qemu/block/nvme.c
<<
>>
Prefs
   1/*
   2 * NVMe block driver based on vfio
   3 *
   4 * Copyright 2016 - 2018 Red Hat, Inc.
   5 *
   6 * Authors:
   7 *   Fam Zheng <famz@redhat.com>
   8 *   Paolo Bonzini <pbonzini@redhat.com>
   9 *
  10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  11 * See the COPYING file in the top-level directory.
  12 */
  13
  14#include "qemu/osdep.h"
  15#include <linux/vfio.h>
  16#include "qapi/error.h"
  17#include "qapi/qmp/qdict.h"
  18#include "qapi/qmp/qstring.h"
  19#include "qemu/error-report.h"
  20#include "qemu/module.h"
  21#include "qemu/cutils.h"
  22#include "qemu/option.h"
  23#include "qemu/vfio-helpers.h"
  24#include "block/block_int.h"
  25#include "trace.h"
  26
  27#include "block/nvme.h"
  28
  29#define NVME_SQ_ENTRY_BYTES 64
  30#define NVME_CQ_ENTRY_BYTES 16
  31#define NVME_QUEUE_SIZE 128
  32#define NVME_BAR_SIZE 8192
  33
  34typedef struct {
  35    int32_t  head, tail;
  36    uint8_t  *queue;
  37    uint64_t iova;
  38    /* Hardware MMIO register */
  39    volatile uint32_t *doorbell;
  40} NVMeQueue;
  41
  42typedef struct {
  43    BlockCompletionFunc *cb;
  44    void *opaque;
  45    int cid;
  46    void *prp_list_page;
  47    uint64_t prp_list_iova;
  48    bool busy;
  49} NVMeRequest;
  50
  51typedef struct {
  52    CoQueue     free_req_queue;
  53    QemuMutex   lock;
  54
  55    /* Fields protected by BQL */
  56    int         index;
  57    uint8_t     *prp_list_pages;
  58
  59    /* Fields protected by @lock */
  60    NVMeQueue   sq, cq;
  61    int         cq_phase;
  62    NVMeRequest reqs[NVME_QUEUE_SIZE];
  63    bool        busy;
  64    int         need_kick;
  65    int         inflight;
  66} NVMeQueuePair;
  67
  68/* Memory mapped registers */
  69typedef volatile struct {
  70    uint64_t cap;
  71    uint32_t vs;
  72    uint32_t intms;
  73    uint32_t intmc;
  74    uint32_t cc;
  75    uint32_t reserved0;
  76    uint32_t csts;
  77    uint32_t nssr;
  78    uint32_t aqa;
  79    uint64_t asq;
  80    uint64_t acq;
  81    uint32_t cmbloc;
  82    uint32_t cmbsz;
  83    uint8_t  reserved1[0xec0];
  84    uint8_t  cmd_set_specfic[0x100];
  85    uint32_t doorbells[];
  86} NVMeRegs;
  87
  88QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
  89
  90typedef struct {
  91    AioContext *aio_context;
  92    QEMUVFIOState *vfio;
  93    NVMeRegs *regs;
  94    /* The submission/completion queue pairs.
  95     * [0]: admin queue.
  96     * [1..]: io queues.
  97     */
  98    NVMeQueuePair **queues;
  99    int nr_queues;
 100    size_t page_size;
 101    /* How many uint32_t elements does each doorbell entry take. */
 102    size_t doorbell_scale;
 103    bool write_cache_supported;
 104    EventNotifier irq_notifier;
 105
 106    uint64_t nsze; /* Namespace size reported by identify command */
 107    int nsid;      /* The namespace id to read/write data. */
 108    int blkshift;
 109
 110    uint64_t max_transfer;
 111    bool plugged;
 112
 113    CoMutex dma_map_lock;
 114    CoQueue dma_flush_queue;
 115
 116    /* Total size of mapped qiov, accessed under dma_map_lock */
 117    int dma_map_count;
 118
 119    /* PCI address (required for nvme_refresh_filename()) */
 120    char *device;
 121} BDRVNVMeState;
 122
 123#define NVME_BLOCK_OPT_DEVICE "device"
 124#define NVME_BLOCK_OPT_NAMESPACE "namespace"
 125
 126static QemuOptsList runtime_opts = {
 127    .name = "nvme",
 128    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
 129    .desc = {
 130        {
 131            .name = NVME_BLOCK_OPT_DEVICE,
 132            .type = QEMU_OPT_STRING,
 133            .help = "NVMe PCI device address",
 134        },
 135        {
 136            .name = NVME_BLOCK_OPT_NAMESPACE,
 137            .type = QEMU_OPT_NUMBER,
 138            .help = "NVMe namespace",
 139        },
 140        { /* end of list */ }
 141    },
 142};
 143
 144static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
 145                            int nentries, int entry_bytes, Error **errp)
 146{
 147    BDRVNVMeState *s = bs->opaque;
 148    size_t bytes;
 149    int r;
 150
 151    bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
 152    q->head = q->tail = 0;
 153    q->queue = qemu_try_blockalign0(bs, bytes);
 154
 155    if (!q->queue) {
 156        error_setg(errp, "Cannot allocate queue");
 157        return;
 158    }
 159    r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
 160    if (r) {
 161        error_setg(errp, "Cannot map queue");
 162    }
 163}
 164
 165static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
 166{
 167    qemu_vfree(q->prp_list_pages);
 168    qemu_vfree(q->sq.queue);
 169    qemu_vfree(q->cq.queue);
 170    qemu_mutex_destroy(&q->lock);
 171    g_free(q);
 172}
 173
 174static void nvme_free_req_queue_cb(void *opaque)
 175{
 176    NVMeQueuePair *q = opaque;
 177
 178    qemu_mutex_lock(&q->lock);
 179    while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
 180        /* Retry all pending requests */
 181    }
 182    qemu_mutex_unlock(&q->lock);
 183}
 184
 185static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
 186                                             int idx, int size,
 187                                             Error **errp)
 188{
 189    int i, r;
 190    BDRVNVMeState *s = bs->opaque;
 191    Error *local_err = NULL;
 192    NVMeQueuePair *q = g_new0(NVMeQueuePair, 1);
 193    uint64_t prp_list_iova;
 194
 195    qemu_mutex_init(&q->lock);
 196    q->index = idx;
 197    qemu_co_queue_init(&q->free_req_queue);
 198    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
 199    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
 200                          s->page_size * NVME_QUEUE_SIZE,
 201                          false, &prp_list_iova);
 202    if (r) {
 203        goto fail;
 204    }
 205    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
 206        NVMeRequest *req = &q->reqs[i];
 207        req->cid = i + 1;
 208        req->prp_list_page = q->prp_list_pages + i * s->page_size;
 209        req->prp_list_iova = prp_list_iova + i * s->page_size;
 210    }
 211    nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
 212    if (local_err) {
 213        error_propagate(errp, local_err);
 214        goto fail;
 215    }
 216    q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale];
 217
 218    nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
 219    if (local_err) {
 220        error_propagate(errp, local_err);
 221        goto fail;
 222    }
 223    q->cq.doorbell = &s->regs->doorbells[(idx * 2 + 1) * s->doorbell_scale];
 224
 225    return q;
 226fail:
 227    nvme_free_queue_pair(bs, q);
 228    return NULL;
 229}
 230
 231/* With q->lock */
 232static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
 233{
 234    if (s->plugged || !q->need_kick) {
 235        return;
 236    }
 237    trace_nvme_kick(s, q->index);
 238    assert(!(q->sq.tail & 0xFF00));
 239    /* Fence the write to submission queue entry before notifying the device. */
 240    smp_wmb();
 241    *q->sq.doorbell = cpu_to_le32(q->sq.tail);
 242    q->inflight += q->need_kick;
 243    q->need_kick = 0;
 244}
 245
 246/* Find a free request element if any, otherwise:
 247 * a) if in coroutine context, try to wait for one to become available;
 248 * b) if not in coroutine, return NULL;
 249 */
 250static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 251{
 252    int i;
 253    NVMeRequest *req = NULL;
 254
 255    qemu_mutex_lock(&q->lock);
 256    while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
 257        /* We have to leave one slot empty as that is the full queue case (head
 258         * == tail + 1). */
 259        if (qemu_in_coroutine()) {
 260            trace_nvme_free_req_queue_wait(q);
 261            qemu_co_queue_wait(&q->free_req_queue, &q->lock);
 262        } else {
 263            qemu_mutex_unlock(&q->lock);
 264            return NULL;
 265        }
 266    }
 267    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
 268        if (!q->reqs[i].busy) {
 269            q->reqs[i].busy = true;
 270            req = &q->reqs[i];
 271            break;
 272        }
 273    }
 274    /* We have checked inflight and need_kick while holding q->lock, so one
 275     * free req must be available. */
 276    assert(req);
 277    qemu_mutex_unlock(&q->lock);
 278    return req;
 279}
 280
 281static inline int nvme_translate_error(const NvmeCqe *c)
 282{
 283    uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
 284    if (status) {
 285        trace_nvme_error(le32_to_cpu(c->result),
 286                         le16_to_cpu(c->sq_head),
 287                         le16_to_cpu(c->sq_id),
 288                         le16_to_cpu(c->cid),
 289                         le16_to_cpu(status));
 290    }
 291    switch (status) {
 292    case 0:
 293        return 0;
 294    case 1:
 295        return -ENOSYS;
 296    case 2:
 297        return -EINVAL;
 298    default:
 299        return -EIO;
 300    }
 301}
 302
 303/* With q->lock */
 304static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
 305{
 306    bool progress = false;
 307    NVMeRequest *preq;
 308    NVMeRequest req;
 309    NvmeCqe *c;
 310
 311    trace_nvme_process_completion(s, q->index, q->inflight);
 312    if (q->busy || s->plugged) {
 313        trace_nvme_process_completion_queue_busy(s, q->index);
 314        return false;
 315    }
 316    q->busy = true;
 317    assert(q->inflight >= 0);
 318    while (q->inflight) {
 319        int16_t cid;
 320        c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
 321        if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
 322            break;
 323        }
 324        q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
 325        if (!q->cq.head) {
 326            q->cq_phase = !q->cq_phase;
 327        }
 328        cid = le16_to_cpu(c->cid);
 329        if (cid == 0 || cid > NVME_QUEUE_SIZE) {
 330            fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
 331                    cid);
 332            continue;
 333        }
 334        assert(cid <= NVME_QUEUE_SIZE);
 335        trace_nvme_complete_command(s, q->index, cid);
 336        preq = &q->reqs[cid - 1];
 337        req = *preq;
 338        assert(req.cid == cid);
 339        assert(req.cb);
 340        preq->busy = false;
 341        preq->cb = preq->opaque = NULL;
 342        qemu_mutex_unlock(&q->lock);
 343        req.cb(req.opaque, nvme_translate_error(c));
 344        qemu_mutex_lock(&q->lock);
 345        q->inflight--;
 346        progress = true;
 347    }
 348    if (progress) {
 349        /* Notify the device so it can post more completions. */
 350        smp_mb_release();
 351        *q->cq.doorbell = cpu_to_le32(q->cq.head);
 352        if (!qemu_co_queue_empty(&q->free_req_queue)) {
 353            aio_bh_schedule_oneshot(s->aio_context, nvme_free_req_queue_cb, q);
 354        }
 355    }
 356    q->busy = false;
 357    return progress;
 358}
 359
 360static void nvme_trace_command(const NvmeCmd *cmd)
 361{
 362    int i;
 363
 364    for (i = 0; i < 8; ++i) {
 365        uint8_t *cmdp = (uint8_t *)cmd + i * 8;
 366        trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
 367                                      cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
 368    }
 369}
 370
 371static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
 372                                NVMeRequest *req,
 373                                NvmeCmd *cmd, BlockCompletionFunc cb,
 374                                void *opaque)
 375{
 376    assert(!req->cb);
 377    req->cb = cb;
 378    req->opaque = opaque;
 379    cmd->cid = cpu_to_le32(req->cid);
 380
 381    trace_nvme_submit_command(s, q->index, req->cid);
 382    nvme_trace_command(cmd);
 383    qemu_mutex_lock(&q->lock);
 384    memcpy((uint8_t *)q->sq.queue +
 385           q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
 386    q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
 387    q->need_kick++;
 388    nvme_kick(s, q);
 389    nvme_process_completion(s, q);
 390    qemu_mutex_unlock(&q->lock);
 391}
 392
 393static void nvme_cmd_sync_cb(void *opaque, int ret)
 394{
 395    int *pret = opaque;
 396    *pret = ret;
 397    aio_wait_kick();
 398}
 399
 400static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
 401                         NvmeCmd *cmd)
 402{
 403    NVMeRequest *req;
 404    BDRVNVMeState *s = bs->opaque;
 405    int ret = -EINPROGRESS;
 406    req = nvme_get_free_req(q);
 407    if (!req) {
 408        return -EBUSY;
 409    }
 410    nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
 411
 412    BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
 413    return ret;
 414}
 415
 416static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 417{
 418    BDRVNVMeState *s = bs->opaque;
 419    NvmeIdCtrl *idctrl;
 420    NvmeIdNs *idns;
 421    NvmeLBAF *lbaf;
 422    uint8_t *resp;
 423    int r;
 424    uint64_t iova;
 425    NvmeCmd cmd = {
 426        .opcode = NVME_ADM_CMD_IDENTIFY,
 427        .cdw10 = cpu_to_le32(0x1),
 428    };
 429
 430    resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
 431    if (!resp) {
 432        error_setg(errp, "Cannot allocate buffer for identify response");
 433        goto out;
 434    }
 435    idctrl = (NvmeIdCtrl *)resp;
 436    idns = (NvmeIdNs *)resp;
 437    r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova);
 438    if (r) {
 439        error_setg(errp, "Cannot map buffer for DMA");
 440        goto out;
 441    }
 442    cmd.prp1 = cpu_to_le64(iova);
 443
 444    if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
 445        error_setg(errp, "Failed to identify controller");
 446        goto out;
 447    }
 448
 449    if (le32_to_cpu(idctrl->nn) < namespace) {
 450        error_setg(errp, "Invalid namespace");
 451        goto out;
 452    }
 453    s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1;
 454    s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size;
 455    /* For now the page list buffer per command is one page, to hold at most
 456     * s->page_size / sizeof(uint64_t) entries. */
 457    s->max_transfer = MIN_NON_ZERO(s->max_transfer,
 458                          s->page_size / sizeof(uint64_t) * s->page_size);
 459
 460    memset(resp, 0, 4096);
 461
 462    cmd.cdw10 = 0;
 463    cmd.nsid = cpu_to_le32(namespace);
 464    if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
 465        error_setg(errp, "Failed to identify namespace");
 466        goto out;
 467    }
 468
 469    s->nsze = le64_to_cpu(idns->nsze);
 470    lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
 471
 472    if (lbaf->ms) {
 473        error_setg(errp, "Namespaces with metadata are not yet supported");
 474        goto out;
 475    }
 476
 477    if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
 478        (1 << lbaf->ds) > s->page_size)
 479    {
 480        error_setg(errp, "Namespace has unsupported block size (2^%d)",
 481                   lbaf->ds);
 482        goto out;
 483    }
 484
 485    s->blkshift = lbaf->ds;
 486out:
 487    qemu_vfio_dma_unmap(s->vfio, resp);
 488    qemu_vfree(resp);
 489}
 490
 491static bool nvme_poll_queues(BDRVNVMeState *s)
 492{
 493    bool progress = false;
 494    int i;
 495
 496    for (i = 0; i < s->nr_queues; i++) {
 497        NVMeQueuePair *q = s->queues[i];
 498        qemu_mutex_lock(&q->lock);
 499        while (nvme_process_completion(s, q)) {
 500            /* Keep polling */
 501            progress = true;
 502        }
 503        qemu_mutex_unlock(&q->lock);
 504    }
 505    return progress;
 506}
 507
 508static void nvme_handle_event(EventNotifier *n)
 509{
 510    BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier);
 511
 512    trace_nvme_handle_event(s);
 513    event_notifier_test_and_clear(n);
 514    nvme_poll_queues(s);
 515}
 516
 517static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
 518{
 519    BDRVNVMeState *s = bs->opaque;
 520    int n = s->nr_queues;
 521    NVMeQueuePair *q;
 522    NvmeCmd cmd;
 523    int queue_size = NVME_QUEUE_SIZE;
 524
 525    q = nvme_create_queue_pair(bs, n, queue_size, errp);
 526    if (!q) {
 527        return false;
 528    }
 529    cmd = (NvmeCmd) {
 530        .opcode = NVME_ADM_CMD_CREATE_CQ,
 531        .prp1 = cpu_to_le64(q->cq.iova),
 532        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
 533        .cdw11 = cpu_to_le32(0x3),
 534    };
 535    if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
 536        error_setg(errp, "Failed to create io queue [%d]", n);
 537        nvme_free_queue_pair(bs, q);
 538        return false;
 539    }
 540    cmd = (NvmeCmd) {
 541        .opcode = NVME_ADM_CMD_CREATE_SQ,
 542        .prp1 = cpu_to_le64(q->sq.iova),
 543        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
 544        .cdw11 = cpu_to_le32(0x1 | (n << 16)),
 545    };
 546    if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
 547        error_setg(errp, "Failed to create io queue [%d]", n);
 548        nvme_free_queue_pair(bs, q);
 549        return false;
 550    }
 551    s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
 552    s->queues[n] = q;
 553    s->nr_queues++;
 554    return true;
 555}
 556
 557static bool nvme_poll_cb(void *opaque)
 558{
 559    EventNotifier *e = opaque;
 560    BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier);
 561    bool progress = false;
 562
 563    trace_nvme_poll_cb(s);
 564    progress = nvme_poll_queues(s);
 565    return progress;
 566}
 567
 568static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
 569                     Error **errp)
 570{
 571    BDRVNVMeState *s = bs->opaque;
 572    int ret;
 573    uint64_t cap;
 574    uint64_t timeout_ms;
 575    uint64_t deadline, now;
 576    Error *local_err = NULL;
 577
 578    qemu_co_mutex_init(&s->dma_map_lock);
 579    qemu_co_queue_init(&s->dma_flush_queue);
 580    s->device = g_strdup(device);
 581    s->nsid = namespace;
 582    s->aio_context = bdrv_get_aio_context(bs);
 583    ret = event_notifier_init(&s->irq_notifier, 0);
 584    if (ret) {
 585        error_setg(errp, "Failed to init event notifier");
 586        return ret;
 587    }
 588
 589    s->vfio = qemu_vfio_open_pci(device, errp);
 590    if (!s->vfio) {
 591        ret = -EINVAL;
 592        goto out;
 593    }
 594
 595    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp);
 596    if (!s->regs) {
 597        ret = -EINVAL;
 598        goto out;
 599    }
 600
 601    /* Perform initialize sequence as described in NVMe spec "7.6.1
 602     * Initialization". */
 603
 604    cap = le64_to_cpu(s->regs->cap);
 605    if (!(cap & (1ULL << 37))) {
 606        error_setg(errp, "Device doesn't support NVMe command set");
 607        ret = -EINVAL;
 608        goto out;
 609    }
 610
 611    s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF)));
 612    s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t);
 613    bs->bl.opt_mem_alignment = s->page_size;
 614    timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000);
 615
 616    /* Reset device to get a clean state. */
 617    s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE);
 618    /* Wait for CSTS.RDY = 0. */
 619    deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL;
 620    while (le32_to_cpu(s->regs->csts) & 0x1) {
 621        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
 622            error_setg(errp, "Timeout while waiting for device to reset (%"
 623                             PRId64 " ms)",
 624                       timeout_ms);
 625            ret = -ETIMEDOUT;
 626            goto out;
 627        }
 628    }
 629
 630    /* Set up admin queue. */
 631    s->queues = g_new(NVMeQueuePair *, 1);
 632    s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp);
 633    if (!s->queues[0]) {
 634        ret = -EINVAL;
 635        goto out;
 636    }
 637    s->nr_queues = 1;
 638    QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
 639    s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
 640    s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova);
 641    s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova);
 642
 643    /* After setting up all control registers we can enable device now. */
 644    s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
 645                              (ctz32(NVME_SQ_ENTRY_BYTES) << 16) |
 646                              0x1);
 647    /* Wait for CSTS.RDY = 1. */
 648    now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 649    deadline = now + timeout_ms * 1000000;
 650    while (!(le32_to_cpu(s->regs->csts) & 0x1)) {
 651        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
 652            error_setg(errp, "Timeout while waiting for device to start (%"
 653                             PRId64 " ms)",
 654                       timeout_ms);
 655            ret = -ETIMEDOUT;
 656            goto out;
 657        }
 658    }
 659
 660    ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier,
 661                                 VFIO_PCI_MSIX_IRQ_INDEX, errp);
 662    if (ret) {
 663        goto out;
 664    }
 665    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
 666                           false, nvme_handle_event, nvme_poll_cb);
 667
 668    nvme_identify(bs, namespace, &local_err);
 669    if (local_err) {
 670        error_propagate(errp, local_err);
 671        ret = -EIO;
 672        goto out;
 673    }
 674
 675    /* Set up command queues. */
 676    if (!nvme_add_io_queue(bs, errp)) {
 677        ret = -EIO;
 678    }
 679out:
 680    /* Cleaning up is done in nvme_file_open() upon error. */
 681    return ret;
 682}
 683
 684/* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
 685 *
 686 *     nvme://0000:44:00.0/1
 687 *
 688 * where the "nvme://" is a fixed form of the protocol prefix, the middle part
 689 * is the PCI address, and the last part is the namespace number starting from
 690 * 1 according to the NVMe spec. */
 691static void nvme_parse_filename(const char *filename, QDict *options,
 692                                Error **errp)
 693{
 694    int pref = strlen("nvme://");
 695
 696    if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
 697        const char *tmp = filename + pref;
 698        char *device;
 699        const char *namespace;
 700        unsigned long ns;
 701        const char *slash = strchr(tmp, '/');
 702        if (!slash) {
 703            qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
 704            return;
 705        }
 706        device = g_strndup(tmp, slash - tmp);
 707        qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
 708        g_free(device);
 709        namespace = slash + 1;
 710        if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
 711            error_setg(errp, "Invalid namespace '%s', positive number expected",
 712                       namespace);
 713            return;
 714        }
 715        qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
 716                      *namespace ? namespace : "1");
 717    }
 718}
 719
 720static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
 721                                           Error **errp)
 722{
 723    int ret;
 724    BDRVNVMeState *s = bs->opaque;
 725    NvmeCmd cmd = {
 726        .opcode = NVME_ADM_CMD_SET_FEATURES,
 727        .nsid = cpu_to_le32(s->nsid),
 728        .cdw10 = cpu_to_le32(0x06),
 729        .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
 730    };
 731
 732    ret = nvme_cmd_sync(bs, s->queues[0], &cmd);
 733    if (ret) {
 734        error_setg(errp, "Failed to configure NVMe write cache");
 735    }
 736    return ret;
 737}
 738
 739static void nvme_close(BlockDriverState *bs)
 740{
 741    int i;
 742    BDRVNVMeState *s = bs->opaque;
 743
 744    for (i = 0; i < s->nr_queues; ++i) {
 745        nvme_free_queue_pair(bs, s->queues[i]);
 746    }
 747    g_free(s->queues);
 748    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
 749                           false, NULL, NULL);
 750    event_notifier_cleanup(&s->irq_notifier);
 751    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
 752    qemu_vfio_close(s->vfio);
 753
 754    g_free(s->device);
 755}
 756
 757static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
 758                          Error **errp)
 759{
 760    const char *device;
 761    QemuOpts *opts;
 762    int namespace;
 763    int ret;
 764    BDRVNVMeState *s = bs->opaque;
 765
 766    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
 767    qemu_opts_absorb_qdict(opts, options, &error_abort);
 768    device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
 769    if (!device) {
 770        error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
 771        qemu_opts_del(opts);
 772        return -EINVAL;
 773    }
 774
 775    namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
 776    ret = nvme_init(bs, device, namespace, errp);
 777    qemu_opts_del(opts);
 778    if (ret) {
 779        goto fail;
 780    }
 781    if (flags & BDRV_O_NOCACHE) {
 782        if (!s->write_cache_supported) {
 783            error_setg(errp,
 784                       "NVMe controller doesn't support write cache configuration");
 785            ret = -EINVAL;
 786        } else {
 787            ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
 788                                                  errp);
 789        }
 790        if (ret) {
 791            goto fail;
 792        }
 793    }
 794    bs->supported_write_flags = BDRV_REQ_FUA;
 795    return 0;
 796fail:
 797    nvme_close(bs);
 798    return ret;
 799}
 800
 801static int64_t nvme_getlength(BlockDriverState *bs)
 802{
 803    BDRVNVMeState *s = bs->opaque;
 804    return s->nsze << s->blkshift;
 805}
 806
 807static uint32_t nvme_get_blocksize(BlockDriverState *bs)
 808{
 809    BDRVNVMeState *s = bs->opaque;
 810    assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
 811    return UINT32_C(1) << s->blkshift;
 812}
 813
 814static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 815{
 816    uint32_t blocksize = nvme_get_blocksize(bs);
 817    bsz->phys = blocksize;
 818    bsz->log = blocksize;
 819    return 0;
 820}
 821
 822/* Called with s->dma_map_lock */
 823static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
 824                                            QEMUIOVector *qiov)
 825{
 826    int r = 0;
 827    BDRVNVMeState *s = bs->opaque;
 828
 829    s->dma_map_count -= qiov->size;
 830    if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
 831        r = qemu_vfio_dma_reset_temporary(s->vfio);
 832        if (!r) {
 833            qemu_co_queue_restart_all(&s->dma_flush_queue);
 834        }
 835    }
 836    return r;
 837}
 838
 839/* Called with s->dma_map_lock */
 840static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
 841                                          NVMeRequest *req, QEMUIOVector *qiov)
 842{
 843    BDRVNVMeState *s = bs->opaque;
 844    uint64_t *pagelist = req->prp_list_page;
 845    int i, j, r;
 846    int entries = 0;
 847
 848    assert(qiov->size);
 849    assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
 850    assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
 851    for (i = 0; i < qiov->niov; ++i) {
 852        bool retry = true;
 853        uint64_t iova;
 854try_map:
 855        r = qemu_vfio_dma_map(s->vfio,
 856                              qiov->iov[i].iov_base,
 857                              qiov->iov[i].iov_len,
 858                              true, &iova);
 859        if (r == -ENOMEM && retry) {
 860            retry = false;
 861            trace_nvme_dma_flush_queue_wait(s);
 862            if (s->dma_map_count) {
 863                trace_nvme_dma_map_flush(s);
 864                qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
 865            } else {
 866                r = qemu_vfio_dma_reset_temporary(s->vfio);
 867                if (r) {
 868                    goto fail;
 869                }
 870            }
 871            goto try_map;
 872        }
 873        if (r) {
 874            goto fail;
 875        }
 876
 877        for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
 878            pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
 879        }
 880        trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
 881                                    qiov->iov[i].iov_len / s->page_size);
 882    }
 883
 884    s->dma_map_count += qiov->size;
 885
 886    assert(entries <= s->page_size / sizeof(uint64_t));
 887    switch (entries) {
 888    case 0:
 889        abort();
 890    case 1:
 891        cmd->prp1 = pagelist[0];
 892        cmd->prp2 = 0;
 893        break;
 894    case 2:
 895        cmd->prp1 = pagelist[0];
 896        cmd->prp2 = pagelist[1];
 897        break;
 898    default:
 899        cmd->prp1 = pagelist[0];
 900        cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
 901        break;
 902    }
 903    trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
 904    for (i = 0; i < entries; ++i) {
 905        trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
 906    }
 907    return 0;
 908fail:
 909    /* No need to unmap [0 - i) iovs even if we've failed, since we don't
 910     * increment s->dma_map_count. This is okay for fixed mapping memory areas
 911     * because they are already mapped before calling this function; for
 912     * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
 913     * calling qemu_vfio_dma_reset_temporary when necessary. */
 914    return r;
 915}
 916
 917typedef struct {
 918    Coroutine *co;
 919    int ret;
 920    AioContext *ctx;
 921} NVMeCoData;
 922
 923static void nvme_rw_cb_bh(void *opaque)
 924{
 925    NVMeCoData *data = opaque;
 926    qemu_coroutine_enter(data->co);
 927}
 928
 929static void nvme_rw_cb(void *opaque, int ret)
 930{
 931    NVMeCoData *data = opaque;
 932    data->ret = ret;
 933    if (!data->co) {
 934        /* The rw coroutine hasn't yielded, don't try to enter. */
 935        return;
 936    }
 937    aio_bh_schedule_oneshot(data->ctx, nvme_rw_cb_bh, data);
 938}
 939
 940static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
 941                                            uint64_t offset, uint64_t bytes,
 942                                            QEMUIOVector *qiov,
 943                                            bool is_write,
 944                                            int flags)
 945{
 946    int r;
 947    BDRVNVMeState *s = bs->opaque;
 948    NVMeQueuePair *ioq = s->queues[1];
 949    NVMeRequest *req;
 950
 951    uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
 952                       (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
 953    NvmeCmd cmd = {
 954        .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
 955        .nsid = cpu_to_le32(s->nsid),
 956        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
 957        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
 958        .cdw12 = cpu_to_le32(cdw12),
 959    };
 960    NVMeCoData data = {
 961        .ctx = bdrv_get_aio_context(bs),
 962        .ret = -EINPROGRESS,
 963    };
 964
 965    trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
 966    assert(s->nr_queues > 1);
 967    req = nvme_get_free_req(ioq);
 968    assert(req);
 969
 970    qemu_co_mutex_lock(&s->dma_map_lock);
 971    r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
 972    qemu_co_mutex_unlock(&s->dma_map_lock);
 973    if (r) {
 974        req->busy = false;
 975        return r;
 976    }
 977    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 978
 979    data.co = qemu_coroutine_self();
 980    while (data.ret == -EINPROGRESS) {
 981        qemu_coroutine_yield();
 982    }
 983
 984    qemu_co_mutex_lock(&s->dma_map_lock);
 985    r = nvme_cmd_unmap_qiov(bs, qiov);
 986    qemu_co_mutex_unlock(&s->dma_map_lock);
 987    if (r) {
 988        return r;
 989    }
 990
 991    trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
 992    return data.ret;
 993}
 994
 995static inline bool nvme_qiov_aligned(BlockDriverState *bs,
 996                                     const QEMUIOVector *qiov)
 997{
 998    int i;
 999    BDRVNVMeState *s = bs->opaque;
1000
1001    for (i = 0; i < qiov->niov; ++i) {
1002        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
1003            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
1004            trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
1005                                      qiov->iov[i].iov_len, s->page_size);
1006            return false;
1007        }
1008    }
1009    return true;
1010}
1011
1012static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
1013                       QEMUIOVector *qiov, bool is_write, int flags)
1014{
1015    BDRVNVMeState *s = bs->opaque;
1016    int r;
1017    uint8_t *buf = NULL;
1018    QEMUIOVector local_qiov;
1019
1020    assert(QEMU_IS_ALIGNED(offset, s->page_size));
1021    assert(QEMU_IS_ALIGNED(bytes, s->page_size));
1022    assert(bytes <= s->max_transfer);
1023    if (nvme_qiov_aligned(bs, qiov)) {
1024        return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
1025    }
1026    trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
1027    buf = qemu_try_blockalign(bs, bytes);
1028
1029    if (!buf) {
1030        return -ENOMEM;
1031    }
1032    qemu_iovec_init(&local_qiov, 1);
1033    if (is_write) {
1034        qemu_iovec_to_buf(qiov, 0, buf, bytes);
1035    }
1036    qemu_iovec_add(&local_qiov, buf, bytes);
1037    r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1038    qemu_iovec_destroy(&local_qiov);
1039    if (!r && !is_write) {
1040        qemu_iovec_from_buf(qiov, 0, buf, bytes);
1041    }
1042    qemu_vfree(buf);
1043    return r;
1044}
1045
1046static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1047                                       uint64_t offset, uint64_t bytes,
1048                                       QEMUIOVector *qiov, int flags)
1049{
1050    return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1051}
1052
1053static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1054                                        uint64_t offset, uint64_t bytes,
1055                                        QEMUIOVector *qiov, int flags)
1056{
1057    return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1058}
1059
1060static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1061{
1062    BDRVNVMeState *s = bs->opaque;
1063    NVMeQueuePair *ioq = s->queues[1];
1064    NVMeRequest *req;
1065    NvmeCmd cmd = {
1066        .opcode = NVME_CMD_FLUSH,
1067        .nsid = cpu_to_le32(s->nsid),
1068    };
1069    NVMeCoData data = {
1070        .ctx = bdrv_get_aio_context(bs),
1071        .ret = -EINPROGRESS,
1072    };
1073
1074    assert(s->nr_queues > 1);
1075    req = nvme_get_free_req(ioq);
1076    assert(req);
1077    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
1078
1079    data.co = qemu_coroutine_self();
1080    if (data.ret == -EINPROGRESS) {
1081        qemu_coroutine_yield();
1082    }
1083
1084    return data.ret;
1085}
1086
1087
1088static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1089                               BlockReopenQueue *queue, Error **errp)
1090{
1091    return 0;
1092}
1093
1094static void nvme_refresh_filename(BlockDriverState *bs)
1095{
1096    BDRVNVMeState *s = bs->opaque;
1097
1098    snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1099             s->device, s->nsid);
1100}
1101
1102static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1103{
1104    BDRVNVMeState *s = bs->opaque;
1105
1106    bs->bl.opt_mem_alignment = s->page_size;
1107    bs->bl.request_alignment = s->page_size;
1108    bs->bl.max_transfer = s->max_transfer;
1109}
1110
1111static void nvme_detach_aio_context(BlockDriverState *bs)
1112{
1113    BDRVNVMeState *s = bs->opaque;
1114
1115    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
1116                           false, NULL, NULL);
1117}
1118
1119static void nvme_attach_aio_context(BlockDriverState *bs,
1120                                    AioContext *new_context)
1121{
1122    BDRVNVMeState *s = bs->opaque;
1123
1124    s->aio_context = new_context;
1125    aio_set_event_notifier(new_context, &s->irq_notifier,
1126                           false, nvme_handle_event, nvme_poll_cb);
1127}
1128
1129static void nvme_aio_plug(BlockDriverState *bs)
1130{
1131    BDRVNVMeState *s = bs->opaque;
1132    assert(!s->plugged);
1133    s->plugged = true;
1134}
1135
1136static void nvme_aio_unplug(BlockDriverState *bs)
1137{
1138    int i;
1139    BDRVNVMeState *s = bs->opaque;
1140    assert(s->plugged);
1141    s->plugged = false;
1142    for (i = 1; i < s->nr_queues; i++) {
1143        NVMeQueuePair *q = s->queues[i];
1144        qemu_mutex_lock(&q->lock);
1145        nvme_kick(s, q);
1146        nvme_process_completion(s, q);
1147        qemu_mutex_unlock(&q->lock);
1148    }
1149}
1150
1151static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
1152{
1153    int ret;
1154    BDRVNVMeState *s = bs->opaque;
1155
1156    ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL);
1157    if (ret) {
1158        /* FIXME: we may run out of IOVA addresses after repeated
1159         * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1160         * doesn't reclaim addresses for fixed mappings. */
1161        error_report("nvme_register_buf failed: %s", strerror(-ret));
1162    }
1163}
1164
1165static void nvme_unregister_buf(BlockDriverState *bs, void *host)
1166{
1167    BDRVNVMeState *s = bs->opaque;
1168
1169    qemu_vfio_dma_unmap(s->vfio, host);
1170}
1171
1172static const char *const nvme_strong_runtime_opts[] = {
1173    NVME_BLOCK_OPT_DEVICE,
1174    NVME_BLOCK_OPT_NAMESPACE,
1175
1176    NULL
1177};
1178
1179static BlockDriver bdrv_nvme = {
1180    .format_name              = "nvme",
1181    .protocol_name            = "nvme",
1182    .instance_size            = sizeof(BDRVNVMeState),
1183
1184    .bdrv_parse_filename      = nvme_parse_filename,
1185    .bdrv_file_open           = nvme_file_open,
1186    .bdrv_close               = nvme_close,
1187    .bdrv_getlength           = nvme_getlength,
1188    .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
1189
1190    .bdrv_co_preadv           = nvme_co_preadv,
1191    .bdrv_co_pwritev          = nvme_co_pwritev,
1192    .bdrv_co_flush_to_disk    = nvme_co_flush,
1193    .bdrv_reopen_prepare      = nvme_reopen_prepare,
1194
1195    .bdrv_refresh_filename    = nvme_refresh_filename,
1196    .bdrv_refresh_limits      = nvme_refresh_limits,
1197    .strong_runtime_opts      = nvme_strong_runtime_opts,
1198
1199    .bdrv_detach_aio_context  = nvme_detach_aio_context,
1200    .bdrv_attach_aio_context  = nvme_attach_aio_context,
1201
1202    .bdrv_io_plug             = nvme_aio_plug,
1203    .bdrv_io_unplug           = nvme_aio_unplug,
1204
1205    .bdrv_register_buf        = nvme_register_buf,
1206    .bdrv_unregister_buf      = nvme_unregister_buf,
1207};
1208
1209static void bdrv_nvme_init(void)
1210{
1211    bdrv_register(&bdrv_nvme);
1212}
1213
1214block_init(bdrv_nvme_init);
1215