qemu/hw/block/nvme.c
<<
>>
Prefs
   1/*
   2 * QEMU NVM Express Controller
   3 *
   4 * Copyright (c) 2012, Intel Corporation
   5 *
   6 * Written by Keith Busch <keith.busch@intel.com>
   7 *
   8 * This code is licensed under the GNU GPL v2 or later.
   9 */
  10
  11/**
  12 * Reference Specs: http://www.nvmexpress.org, 1.1, 1.0e
  13 *
  14 *  http://www.nvmexpress.org/resources/
  15 */
  16
  17/**
  18 * Usage: add options:
  19 *      -drive file=<file>,if=none,id=<drive_id>
  20 *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>
  21 */
  22
  23#include <hw/block/block.h>
  24#include <hw/hw.h>
  25#include <hw/pci/msix.h>
  26#include <hw/pci/pci.h>
  27
  28#include "nvme.h"
  29
  30static void nvme_process_sq(void *opaque);
  31
  32static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
  33{
  34    return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
  35}
  36
  37static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
  38{
  39    return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
  40}
  41
  42static void nvme_inc_cq_tail(NvmeCQueue *cq)
  43{
  44    cq->tail++;
  45    if (cq->tail >= cq->size) {
  46        cq->tail = 0;
  47        cq->phase = !cq->phase;
  48    }
  49}
  50
  51static void nvme_inc_sq_head(NvmeSQueue *sq)
  52{
  53    sq->head = (sq->head + 1) % sq->size;
  54}
  55
  56static uint8_t nvme_cq_full(NvmeCQueue *cq)
  57{
  58    return (cq->tail + 1) % cq->size == cq->head;
  59}
  60
  61static uint8_t nvme_sq_empty(NvmeSQueue *sq)
  62{
  63    return sq->head == sq->tail;
  64}
  65
  66static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
  67{
  68    if (cq->irq_enabled) {
  69        if (msix_enabled(&(n->parent_obj))) {
  70            msix_notify(&(n->parent_obj), cq->vector);
  71        } else {
  72            pci_irq_pulse(&n->parent_obj);
  73        }
  74    }
  75}
  76
  77static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
  78    uint32_t len, NvmeCtrl *n)
  79{
  80    hwaddr trans_len = n->page_size - (prp1 % n->page_size);
  81    trans_len = MIN(len, trans_len);
  82    int num_prps = (len >> n->page_bits) + 1;
  83
  84    if (!prp1) {
  85        return NVME_INVALID_FIELD | NVME_DNR;
  86    }
  87
  88    pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
  89    qemu_sglist_add(qsg, prp1, trans_len);
  90    len -= trans_len;
  91    if (len) {
  92        if (!prp2) {
  93            goto unmap;
  94        }
  95        if (len > n->page_size) {
  96            uint64_t prp_list[n->max_prp_ents];
  97            uint32_t nents, prp_trans;
  98            int i = 0;
  99
 100            nents = (len + n->page_size - 1) >> n->page_bits;
 101            prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
 102            pci_dma_read(&n->parent_obj, prp2, (void *)prp_list, prp_trans);
 103            while (len != 0) {
 104                uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 105
 106                if (i == n->max_prp_ents - 1 && len > n->page_size) {
 107                    if (!prp_ent || prp_ent & (n->page_size - 1)) {
 108                        goto unmap;
 109                    }
 110
 111                    i = 0;
 112                    nents = (len + n->page_size - 1) >> n->page_bits;
 113                    prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
 114                    pci_dma_read(&n->parent_obj, prp_ent, (void *)prp_list,
 115                        prp_trans);
 116                    prp_ent = le64_to_cpu(prp_list[i]);
 117                }
 118
 119                if (!prp_ent || prp_ent & (n->page_size - 1)) {
 120                    goto unmap;
 121                }
 122
 123                trans_len = MIN(len, n->page_size);
 124                qemu_sglist_add(qsg, prp_ent, trans_len);
 125                len -= trans_len;
 126                i++;
 127            }
 128        } else {
 129            if (prp2 & (n->page_size - 1)) {
 130                goto unmap;
 131            }
 132            qemu_sglist_add(qsg, prp2, len);
 133        }
 134    }
 135    return NVME_SUCCESS;
 136
 137 unmap:
 138    qemu_sglist_destroy(qsg);
 139    return NVME_INVALID_FIELD | NVME_DNR;
 140}
 141
 142static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
 143    uint64_t prp1, uint64_t prp2)
 144{
 145    QEMUSGList qsg;
 146
 147    if (nvme_map_prp(&qsg, prp1, prp2, len, n)) {
 148        return NVME_INVALID_FIELD | NVME_DNR;
 149    }
 150    if (dma_buf_read(ptr, len, &qsg)) {
 151        qemu_sglist_destroy(&qsg);
 152        return NVME_INVALID_FIELD | NVME_DNR;
 153    }
 154    return NVME_SUCCESS;
 155}
 156
 157static void nvme_post_cqes(void *opaque)
 158{
 159    NvmeCQueue *cq = opaque;
 160    NvmeCtrl *n = cq->ctrl;
 161    NvmeRequest *req, *next;
 162
 163    QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
 164        NvmeSQueue *sq;
 165        hwaddr addr;
 166
 167        if (nvme_cq_full(cq)) {
 168            break;
 169        }
 170
 171        QTAILQ_REMOVE(&cq->req_list, req, entry);
 172        sq = req->sq;
 173        req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
 174        req->cqe.sq_id = cpu_to_le16(sq->sqid);
 175        req->cqe.sq_head = cpu_to_le16(sq->head);
 176        addr = cq->dma_addr + cq->tail * n->cqe_size;
 177        nvme_inc_cq_tail(cq);
 178        pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
 179            sizeof(req->cqe));
 180        QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
 181    }
 182    nvme_isr_notify(n, cq);
 183}
 184
 185static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
 186{
 187    assert(cq->cqid == req->sq->cqid);
 188    QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
 189    QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
 190    timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
 191}
 192
 193static void nvme_rw_cb(void *opaque, int ret)
 194{
 195    NvmeRequest *req = opaque;
 196    NvmeSQueue *sq = req->sq;
 197    NvmeCtrl *n = sq->ctrl;
 198    NvmeCQueue *cq = n->cq[sq->cqid];
 199
 200    bdrv_acct_done(n->conf.bs, &req->acct);
 201    if (!ret) {
 202        req->status = NVME_SUCCESS;
 203    } else {
 204        req->status = NVME_INTERNAL_DEV_ERROR;
 205    }
 206
 207    qemu_sglist_destroy(&req->qsg);
 208    nvme_enqueue_req_completion(cq, req);
 209}
 210
 211static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
 212    NvmeRequest *req)
 213{
 214    NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
 215    uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
 216    uint64_t slba = le64_to_cpu(rw->slba);
 217    uint64_t prp1 = le64_to_cpu(rw->prp1);
 218    uint64_t prp2 = le64_to_cpu(rw->prp2);
 219
 220    uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
 221    uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
 222    uint64_t data_size = nlb << data_shift;
 223    uint64_t aio_slba  = slba << (data_shift - BDRV_SECTOR_BITS);
 224    int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
 225
 226    if ((slba + nlb) > ns->id_ns.nsze) {
 227        return NVME_LBA_RANGE | NVME_DNR;
 228    }
 229    if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) {
 230        return NVME_INVALID_FIELD | NVME_DNR;
 231    }
 232    assert((nlb << data_shift) == req->qsg.size);
 233
 234    dma_acct_start(n->conf.bs, &req->acct, &req->qsg, is_write ?
 235        BDRV_ACCT_WRITE : BDRV_ACCT_READ);
 236    req->aiocb = is_write ?
 237        dma_bdrv_write(n->conf.bs, &req->qsg, aio_slba, nvme_rw_cb, req) :
 238        dma_bdrv_read(n->conf.bs, &req->qsg, aio_slba, nvme_rw_cb, req);
 239
 240    return NVME_NO_COMPLETE;
 241}
 242
 243static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 244{
 245    NvmeNamespace *ns;
 246    uint32_t nsid = le32_to_cpu(cmd->nsid);
 247
 248    if (nsid == 0 || nsid > n->num_namespaces) {
 249        return NVME_INVALID_NSID | NVME_DNR;
 250    }
 251
 252    ns = &n->namespaces[nsid - 1];
 253    switch (cmd->opcode) {
 254    case NVME_CMD_FLUSH:
 255        return NVME_SUCCESS;
 256    case NVME_CMD_WRITE:
 257    case NVME_CMD_READ:
 258        return nvme_rw(n, ns, cmd, req);
 259    default:
 260        return NVME_INVALID_OPCODE | NVME_DNR;
 261    }
 262}
 263
 264static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
 265{
 266    n->sq[sq->sqid] = NULL;
 267    timer_del(sq->timer);
 268    timer_free(sq->timer);
 269    g_free(sq->io_req);
 270    if (sq->sqid) {
 271        g_free(sq);
 272    }
 273}
 274
 275static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
 276{
 277    NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
 278    NvmeRequest *req, *next;
 279    NvmeSQueue *sq;
 280    NvmeCQueue *cq;
 281    uint16_t qid = le16_to_cpu(c->qid);
 282
 283    if (!qid || nvme_check_sqid(n, qid)) {
 284        return NVME_INVALID_QID | NVME_DNR;
 285    }
 286
 287    sq = n->sq[qid];
 288    while (!QTAILQ_EMPTY(&sq->out_req_list)) {
 289        req = QTAILQ_FIRST(&sq->out_req_list);
 290        assert(req->aiocb);
 291        bdrv_aio_cancel(req->aiocb);
 292    }
 293    if (!nvme_check_cqid(n, sq->cqid)) {
 294        cq = n->cq[sq->cqid];
 295        QTAILQ_REMOVE(&cq->sq_list, sq, entry);
 296
 297        nvme_post_cqes(cq);
 298        QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
 299            if (req->sq == sq) {
 300                QTAILQ_REMOVE(&cq->req_list, req, entry);
 301                QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
 302            }
 303        }
 304    }
 305
 306    nvme_free_sq(sq, n);
 307    return NVME_SUCCESS;
 308}
 309
 310static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
 311    uint16_t sqid, uint16_t cqid, uint16_t size)
 312{
 313    int i;
 314    NvmeCQueue *cq;
 315
 316    sq->ctrl = n;
 317    sq->dma_addr = dma_addr;
 318    sq->sqid = sqid;
 319    sq->size = size;
 320    sq->cqid = cqid;
 321    sq->head = sq->tail = 0;
 322    sq->io_req = g_malloc(sq->size * sizeof(*sq->io_req));
 323
 324    QTAILQ_INIT(&sq->req_list);
 325    QTAILQ_INIT(&sq->out_req_list);
 326    for (i = 0; i < sq->size; i++) {
 327        sq->io_req[i].sq = sq;
 328        QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
 329    }
 330    sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
 331
 332    assert(n->cq[cqid]);
 333    cq = n->cq[cqid];
 334    QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
 335    n->sq[sqid] = sq;
 336}
 337
 338static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
 339{
 340    NvmeSQueue *sq;
 341    NvmeCreateSq *c = (NvmeCreateSq *)cmd;
 342
 343    uint16_t cqid = le16_to_cpu(c->cqid);
 344    uint16_t sqid = le16_to_cpu(c->sqid);
 345    uint16_t qsize = le16_to_cpu(c->qsize);
 346    uint16_t qflags = le16_to_cpu(c->sq_flags);
 347    uint64_t prp1 = le64_to_cpu(c->prp1);
 348
 349    if (!cqid || nvme_check_cqid(n, cqid)) {
 350        return NVME_INVALID_CQID | NVME_DNR;
 351    }
 352    if (!sqid || (sqid && !nvme_check_sqid(n, sqid))) {
 353        return NVME_INVALID_QID | NVME_DNR;
 354    }
 355    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 356        return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
 357    }
 358    if (!prp1 || prp1 & (n->page_size - 1)) {
 359        return NVME_INVALID_FIELD | NVME_DNR;
 360    }
 361    if (!(NVME_SQ_FLAGS_PC(qflags))) {
 362        return NVME_INVALID_FIELD | NVME_DNR;
 363    }
 364    sq = g_malloc0(sizeof(*sq));
 365    nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
 366    return NVME_SUCCESS;
 367}
 368
 369static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
 370{
 371    n->cq[cq->cqid] = NULL;
 372    timer_del(cq->timer);
 373    timer_free(cq->timer);
 374    msix_vector_unuse(&n->parent_obj, cq->vector);
 375    if (cq->cqid) {
 376        g_free(cq);
 377    }
 378}
 379
 380static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
 381{
 382    NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
 383    NvmeCQueue *cq;
 384    uint16_t qid = le16_to_cpu(c->qid);
 385
 386    if (!qid || nvme_check_cqid(n, qid)) {
 387        return NVME_INVALID_CQID | NVME_DNR;
 388    }
 389
 390    cq = n->cq[qid];
 391    if (!QTAILQ_EMPTY(&cq->sq_list)) {
 392        return NVME_INVALID_QUEUE_DEL;
 393    }
 394    nvme_free_cq(cq, n);
 395    return NVME_SUCCESS;
 396}
 397
 398static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
 399    uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
 400{
 401    cq->ctrl = n;
 402    cq->cqid = cqid;
 403    cq->size = size;
 404    cq->dma_addr = dma_addr;
 405    cq->phase = 1;
 406    cq->irq_enabled = irq_enabled;
 407    cq->vector = vector;
 408    cq->head = cq->tail = 0;
 409    QTAILQ_INIT(&cq->req_list);
 410    QTAILQ_INIT(&cq->sq_list);
 411    msix_vector_use(&n->parent_obj, cq->vector);
 412    n->cq[cqid] = cq;
 413    cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
 414}
 415
 416static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
 417{
 418    NvmeCQueue *cq;
 419    NvmeCreateCq *c = (NvmeCreateCq *)cmd;
 420    uint16_t cqid = le16_to_cpu(c->cqid);
 421    uint16_t vector = le16_to_cpu(c->irq_vector);
 422    uint16_t qsize = le16_to_cpu(c->qsize);
 423    uint16_t qflags = le16_to_cpu(c->cq_flags);
 424    uint64_t prp1 = le64_to_cpu(c->prp1);
 425
 426    if (!cqid || (cqid && !nvme_check_cqid(n, cqid))) {
 427        return NVME_INVALID_CQID | NVME_DNR;
 428    }
 429    if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
 430        return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
 431    }
 432    if (!prp1) {
 433        return NVME_INVALID_FIELD | NVME_DNR;
 434    }
 435    if (vector > n->num_queues) {
 436        return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
 437    }
 438    if (!(NVME_CQ_FLAGS_PC(qflags))) {
 439        return NVME_INVALID_FIELD | NVME_DNR;
 440    }
 441
 442    cq = g_malloc0(sizeof(*cq));
 443    nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
 444        NVME_CQ_FLAGS_IEN(qflags));
 445    return NVME_SUCCESS;
 446}
 447
 448static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
 449{
 450    NvmeNamespace *ns;
 451    NvmeIdentify *c = (NvmeIdentify *)cmd;
 452    uint32_t cns  = le32_to_cpu(c->cns);
 453    uint32_t nsid = le32_to_cpu(c->nsid);
 454    uint64_t prp1 = le64_to_cpu(c->prp1);
 455    uint64_t prp2 = le64_to_cpu(c->prp2);
 456
 457    if (cns) {
 458        return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
 459            prp1, prp2);
 460    }
 461    if (nsid == 0 || nsid > n->num_namespaces) {
 462        return NVME_INVALID_NSID | NVME_DNR;
 463    }
 464
 465    ns = &n->namespaces[nsid - 1];
 466    return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
 467        prp1, prp2);
 468}
 469
 470static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 471{
 472    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
 473
 474    switch (dw10) {
 475    case NVME_NUMBER_OF_QUEUES:
 476        req->cqe.result = cpu_to_le32(n->num_queues);
 477        break;
 478    default:
 479        return NVME_INVALID_FIELD | NVME_DNR;
 480    }
 481    return NVME_SUCCESS;
 482}
 483
 484static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 485{
 486    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
 487
 488    switch (dw10) {
 489    case NVME_NUMBER_OF_QUEUES:
 490        req->cqe.result = cpu_to_le32(n->num_queues);
 491        break;
 492    default:
 493        return NVME_INVALID_FIELD | NVME_DNR;
 494    }
 495    return NVME_SUCCESS;
 496}
 497
 498static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 499{
 500    switch (cmd->opcode) {
 501    case NVME_ADM_CMD_DELETE_SQ:
 502        return nvme_del_sq(n, cmd);
 503    case NVME_ADM_CMD_CREATE_SQ:
 504        return nvme_create_sq(n, cmd);
 505    case NVME_ADM_CMD_DELETE_CQ:
 506        return nvme_del_cq(n, cmd);
 507    case NVME_ADM_CMD_CREATE_CQ:
 508        return nvme_create_cq(n, cmd);
 509    case NVME_ADM_CMD_IDENTIFY:
 510        return nvme_identify(n, cmd);
 511    case NVME_ADM_CMD_SET_FEATURES:
 512        return nvme_set_feature(n, cmd, req);
 513    case NVME_ADM_CMD_GET_FEATURES:
 514        return nvme_get_feature(n, cmd, req);
 515    default:
 516        return NVME_INVALID_OPCODE | NVME_DNR;
 517    }
 518}
 519
 520static void nvme_process_sq(void *opaque)
 521{
 522    NvmeSQueue *sq = opaque;
 523    NvmeCtrl *n = sq->ctrl;
 524    NvmeCQueue *cq = n->cq[sq->cqid];
 525
 526    uint16_t status;
 527    hwaddr addr;
 528    NvmeCmd cmd;
 529    NvmeRequest *req;
 530
 531    while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
 532        addr = sq->dma_addr + sq->head * n->sqe_size;
 533        pci_dma_read(&n->parent_obj, addr, (void *)&cmd, sizeof(cmd));
 534        nvme_inc_sq_head(sq);
 535
 536        req = QTAILQ_FIRST(&sq->req_list);
 537        QTAILQ_REMOVE(&sq->req_list, req, entry);
 538        QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
 539        memset(&req->cqe, 0, sizeof(req->cqe));
 540        req->cqe.cid = cmd.cid;
 541
 542        status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
 543            nvme_admin_cmd(n, &cmd, req);
 544        if (status != NVME_NO_COMPLETE) {
 545            req->status = status;
 546            nvme_enqueue_req_completion(cq, req);
 547        }
 548    }
 549}
 550
 551static void nvme_clear_ctrl(NvmeCtrl *n)
 552{
 553    int i;
 554
 555    for (i = 0; i < n->num_queues; i++) {
 556        if (n->sq[i] != NULL) {
 557            nvme_free_sq(n->sq[i], n);
 558        }
 559    }
 560    for (i = 0; i < n->num_queues; i++) {
 561        if (n->cq[i] != NULL) {
 562            nvme_free_cq(n->cq[i], n);
 563        }
 564    }
 565
 566    bdrv_flush(n->conf.bs);
 567    n->bar.cc = 0;
 568}
 569
 570static int nvme_start_ctrl(NvmeCtrl *n)
 571{
 572    uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
 573    uint32_t page_size = 1 << page_bits;
 574
 575    if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
 576            n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
 577            NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
 578            NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
 579            NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
 580            NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
 581            NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
 582            NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
 583            !NVME_AQA_ASQS(n->bar.aqa) || NVME_AQA_ASQS(n->bar.aqa) > 4095 ||
 584            !NVME_AQA_ACQS(n->bar.aqa) || NVME_AQA_ACQS(n->bar.aqa) > 4095) {
 585        return -1;
 586    }
 587
 588    n->page_bits = page_bits;
 589    n->page_size = page_size;
 590    n->max_prp_ents = n->page_size / sizeof(uint64_t);
 591    n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
 592    n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
 593    nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
 594        NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
 595    nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
 596        NVME_AQA_ASQS(n->bar.aqa) + 1);
 597
 598    return 0;
 599}
 600
 601static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
 602    unsigned size)
 603{
 604    switch (offset) {
 605    case 0xc:
 606        n->bar.intms |= data & 0xffffffff;
 607        n->bar.intmc = n->bar.intms;
 608        break;
 609    case 0x10:
 610        n->bar.intms &= ~(data & 0xffffffff);
 611        n->bar.intmc = n->bar.intms;
 612        break;
 613    case 0x14:
 614        if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
 615            n->bar.cc = data;
 616            if (nvme_start_ctrl(n)) {
 617                n->bar.csts = NVME_CSTS_FAILED;
 618            } else {
 619                n->bar.csts = NVME_CSTS_READY;
 620            }
 621        } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
 622            nvme_clear_ctrl(n);
 623            n->bar.csts &= ~NVME_CSTS_READY;
 624        }
 625        if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
 626                nvme_clear_ctrl(n);
 627                n->bar.cc = data;
 628                n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
 629        } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
 630                n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
 631                n->bar.cc = data;
 632        }
 633        break;
 634    case 0x24:
 635        n->bar.aqa = data & 0xffffffff;
 636        break;
 637    case 0x28:
 638        n->bar.asq = data;
 639        break;
 640    case 0x2c:
 641        n->bar.asq |= data << 32;
 642        break;
 643    case 0x30:
 644        n->bar.acq = data;
 645        break;
 646    case 0x34:
 647        n->bar.acq |= data << 32;
 648        break;
 649    default:
 650        break;
 651    }
 652}
 653
 654static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
 655{
 656    NvmeCtrl *n = (NvmeCtrl *)opaque;
 657    uint8_t *ptr = (uint8_t *)&n->bar;
 658    uint64_t val = 0;
 659
 660    if (addr < sizeof(n->bar)) {
 661        memcpy(&val, ptr + addr, size);
 662    }
 663    return val;
 664}
 665
 666static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
 667{
 668    uint32_t qid;
 669
 670    if (addr & ((1 << 2) - 1)) {
 671        return;
 672    }
 673
 674    if (((addr - 0x1000) >> 2) & 1) {
 675        uint16_t new_head = val & 0xffff;
 676        int start_sqs;
 677        NvmeCQueue *cq;
 678
 679        qid = (addr - (0x1000 + (1 << 2))) >> 3;
 680        if (nvme_check_cqid(n, qid)) {
 681            return;
 682        }
 683
 684        cq = n->cq[qid];
 685        if (new_head >= cq->size) {
 686            return;
 687        }
 688
 689        start_sqs = nvme_cq_full(cq) ? 1 : 0;
 690        cq->head = new_head;
 691        if (start_sqs) {
 692            NvmeSQueue *sq;
 693            QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
 694                timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
 695            }
 696            timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
 697        }
 698
 699        if (cq->tail != cq->head) {
 700            nvme_isr_notify(n, cq);
 701        }
 702    } else {
 703        uint16_t new_tail = val & 0xffff;
 704        NvmeSQueue *sq;
 705
 706        qid = (addr - 0x1000) >> 3;
 707        if (nvme_check_sqid(n, qid)) {
 708            return;
 709        }
 710
 711        sq = n->sq[qid];
 712        if (new_tail >= sq->size) {
 713            return;
 714        }
 715
 716        sq->tail = new_tail;
 717        timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
 718    }
 719}
 720
 721static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
 722    unsigned size)
 723{
 724    NvmeCtrl *n = (NvmeCtrl *)opaque;
 725    if (addr < sizeof(n->bar)) {
 726        nvme_write_bar(n, addr, data, size);
 727    } else if (addr >= 0x1000) {
 728        nvme_process_db(n, addr, data);
 729    }
 730}
 731
 732static const MemoryRegionOps nvme_mmio_ops = {
 733    .read = nvme_mmio_read,
 734    .write = nvme_mmio_write,
 735    .endianness = DEVICE_LITTLE_ENDIAN,
 736    .impl = {
 737        .min_access_size = 2,
 738        .max_access_size = 8,
 739    },
 740};
 741
 742static int nvme_init(PCIDevice *pci_dev)
 743{
 744    NvmeCtrl *n = NVME(pci_dev);
 745    NvmeIdCtrl *id = &n->id_ctrl;
 746
 747    int i;
 748    int64_t bs_size;
 749    uint8_t *pci_conf;
 750
 751    if (!(n->conf.bs)) {
 752        return -1;
 753    }
 754
 755    bs_size =  bdrv_getlength(n->conf.bs);
 756    if (bs_size <= 0) {
 757        return -1;
 758    }
 759
 760    blkconf_serial(&n->conf, &n->serial);
 761    if (!n->serial) {
 762        return -1;
 763    }
 764
 765    pci_conf = pci_dev->config;
 766    pci_conf[PCI_INTERRUPT_PIN] = 1;
 767    pci_config_set_prog_interface(pci_dev->config, 0x2);
 768    pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
 769    pcie_endpoint_cap_init(&n->parent_obj, 0x80);
 770
 771    n->num_namespaces = 1;
 772    n->num_queues = 64;
 773    n->reg_size = 1 << qemu_fls(0x1004 + 2 * (n->num_queues + 1) * 4);
 774    n->ns_size = bs_size / (uint64_t)n->num_namespaces;
 775
 776    n->namespaces = g_malloc0(sizeof(*n->namespaces)*n->num_namespaces);
 777    n->sq = g_malloc0(sizeof(*n->sq)*n->num_queues);
 778    n->cq = g_malloc0(sizeof(*n->cq)*n->num_queues);
 779
 780    memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
 781                          "nvme", n->reg_size);
 782    pci_register_bar(&n->parent_obj, 0,
 783        PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
 784        &n->iomem);
 785    msix_init_exclusive_bar(&n->parent_obj, n->num_queues, 4);
 786
 787    id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
 788    id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
 789    strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
 790    strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
 791    strpadcpy((char *)id->sn, sizeof(id->sn), n->serial, ' ');
 792    id->rab = 6;
 793    id->ieee[0] = 0x00;
 794    id->ieee[1] = 0x02;
 795    id->ieee[2] = 0xb3;
 796    id->oacs = cpu_to_le16(0);
 797    id->frmw = 7 << 1;
 798    id->lpa = 1 << 0;
 799    id->sqes = (0x6 << 4) | 0x6;
 800    id->cqes = (0x4 << 4) | 0x4;
 801    id->nn = cpu_to_le32(n->num_namespaces);
 802    id->psd[0].mp = cpu_to_le16(0x9c4);
 803    id->psd[0].enlat = cpu_to_le32(0x10);
 804    id->psd[0].exlat = cpu_to_le32(0x4);
 805
 806    n->bar.cap = 0;
 807    NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
 808    NVME_CAP_SET_CQR(n->bar.cap, 1);
 809    NVME_CAP_SET_AMS(n->bar.cap, 1);
 810    NVME_CAP_SET_TO(n->bar.cap, 0xf);
 811    NVME_CAP_SET_CSS(n->bar.cap, 1);
 812
 813    n->bar.vs = 0x00010001;
 814    n->bar.intmc = n->bar.intms = 0;
 815
 816    for (i = 0; i < n->num_namespaces; i++) {
 817        NvmeNamespace *ns = &n->namespaces[i];
 818        NvmeIdNs *id_ns = &ns->id_ns;
 819        id_ns->nsfeat = 0;
 820        id_ns->nlbaf = 0;
 821        id_ns->flbas = 0;
 822        id_ns->mc = 0;
 823        id_ns->dpc = 0;
 824        id_ns->dps = 0;
 825        id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
 826        id_ns->ncap  = id_ns->nuse = id_ns->nsze =
 827            cpu_to_le64(n->ns_size >>
 828                id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)].ds);
 829    }
 830    return 0;
 831}
 832
 833static void nvme_exit(PCIDevice *pci_dev)
 834{
 835    NvmeCtrl *n = NVME(pci_dev);
 836
 837    nvme_clear_ctrl(n);
 838    g_free(n->namespaces);
 839    g_free(n->cq);
 840    g_free(n->sq);
 841    msix_uninit_exclusive_bar(pci_dev);
 842    memory_region_destroy(&n->iomem);
 843}
 844
 845static Property nvme_props[] = {
 846    DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
 847    DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
 848    DEFINE_PROP_END_OF_LIST(),
 849};
 850
 851static const VMStateDescription nvme_vmstate = {
 852    .name = "nvme",
 853    .unmigratable = 1,
 854};
 855
 856static void nvme_class_init(ObjectClass *oc, void *data)
 857{
 858    DeviceClass *dc = DEVICE_CLASS(oc);
 859    PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
 860
 861    pc->init = nvme_init;
 862    pc->exit = nvme_exit;
 863    pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
 864    pc->vendor_id = PCI_VENDOR_ID_INTEL;
 865    pc->device_id = 0x5845;
 866    pc->revision = 1;
 867    pc->is_express = 1;
 868
 869    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
 870    dc->desc = "Non-Volatile Memory Express";
 871    dc->props = nvme_props;
 872    dc->vmsd = &nvme_vmstate;
 873}
 874
 875static const TypeInfo nvme_info = {
 876    .name          = "nvme",
 877    .parent        = TYPE_PCI_DEVICE,
 878    .instance_size = sizeof(NvmeCtrl),
 879    .class_init    = nvme_class_init,
 880};
 881
 882static void nvme_register_types(void)
 883{
 884    type_register_static(&nvme_info);
 885}
 886
 887type_init(nvme_register_types)
 888