qemu/hw/block/nvme.c
<<
>>
Prefs
   1/*
   2 * QEMU NVM Express Controller
   3 *
   4 * Copyright (c) 2012, Intel Corporation
   5 *
   6 * Written by Keith Busch <keith.busch@intel.com>
   7 *
   8 * This code is licensed under the GNU GPL v2 or later.
   9 */
  10
  11/**
  12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
  13 *
  14 *  https://nvmexpress.org/developers/nvme-specification/
  15 *
  16 *
  17 * Notes on coding style
  18 * ---------------------
  19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
  20 * NVMe subsystem use thes format from the NVMe specifications in the comments
  21 * (i.e. 'h' suffix instead of '0x' prefix).
  22 *
  23 * Usage
  24 * -----
  25 * See docs/system/nvme.rst for extensive documentation.
  26 *
  27 * Add options:
  28 *      -drive file=<file>,if=none,id=<drive_id>
  29 *      -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
  30 *      -device nvme,serial=<serial>,id=<bus_name>, \
  31 *              cmb_size_mb=<cmb_size_mb[optional]>, \
  32 *              [pmrdev=<mem_backend_file_id>,] \
  33 *              max_ioqpairs=<N[optional]>, \
  34 *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
  35 *              mdts=<N[optional]>,vsl=<N[optional]>, \
  36 *              zoned.zasl=<N[optional]>, \
  37 *              subsys=<subsys_id>
  38 *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
  39 *              zoned=<true|false[optional]>, \
  40 *              subsys=<subsys_id>,detached=<true|false[optional]>
  41 *
  42 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  43 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
  44 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
  45 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
  46 *
  47 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  48 * For example:
  49 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
  50 *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
  51 *
  52 * The PMR will use BAR 4/5 exclusively.
  53 *
  54 * To place controller(s) and namespace(s) to a subsystem, then provide
  55 * nvme-subsys device as above.
  56 *
  57 * nvme subsystem device parameters
  58 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  59 * - `nqn`
  60 *   This parameter provides the `<nqn_id>` part of the string
  61 *   `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
  62 *   of subsystem controllers. Note that `<nqn_id>` should be unique per
  63 *   subsystem, but this is not enforced by QEMU. If not specified, it will
  64 *   default to the value of the `id` parameter (`<subsys_id>`).
  65 *
  66 * nvme device parameters
  67 * ~~~~~~~~~~~~~~~~~~~~~~
  68 * - `subsys`
  69 *   Specifying this parameter attaches the controller to the subsystem and
  70 *   the SUBNQN field in the controller will report the NQN of the subsystem
  71 *   device. This also enables multi controller capability represented in
  72 *   Identify Controller data structure in CMIC (Controller Multi-path I/O and
  73 *   Namesapce Sharing Capabilities).
  74 *
  75 * - `aerl`
  76 *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
  77 *   of concurrently outstanding Asynchronous Event Request commands support
  78 *   by the controller. This is a 0's based value.
  79 *
  80 * - `aer_max_queued`
  81 *   This is the maximum number of events that the device will enqueue for
  82 *   completion when there are no outstanding AERs. When the maximum number of
  83 *   enqueued events are reached, subsequent events will be dropped.
  84 *
  85 * - `mdts`
  86 *   Indicates the maximum data transfer size for a command that transfers data
  87 *   between host-accessible memory and the controller. The value is specified
  88 *   as a power of two (2^n) and is in units of the minimum memory page size
  89 *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
  90 *
  91 * - `vsl`
  92 *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
  93 *   this value is specified as a power of two (2^n) and is in units of the
  94 *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
  95 *   KiB).
  96 *
  97 * - `zoned.zasl`
  98 *   Indicates the maximum data transfer size for the Zone Append command. Like
  99 *   `mdts`, the value is specified as a power of two (2^n) and is in units of
 100 *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
 101 *   defaulting to the value of `mdts`).
 102 *
 103 * nvme namespace device parameters
 104 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 105 * - `shared`
 106 *   When the parent nvme device (as defined explicitly by the 'bus' parameter
 107 *   or implicitly by the most recently defined NvmeBus) is linked to an
 108 *   nvme-subsys device, the namespace will be attached to all controllers in
 109 *   the subsystem. If set to 'off' (the default), the namespace will remain a
 110 *   private namespace and may only be attached to a single controller at a
 111 *   time.
 112 *
 113 * - `detached`
 114 *   This parameter is only valid together with the `subsys` parameter. If left
 115 *   at the default value (`false/off`), the namespace will be attached to all
 116 *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
 117 *   namespace will be be available in the subsystem not not attached to any
 118 *   controllers.
 119 *
 120 * Setting `zoned` to true selects Zoned Command Set at the namespace.
 121 * In this case, the following namespace properties are available to configure
 122 * zoned operation:
 123 *     zoned.zone_size=<zone size in bytes, default: 128MiB>
 124 *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
 125 *
 126 *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
 127 *         The value 0 (default) forces zone capacity to be the same as zone
 128 *         size. The value of this property may not exceed zone size.
 129 *
 130 *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
 131 *         This value needs to be specified in 64B units. If it is zero,
 132 *         namespace(s) will not support zone descriptor extensions.
 133 *
 134 *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
 135 *         The default value means there is no limit to the number of
 136 *         concurrently active zones.
 137 *
 138 *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
 139 *         The default value means there is no limit to the number of
 140 *         concurrently open zones.
 141 *
 142 *     zoned.cross_read=<enable RAZB, default: false>
 143 *         Setting this property to true enables Read Across Zone Boundaries.
 144 */
 145
 146#include "qemu/osdep.h"
 147#include "qemu/units.h"
 148#include "qemu/error-report.h"
 149#include "hw/block/block.h"
 150#include "hw/pci/msix.h"
 151#include "hw/pci/pci.h"
 152#include "hw/qdev-properties.h"
 153#include "migration/vmstate.h"
 154#include "sysemu/sysemu.h"
 155#include "qapi/error.h"
 156#include "qapi/visitor.h"
 157#include "sysemu/hostmem.h"
 158#include "sysemu/block-backend.h"
 159#include "exec/memory.h"
 160#include "qemu/log.h"
 161#include "qemu/module.h"
 162#include "qemu/cutils.h"
 163#include "trace.h"
 164#include "nvme.h"
 165#include "nvme-ns.h"
 166#include "nvme-dif.h"
 167
 168#define NVME_MAX_IOQPAIRS 0xffff
 169#define NVME_DB_SIZE  4
 170#define NVME_SPEC_VER 0x00010400
 171#define NVME_CMB_BIR 2
 172#define NVME_PMR_BIR 4
 173#define NVME_TEMPERATURE 0x143
 174#define NVME_TEMPERATURE_WARNING 0x157
 175#define NVME_TEMPERATURE_CRITICAL 0x175
 176#define NVME_NUM_FW_SLOTS 1
 177
 178#define NVME_GUEST_ERR(trace, fmt, ...) \
 179    do { \
 180        (trace_##trace)(__VA_ARGS__); \
 181        qemu_log_mask(LOG_GUEST_ERROR, #trace \
 182            " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
 183    } while (0)
 184
 185static const bool nvme_feature_support[NVME_FID_MAX] = {
 186    [NVME_ARBITRATION]              = true,
 187    [NVME_POWER_MANAGEMENT]         = true,
 188    [NVME_TEMPERATURE_THRESHOLD]    = true,
 189    [NVME_ERROR_RECOVERY]           = true,
 190    [NVME_VOLATILE_WRITE_CACHE]     = true,
 191    [NVME_NUMBER_OF_QUEUES]         = true,
 192    [NVME_INTERRUPT_COALESCING]     = true,
 193    [NVME_INTERRUPT_VECTOR_CONF]    = true,
 194    [NVME_WRITE_ATOMICITY]          = true,
 195    [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
 196    [NVME_TIMESTAMP]                = true,
 197};
 198
 199static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 200    [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
 201    [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
 202    [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
 203    [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
 204    [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
 205    [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
 206};
 207
 208static const uint32_t nvme_cse_acs[256] = {
 209    [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
 210    [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
 211    [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
 212    [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
 213    [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
 214    [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
 215    [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
 216    [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
 217    [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
 218    [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
 219    [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
 220    [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 221};
 222
 223static const uint32_t nvme_cse_iocs_none[256];
 224
 225static const uint32_t nvme_cse_iocs_nvm[256] = {
 226    [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 227    [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 228    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 229    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
 230    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 231    [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
 232    [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 233    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
 234};
 235
 236static const uint32_t nvme_cse_iocs_zoned[256] = {
 237    [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 238    [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 239    [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 240    [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
 241    [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 242    [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
 243    [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 244    [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
 245    [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 246    [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 247    [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
 248};
 249
 250static void nvme_process_sq(void *opaque);
 251
 252static uint16_t nvme_sqid(NvmeRequest *req)
 253{
 254    return le16_to_cpu(req->sq->sqid);
 255}
 256
 257static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
 258                                   NvmeZoneState state)
 259{
 260    if (QTAILQ_IN_USE(zone, entry)) {
 261        switch (nvme_get_zone_state(zone)) {
 262        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
 263            QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
 264            break;
 265        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
 266            QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
 267            break;
 268        case NVME_ZONE_STATE_CLOSED:
 269            QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
 270            break;
 271        case NVME_ZONE_STATE_FULL:
 272            QTAILQ_REMOVE(&ns->full_zones, zone, entry);
 273        default:
 274            ;
 275        }
 276    }
 277
 278    nvme_set_zone_state(zone, state);
 279
 280    switch (state) {
 281    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
 282        QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
 283        break;
 284    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
 285        QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
 286        break;
 287    case NVME_ZONE_STATE_CLOSED:
 288        QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
 289        break;
 290    case NVME_ZONE_STATE_FULL:
 291        QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
 292    case NVME_ZONE_STATE_READ_ONLY:
 293        break;
 294    default:
 295        zone->d.za = 0;
 296    }
 297}
 298
 299/*
 300 * Check if we can open a zone without exceeding open/active limits.
 301 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
 302 */
 303static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
 304{
 305    if (ns->params.max_active_zones != 0 &&
 306        ns->nr_active_zones + act > ns->params.max_active_zones) {
 307        trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
 308        return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
 309    }
 310    if (ns->params.max_open_zones != 0 &&
 311        ns->nr_open_zones + opn > ns->params.max_open_zones) {
 312        trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
 313        return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
 314    }
 315
 316    return NVME_SUCCESS;
 317}
 318
 319static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 320{
 321    hwaddr hi, lo;
 322
 323    if (!n->cmb.cmse) {
 324        return false;
 325    }
 326
 327    lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
 328    hi = lo + int128_get64(n->cmb.mem.size);
 329
 330    return addr >= lo && addr < hi;
 331}
 332
 333static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
 334{
 335    hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
 336    return &n->cmb.buf[addr - base];
 337}
 338
 339static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
 340{
 341    hwaddr hi;
 342
 343    if (!n->pmr.cmse) {
 344        return false;
 345    }
 346
 347    hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
 348
 349    return addr >= n->pmr.cba && addr < hi;
 350}
 351
 352static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
 353{
 354    return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
 355}
 356
 357static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 358{
 359    hwaddr hi = addr + size - 1;
 360    if (hi < addr) {
 361        return 1;
 362    }
 363
 364    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
 365        memcpy(buf, nvme_addr_to_cmb(n, addr), size);
 366        return 0;
 367    }
 368
 369    if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
 370        memcpy(buf, nvme_addr_to_pmr(n, addr), size);
 371        return 0;
 372    }
 373
 374    return pci_dma_read(&n->parent_obj, addr, buf, size);
 375}
 376
 377static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 378{
 379    hwaddr hi = addr + size - 1;
 380    if (hi < addr) {
 381        return 1;
 382    }
 383
 384    if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
 385        memcpy(nvme_addr_to_cmb(n, addr), buf, size);
 386        return 0;
 387    }
 388
 389    if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
 390        memcpy(nvme_addr_to_pmr(n, addr), buf, size);
 391        return 0;
 392    }
 393
 394    return pci_dma_write(&n->parent_obj, addr, buf, size);
 395}
 396
 397static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
 398{
 399    return nsid && (nsid == NVME_NSID_BROADCAST || nsid <= n->num_namespaces);
 400}
 401
 402static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
 403{
 404    return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
 405}
 406
 407static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
 408{
 409    return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
 410}
 411
 412static void nvme_inc_cq_tail(NvmeCQueue *cq)
 413{
 414    cq->tail++;
 415    if (cq->tail >= cq->size) {
 416        cq->tail = 0;
 417        cq->phase = !cq->phase;
 418    }
 419}
 420
 421static void nvme_inc_sq_head(NvmeSQueue *sq)
 422{
 423    sq->head = (sq->head + 1) % sq->size;
 424}
 425
 426static uint8_t nvme_cq_full(NvmeCQueue *cq)
 427{
 428    return (cq->tail + 1) % cq->size == cq->head;
 429}
 430
 431static uint8_t nvme_sq_empty(NvmeSQueue *sq)
 432{
 433    return sq->head == sq->tail;
 434}
 435
 436static void nvme_irq_check(NvmeCtrl *n)
 437{
 438    if (msix_enabled(&(n->parent_obj))) {
 439        return;
 440    }
 441    if (~n->bar.intms & n->irq_status) {
 442        pci_irq_assert(&n->parent_obj);
 443    } else {
 444        pci_irq_deassert(&n->parent_obj);
 445    }
 446}
 447
 448static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
 449{
 450    if (cq->irq_enabled) {
 451        if (msix_enabled(&(n->parent_obj))) {
 452            trace_pci_nvme_irq_msix(cq->vector);
 453            msix_notify(&(n->parent_obj), cq->vector);
 454        } else {
 455            trace_pci_nvme_irq_pin();
 456            assert(cq->vector < 32);
 457            n->irq_status |= 1 << cq->vector;
 458            nvme_irq_check(n);
 459        }
 460    } else {
 461        trace_pci_nvme_irq_masked();
 462    }
 463}
 464
 465static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
 466{
 467    if (cq->irq_enabled) {
 468        if (msix_enabled(&(n->parent_obj))) {
 469            return;
 470        } else {
 471            assert(cq->vector < 32);
 472            if (!n->cq_pending) {
 473                n->irq_status &= ~(1 << cq->vector);
 474            }
 475            nvme_irq_check(n);
 476        }
 477    }
 478}
 479
 480static void nvme_req_clear(NvmeRequest *req)
 481{
 482    req->ns = NULL;
 483    req->opaque = NULL;
 484    req->aiocb = NULL;
 485    memset(&req->cqe, 0x0, sizeof(req->cqe));
 486    req->status = NVME_SUCCESS;
 487}
 488
 489static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
 490{
 491    if (dma) {
 492        pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
 493        sg->flags = NVME_SG_DMA;
 494    } else {
 495        qemu_iovec_init(&sg->iov, 0);
 496    }
 497
 498    sg->flags |= NVME_SG_ALLOC;
 499}
 500
 501static inline void nvme_sg_unmap(NvmeSg *sg)
 502{
 503    if (!(sg->flags & NVME_SG_ALLOC)) {
 504        return;
 505    }
 506
 507    if (sg->flags & NVME_SG_DMA) {
 508        qemu_sglist_destroy(&sg->qsg);
 509    } else {
 510        qemu_iovec_destroy(&sg->iov);
 511    }
 512
 513    memset(sg, 0x0, sizeof(*sg));
 514}
 515
 516/*
 517 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
 518 * holds both data and metadata. This function splits the data and metadata
 519 * into two separate QSG/IOVs.
 520 */
 521static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
 522                          NvmeSg *mdata)
 523{
 524    NvmeSg *dst = data;
 525    size_t size = nvme_lsize(ns);
 526    size_t msize = nvme_msize(ns);
 527    uint32_t trans_len, count = size;
 528    uint64_t offset = 0;
 529    bool dma = sg->flags & NVME_SG_DMA;
 530    size_t sge_len;
 531    size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
 532    int sg_idx = 0;
 533
 534    assert(sg->flags & NVME_SG_ALLOC);
 535
 536    while (sg_len) {
 537        sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
 538
 539        trans_len = MIN(sg_len, count);
 540        trans_len = MIN(trans_len, sge_len - offset);
 541
 542        if (dst) {
 543            if (dma) {
 544                qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
 545                                trans_len);
 546            } else {
 547                qemu_iovec_add(&dst->iov,
 548                               sg->iov.iov[sg_idx].iov_base + offset,
 549                               trans_len);
 550            }
 551        }
 552
 553        sg_len -= trans_len;
 554        count -= trans_len;
 555        offset += trans_len;
 556
 557        if (count == 0) {
 558            dst = (dst == data) ? mdata : data;
 559            count = (dst == data) ? size : msize;
 560        }
 561
 562        if (sge_len == offset) {
 563            offset = 0;
 564            sg_idx++;
 565        }
 566    }
 567}
 568
 569static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
 570                                  size_t len)
 571{
 572    if (!len) {
 573        return NVME_SUCCESS;
 574    }
 575
 576    trace_pci_nvme_map_addr_cmb(addr, len);
 577
 578    if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
 579        return NVME_DATA_TRAS_ERROR;
 580    }
 581
 582    qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
 583
 584    return NVME_SUCCESS;
 585}
 586
 587static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
 588    size_t len)
 589{
 590    if (!len) {
 591        return NVME_SUCCESS;
 592    }
 593
 594    if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
 595        return NVME_DATA_TRAS_ERROR;
 596    }
 597
 598    qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
 599
 600    return NVME_SUCCESS;
 601}
 602
 603static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
 604{
 605    bool cmb = false, pmr = false;
 606
 607    if (!len) {
 608        return NVME_SUCCESS;
 609    }
 610
 611    trace_pci_nvme_map_addr(addr, len);
 612
 613    if (nvme_addr_is_cmb(n, addr)) {
 614        cmb = true;
 615    } else if (nvme_addr_is_pmr(n, addr)) {
 616        pmr = true;
 617    }
 618
 619    if (cmb || pmr) {
 620        if (sg->flags & NVME_SG_DMA) {
 621            return NVME_INVALID_USE_OF_CMB | NVME_DNR;
 622        }
 623
 624        if (cmb) {
 625            return nvme_map_addr_cmb(n, &sg->iov, addr, len);
 626        } else {
 627            return nvme_map_addr_pmr(n, &sg->iov, addr, len);
 628        }
 629    }
 630
 631    if (!(sg->flags & NVME_SG_DMA)) {
 632        return NVME_INVALID_USE_OF_CMB | NVME_DNR;
 633    }
 634
 635    qemu_sglist_add(&sg->qsg, addr, len);
 636
 637    return NVME_SUCCESS;
 638}
 639
 640static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
 641{
 642    return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
 643}
 644
 645static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
 646                             uint64_t prp2, uint32_t len)
 647{
 648    hwaddr trans_len = n->page_size - (prp1 % n->page_size);
 649    trans_len = MIN(len, trans_len);
 650    int num_prps = (len >> n->page_bits) + 1;
 651    uint16_t status;
 652    int ret;
 653
 654    trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
 655
 656    nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
 657
 658    status = nvme_map_addr(n, sg, prp1, trans_len);
 659    if (status) {
 660        goto unmap;
 661    }
 662
 663    len -= trans_len;
 664    if (len) {
 665        if (len > n->page_size) {
 666            uint64_t prp_list[n->max_prp_ents];
 667            uint32_t nents, prp_trans;
 668            int i = 0;
 669
 670            /*
 671             * The first PRP list entry, pointed to by PRP2 may contain offset.
 672             * Hence, we need to calculate the number of entries in based on
 673             * that offset.
 674             */
 675            nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
 676            prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
 677            ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
 678            if (ret) {
 679                trace_pci_nvme_err_addr_read(prp2);
 680                status = NVME_DATA_TRAS_ERROR;
 681                goto unmap;
 682            }
 683            while (len != 0) {
 684                uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 685
 686                if (i == nents - 1 && len > n->page_size) {
 687                    if (unlikely(prp_ent & (n->page_size - 1))) {
 688                        trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
 689                        status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
 690                        goto unmap;
 691                    }
 692
 693                    i = 0;
 694                    nents = (len + n->page_size - 1) >> n->page_bits;
 695                    nents = MIN(nents, n->max_prp_ents);
 696                    prp_trans = nents * sizeof(uint64_t);
 697                    ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
 698                                         prp_trans);
 699                    if (ret) {
 700                        trace_pci_nvme_err_addr_read(prp_ent);
 701                        status = NVME_DATA_TRAS_ERROR;
 702                        goto unmap;
 703                    }
 704                    prp_ent = le64_to_cpu(prp_list[i]);
 705                }
 706
 707                if (unlikely(prp_ent & (n->page_size - 1))) {
 708                    trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
 709                    status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
 710                    goto unmap;
 711                }
 712
 713                trans_len = MIN(len, n->page_size);
 714                status = nvme_map_addr(n, sg, prp_ent, trans_len);
 715                if (status) {
 716                    goto unmap;
 717                }
 718
 719                len -= trans_len;
 720                i++;
 721            }
 722        } else {
 723            if (unlikely(prp2 & (n->page_size - 1))) {
 724                trace_pci_nvme_err_invalid_prp2_align(prp2);
 725                status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
 726                goto unmap;
 727            }
 728            status = nvme_map_addr(n, sg, prp2, len);
 729            if (status) {
 730                goto unmap;
 731            }
 732        }
 733    }
 734
 735    return NVME_SUCCESS;
 736
 737unmap:
 738    nvme_sg_unmap(sg);
 739    return status;
 740}
 741
 742/*
 743 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
 744 * number of bytes mapped in len.
 745 */
 746static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
 747                                  NvmeSglDescriptor *segment, uint64_t nsgld,
 748                                  size_t *len, NvmeCmd *cmd)
 749{
 750    dma_addr_t addr, trans_len;
 751    uint32_t dlen;
 752    uint16_t status;
 753
 754    for (int i = 0; i < nsgld; i++) {
 755        uint8_t type = NVME_SGL_TYPE(segment[i].type);
 756
 757        switch (type) {
 758        case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
 759            if (cmd->opcode == NVME_CMD_WRITE) {
 760                continue;
 761            }
 762        case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
 763            break;
 764        case NVME_SGL_DESCR_TYPE_SEGMENT:
 765        case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
 766            return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
 767        default:
 768            return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
 769        }
 770
 771        dlen = le32_to_cpu(segment[i].len);
 772
 773        if (!dlen) {
 774            continue;
 775        }
 776
 777        if (*len == 0) {
 778            /*
 779             * All data has been mapped, but the SGL contains additional
 780             * segments and/or descriptors. The controller might accept
 781             * ignoring the rest of the SGL.
 782             */
 783            uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
 784            if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
 785                break;
 786            }
 787
 788            trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
 789            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
 790        }
 791
 792        trans_len = MIN(*len, dlen);
 793
 794        if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
 795            goto next;
 796        }
 797
 798        addr = le64_to_cpu(segment[i].addr);
 799
 800        if (UINT64_MAX - addr < dlen) {
 801            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
 802        }
 803
 804        status = nvme_map_addr(n, sg, addr, trans_len);
 805        if (status) {
 806            return status;
 807        }
 808
 809next:
 810        *len -= trans_len;
 811    }
 812
 813    return NVME_SUCCESS;
 814}
 815
 816static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
 817                             size_t len, NvmeCmd *cmd)
 818{
 819    /*
 820     * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
 821     * dynamically allocating a potentially huge SGL. The spec allows the SGL
 822     * to be larger (as in number of bytes required to describe the SGL
 823     * descriptors and segment chain) than the command transfer size, so it is
 824     * not bounded by MDTS.
 825     */
 826    const int SEG_CHUNK_SIZE = 256;
 827
 828    NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
 829    uint64_t nsgld;
 830    uint32_t seg_len;
 831    uint16_t status;
 832    hwaddr addr;
 833    int ret;
 834
 835    sgld = &sgl;
 836    addr = le64_to_cpu(sgl.addr);
 837
 838    trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
 839
 840    nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
 841
 842    /*
 843     * If the entire transfer can be described with a single data block it can
 844     * be mapped directly.
 845     */
 846    if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
 847        status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
 848        if (status) {
 849            goto unmap;
 850        }
 851
 852        goto out;
 853    }
 854
 855    for (;;) {
 856        switch (NVME_SGL_TYPE(sgld->type)) {
 857        case NVME_SGL_DESCR_TYPE_SEGMENT:
 858        case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
 859            break;
 860        default:
 861            return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
 862        }
 863
 864        seg_len = le32_to_cpu(sgld->len);
 865
 866        /* check the length of the (Last) Segment descriptor */
 867        if ((!seg_len || seg_len & 0xf) &&
 868            (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
 869            return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
 870        }
 871
 872        if (UINT64_MAX - addr < seg_len) {
 873            return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
 874        }
 875
 876        nsgld = seg_len / sizeof(NvmeSglDescriptor);
 877
 878        while (nsgld > SEG_CHUNK_SIZE) {
 879            if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
 880                trace_pci_nvme_err_addr_read(addr);
 881                status = NVME_DATA_TRAS_ERROR;
 882                goto unmap;
 883            }
 884
 885            status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
 886                                       &len, cmd);
 887            if (status) {
 888                goto unmap;
 889            }
 890
 891            nsgld -= SEG_CHUNK_SIZE;
 892            addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
 893        }
 894
 895        ret = nvme_addr_read(n, addr, segment, nsgld *
 896                             sizeof(NvmeSglDescriptor));
 897        if (ret) {
 898            trace_pci_nvme_err_addr_read(addr);
 899            status = NVME_DATA_TRAS_ERROR;
 900            goto unmap;
 901        }
 902
 903        last_sgld = &segment[nsgld - 1];
 904
 905        /*
 906         * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
 907         * then we are done.
 908         */
 909        switch (NVME_SGL_TYPE(last_sgld->type)) {
 910        case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
 911        case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
 912            status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
 913            if (status) {
 914                goto unmap;
 915            }
 916
 917            goto out;
 918
 919        default:
 920            break;
 921        }
 922
 923        /*
 924         * If the last descriptor was not a Data Block or Bit Bucket, then the
 925         * current segment must not be a Last Segment.
 926         */
 927        if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
 928            status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
 929            goto unmap;
 930        }
 931
 932        sgld = last_sgld;
 933        addr = le64_to_cpu(sgld->addr);
 934
 935        /*
 936         * Do not map the last descriptor; it will be a Segment or Last Segment
 937         * descriptor and is handled by the next iteration.
 938         */
 939        status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
 940        if (status) {
 941            goto unmap;
 942        }
 943    }
 944
 945out:
 946    /* if there is any residual left in len, the SGL was too short */
 947    if (len) {
 948        status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
 949        goto unmap;
 950    }
 951
 952    return NVME_SUCCESS;
 953
 954unmap:
 955    nvme_sg_unmap(sg);
 956    return status;
 957}
 958
 959uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
 960                       NvmeCmd *cmd)
 961{
 962    uint64_t prp1, prp2;
 963
 964    switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
 965    case NVME_PSDT_PRP:
 966        prp1 = le64_to_cpu(cmd->dptr.prp1);
 967        prp2 = le64_to_cpu(cmd->dptr.prp2);
 968
 969        return nvme_map_prp(n, sg, prp1, prp2, len);
 970    case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
 971    case NVME_PSDT_SGL_MPTR_SGL:
 972        return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
 973    default:
 974        return NVME_INVALID_FIELD;
 975    }
 976}
 977
 978static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
 979                              NvmeCmd *cmd)
 980{
 981    int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
 982    hwaddr mptr = le64_to_cpu(cmd->mptr);
 983    uint16_t status;
 984
 985    if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
 986        NvmeSglDescriptor sgl;
 987
 988        if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
 989            return NVME_DATA_TRAS_ERROR;
 990        }
 991
 992        status = nvme_map_sgl(n, sg, sgl, len, cmd);
 993        if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
 994            status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
 995        }
 996
 997        return status;
 998    }
 999
1000    nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1001    status = nvme_map_addr(n, sg, mptr, len);
1002    if (status) {
1003        nvme_sg_unmap(sg);
1004    }
1005
1006    return status;
1007}
1008
1009static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1010{
1011    NvmeNamespace *ns = req->ns;
1012    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1013    uint16_t ctrl = le16_to_cpu(rw->control);
1014    size_t len = nvme_l2b(ns, nlb);
1015    uint16_t status;
1016
1017    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
1018        (ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
1019        goto out;
1020    }
1021
1022    if (nvme_ns_ext(ns)) {
1023        NvmeSg sg;
1024
1025        len += nvme_m2b(ns, nlb);
1026
1027        status = nvme_map_dptr(n, &sg, len, &req->cmd);
1028        if (status) {
1029            return status;
1030        }
1031
1032        nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1033        nvme_sg_split(&sg, ns, &req->sg, NULL);
1034        nvme_sg_unmap(&sg);
1035
1036        return NVME_SUCCESS;
1037    }
1038
1039out:
1040    return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1041}
1042
1043static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1044{
1045    NvmeNamespace *ns = req->ns;
1046    size_t len = nvme_m2b(ns, nlb);
1047    uint16_t status;
1048
1049    if (nvme_ns_ext(ns)) {
1050        NvmeSg sg;
1051
1052        len += nvme_l2b(ns, nlb);
1053
1054        status = nvme_map_dptr(n, &sg, len, &req->cmd);
1055        if (status) {
1056            return status;
1057        }
1058
1059        nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1060        nvme_sg_split(&sg, ns, NULL, &req->sg);
1061        nvme_sg_unmap(&sg);
1062
1063        return NVME_SUCCESS;
1064    }
1065
1066    return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1067}
1068
1069static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1070                                    uint32_t len, uint32_t bytes,
1071                                    int32_t skip_bytes, int64_t offset,
1072                                    NvmeTxDirection dir)
1073{
1074    hwaddr addr;
1075    uint32_t trans_len, count = bytes;
1076    bool dma = sg->flags & NVME_SG_DMA;
1077    int64_t sge_len;
1078    int sg_idx = 0;
1079    int ret;
1080
1081    assert(sg->flags & NVME_SG_ALLOC);
1082
1083    while (len) {
1084        sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1085
1086        if (sge_len - offset < 0) {
1087            offset -= sge_len;
1088            sg_idx++;
1089            continue;
1090        }
1091
1092        if (sge_len == offset) {
1093            offset = 0;
1094            sg_idx++;
1095            continue;
1096        }
1097
1098        trans_len = MIN(len, count);
1099        trans_len = MIN(trans_len, sge_len - offset);
1100
1101        if (dma) {
1102            addr = sg->qsg.sg[sg_idx].base + offset;
1103        } else {
1104            addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1105        }
1106
1107        if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1108            ret = nvme_addr_read(n, addr, ptr, trans_len);
1109        } else {
1110            ret = nvme_addr_write(n, addr, ptr, trans_len);
1111        }
1112
1113        if (ret) {
1114            return NVME_DATA_TRAS_ERROR;
1115        }
1116
1117        ptr += trans_len;
1118        len -= trans_len;
1119        count -= trans_len;
1120        offset += trans_len;
1121
1122        if (count == 0) {
1123            count = bytes;
1124            offset += skip_bytes;
1125        }
1126    }
1127
1128    return NVME_SUCCESS;
1129}
1130
1131static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1132                        NvmeTxDirection dir)
1133{
1134    assert(sg->flags & NVME_SG_ALLOC);
1135
1136    if (sg->flags & NVME_SG_DMA) {
1137        uint64_t residual;
1138
1139        if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1140            residual = dma_buf_write(ptr, len, &sg->qsg);
1141        } else {
1142            residual = dma_buf_read(ptr, len, &sg->qsg);
1143        }
1144
1145        if (unlikely(residual)) {
1146            trace_pci_nvme_err_invalid_dma();
1147            return NVME_INVALID_FIELD | NVME_DNR;
1148        }
1149    } else {
1150        size_t bytes;
1151
1152        if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1153            bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1154        } else {
1155            bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1156        }
1157
1158        if (unlikely(bytes != len)) {
1159            trace_pci_nvme_err_invalid_dma();
1160            return NVME_INVALID_FIELD | NVME_DNR;
1161        }
1162    }
1163
1164    return NVME_SUCCESS;
1165}
1166
1167static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1168                                NvmeRequest *req)
1169{
1170    uint16_t status;
1171
1172    status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1173    if (status) {
1174        return status;
1175    }
1176
1177    return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1178}
1179
1180static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1181                                NvmeRequest *req)
1182{
1183    uint16_t status;
1184
1185    status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1186    if (status) {
1187        return status;
1188    }
1189
1190    return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1191}
1192
1193uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1194                          NvmeTxDirection dir, NvmeRequest *req)
1195{
1196    NvmeNamespace *ns = req->ns;
1197    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1198    uint16_t ctrl = le16_to_cpu(rw->control);
1199
1200    if (nvme_ns_ext(ns) &&
1201        !(ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
1202        size_t lsize = nvme_lsize(ns);
1203        size_t msize = nvme_msize(ns);
1204
1205        return nvme_tx_interleaved(n, &req->sg, ptr, len, lsize, msize, 0,
1206                                   dir);
1207    }
1208
1209    return nvme_tx(n, &req->sg, ptr, len, dir);
1210}
1211
1212uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1213                           NvmeTxDirection dir, NvmeRequest *req)
1214{
1215    NvmeNamespace *ns = req->ns;
1216    uint16_t status;
1217
1218    if (nvme_ns_ext(ns)) {
1219        size_t lsize = nvme_lsize(ns);
1220        size_t msize = nvme_msize(ns);
1221
1222        return nvme_tx_interleaved(n, &req->sg, ptr, len, msize, lsize, lsize,
1223                                   dir);
1224    }
1225
1226    nvme_sg_unmap(&req->sg);
1227
1228    status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1229    if (status) {
1230        return status;
1231    }
1232
1233    return nvme_tx(n, &req->sg, ptr, len, dir);
1234}
1235
1236static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1237                                 BlockCompletionFunc *cb, NvmeRequest *req)
1238{
1239    assert(req->sg.flags & NVME_SG_ALLOC);
1240
1241    if (req->sg.flags & NVME_SG_DMA) {
1242        req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1243                                  cb, req);
1244    } else {
1245        req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1246    }
1247}
1248
1249static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1250                                  BlockCompletionFunc *cb, NvmeRequest *req)
1251{
1252    assert(req->sg.flags & NVME_SG_ALLOC);
1253
1254    if (req->sg.flags & NVME_SG_DMA) {
1255        req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1256                                   cb, req);
1257    } else {
1258        req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1259    }
1260}
1261
1262static void nvme_post_cqes(void *opaque)
1263{
1264    NvmeCQueue *cq = opaque;
1265    NvmeCtrl *n = cq->ctrl;
1266    NvmeRequest *req, *next;
1267    bool pending = cq->head != cq->tail;
1268    int ret;
1269
1270    QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1271        NvmeSQueue *sq;
1272        hwaddr addr;
1273
1274        if (nvme_cq_full(cq)) {
1275            break;
1276        }
1277
1278        sq = req->sq;
1279        req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1280        req->cqe.sq_id = cpu_to_le16(sq->sqid);
1281        req->cqe.sq_head = cpu_to_le16(sq->head);
1282        addr = cq->dma_addr + cq->tail * n->cqe_size;
1283        ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1284                            sizeof(req->cqe));
1285        if (ret) {
1286            trace_pci_nvme_err_addr_write(addr);
1287            trace_pci_nvme_err_cfs();
1288            n->bar.csts = NVME_CSTS_FAILED;
1289            break;
1290        }
1291        QTAILQ_REMOVE(&cq->req_list, req, entry);
1292        nvme_inc_cq_tail(cq);
1293        nvme_sg_unmap(&req->sg);
1294        QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1295    }
1296    if (cq->tail != cq->head) {
1297        if (cq->irq_enabled && !pending) {
1298            n->cq_pending++;
1299        }
1300
1301        nvme_irq_assert(n, cq);
1302    }
1303}
1304
1305static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1306{
1307    assert(cq->cqid == req->sq->cqid);
1308    trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1309                                          req->status);
1310
1311    if (req->status) {
1312        trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1313                                      req->status, req->cmd.opcode);
1314    }
1315
1316    QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1317    QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1318    timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1319}
1320
1321static void nvme_process_aers(void *opaque)
1322{
1323    NvmeCtrl *n = opaque;
1324    NvmeAsyncEvent *event, *next;
1325
1326    trace_pci_nvme_process_aers(n->aer_queued);
1327
1328    QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1329        NvmeRequest *req;
1330        NvmeAerResult *result;
1331
1332        /* can't post cqe if there is nothing to complete */
1333        if (!n->outstanding_aers) {
1334            trace_pci_nvme_no_outstanding_aers();
1335            break;
1336        }
1337
1338        /* ignore if masked (cqe posted, but event not cleared) */
1339        if (n->aer_mask & (1 << event->result.event_type)) {
1340            trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1341            continue;
1342        }
1343
1344        QTAILQ_REMOVE(&n->aer_queue, event, entry);
1345        n->aer_queued--;
1346
1347        n->aer_mask |= 1 << event->result.event_type;
1348        n->outstanding_aers--;
1349
1350        req = n->aer_reqs[n->outstanding_aers];
1351
1352        result = (NvmeAerResult *) &req->cqe.result;
1353        result->event_type = event->result.event_type;
1354        result->event_info = event->result.event_info;
1355        result->log_page = event->result.log_page;
1356        g_free(event);
1357
1358        trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1359                                    result->log_page);
1360
1361        nvme_enqueue_req_completion(&n->admin_cq, req);
1362    }
1363}
1364
1365static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1366                               uint8_t event_info, uint8_t log_page)
1367{
1368    NvmeAsyncEvent *event;
1369
1370    trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1371
1372    if (n->aer_queued == n->params.aer_max_queued) {
1373        trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1374        return;
1375    }
1376
1377    event = g_new(NvmeAsyncEvent, 1);
1378    event->result = (NvmeAerResult) {
1379        .event_type = event_type,
1380        .event_info = event_info,
1381        .log_page   = log_page,
1382    };
1383
1384    QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1385    n->aer_queued++;
1386
1387    nvme_process_aers(n);
1388}
1389
1390static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1391{
1392    uint8_t aer_info;
1393
1394    /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1395    if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1396        return;
1397    }
1398
1399    switch (event) {
1400    case NVME_SMART_SPARE:
1401        aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1402        break;
1403    case NVME_SMART_TEMPERATURE:
1404        aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1405        break;
1406    case NVME_SMART_RELIABILITY:
1407    case NVME_SMART_MEDIA_READ_ONLY:
1408    case NVME_SMART_FAILED_VOLATILE_MEDIA:
1409    case NVME_SMART_PMR_UNRELIABLE:
1410        aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1411        break;
1412    default:
1413        return;
1414    }
1415
1416    nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1417}
1418
1419static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1420{
1421    n->aer_mask &= ~(1 << event_type);
1422    if (!QTAILQ_EMPTY(&n->aer_queue)) {
1423        nvme_process_aers(n);
1424    }
1425}
1426
1427static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1428{
1429    uint8_t mdts = n->params.mdts;
1430
1431    if (mdts && len > n->page_size << mdts) {
1432        trace_pci_nvme_err_mdts(len);
1433        return NVME_INVALID_FIELD | NVME_DNR;
1434    }
1435
1436    return NVME_SUCCESS;
1437}
1438
1439static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1440                                         uint32_t nlb)
1441{
1442    uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1443
1444    if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1445        return NVME_LBA_RANGE | NVME_DNR;
1446    }
1447
1448    return NVME_SUCCESS;
1449}
1450
1451static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1452                                 uint32_t nlb)
1453{
1454    BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1455
1456    int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1457    int64_t offset = nvme_l2b(ns, slba);
1458    bool zeroed;
1459    int ret;
1460
1461    Error *local_err = NULL;
1462
1463    /*
1464     * `pnum` holds the number of bytes after offset that shares the same
1465     * allocation status as the byte at offset. If `pnum` is different from
1466     * `bytes`, we should check the allocation status of the next range and
1467     * continue this until all bytes have been checked.
1468     */
1469    do {
1470        bytes -= pnum;
1471
1472        ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1473        if (ret < 0) {
1474            error_setg_errno(&local_err, -ret, "unable to get block status");
1475            error_report_err(local_err);
1476
1477            return NVME_INTERNAL_DEV_ERROR;
1478        }
1479
1480        zeroed = !!(ret & BDRV_BLOCK_ZERO);
1481
1482        trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
1483
1484        if (zeroed) {
1485            return NVME_DULB;
1486        }
1487
1488        offset += pnum;
1489    } while (pnum != bytes);
1490
1491    return NVME_SUCCESS;
1492}
1493
1494static void nvme_aio_err(NvmeRequest *req, int ret)
1495{
1496    uint16_t status = NVME_SUCCESS;
1497    Error *local_err = NULL;
1498
1499    switch (req->cmd.opcode) {
1500    case NVME_CMD_READ:
1501        status = NVME_UNRECOVERED_READ;
1502        break;
1503    case NVME_CMD_FLUSH:
1504    case NVME_CMD_WRITE:
1505    case NVME_CMD_WRITE_ZEROES:
1506    case NVME_CMD_ZONE_APPEND:
1507        status = NVME_WRITE_FAULT;
1508        break;
1509    default:
1510        status = NVME_INTERNAL_DEV_ERROR;
1511        break;
1512    }
1513
1514    trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1515
1516    error_setg_errno(&local_err, -ret, "aio failed");
1517    error_report_err(local_err);
1518
1519    /*
1520     * Set the command status code to the first encountered error but allow a
1521     * subsequent Internal Device Error to trump it.
1522     */
1523    if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1524        return;
1525    }
1526
1527    req->status = status;
1528}
1529
1530static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1531{
1532    return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1533                                    slba / ns->zone_size;
1534}
1535
1536static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1537{
1538    uint32_t zone_idx = nvme_zone_idx(ns, slba);
1539
1540    assert(zone_idx < ns->num_zones);
1541    return &ns->zone_array[zone_idx];
1542}
1543
1544static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1545{
1546    uint64_t zslba = zone->d.zslba;
1547
1548    switch (nvme_get_zone_state(zone)) {
1549    case NVME_ZONE_STATE_EMPTY:
1550    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1551    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1552    case NVME_ZONE_STATE_CLOSED:
1553        return NVME_SUCCESS;
1554    case NVME_ZONE_STATE_FULL:
1555        trace_pci_nvme_err_zone_is_full(zslba);
1556        return NVME_ZONE_FULL;
1557    case NVME_ZONE_STATE_OFFLINE:
1558        trace_pci_nvme_err_zone_is_offline(zslba);
1559        return NVME_ZONE_OFFLINE;
1560    case NVME_ZONE_STATE_READ_ONLY:
1561        trace_pci_nvme_err_zone_is_read_only(zslba);
1562        return NVME_ZONE_READ_ONLY;
1563    default:
1564        assert(false);
1565    }
1566
1567    return NVME_INTERNAL_DEV_ERROR;
1568}
1569
1570static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1571                                      uint64_t slba, uint32_t nlb)
1572{
1573    uint64_t zcap = nvme_zone_wr_boundary(zone);
1574    uint16_t status;
1575
1576    status = nvme_check_zone_state_for_write(zone);
1577    if (status) {
1578        return status;
1579    }
1580
1581    if (unlikely(slba != zone->w_ptr)) {
1582        trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1583        return NVME_ZONE_INVALID_WRITE;
1584    }
1585
1586    if (unlikely((slba + nlb) > zcap)) {
1587        trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1588        return NVME_ZONE_BOUNDARY_ERROR;
1589    }
1590
1591    return NVME_SUCCESS;
1592}
1593
1594static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1595{
1596    switch (nvme_get_zone_state(zone)) {
1597    case NVME_ZONE_STATE_EMPTY:
1598    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1599    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1600    case NVME_ZONE_STATE_FULL:
1601    case NVME_ZONE_STATE_CLOSED:
1602    case NVME_ZONE_STATE_READ_ONLY:
1603        return NVME_SUCCESS;
1604    case NVME_ZONE_STATE_OFFLINE:
1605        trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1606        return NVME_ZONE_OFFLINE;
1607    default:
1608        assert(false);
1609    }
1610
1611    return NVME_INTERNAL_DEV_ERROR;
1612}
1613
1614static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1615                                     uint32_t nlb)
1616{
1617    NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
1618    uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
1619    uint64_t end = slba + nlb;
1620    uint16_t status;
1621
1622    status = nvme_check_zone_state_for_read(zone);
1623    if (status) {
1624        ;
1625    } else if (unlikely(end > bndry)) {
1626        if (!ns->params.cross_zone_read) {
1627            status = NVME_ZONE_BOUNDARY_ERROR;
1628        } else {
1629            /*
1630             * Read across zone boundary - check that all subsequent
1631             * zones that are being read have an appropriate state.
1632             */
1633            do {
1634                zone++;
1635                status = nvme_check_zone_state_for_read(zone);
1636                if (status) {
1637                    break;
1638                }
1639            } while (end > nvme_zone_rd_boundary(ns, zone));
1640        }
1641    }
1642
1643    return status;
1644}
1645
1646static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1647{
1648    switch (nvme_get_zone_state(zone)) {
1649    case NVME_ZONE_STATE_FULL:
1650        return NVME_SUCCESS;
1651
1652    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1653    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1654        nvme_aor_dec_open(ns);
1655        /* fallthrough */
1656    case NVME_ZONE_STATE_CLOSED:
1657        nvme_aor_dec_active(ns);
1658        /* fallthrough */
1659    case NVME_ZONE_STATE_EMPTY:
1660        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1661        return NVME_SUCCESS;
1662
1663    default:
1664        return NVME_ZONE_INVAL_TRANSITION;
1665    }
1666}
1667
1668static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1669{
1670    switch (nvme_get_zone_state(zone)) {
1671    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1672    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1673        nvme_aor_dec_open(ns);
1674        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1675        /* fall through */
1676    case NVME_ZONE_STATE_CLOSED:
1677        return NVME_SUCCESS;
1678
1679    default:
1680        return NVME_ZONE_INVAL_TRANSITION;
1681    }
1682}
1683
1684static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1685{
1686    NvmeZone *zone;
1687
1688    if (ns->params.max_open_zones &&
1689        ns->nr_open_zones == ns->params.max_open_zones) {
1690        zone = QTAILQ_FIRST(&ns->imp_open_zones);
1691        if (zone) {
1692            /*
1693             * Automatically close this implicitly open zone.
1694             */
1695            QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1696            nvme_zrm_close(ns, zone);
1697        }
1698    }
1699}
1700
1701static uint16_t __nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone,
1702                                bool implicit)
1703{
1704    int act = 0;
1705    uint16_t status;
1706
1707    switch (nvme_get_zone_state(zone)) {
1708    case NVME_ZONE_STATE_EMPTY:
1709        act = 1;
1710
1711        /* fallthrough */
1712
1713    case NVME_ZONE_STATE_CLOSED:
1714        nvme_zrm_auto_transition_zone(ns);
1715        status = nvme_aor_check(ns, act, 1);
1716        if (status) {
1717            return status;
1718        }
1719
1720        if (act) {
1721            nvme_aor_inc_active(ns);
1722        }
1723
1724        nvme_aor_inc_open(ns);
1725
1726        if (implicit) {
1727            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1728            return NVME_SUCCESS;
1729        }
1730
1731        /* fallthrough */
1732
1733    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1734        if (implicit) {
1735            return NVME_SUCCESS;
1736        }
1737
1738        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1739
1740        /* fallthrough */
1741
1742    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1743        return NVME_SUCCESS;
1744
1745    default:
1746        return NVME_ZONE_INVAL_TRANSITION;
1747    }
1748}
1749
1750static inline uint16_t nvme_zrm_auto(NvmeNamespace *ns, NvmeZone *zone)
1751{
1752    return __nvme_zrm_open(ns, zone, true);
1753}
1754
1755static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone)
1756{
1757    return __nvme_zrm_open(ns, zone, false);
1758}
1759
1760static void __nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1761                                   uint32_t nlb)
1762{
1763    zone->d.wp += nlb;
1764
1765    if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1766        nvme_zrm_finish(ns, zone);
1767    }
1768}
1769
1770static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1771{
1772    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1773    NvmeZone *zone;
1774    uint64_t slba;
1775    uint32_t nlb;
1776
1777    slba = le64_to_cpu(rw->slba);
1778    nlb = le16_to_cpu(rw->nlb) + 1;
1779    zone = nvme_get_zone_by_slba(ns, slba);
1780
1781    __nvme_advance_zone_wp(ns, zone, nlb);
1782}
1783
1784static inline bool nvme_is_write(NvmeRequest *req)
1785{
1786    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1787
1788    return rw->opcode == NVME_CMD_WRITE ||
1789           rw->opcode == NVME_CMD_ZONE_APPEND ||
1790           rw->opcode == NVME_CMD_WRITE_ZEROES;
1791}
1792
1793static void nvme_misc_cb(void *opaque, int ret)
1794{
1795    NvmeRequest *req = opaque;
1796    NvmeNamespace *ns = req->ns;
1797
1798    BlockBackend *blk = ns->blkconf.blk;
1799    BlockAcctCookie *acct = &req->acct;
1800    BlockAcctStats *stats = blk_get_stats(blk);
1801
1802    trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk));
1803
1804    if (ret) {
1805        block_acct_failed(stats, acct);
1806        nvme_aio_err(req, ret);
1807    } else {
1808        block_acct_done(stats, acct);
1809    }
1810
1811    nvme_enqueue_req_completion(nvme_cq(req), req);
1812}
1813
1814void nvme_rw_complete_cb(void *opaque, int ret)
1815{
1816    NvmeRequest *req = opaque;
1817    NvmeNamespace *ns = req->ns;
1818    BlockBackend *blk = ns->blkconf.blk;
1819    BlockAcctCookie *acct = &req->acct;
1820    BlockAcctStats *stats = blk_get_stats(blk);
1821
1822    trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1823
1824    if (ret) {
1825        block_acct_failed(stats, acct);
1826        nvme_aio_err(req, ret);
1827    } else {
1828        block_acct_done(stats, acct);
1829    }
1830
1831    if (ns->params.zoned && nvme_is_write(req)) {
1832        nvme_finalize_zoned_write(ns, req);
1833    }
1834
1835    nvme_enqueue_req_completion(nvme_cq(req), req);
1836}
1837
1838static void nvme_rw_cb(void *opaque, int ret)
1839{
1840    NvmeRequest *req = opaque;
1841    NvmeNamespace *ns = req->ns;
1842
1843    BlockBackend *blk = ns->blkconf.blk;
1844
1845    trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1846
1847    if (ret) {
1848        goto out;
1849    }
1850
1851    if (nvme_msize(ns)) {
1852        NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1853        uint64_t slba = le64_to_cpu(rw->slba);
1854        uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1855        uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
1856
1857        if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1858            size_t mlen = nvme_m2b(ns, nlb);
1859
1860            req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1861                                               BDRV_REQ_MAY_UNMAP,
1862                                               nvme_rw_complete_cb, req);
1863            return;
1864        }
1865
1866        if (nvme_ns_ext(ns) || req->cmd.mptr) {
1867            uint16_t status;
1868
1869            nvme_sg_unmap(&req->sg);
1870            status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1871            if (status) {
1872                ret = -EFAULT;
1873                goto out;
1874            }
1875
1876            if (req->cmd.opcode == NVME_CMD_READ) {
1877                return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1878            }
1879
1880            return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1881        }
1882    }
1883
1884out:
1885    nvme_rw_complete_cb(req, ret);
1886}
1887
1888struct nvme_aio_format_ctx {
1889    NvmeRequest   *req;
1890    NvmeNamespace *ns;
1891
1892    /* number of outstanding write zeroes for this namespace */
1893    int *count;
1894};
1895
1896static void nvme_aio_format_cb(void *opaque, int ret)
1897{
1898    struct nvme_aio_format_ctx *ctx = opaque;
1899    NvmeRequest *req = ctx->req;
1900    NvmeNamespace *ns = ctx->ns;
1901    uintptr_t *num_formats = (uintptr_t *)&req->opaque;
1902    int *count = ctx->count;
1903
1904    g_free(ctx);
1905
1906    if (ret) {
1907        nvme_aio_err(req, ret);
1908    }
1909
1910    if (--(*count)) {
1911        return;
1912    }
1913
1914    g_free(count);
1915    ns->status = 0x0;
1916
1917    if (--(*num_formats)) {
1918        return;
1919    }
1920
1921    nvme_enqueue_req_completion(nvme_cq(req), req);
1922}
1923
1924struct nvme_aio_flush_ctx {
1925    NvmeRequest     *req;
1926    NvmeNamespace   *ns;
1927    BlockAcctCookie acct;
1928};
1929
1930static void nvme_aio_flush_cb(void *opaque, int ret)
1931{
1932    struct nvme_aio_flush_ctx *ctx = opaque;
1933    NvmeRequest *req = ctx->req;
1934    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
1935
1936    BlockBackend *blk = ctx->ns->blkconf.blk;
1937    BlockAcctCookie *acct = &ctx->acct;
1938    BlockAcctStats *stats = blk_get_stats(blk);
1939
1940    trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
1941
1942    if (!ret) {
1943        block_acct_done(stats, acct);
1944    } else {
1945        block_acct_failed(stats, acct);
1946        nvme_aio_err(req, ret);
1947    }
1948
1949    (*num_flushes)--;
1950    g_free(ctx);
1951
1952    if (*num_flushes) {
1953        return;
1954    }
1955
1956    nvme_enqueue_req_completion(nvme_cq(req), req);
1957}
1958
1959static void nvme_verify_cb(void *opaque, int ret)
1960{
1961    NvmeBounceContext *ctx = opaque;
1962    NvmeRequest *req = ctx->req;
1963    NvmeNamespace *ns = req->ns;
1964    BlockBackend *blk = ns->blkconf.blk;
1965    BlockAcctCookie *acct = &req->acct;
1966    BlockAcctStats *stats = blk_get_stats(blk);
1967    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1968    uint64_t slba = le64_to_cpu(rw->slba);
1969    uint16_t ctrl = le16_to_cpu(rw->control);
1970    uint16_t apptag = le16_to_cpu(rw->apptag);
1971    uint16_t appmask = le16_to_cpu(rw->appmask);
1972    uint32_t reftag = le32_to_cpu(rw->reftag);
1973    uint16_t status;
1974
1975    trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag,
1976                             appmask, reftag);
1977
1978    if (ret) {
1979        block_acct_failed(stats, acct);
1980        nvme_aio_err(req, ret);
1981        goto out;
1982    }
1983
1984    block_acct_done(stats, acct);
1985
1986    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1987        status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1988                                       ctx->mdata.iov.size, slba);
1989        if (status) {
1990            req->status = status;
1991            goto out;
1992        }
1993
1994        req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1995                                     ctx->mdata.bounce, ctx->mdata.iov.size,
1996                                     ctrl, slba, apptag, appmask, reftag);
1997    }
1998
1999out:
2000    qemu_iovec_destroy(&ctx->data.iov);
2001    g_free(ctx->data.bounce);
2002
2003    qemu_iovec_destroy(&ctx->mdata.iov);
2004    g_free(ctx->mdata.bounce);
2005
2006    g_free(ctx);
2007
2008    nvme_enqueue_req_completion(nvme_cq(req), req);
2009}
2010
2011
2012static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2013{
2014    NvmeBounceContext *ctx = opaque;
2015    NvmeRequest *req = ctx->req;
2016    NvmeNamespace *ns = req->ns;
2017    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2018    uint64_t slba = le64_to_cpu(rw->slba);
2019    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2020    size_t mlen = nvme_m2b(ns, nlb);
2021    uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
2022    BlockBackend *blk = ns->blkconf.blk;
2023
2024    trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2025
2026    if (ret) {
2027        goto out;
2028    }
2029
2030    ctx->mdata.bounce = g_malloc(mlen);
2031
2032    qemu_iovec_reset(&ctx->mdata.iov);
2033    qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2034
2035    req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2036                                nvme_verify_cb, ctx);
2037    return;
2038
2039out:
2040    nvme_verify_cb(ctx, ret);
2041}
2042
2043static void nvme_aio_discard_cb(void *opaque, int ret)
2044{
2045    NvmeRequest *req = opaque;
2046    uintptr_t *discards = (uintptr_t *)&req->opaque;
2047
2048    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
2049
2050    if (ret) {
2051        nvme_aio_err(req, ret);
2052    }
2053
2054    (*discards)--;
2055
2056    if (*discards) {
2057        return;
2058    }
2059
2060    nvme_enqueue_req_completion(nvme_cq(req), req);
2061}
2062
2063struct nvme_zone_reset_ctx {
2064    NvmeRequest *req;
2065    NvmeZone    *zone;
2066};
2067
2068static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret)
2069{
2070    struct nvme_zone_reset_ctx *ctx = opaque;
2071    NvmeRequest *req = ctx->req;
2072    NvmeNamespace *ns = req->ns;
2073    NvmeZone *zone = ctx->zone;
2074    uintptr_t *resets = (uintptr_t *)&req->opaque;
2075
2076    if (ret) {
2077        nvme_aio_err(req, ret);
2078        goto out;
2079    }
2080
2081    switch (nvme_get_zone_state(zone)) {
2082    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2083    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2084        nvme_aor_dec_open(ns);
2085        /* fall through */
2086    case NVME_ZONE_STATE_CLOSED:
2087        nvme_aor_dec_active(ns);
2088        /* fall through */
2089    case NVME_ZONE_STATE_FULL:
2090        zone->w_ptr = zone->d.zslba;
2091        zone->d.wp = zone->w_ptr;
2092        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
2093        /* fall through */
2094    default:
2095        break;
2096    }
2097
2098out:
2099    g_free(ctx);
2100
2101    (*resets)--;
2102
2103    if (*resets) {
2104        return;
2105    }
2106
2107    nvme_enqueue_req_completion(nvme_cq(req), req);
2108}
2109
2110static void nvme_aio_zone_reset_cb(void *opaque, int ret)
2111{
2112    struct nvme_zone_reset_ctx *ctx = opaque;
2113    NvmeRequest *req = ctx->req;
2114    NvmeNamespace *ns = req->ns;
2115    NvmeZone *zone = ctx->zone;
2116
2117    trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
2118
2119    if (ret) {
2120        goto out;
2121    }
2122
2123    if (nvme_msize(ns)) {
2124        int64_t offset = ns->mdata_offset + nvme_m2b(ns, zone->d.zslba);
2125
2126        blk_aio_pwrite_zeroes(ns->blkconf.blk, offset,
2127                              nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
2128                              nvme_aio_zone_reset_complete_cb, ctx);
2129        return;
2130    }
2131
2132out:
2133    nvme_aio_zone_reset_complete_cb(opaque, ret);
2134}
2135
2136struct nvme_copy_ctx {
2137    int copies;
2138    uint8_t *bounce;
2139    uint8_t *mbounce;
2140    uint32_t nlb;
2141    NvmeCopySourceRange *ranges;
2142};
2143
2144struct nvme_copy_in_ctx {
2145    NvmeRequest *req;
2146    QEMUIOVector iov;
2147    NvmeCopySourceRange *range;
2148};
2149
2150static void nvme_copy_complete_cb(void *opaque, int ret)
2151{
2152    NvmeRequest *req = opaque;
2153    NvmeNamespace *ns = req->ns;
2154    struct nvme_copy_ctx *ctx = req->opaque;
2155
2156    if (ret) {
2157        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2158        nvme_aio_err(req, ret);
2159        goto out;
2160    }
2161
2162    block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2163
2164out:
2165    if (ns->params.zoned) {
2166        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2167        uint64_t sdlba = le64_to_cpu(copy->sdlba);
2168        NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2169
2170        __nvme_advance_zone_wp(ns, zone, ctx->nlb);
2171    }
2172
2173    g_free(ctx->bounce);
2174    g_free(ctx->mbounce);
2175    g_free(ctx);
2176
2177    nvme_enqueue_req_completion(nvme_cq(req), req);
2178}
2179
2180static void nvme_copy_cb(void *opaque, int ret)
2181{
2182    NvmeRequest *req = opaque;
2183    NvmeNamespace *ns = req->ns;
2184    struct nvme_copy_ctx *ctx = req->opaque;
2185
2186    trace_pci_nvme_copy_cb(nvme_cid(req));
2187
2188    if (ret) {
2189        goto out;
2190    }
2191
2192    if (nvme_msize(ns)) {
2193        NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2194        uint64_t sdlba = le64_to_cpu(copy->sdlba);
2195        int64_t offset = ns->mdata_offset + nvme_m2b(ns, sdlba);
2196
2197        qemu_iovec_reset(&req->sg.iov);
2198        qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb));
2199
2200        req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0,
2201                                     nvme_copy_complete_cb, req);
2202        return;
2203    }
2204
2205out:
2206    nvme_copy_complete_cb(opaque, ret);
2207}
2208
2209static void nvme_copy_in_complete(NvmeRequest *req)
2210{
2211    NvmeNamespace *ns = req->ns;
2212    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2213    struct nvme_copy_ctx *ctx = req->opaque;
2214    uint64_t sdlba = le64_to_cpu(copy->sdlba);
2215    uint16_t status;
2216
2217    trace_pci_nvme_copy_in_complete(nvme_cid(req));
2218
2219    block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2220
2221    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2222        uint16_t prinfor = (copy->control[0] >> 4) & 0xf;
2223        uint16_t prinfow = (copy->control[2] >> 2) & 0xf;
2224        uint16_t nr = copy->nr + 1;
2225        NvmeCopySourceRange *range;
2226        uint64_t slba;
2227        uint32_t nlb;
2228        uint16_t apptag, appmask;
2229        uint32_t reftag;
2230        uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce;
2231        size_t len, mlen;
2232        int i;
2233
2234        /*
2235         * The dif helpers expects prinfo to be similar to the control field of
2236         * the NvmeRwCmd, so shift by 10 to fake it.
2237         */
2238        prinfor = prinfor << 10;
2239        prinfow = prinfow << 10;
2240
2241        for (i = 0; i < nr; i++) {
2242            range = &ctx->ranges[i];
2243            slba = le64_to_cpu(range->slba);
2244            nlb = le16_to_cpu(range->nlb) + 1;
2245            len = nvme_l2b(ns, nlb);
2246            mlen = nvme_m2b(ns, nlb);
2247            apptag = le16_to_cpu(range->apptag);
2248            appmask = le16_to_cpu(range->appmask);
2249            reftag = le32_to_cpu(range->reftag);
2250
2251            status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba,
2252                                    apptag, appmask, reftag);
2253            if (status) {
2254                goto invalid;
2255            }
2256
2257            buf += len;
2258            mbuf += mlen;
2259        }
2260
2261        apptag = le16_to_cpu(copy->apptag);
2262        appmask = le16_to_cpu(copy->appmask);
2263        reftag = le32_to_cpu(copy->reftag);
2264
2265        if (prinfow & NVME_RW_PRINFO_PRACT) {
2266            size_t len = nvme_l2b(ns, ctx->nlb);
2267            size_t mlen = nvme_m2b(ns, ctx->nlb);
2268
2269            status = nvme_check_prinfo(ns, prinfow, sdlba, reftag);
2270            if (status) {
2271                goto invalid;
2272            }
2273
2274            nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce,
2275                                        mlen, apptag, reftag);
2276        } else {
2277            status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen,
2278                                    prinfow, sdlba, apptag, appmask, reftag);
2279            if (status) {
2280                goto invalid;
2281            }
2282        }
2283    }
2284
2285    status = nvme_check_bounds(ns, sdlba, ctx->nlb);
2286    if (status) {
2287        trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);
2288        goto invalid;
2289    }
2290
2291    if (ns->params.zoned) {
2292        NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2293
2294        status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb);
2295        if (status) {
2296            goto invalid;
2297        }
2298
2299        status = nvme_zrm_auto(ns, zone);
2300        if (status) {
2301            goto invalid;
2302        }
2303
2304        zone->w_ptr += ctx->nlb;
2305    }
2306
2307    qemu_iovec_init(&req->sg.iov, 1);
2308    qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
2309
2310    block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2311                     BLOCK_ACCT_WRITE);
2312
2313    req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
2314                                 &req->sg.iov, 0, nvme_copy_cb, req);
2315
2316    return;
2317
2318invalid:
2319    req->status = status;
2320
2321    g_free(ctx->bounce);
2322    g_free(ctx);
2323
2324    nvme_enqueue_req_completion(nvme_cq(req), req);
2325}
2326
2327static void nvme_aio_copy_in_cb(void *opaque, int ret)
2328{
2329    struct nvme_copy_in_ctx *in_ctx = opaque;
2330    NvmeRequest *req = in_ctx->req;
2331    NvmeNamespace *ns = req->ns;
2332    struct nvme_copy_ctx *ctx = req->opaque;
2333
2334    qemu_iovec_destroy(&in_ctx->iov);
2335    g_free(in_ctx);
2336
2337    trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
2338
2339    if (ret) {
2340        nvme_aio_err(req, ret);
2341    }
2342
2343    ctx->copies--;
2344
2345    if (ctx->copies) {
2346        return;
2347    }
2348
2349    if (req->status) {
2350        block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2351
2352        g_free(ctx->bounce);
2353        g_free(ctx->mbounce);
2354        g_free(ctx);
2355
2356        nvme_enqueue_req_completion(nvme_cq(req), req);
2357
2358        return;
2359    }
2360
2361    nvme_copy_in_complete(req);
2362}
2363
2364struct nvme_compare_ctx {
2365    struct {
2366        QEMUIOVector iov;
2367        uint8_t *bounce;
2368    } data;
2369
2370    struct {
2371        QEMUIOVector iov;
2372        uint8_t *bounce;
2373    } mdata;
2374};
2375
2376static void nvme_compare_mdata_cb(void *opaque, int ret)
2377{
2378    NvmeRequest *req = opaque;
2379    NvmeNamespace *ns = req->ns;
2380    NvmeCtrl *n = nvme_ctrl(req);
2381    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2382    uint16_t ctrl = le16_to_cpu(rw->control);
2383    uint16_t apptag = le16_to_cpu(rw->apptag);
2384    uint16_t appmask = le16_to_cpu(rw->appmask);
2385    uint32_t reftag = le32_to_cpu(rw->reftag);
2386    struct nvme_compare_ctx *ctx = req->opaque;
2387    g_autofree uint8_t *buf = NULL;
2388    uint16_t status = NVME_SUCCESS;
2389
2390    trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2391
2392    buf = g_malloc(ctx->mdata.iov.size);
2393
2394    status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2395                               NVME_TX_DIRECTION_TO_DEVICE, req);
2396    if (status) {
2397        req->status = status;
2398        goto out;
2399    }
2400
2401    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2402        uint64_t slba = le64_to_cpu(rw->slba);
2403        uint8_t *bufp;
2404        uint8_t *mbufp = ctx->mdata.bounce;
2405        uint8_t *end = mbufp + ctx->mdata.iov.size;
2406        size_t msize = nvme_msize(ns);
2407        int16_t pil = 0;
2408
2409        status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2410                                ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
2411                                slba, apptag, appmask, reftag);
2412        if (status) {
2413            req->status = status;
2414            goto out;
2415        }
2416
2417        /*
2418         * When formatted with protection information, do not compare the DIF
2419         * tuple.
2420         */
2421        if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2422            pil = nvme_msize(ns) - sizeof(NvmeDifTuple);
2423        }
2424
2425        for (bufp = buf; mbufp < end; bufp += msize, mbufp += msize) {
2426            if (memcmp(bufp + pil, mbufp + pil, msize - pil)) {
2427                req->status = NVME_CMP_FAILURE;
2428                goto out;
2429            }
2430        }
2431
2432        goto out;
2433    }
2434
2435    if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2436        req->status = NVME_CMP_FAILURE;
2437        goto out;
2438    }
2439
2440out:
2441    qemu_iovec_destroy(&ctx->data.iov);
2442    g_free(ctx->data.bounce);
2443
2444    qemu_iovec_destroy(&ctx->mdata.iov);
2445    g_free(ctx->mdata.bounce);
2446
2447    g_free(ctx);
2448
2449    nvme_enqueue_req_completion(nvme_cq(req), req);
2450}
2451
2452static void nvme_compare_data_cb(void *opaque, int ret)
2453{
2454    NvmeRequest *req = opaque;
2455    NvmeCtrl *n = nvme_ctrl(req);
2456    NvmeNamespace *ns = req->ns;
2457    BlockBackend *blk = ns->blkconf.blk;
2458    BlockAcctCookie *acct = &req->acct;
2459    BlockAcctStats *stats = blk_get_stats(blk);
2460
2461    struct nvme_compare_ctx *ctx = req->opaque;
2462    g_autofree uint8_t *buf = NULL;
2463    uint16_t status;
2464
2465    trace_pci_nvme_compare_data_cb(nvme_cid(req));
2466
2467    if (ret) {
2468        block_acct_failed(stats, acct);
2469        nvme_aio_err(req, ret);
2470        goto out;
2471    }
2472
2473    buf = g_malloc(ctx->data.iov.size);
2474
2475    status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2476                              NVME_TX_DIRECTION_TO_DEVICE, req);
2477    if (status) {
2478        req->status = status;
2479        goto out;
2480    }
2481
2482    if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2483        req->status = NVME_CMP_FAILURE;
2484        goto out;
2485    }
2486
2487    if (nvme_msize(ns)) {
2488        NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2489        uint64_t slba = le64_to_cpu(rw->slba);
2490        uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2491        size_t mlen = nvme_m2b(ns, nlb);
2492        uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
2493
2494        ctx->mdata.bounce = g_malloc(mlen);
2495
2496        qemu_iovec_init(&ctx->mdata.iov, 1);
2497        qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2498
2499        req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2500                                    nvme_compare_mdata_cb, req);
2501        return;
2502    }
2503
2504    block_acct_done(stats, acct);
2505
2506out:
2507    qemu_iovec_destroy(&ctx->data.iov);
2508    g_free(ctx->data.bounce);
2509    g_free(ctx);
2510
2511    nvme_enqueue_req_completion(nvme_cq(req), req);
2512}
2513
2514static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2515{
2516    NvmeNamespace *ns = req->ns;
2517    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2518
2519    uint32_t attr = le32_to_cpu(dsm->attributes);
2520    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2521
2522    uint16_t status = NVME_SUCCESS;
2523
2524    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
2525
2526    if (attr & NVME_DSMGMT_AD) {
2527        int64_t offset;
2528        size_t len;
2529        NvmeDsmRange range[nr];
2530        uintptr_t *discards = (uintptr_t *)&req->opaque;
2531
2532        status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
2533        if (status) {
2534            return status;
2535        }
2536
2537        /*
2538         * AIO callbacks may be called immediately, so initialize discards to 1
2539         * to make sure the the callback does not complete the request before
2540         * all discards have been issued.
2541         */
2542        *discards = 1;
2543
2544        for (int i = 0; i < nr; i++) {
2545            uint64_t slba = le64_to_cpu(range[i].slba);
2546            uint32_t nlb = le32_to_cpu(range[i].nlb);
2547
2548            if (nvme_check_bounds(ns, slba, nlb)) {
2549                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2550                                                     ns->id_ns.nsze);
2551                continue;
2552            }
2553
2554            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
2555                                          nlb);
2556
2557            if (nlb > n->dmrsl) {
2558                trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2559            }
2560
2561            offset = nvme_l2b(ns, slba);
2562            len = nvme_l2b(ns, nlb);
2563
2564            while (len) {
2565                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
2566
2567                (*discards)++;
2568
2569                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
2570                                 nvme_aio_discard_cb, req);
2571
2572                offset += bytes;
2573                len -= bytes;
2574            }
2575        }
2576
2577        /* account for the 1-initialization */
2578        (*discards)--;
2579
2580        if (*discards) {
2581            status = NVME_NO_COMPLETE;
2582        } else {
2583            status = req->status;
2584        }
2585    }
2586
2587    return status;
2588}
2589
2590static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2591{
2592    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2593    NvmeNamespace *ns = req->ns;
2594    BlockBackend *blk = ns->blkconf.blk;
2595    uint64_t slba = le64_to_cpu(rw->slba);
2596    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2597    size_t len = nvme_l2b(ns, nlb);
2598    int64_t offset = nvme_l2b(ns, slba);
2599    uint16_t ctrl = le16_to_cpu(rw->control);
2600    uint32_t reftag = le32_to_cpu(rw->reftag);
2601    NvmeBounceContext *ctx = NULL;
2602    uint16_t status;
2603
2604    trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2605
2606    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2607        status = nvme_check_prinfo(ns, ctrl, slba, reftag);
2608        if (status) {
2609            return status;
2610        }
2611
2612        if (ctrl & NVME_RW_PRINFO_PRACT) {
2613            return NVME_INVALID_PROT_INFO | NVME_DNR;
2614        }
2615    }
2616
2617    if (len > n->page_size << n->params.vsl) {
2618        return NVME_INVALID_FIELD | NVME_DNR;
2619    }
2620
2621    status = nvme_check_bounds(ns, slba, nlb);
2622    if (status) {
2623        trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2624        return status;
2625    }
2626
2627    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2628        status = nvme_check_dulbe(ns, slba, nlb);
2629        if (status) {
2630            return status;
2631        }
2632    }
2633
2634    ctx = g_new0(NvmeBounceContext, 1);
2635    ctx->req = req;
2636
2637    ctx->data.bounce = g_malloc(len);
2638
2639    qemu_iovec_init(&ctx->data.iov, 1);
2640    qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2641
2642    block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2643                     BLOCK_ACCT_READ);
2644
2645    req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2646                                nvme_verify_mdata_in_cb, ctx);
2647    return NVME_NO_COMPLETE;
2648}
2649
2650static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2651{
2652    NvmeNamespace *ns = req->ns;
2653    NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2654
2655    uint16_t nr = copy->nr + 1;
2656    uint8_t format = copy->control[0] & 0xf;
2657
2658    /*
2659     * Shift the PRINFOR/PRINFOW values by 10 to allow reusing the
2660     * NVME_RW_PRINFO constants.
2661     */
2662    uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10;
2663    uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10;
2664
2665    uint32_t nlb = 0;
2666    uint8_t *bounce = NULL, *bouncep = NULL;
2667    uint8_t *mbounce = NULL, *mbouncep = NULL;
2668    struct nvme_copy_ctx *ctx;
2669    uint16_t status;
2670    int i;
2671
2672    trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2673
2674    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2675        ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) {
2676        return NVME_INVALID_FIELD | NVME_DNR;
2677    }
2678
2679    if (!(n->id_ctrl.ocfs & (1 << format))) {
2680        trace_pci_nvme_err_copy_invalid_format(format);
2681        return NVME_INVALID_FIELD | NVME_DNR;
2682    }
2683
2684    if (nr > ns->id_ns.msrc + 1) {
2685        return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2686    }
2687
2688    ctx = g_new(struct nvme_copy_ctx, 1);
2689    ctx->ranges = g_new(NvmeCopySourceRange, nr);
2690
2691    status = nvme_h2c(n, (uint8_t *)ctx->ranges,
2692                      nr * sizeof(NvmeCopySourceRange), req);
2693    if (status) {
2694        goto out;
2695    }
2696
2697    for (i = 0; i < nr; i++) {
2698        uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2699        uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2700
2701        if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2702            status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2703            goto out;
2704        }
2705
2706        status = nvme_check_bounds(ns, slba, _nlb);
2707        if (status) {
2708            trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze);
2709            goto out;
2710        }
2711
2712        if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2713            status = nvme_check_dulbe(ns, slba, _nlb);
2714            if (status) {
2715                goto out;
2716            }
2717        }
2718
2719        if (ns->params.zoned) {
2720            status = nvme_check_zone_read(ns, slba, _nlb);
2721            if (status) {
2722                goto out;
2723            }
2724        }
2725
2726        nlb += _nlb;
2727    }
2728
2729    if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
2730        status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2731        goto out;
2732    }
2733
2734    bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
2735    if (nvme_msize(ns)) {
2736        mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb));
2737    }
2738
2739    block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2740                     BLOCK_ACCT_READ);
2741
2742    ctx->bounce = bounce;
2743    ctx->mbounce = mbounce;
2744    ctx->nlb = nlb;
2745    ctx->copies = 1;
2746
2747    req->opaque = ctx;
2748
2749    for (i = 0; i < nr; i++) {
2750        uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2751        uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2752
2753        size_t len = nvme_l2b(ns, nlb);
2754        int64_t offset = nvme_l2b(ns, slba);
2755
2756        trace_pci_nvme_copy_source_range(slba, nlb);
2757
2758        struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2759        in_ctx->req = req;
2760
2761        qemu_iovec_init(&in_ctx->iov, 1);
2762        qemu_iovec_add(&in_ctx->iov, bouncep, len);
2763
2764        ctx->copies++;
2765
2766        blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2767                       nvme_aio_copy_in_cb, in_ctx);
2768
2769        bouncep += len;
2770
2771        if (nvme_msize(ns)) {
2772            len = nvme_m2b(ns, nlb);
2773            offset = ns->mdata_offset + nvme_m2b(ns, slba);
2774
2775            in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2776            in_ctx->req = req;
2777
2778            qemu_iovec_init(&in_ctx->iov, 1);
2779            qemu_iovec_add(&in_ctx->iov, mbouncep, len);
2780
2781            ctx->copies++;
2782
2783            blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2784                           nvme_aio_copy_in_cb, in_ctx);
2785
2786            mbouncep += len;
2787        }
2788    }
2789
2790    /* account for the 1-initialization */
2791    ctx->copies--;
2792
2793    if (!ctx->copies) {
2794        nvme_copy_in_complete(req);
2795    }
2796
2797    return NVME_NO_COMPLETE;
2798
2799out:
2800    g_free(ctx->ranges);
2801    g_free(ctx);
2802
2803    return status;
2804}
2805
2806static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2807{
2808    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2809    NvmeNamespace *ns = req->ns;
2810    BlockBackend *blk = ns->blkconf.blk;
2811    uint64_t slba = le64_to_cpu(rw->slba);
2812    uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2813    uint16_t ctrl = le16_to_cpu(rw->control);
2814    size_t data_len = nvme_l2b(ns, nlb);
2815    size_t len = data_len;
2816    int64_t offset = nvme_l2b(ns, slba);
2817    struct nvme_compare_ctx *ctx = NULL;
2818    uint16_t status;
2819
2820    trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2821
2822    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) {
2823        return NVME_INVALID_PROT_INFO | NVME_DNR;
2824    }
2825
2826    if (nvme_ns_ext(ns)) {
2827        len += nvme_m2b(ns, nlb);
2828    }
2829
2830    status = nvme_check_mdts(n, len);
2831    if (status) {
2832        return status;
2833    }
2834
2835    status = nvme_check_bounds(ns, slba, nlb);
2836    if (status) {
2837        trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2838        return status;
2839    }
2840
2841    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2842        status = nvme_check_dulbe(ns, slba, nlb);
2843        if (status) {
2844            return status;
2845        }
2846    }
2847
2848    status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2849    if (status) {
2850        return status;
2851    }
2852
2853    ctx = g_new(struct nvme_compare_ctx, 1);
2854    ctx->data.bounce = g_malloc(data_len);
2855
2856    req->opaque = ctx;
2857
2858    qemu_iovec_init(&ctx->data.iov, 1);
2859    qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2860
2861    block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2862                     BLOCK_ACCT_READ);
2863    req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2864                                nvme_compare_data_cb, req);
2865
2866    return NVME_NO_COMPLETE;
2867}
2868
2869static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
2870{
2871    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
2872    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
2873    uint16_t status;
2874    struct nvme_aio_flush_ctx *ctx;
2875    NvmeNamespace *ns;
2876
2877    trace_pci_nvme_flush(nvme_cid(req), nsid);
2878
2879    if (nsid != NVME_NSID_BROADCAST) {
2880        req->ns = nvme_ns(n, nsid);
2881        if (unlikely(!req->ns)) {
2882            return NVME_INVALID_FIELD | NVME_DNR;
2883        }
2884
2885        block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
2886                         BLOCK_ACCT_FLUSH);
2887        req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req);
2888        return NVME_NO_COMPLETE;
2889    }
2890
2891    /* 1-initialize; see comment in nvme_dsm */
2892    *num_flushes = 1;
2893
2894    for (int i = 1; i <= n->num_namespaces; i++) {
2895        ns = nvme_ns(n, i);
2896        if (!ns) {
2897            continue;
2898        }
2899
2900        ctx = g_new(struct nvme_aio_flush_ctx, 1);
2901        ctx->req = req;
2902        ctx->ns = ns;
2903
2904        (*num_flushes)++;
2905
2906        block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
2907                         BLOCK_ACCT_FLUSH);
2908        blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
2909    }
2910
2911    /* account for the 1-initialization */
2912    (*num_flushes)--;
2913
2914    if (*num_flushes) {
2915        status = NVME_NO_COMPLETE;
2916    } else {
2917        status = req->status;
2918    }
2919
2920    return status;
2921}
2922
2923static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
2924{
2925    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2926    NvmeNamespace *ns = req->ns;
2927    uint64_t slba = le64_to_cpu(rw->slba);
2928    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2929    uint16_t ctrl = le16_to_cpu(rw->control);
2930    uint64_t data_size = nvme_l2b(ns, nlb);
2931    uint64_t mapped_size = data_size;
2932    uint64_t data_offset;
2933    BlockBackend *blk = ns->blkconf.blk;
2934    uint16_t status;
2935
2936    if (nvme_ns_ext(ns)) {
2937        mapped_size += nvme_m2b(ns, nlb);
2938
2939        if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2940            bool pract = ctrl & NVME_RW_PRINFO_PRACT;
2941
2942            if (pract && nvme_msize(ns) == 8) {
2943                mapped_size = data_size;
2944            }
2945        }
2946    }
2947
2948    trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
2949
2950    status = nvme_check_mdts(n, mapped_size);
2951    if (status) {
2952        goto invalid;
2953    }
2954
2955    status = nvme_check_bounds(ns, slba, nlb);
2956    if (status) {
2957        trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2958        goto invalid;
2959    }
2960
2961    if (ns->params.zoned) {
2962        status = nvme_check_zone_read(ns, slba, nlb);
2963        if (status) {
2964            trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
2965            goto invalid;
2966        }
2967    }
2968
2969    if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2970        status = nvme_check_dulbe(ns, slba, nlb);
2971        if (status) {
2972            goto invalid;
2973        }
2974    }
2975
2976    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2977        return nvme_dif_rw(n, req);
2978    }
2979
2980    status = nvme_map_data(n, nlb, req);
2981    if (status) {
2982        goto invalid;
2983    }
2984
2985    data_offset = nvme_l2b(ns, slba);
2986
2987    block_acct_start(blk_get_stats(blk), &req->acct, data_size,
2988                     BLOCK_ACCT_READ);
2989    nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
2990    return NVME_NO_COMPLETE;
2991
2992invalid:
2993    block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
2994    return status | NVME_DNR;
2995}
2996
2997static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
2998                              bool wrz)
2999{
3000    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3001    NvmeNamespace *ns = req->ns;
3002    uint64_t slba = le64_to_cpu(rw->slba);
3003    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3004    uint16_t ctrl = le16_to_cpu(rw->control);
3005    uint64_t data_size = nvme_l2b(ns, nlb);
3006    uint64_t mapped_size = data_size;
3007    uint64_t data_offset;
3008    NvmeZone *zone;
3009    NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3010    BlockBackend *blk = ns->blkconf.blk;
3011    uint16_t status;
3012
3013    if (nvme_ns_ext(ns)) {
3014        mapped_size += nvme_m2b(ns, nlb);
3015
3016        if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3017            bool pract = ctrl & NVME_RW_PRINFO_PRACT;
3018
3019            if (pract && nvme_msize(ns) == 8) {
3020                mapped_size -= nvme_m2b(ns, nlb);
3021            }
3022        }
3023    }
3024
3025    trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3026                         nvme_nsid(ns), nlb, mapped_size, slba);
3027
3028    if (!wrz) {
3029        status = nvme_check_mdts(n, mapped_size);
3030        if (status) {
3031            goto invalid;
3032        }
3033    }
3034
3035    status = nvme_check_bounds(ns, slba, nlb);
3036    if (status) {
3037        trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
3038        goto invalid;
3039    }
3040
3041    if (ns->params.zoned) {
3042        zone = nvme_get_zone_by_slba(ns, slba);
3043
3044        if (append) {
3045            bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3046
3047            if (unlikely(slba != zone->d.zslba)) {
3048                trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3049                status = NVME_INVALID_FIELD;
3050                goto invalid;
3051            }
3052
3053            if (n->params.zasl &&
3054                data_size > (uint64_t)n->page_size << n->params.zasl) {
3055                trace_pci_nvme_err_zasl(data_size);
3056                return NVME_INVALID_FIELD | NVME_DNR;
3057            }
3058
3059            slba = zone->w_ptr;
3060            rw->slba = cpu_to_le64(slba);
3061            res->slba = cpu_to_le64(slba);
3062
3063            switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3064            case NVME_ID_NS_DPS_TYPE_1:
3065                if (!piremap) {
3066                    return NVME_INVALID_PROT_INFO | NVME_DNR;
3067                }
3068
3069                /* fallthrough */
3070
3071            case NVME_ID_NS_DPS_TYPE_2:
3072                if (piremap) {
3073                    uint32_t reftag = le32_to_cpu(rw->reftag);
3074                    rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3075                }
3076
3077                break;
3078
3079            case NVME_ID_NS_DPS_TYPE_3:
3080                if (piremap) {
3081                    return NVME_INVALID_PROT_INFO | NVME_DNR;
3082                }
3083
3084                break;
3085            }
3086        }
3087
3088        status = nvme_check_zone_write(ns, zone, slba, nlb);
3089        if (status) {
3090            goto invalid;
3091        }
3092
3093        status = nvme_zrm_auto(ns, zone);
3094        if (status) {
3095            goto invalid;
3096        }
3097
3098        zone->w_ptr += nlb;
3099    }
3100
3101    data_offset = nvme_l2b(ns, slba);
3102
3103    if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3104        return nvme_dif_rw(n, req);
3105    }
3106
3107    if (!wrz) {
3108        status = nvme_map_data(n, nlb, req);
3109        if (status) {
3110            goto invalid;
3111        }
3112
3113        block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3114                         BLOCK_ACCT_WRITE);
3115        nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3116    } else {
3117        req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3118                                           BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3119                                           req);
3120    }
3121
3122    return NVME_NO_COMPLETE;
3123
3124invalid:
3125    block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3126    return status | NVME_DNR;
3127}
3128
3129static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3130{
3131    return nvme_do_write(n, req, false, false);
3132}
3133
3134static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3135{
3136    return nvme_do_write(n, req, false, true);
3137}
3138
3139static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3140{
3141    return nvme_do_write(n, req, true, false);
3142}
3143
3144static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3145                                            uint64_t *slba, uint32_t *zone_idx)
3146{
3147    uint32_t dw10 = le32_to_cpu(c->cdw10);
3148    uint32_t dw11 = le32_to_cpu(c->cdw11);
3149
3150    if (!ns->params.zoned) {
3151        trace_pci_nvme_err_invalid_opc(c->opcode);
3152        return NVME_INVALID_OPCODE | NVME_DNR;
3153    }
3154
3155    *slba = ((uint64_t)dw11) << 32 | dw10;
3156    if (unlikely(*slba >= ns->id_ns.nsze)) {
3157        trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3158        *slba = 0;
3159        return NVME_LBA_RANGE | NVME_DNR;
3160    }
3161
3162    *zone_idx = nvme_zone_idx(ns, *slba);
3163    assert(*zone_idx < ns->num_zones);
3164
3165    return NVME_SUCCESS;
3166}
3167
3168typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3169                                 NvmeRequest *);
3170
3171enum NvmeZoneProcessingMask {
3172    NVME_PROC_CURRENT_ZONE    = 0,
3173    NVME_PROC_OPENED_ZONES    = 1 << 0,
3174    NVME_PROC_CLOSED_ZONES    = 1 << 1,
3175    NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3176    NVME_PROC_FULL_ZONES      = 1 << 3,
3177};
3178
3179static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3180                               NvmeZoneState state, NvmeRequest *req)
3181{
3182    return nvme_zrm_open(ns, zone);
3183}
3184
3185static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3186                                NvmeZoneState state, NvmeRequest *req)
3187{
3188    return nvme_zrm_close(ns, zone);
3189}
3190
3191static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3192                                 NvmeZoneState state, NvmeRequest *req)
3193{
3194    return nvme_zrm_finish(ns, zone);
3195}
3196
3197static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
3198                                NvmeZoneState state, NvmeRequest *req)
3199{
3200    uintptr_t *resets = (uintptr_t *)&req->opaque;
3201    struct nvme_zone_reset_ctx *ctx;
3202
3203    switch (state) {
3204    case NVME_ZONE_STATE_EMPTY:
3205        return NVME_SUCCESS;
3206    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3207    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3208    case NVME_ZONE_STATE_CLOSED:
3209    case NVME_ZONE_STATE_FULL:
3210        break;
3211    default:
3212        return NVME_ZONE_INVAL_TRANSITION;
3213    }
3214
3215    /*
3216     * The zone reset aio callback needs to know the zone that is being reset
3217     * in order to transition the zone on completion.
3218     */
3219    ctx = g_new(struct nvme_zone_reset_ctx, 1);
3220    ctx->req = req;
3221    ctx->zone = zone;
3222
3223    (*resets)++;
3224
3225    blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
3226                          nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
3227                          nvme_aio_zone_reset_cb, ctx);
3228
3229    return NVME_NO_COMPLETE;
3230}
3231
3232static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3233                                  NvmeZoneState state, NvmeRequest *req)
3234{
3235    switch (state) {
3236    case NVME_ZONE_STATE_READ_ONLY:
3237        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3238        /* fall through */
3239    case NVME_ZONE_STATE_OFFLINE:
3240        return NVME_SUCCESS;
3241    default:
3242        return NVME_ZONE_INVAL_TRANSITION;
3243    }
3244}
3245
3246static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3247{
3248    uint16_t status;
3249    uint8_t state = nvme_get_zone_state(zone);
3250
3251    if (state == NVME_ZONE_STATE_EMPTY) {
3252        status = nvme_aor_check(ns, 1, 0);
3253        if (status) {
3254            return status;
3255        }
3256        nvme_aor_inc_active(ns);
3257        zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3258        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3259        return NVME_SUCCESS;
3260    }
3261
3262    return NVME_ZONE_INVAL_TRANSITION;
3263}
3264
3265static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3266                                    enum NvmeZoneProcessingMask proc_mask,
3267                                    op_handler_t op_hndlr, NvmeRequest *req)
3268{
3269    uint16_t status = NVME_SUCCESS;
3270    NvmeZoneState zs = nvme_get_zone_state(zone);
3271    bool proc_zone;
3272
3273    switch (zs) {
3274    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3275    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3276        proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3277        break;
3278    case NVME_ZONE_STATE_CLOSED:
3279        proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3280        break;
3281    case NVME_ZONE_STATE_READ_ONLY:
3282        proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3283        break;
3284    case NVME_ZONE_STATE_FULL:
3285        proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3286        break;
3287    default:
3288        proc_zone = false;
3289    }
3290
3291    if (proc_zone) {
3292        status = op_hndlr(ns, zone, zs, req);
3293    }
3294
3295    return status;
3296}
3297
3298static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3299                                enum NvmeZoneProcessingMask proc_mask,
3300                                op_handler_t op_hndlr, NvmeRequest *req)
3301{
3302    NvmeZone *next;
3303    uint16_t status = NVME_SUCCESS;
3304    int i;
3305
3306    if (!proc_mask) {
3307        status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3308    } else {
3309        if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3310            QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3311                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3312                                             req);
3313                if (status && status != NVME_NO_COMPLETE) {
3314                    goto out;
3315                }
3316            }
3317        }
3318        if (proc_mask & NVME_PROC_OPENED_ZONES) {
3319            QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3320                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3321                                             req);
3322                if (status && status != NVME_NO_COMPLETE) {
3323                    goto out;
3324                }
3325            }
3326
3327            QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3328                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3329                                             req);
3330                if (status && status != NVME_NO_COMPLETE) {
3331                    goto out;
3332                }
3333            }
3334        }
3335        if (proc_mask & NVME_PROC_FULL_ZONES) {
3336            QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3337                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3338                                             req);
3339                if (status && status != NVME_NO_COMPLETE) {
3340                    goto out;
3341                }
3342            }
3343        }
3344
3345        if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3346            for (i = 0; i < ns->num_zones; i++, zone++) {
3347                status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3348                                             req);
3349                if (status && status != NVME_NO_COMPLETE) {
3350                    goto out;
3351                }
3352            }
3353        }
3354    }
3355
3356out:
3357    return status;
3358}
3359
3360static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3361{
3362    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3363    NvmeNamespace *ns = req->ns;
3364    NvmeZone *zone;
3365    uintptr_t *resets;
3366    uint8_t *zd_ext;
3367    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3368    uint64_t slba = 0;
3369    uint32_t zone_idx = 0;
3370    uint16_t status;
3371    uint8_t action;
3372    bool all;
3373    enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3374
3375    action = dw13 & 0xff;
3376    all = dw13 & 0x100;
3377
3378    req->status = NVME_SUCCESS;
3379
3380    if (!all) {
3381        status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3382        if (status) {
3383            return status;
3384        }
3385    }
3386
3387    zone = &ns->zone_array[zone_idx];
3388    if (slba != zone->d.zslba) {
3389        trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3390        return NVME_INVALID_FIELD | NVME_DNR;
3391    }
3392
3393    switch (action) {
3394
3395    case NVME_ZONE_ACTION_OPEN:
3396        if (all) {
3397            proc_mask = NVME_PROC_CLOSED_ZONES;
3398        }
3399        trace_pci_nvme_open_zone(slba, zone_idx, all);
3400        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3401        break;
3402
3403    case NVME_ZONE_ACTION_CLOSE:
3404        if (all) {
3405            proc_mask = NVME_PROC_OPENED_ZONES;
3406        }
3407        trace_pci_nvme_close_zone(slba, zone_idx, all);
3408        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3409        break;
3410
3411    case NVME_ZONE_ACTION_FINISH:
3412        if (all) {
3413            proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3414        }
3415        trace_pci_nvme_finish_zone(slba, zone_idx, all);
3416        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3417        break;
3418
3419    case NVME_ZONE_ACTION_RESET:
3420        resets = (uintptr_t *)&req->opaque;
3421
3422        if (all) {
3423            proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
3424                NVME_PROC_FULL_ZONES;
3425        }
3426        trace_pci_nvme_reset_zone(slba, zone_idx, all);
3427
3428        *resets = 1;
3429
3430        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
3431
3432        (*resets)--;
3433
3434        return *resets ? NVME_NO_COMPLETE : req->status;
3435
3436    case NVME_ZONE_ACTION_OFFLINE:
3437        if (all) {
3438            proc_mask = NVME_PROC_READ_ONLY_ZONES;
3439        }
3440        trace_pci_nvme_offline_zone(slba, zone_idx, all);
3441        status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3442        break;
3443
3444    case NVME_ZONE_ACTION_SET_ZD_EXT:
3445        trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3446        if (all || !ns->params.zd_extension_size) {
3447            return NVME_INVALID_FIELD | NVME_DNR;
3448        }
3449        zd_ext = nvme_get_zd_extension(ns, zone_idx);
3450        status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3451        if (status) {
3452            trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3453            return status;
3454        }
3455
3456        status = nvme_set_zd_ext(ns, zone);
3457        if (status == NVME_SUCCESS) {
3458            trace_pci_nvme_zd_extension_set(zone_idx);
3459            return status;
3460        }
3461        break;
3462
3463    default:
3464        trace_pci_nvme_err_invalid_mgmt_action(action);
3465        status = NVME_INVALID_FIELD;
3466    }
3467
3468    if (status == NVME_ZONE_INVAL_TRANSITION) {
3469        trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3470                                                         zone->d.za);
3471    }
3472    if (status) {
3473        status |= NVME_DNR;
3474    }
3475
3476    return status;
3477}
3478
3479static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3480{
3481    NvmeZoneState zs = nvme_get_zone_state(zl);
3482
3483    switch (zafs) {
3484    case NVME_ZONE_REPORT_ALL:
3485        return true;
3486    case NVME_ZONE_REPORT_EMPTY:
3487        return zs == NVME_ZONE_STATE_EMPTY;
3488    case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3489        return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3490    case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3491        return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3492    case NVME_ZONE_REPORT_CLOSED:
3493        return zs == NVME_ZONE_STATE_CLOSED;
3494    case NVME_ZONE_REPORT_FULL:
3495        return zs == NVME_ZONE_STATE_FULL;
3496    case NVME_ZONE_REPORT_READ_ONLY:
3497        return zs == NVME_ZONE_STATE_READ_ONLY;
3498    case NVME_ZONE_REPORT_OFFLINE:
3499        return zs == NVME_ZONE_STATE_OFFLINE;
3500    default:
3501        return false;
3502    }
3503}
3504
3505static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3506{
3507    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3508    NvmeNamespace *ns = req->ns;
3509    /* cdw12 is zero-based number of dwords to return. Convert to bytes */
3510    uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3511    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3512    uint32_t zone_idx, zra, zrasf, partial;
3513    uint64_t max_zones, nr_zones = 0;
3514    uint16_t status;
3515    uint64_t slba;
3516    NvmeZoneDescr *z;
3517    NvmeZone *zone;
3518    NvmeZoneReportHeader *header;
3519    void *buf, *buf_p;
3520    size_t zone_entry_sz;
3521    int i;
3522
3523    req->status = NVME_SUCCESS;
3524
3525    status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3526    if (status) {
3527        return status;
3528    }
3529
3530    zra = dw13 & 0xff;
3531    if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3532        return NVME_INVALID_FIELD | NVME_DNR;
3533    }
3534    if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3535        return NVME_INVALID_FIELD | NVME_DNR;
3536    }
3537
3538    zrasf = (dw13 >> 8) & 0xff;
3539    if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3540        return NVME_INVALID_FIELD | NVME_DNR;
3541    }
3542
3543    if (data_size < sizeof(NvmeZoneReportHeader)) {
3544        return NVME_INVALID_FIELD | NVME_DNR;
3545    }
3546
3547    status = nvme_check_mdts(n, data_size);
3548    if (status) {
3549        return status;
3550    }
3551
3552    partial = (dw13 >> 16) & 0x01;
3553
3554    zone_entry_sz = sizeof(NvmeZoneDescr);
3555    if (zra == NVME_ZONE_REPORT_EXTENDED) {
3556        zone_entry_sz += ns->params.zd_extension_size;
3557    }
3558
3559    max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3560    buf = g_malloc0(data_size);
3561
3562    zone = &ns->zone_array[zone_idx];
3563    for (i = zone_idx; i < ns->num_zones; i++) {
3564        if (partial && nr_zones >= max_zones) {
3565            break;
3566        }
3567        if (nvme_zone_matches_filter(zrasf, zone++)) {
3568            nr_zones++;
3569        }
3570    }
3571    header = (NvmeZoneReportHeader *)buf;
3572    header->nr_zones = cpu_to_le64(nr_zones);
3573
3574    buf_p = buf + sizeof(NvmeZoneReportHeader);
3575    for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3576        zone = &ns->zone_array[zone_idx];
3577        if (nvme_zone_matches_filter(zrasf, zone)) {
3578            z = (NvmeZoneDescr *)buf_p;
3579            buf_p += sizeof(NvmeZoneDescr);
3580
3581            z->zt = zone->d.zt;
3582            z->zs = zone->d.zs;
3583            z->zcap = cpu_to_le64(zone->d.zcap);
3584            z->zslba = cpu_to_le64(zone->d.zslba);
3585            z->za = zone->d.za;
3586
3587            if (nvme_wp_is_valid(zone)) {
3588                z->wp = cpu_to_le64(zone->d.wp);
3589            } else {
3590                z->wp = cpu_to_le64(~0ULL);
3591            }
3592
3593            if (zra == NVME_ZONE_REPORT_EXTENDED) {
3594                if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3595                    memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3596                           ns->params.zd_extension_size);
3597                }
3598                buf_p += ns->params.zd_extension_size;
3599            }
3600
3601            max_zones--;
3602        }
3603    }
3604
3605    status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3606
3607    g_free(buf);
3608
3609    return status;
3610}
3611
3612static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3613{
3614    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3615    uint16_t status;
3616
3617    trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3618                          req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3619
3620    if (!nvme_nsid_valid(n, nsid)) {
3621        return NVME_INVALID_NSID | NVME_DNR;
3622    }
3623
3624    /*
3625     * In the base NVM command set, Flush may apply to all namespaces
3626     * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
3627     * along with TP 4056 (Namespace Types), it may be pretty screwed up.
3628     *
3629     * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
3630     * opcode with a specific command since we cannot determine a unique I/O
3631     * command set. Opcode 0h could have any other meaning than something
3632     * equivalent to flushing and say it DOES have completely different
3633     * semantics in some other command set - does an NSID of FFFFFFFFh then
3634     * mean "for all namespaces, apply whatever command set specific command
3635     * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
3636     * whatever command that uses the 0h opcode if, and only if, it allows NSID
3637     * to be FFFFFFFFh"?
3638     *
3639     * Anyway (and luckily), for now, we do not care about this since the
3640     * device only supports namespace types that includes the NVM Flush command
3641     * (NVM and Zoned), so always do an NVM Flush.
3642     */
3643    if (req->cmd.opcode == NVME_CMD_FLUSH) {
3644        return nvme_flush(n, req);
3645    }
3646
3647    req->ns = nvme_ns(n, nsid);
3648    if (unlikely(!req->ns)) {
3649        return NVME_INVALID_FIELD | NVME_DNR;
3650    }
3651
3652    if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3653        trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3654        return NVME_INVALID_OPCODE | NVME_DNR;
3655    }
3656
3657    status = nvme_ns_status(req->ns);
3658    if (unlikely(status)) {
3659        return status;
3660    }
3661
3662    switch (req->cmd.opcode) {
3663    case NVME_CMD_WRITE_ZEROES:
3664        return nvme_write_zeroes(n, req);
3665    case NVME_CMD_ZONE_APPEND:
3666        return nvme_zone_append(n, req);
3667    case NVME_CMD_WRITE:
3668        return nvme_write(n, req);
3669    case NVME_CMD_READ:
3670        return nvme_read(n, req);
3671    case NVME_CMD_COMPARE:
3672        return nvme_compare(n, req);
3673    case NVME_CMD_DSM:
3674        return nvme_dsm(n, req);
3675    case NVME_CMD_VERIFY:
3676        return nvme_verify(n, req);
3677    case NVME_CMD_COPY:
3678        return nvme_copy(n, req);
3679    case NVME_CMD_ZONE_MGMT_SEND:
3680        return nvme_zone_mgmt_send(n, req);
3681    case NVME_CMD_ZONE_MGMT_RECV:
3682        return nvme_zone_mgmt_recv(n, req);
3683    default:
3684        assert(false);
3685    }
3686
3687    return NVME_INVALID_OPCODE | NVME_DNR;
3688}
3689
3690static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3691{
3692    n->sq[sq->sqid] = NULL;
3693    timer_free(sq->timer);
3694    g_free(sq->io_req);
3695    if (sq->sqid) {
3696        g_free(sq);
3697    }
3698}
3699
3700static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3701{
3702    NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3703    NvmeRequest *r, *next;
3704    NvmeSQueue *sq;
3705    NvmeCQueue *cq;
3706    uint16_t qid = le16_to_cpu(c->qid);
3707    uint32_t nsid;
3708
3709    if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3710        trace_pci_nvme_err_invalid_del_sq(qid);
3711        return NVME_INVALID_QID | NVME_DNR;
3712    }
3713
3714    trace_pci_nvme_del_sq(qid);
3715
3716    sq = n->sq[qid];
3717    while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3718        r = QTAILQ_FIRST(&sq->out_req_list);
3719        if (r->aiocb) {
3720            blk_aio_cancel(r->aiocb);
3721        }
3722    }
3723
3724    /*
3725     * Drain all namespaces if there are still outstanding requests that we
3726     * could not cancel explicitly.
3727     */
3728    if (!QTAILQ_EMPTY(&sq->out_req_list)) {
3729        for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
3730            NvmeNamespace *ns = nvme_ns(n, nsid);
3731            if (ns) {
3732                nvme_ns_drain(ns);
3733            }
3734        }
3735    }
3736
3737    assert(QTAILQ_EMPTY(&sq->out_req_list));
3738
3739    if (!nvme_check_cqid(n, sq->cqid)) {
3740        cq = n->cq[sq->cqid];
3741        QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3742
3743        nvme_post_cqes(cq);
3744        QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3745            if (r->sq == sq) {
3746                QTAILQ_REMOVE(&cq->req_list, r, entry);
3747                QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3748            }
3749        }
3750    }
3751
3752    nvme_free_sq(sq, n);
3753    return NVME_SUCCESS;
3754}
3755
3756static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3757                         uint16_t sqid, uint16_t cqid, uint16_t size)
3758{
3759    int i;
3760    NvmeCQueue *cq;
3761
3762    sq->ctrl = n;
3763    sq->dma_addr = dma_addr;
3764    sq->sqid = sqid;
3765    sq->size = size;
3766    sq->cqid = cqid;
3767    sq->head = sq->tail = 0;
3768    sq->io_req = g_new0(NvmeRequest, sq->size);
3769
3770    QTAILQ_INIT(&sq->req_list);
3771    QTAILQ_INIT(&sq->out_req_list);
3772    for (i = 0; i < sq->size; i++) {
3773        sq->io_req[i].sq = sq;
3774        QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
3775    }
3776    sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
3777
3778    assert(n->cq[cqid]);
3779    cq = n->cq[cqid];
3780    QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
3781    n->sq[sqid] = sq;
3782}
3783
3784static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
3785{
3786    NvmeSQueue *sq;
3787    NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
3788
3789    uint16_t cqid = le16_to_cpu(c->cqid);
3790    uint16_t sqid = le16_to_cpu(c->sqid);
3791    uint16_t qsize = le16_to_cpu(c->qsize);
3792    uint16_t qflags = le16_to_cpu(c->sq_flags);
3793    uint64_t prp1 = le64_to_cpu(c->prp1);
3794
3795    trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
3796
3797    if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
3798        trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
3799        return NVME_INVALID_CQID | NVME_DNR;
3800    }
3801    if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
3802        n->sq[sqid] != NULL)) {
3803        trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
3804        return NVME_INVALID_QID | NVME_DNR;
3805    }
3806    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
3807        trace_pci_nvme_err_invalid_create_sq_size(qsize);
3808        return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
3809    }
3810    if (unlikely(prp1 & (n->page_size - 1))) {
3811        trace_pci_nvme_err_invalid_create_sq_addr(prp1);
3812        return NVME_INVALID_PRP_OFFSET | NVME_DNR;
3813    }
3814    if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
3815        trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
3816        return NVME_INVALID_FIELD | NVME_DNR;
3817    }
3818    sq = g_malloc0(sizeof(*sq));
3819    nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
3820    return NVME_SUCCESS;
3821}
3822
3823struct nvme_stats {
3824    uint64_t units_read;
3825    uint64_t units_written;
3826    uint64_t read_commands;
3827    uint64_t write_commands;
3828};
3829
3830static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
3831{
3832    BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
3833
3834    stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
3835    stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
3836    stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
3837    stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
3838}
3839
3840static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3841                                uint64_t off, NvmeRequest *req)
3842{
3843    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3844    struct nvme_stats stats = { 0 };
3845    NvmeSmartLog smart = { 0 };
3846    uint32_t trans_len;
3847    NvmeNamespace *ns;
3848    time_t current_ms;
3849
3850    if (off >= sizeof(smart)) {
3851        return NVME_INVALID_FIELD | NVME_DNR;
3852    }
3853
3854    if (nsid != 0xffffffff) {
3855        ns = nvme_ns(n, nsid);
3856        if (!ns) {
3857            return NVME_INVALID_NSID | NVME_DNR;
3858        }
3859        nvme_set_blk_stats(ns, &stats);
3860    } else {
3861        int i;
3862
3863        for (i = 1; i <= n->num_namespaces; i++) {
3864            ns = nvme_ns(n, i);
3865            if (!ns) {
3866                continue;
3867            }
3868            nvme_set_blk_stats(ns, &stats);
3869        }
3870    }
3871
3872    trans_len = MIN(sizeof(smart) - off, buf_len);
3873    smart.critical_warning = n->smart_critical_warning;
3874
3875    smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
3876                                                        1000));
3877    smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
3878                                                           1000));
3879    smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
3880    smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
3881
3882    smart.temperature = cpu_to_le16(n->temperature);
3883
3884    if ((n->temperature >= n->features.temp_thresh_hi) ||
3885        (n->temperature <= n->features.temp_thresh_low)) {
3886        smart.critical_warning |= NVME_SMART_TEMPERATURE;
3887    }
3888
3889    current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
3890    smart.power_on_hours[0] =
3891        cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
3892
3893    if (!rae) {
3894        nvme_clear_events(n, NVME_AER_TYPE_SMART);
3895    }
3896
3897    return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
3898}
3899
3900static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
3901                                 NvmeRequest *req)
3902{
3903    uint32_t trans_len;
3904    NvmeFwSlotInfoLog fw_log = {
3905        .afi = 0x1,
3906    };
3907
3908    if (off >= sizeof(fw_log)) {
3909        return NVME_INVALID_FIELD | NVME_DNR;
3910    }
3911
3912    strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
3913    trans_len = MIN(sizeof(fw_log) - off, buf_len);
3914
3915    return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
3916}
3917
3918static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3919                                uint64_t off, NvmeRequest *req)
3920{
3921    uint32_t trans_len;
3922    NvmeErrorLog errlog;
3923
3924    if (off >= sizeof(errlog)) {
3925        return NVME_INVALID_FIELD | NVME_DNR;
3926    }
3927
3928    if (!rae) {
3929        nvme_clear_events(n, NVME_AER_TYPE_ERROR);
3930    }
3931
3932    memset(&errlog, 0x0, sizeof(errlog));
3933    trans_len = MIN(sizeof(errlog) - off, buf_len);
3934
3935    return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
3936}
3937
3938static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3939                                    uint64_t off, NvmeRequest *req)
3940{
3941    uint32_t nslist[1024];
3942    uint32_t trans_len;
3943    int i = 0;
3944    uint32_t nsid;
3945
3946    memset(nslist, 0x0, sizeof(nslist));
3947    trans_len = MIN(sizeof(nslist) - off, buf_len);
3948
3949    while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
3950            NVME_CHANGED_NSID_SIZE) {
3951        /*
3952         * If more than 1024 namespaces, the first entry in the log page should
3953         * be set to FFFFFFFFh and the others to 0 as spec.
3954         */
3955        if (i == ARRAY_SIZE(nslist)) {
3956            memset(nslist, 0x0, sizeof(nslist));
3957            nslist[0] = 0xffffffff;
3958            break;
3959        }
3960
3961        nslist[i++] = nsid;
3962        clear_bit(nsid, n->changed_nsids);
3963    }
3964
3965    /*
3966     * Remove all the remaining list entries in case returns directly due to
3967     * more than 1024 namespaces.
3968     */
3969    if (nslist[0] == 0xffffffff) {
3970        bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
3971    }
3972
3973    if (!rae) {
3974        nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
3975    }
3976
3977    return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
3978}
3979
3980static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
3981                                 uint64_t off, NvmeRequest *req)
3982{
3983    NvmeEffectsLog log = {};
3984    const uint32_t *src_iocs = NULL;
3985    uint32_t trans_len;
3986
3987    if (off >= sizeof(log)) {
3988        trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
3989        return NVME_INVALID_FIELD | NVME_DNR;
3990    }
3991
3992    switch (NVME_CC_CSS(n->bar.cc)) {
3993    case NVME_CC_CSS_NVM:
3994        src_iocs = nvme_cse_iocs_nvm;
3995        /* fall through */
3996    case NVME_CC_CSS_ADMIN_ONLY:
3997        break;
3998    case NVME_CC_CSS_CSI:
3999        switch (csi) {
4000        case NVME_CSI_NVM:
4001            src_iocs = nvme_cse_iocs_nvm;
4002            break;
4003        case NVME_CSI_ZONED:
4004            src_iocs = nvme_cse_iocs_zoned;
4005            break;
4006        }
4007    }
4008
4009    memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4010
4011    if (src_iocs) {
4012        memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4013    }
4014
4015    trans_len = MIN(sizeof(log) - off, buf_len);
4016
4017    return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4018}
4019
4020static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4021{
4022    NvmeCmd *cmd = &req->cmd;
4023
4024    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4025    uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4026    uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4027    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4028    uint8_t  lid = dw10 & 0xff;
4029    uint8_t  lsp = (dw10 >> 8) & 0xf;
4030    uint8_t  rae = (dw10 >> 15) & 0x1;
4031    uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
4032    uint32_t numdl, numdu;
4033    uint64_t off, lpol, lpou;
4034    size_t   len;
4035    uint16_t status;
4036
4037    numdl = (dw10 >> 16);
4038    numdu = (dw11 & 0xffff);
4039    lpol = dw12;
4040    lpou = dw13;
4041
4042    len = (((numdu << 16) | numdl) + 1) << 2;
4043    off = (lpou << 32ULL) | lpol;
4044
4045    if (off & 0x3) {
4046        return NVME_INVALID_FIELD | NVME_DNR;
4047    }
4048
4049    trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4050
4051    status = nvme_check_mdts(n, len);
4052    if (status) {
4053        return status;
4054    }
4055
4056    switch (lid) {
4057    case NVME_LOG_ERROR_INFO:
4058        return nvme_error_info(n, rae, len, off, req);
4059    case NVME_LOG_SMART_INFO:
4060        return nvme_smart_info(n, rae, len, off, req);
4061    case NVME_LOG_FW_SLOT_INFO:
4062        return nvme_fw_log_info(n, len, off, req);
4063    case NVME_LOG_CHANGED_NSLIST:
4064        return nvme_changed_nslist(n, rae, len, off, req);
4065    case NVME_LOG_CMD_EFFECTS:
4066        return nvme_cmd_effects(n, csi, len, off, req);
4067    default:
4068        trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4069        return NVME_INVALID_FIELD | NVME_DNR;
4070    }
4071}
4072
4073static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4074{
4075    n->cq[cq->cqid] = NULL;
4076    timer_free(cq->timer);
4077    if (msix_enabled(&n->parent_obj)) {
4078        msix_vector_unuse(&n->parent_obj, cq->vector);
4079    }
4080    if (cq->cqid) {
4081        g_free(cq);
4082    }
4083}
4084
4085static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4086{
4087    NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4088    NvmeCQueue *cq;
4089    uint16_t qid = le16_to_cpu(c->qid);
4090
4091    if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4092        trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4093        return NVME_INVALID_CQID | NVME_DNR;
4094    }
4095
4096    cq = n->cq[qid];
4097    if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4098        trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4099        return NVME_INVALID_QUEUE_DEL;
4100    }
4101
4102    if (cq->irq_enabled && cq->tail != cq->head) {
4103        n->cq_pending--;
4104    }
4105
4106    nvme_irq_deassert(n, cq);
4107    trace_pci_nvme_del_cq(qid);
4108    nvme_free_cq(cq, n);
4109    return NVME_SUCCESS;
4110}
4111
4112static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4113                         uint16_t cqid, uint16_t vector, uint16_t size,
4114                         uint16_t irq_enabled)
4115{
4116    int ret;
4117
4118    if (msix_enabled(&n->parent_obj)) {
4119        ret = msix_vector_use(&n->parent_obj, vector);
4120        assert(ret == 0);
4121    }
4122    cq->ctrl = n;
4123    cq->cqid = cqid;
4124    cq->size = size;
4125    cq->dma_addr = dma_addr;
4126    cq->phase = 1;
4127    cq->irq_enabled = irq_enabled;
4128    cq->vector = vector;
4129    cq->head = cq->tail = 0;
4130    QTAILQ_INIT(&cq->req_list);
4131    QTAILQ_INIT(&cq->sq_list);
4132    n->cq[cqid] = cq;
4133    cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4134}
4135
4136static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4137{
4138    NvmeCQueue *cq;
4139    NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4140    uint16_t cqid = le16_to_cpu(c->cqid);
4141    uint16_t vector = le16_to_cpu(c->irq_vector);
4142    uint16_t qsize = le16_to_cpu(c->qsize);
4143    uint16_t qflags = le16_to_cpu(c->cq_flags);
4144    uint64_t prp1 = le64_to_cpu(c->prp1);
4145
4146    trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4147                             NVME_CQ_FLAGS_IEN(qflags) != 0);
4148
4149    if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4150        n->cq[cqid] != NULL)) {
4151        trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4152        return NVME_INVALID_QID | NVME_DNR;
4153    }
4154    if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
4155        trace_pci_nvme_err_invalid_create_cq_size(qsize);
4156        return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4157    }
4158    if (unlikely(prp1 & (n->page_size - 1))) {
4159        trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4160        return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4161    }
4162    if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4163        trace_pci_nvme_err_invalid_create_cq_vector(vector);
4164        return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4165    }
4166    if (unlikely(vector >= n->params.msix_qsize)) {
4167        trace_pci_nvme_err_invalid_create_cq_vector(vector);
4168        return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4169    }
4170    if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4171        trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4172        return NVME_INVALID_FIELD | NVME_DNR;
4173    }
4174
4175    cq = g_malloc0(sizeof(*cq));
4176    nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4177                 NVME_CQ_FLAGS_IEN(qflags));
4178
4179    /*
4180     * It is only required to set qs_created when creating a completion queue;
4181     * creating a submission queue without a matching completion queue will
4182     * fail.
4183     */
4184    n->qs_created = true;
4185    return NVME_SUCCESS;
4186}
4187
4188static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4189{
4190    uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4191
4192    return nvme_c2h(n, id, sizeof(id), req);
4193}
4194
4195static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
4196{
4197    switch (ns->csi) {
4198    case NVME_CSI_NVM:
4199    case NVME_CSI_ZONED:
4200        return true;
4201    }
4202    return false;
4203}
4204
4205static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4206{
4207    trace_pci_nvme_identify_ctrl();
4208
4209    return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4210}
4211
4212static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4213{
4214    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4215    uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4216    NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4217
4218    trace_pci_nvme_identify_ctrl_csi(c->csi);
4219
4220    switch (c->csi) {
4221    case NVME_CSI_NVM:
4222        id_nvm->vsl = n->params.vsl;
4223        id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4224        break;
4225
4226    case NVME_CSI_ZONED:
4227        ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4228        break;
4229
4230    default:
4231        return NVME_INVALID_FIELD | NVME_DNR;
4232    }
4233
4234    return nvme_c2h(n, id, sizeof(id), req);
4235}
4236
4237static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4238{
4239    NvmeNamespace *ns;
4240    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4241    uint32_t nsid = le32_to_cpu(c->nsid);
4242
4243    trace_pci_nvme_identify_ns(nsid);
4244
4245    if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4246        return NVME_INVALID_NSID | NVME_DNR;
4247    }
4248
4249    ns = nvme_ns(n, nsid);
4250    if (unlikely(!ns)) {
4251        if (!active) {
4252            ns = nvme_subsys_ns(n->subsys, nsid);
4253            if (!ns) {
4254                return nvme_rpt_empty_id_struct(n, req);
4255            }
4256        } else {
4257            return nvme_rpt_empty_id_struct(n, req);
4258        }
4259    }
4260
4261    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
4262        return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4263    }
4264
4265    return NVME_INVALID_CMD_SET | NVME_DNR;
4266}
4267
4268static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req)
4269{
4270    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4271    uint16_t min_id = le16_to_cpu(c->ctrlid);
4272    uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4273    uint16_t *ids = &list[1];
4274    NvmeNamespace *ns;
4275    NvmeCtrl *ctrl;
4276    int cntlid, nr_ids = 0;
4277
4278    trace_pci_nvme_identify_ns_attached_list(min_id);
4279
4280    if (c->nsid == NVME_NSID_BROADCAST) {
4281        return NVME_INVALID_FIELD | NVME_DNR;
4282    }
4283
4284    ns = nvme_subsys_ns(n->subsys, c->nsid);
4285    if (!ns) {
4286        return NVME_INVALID_FIELD | NVME_DNR;
4287    }
4288
4289    for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4290        ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4291        if (!ctrl) {
4292            continue;
4293        }
4294
4295        if (!nvme_ns(ctrl, c->nsid)) {
4296            continue;
4297        }
4298
4299        ids[nr_ids++] = cntlid;
4300    }
4301
4302    list[0] = nr_ids;
4303
4304    return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4305}
4306
4307static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4308        bool active)
4309{
4310    NvmeNamespace *ns;
4311    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4312    uint32_t nsid = le32_to_cpu(c->nsid);
4313
4314    trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4315
4316    if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4317        return NVME_INVALID_NSID | NVME_DNR;
4318    }
4319
4320    ns = nvme_ns(n, nsid);
4321    if (unlikely(!ns)) {
4322        if (!active) {
4323            ns = nvme_subsys_ns(n->subsys, nsid);
4324            if (!ns) {
4325                return nvme_rpt_empty_id_struct(n, req);
4326            }
4327        } else {
4328            return nvme_rpt_empty_id_struct(n, req);
4329        }
4330    }
4331
4332    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
4333        return nvme_rpt_empty_id_struct(n, req);
4334    } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4335        return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4336                        req);
4337    }
4338
4339    return NVME_INVALID_FIELD | NVME_DNR;
4340}
4341
4342static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4343        bool active)
4344{
4345    NvmeNamespace *ns;
4346    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4347    uint32_t min_nsid = le32_to_cpu(c->nsid);
4348    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4349    static const int data_len = sizeof(list);
4350    uint32_t *list_ptr = (uint32_t *)list;
4351    int i, j = 0;
4352
4353    trace_pci_nvme_identify_nslist(min_nsid);
4354
4355    /*
4356     * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
4357     * since the Active Namespace ID List should return namespaces with ids
4358     * *higher* than the NSID specified in the command. This is also specified
4359     * in the spec (NVM Express v1.3d, Section 5.15.4).
4360     */
4361    if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4362        return NVME_INVALID_NSID | NVME_DNR;
4363    }
4364
4365    for (i = 1; i <= n->num_namespaces; i++) {
4366        ns = nvme_ns(n, i);
4367        if (!ns) {
4368            if (!active) {
4369                ns = nvme_subsys_ns(n->subsys, i);
4370                if (!ns) {
4371                    continue;
4372                }
4373            } else {
4374                continue;
4375            }
4376        }
4377        if (ns->params.nsid <= min_nsid) {
4378            continue;
4379        }
4380        list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4381        if (j == data_len / sizeof(uint32_t)) {
4382            break;
4383        }
4384    }
4385
4386    return nvme_c2h(n, list, data_len, req);
4387}
4388
4389static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4390        bool active)
4391{
4392    NvmeNamespace *ns;
4393    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4394    uint32_t min_nsid = le32_to_cpu(c->nsid);
4395    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4396    static const int data_len = sizeof(list);
4397    uint32_t *list_ptr = (uint32_t *)list;
4398    int i, j = 0;
4399
4400    trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4401
4402    /*
4403     * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
4404     */
4405    if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4406        return NVME_INVALID_NSID | NVME_DNR;
4407    }
4408
4409    if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4410        return NVME_INVALID_FIELD | NVME_DNR;
4411    }
4412
4413    for (i = 1; i <= n->num_namespaces; i++) {
4414        ns = nvme_ns(n, i);
4415        if (!ns) {
4416            if (!active) {
4417                ns = nvme_subsys_ns(n->subsys, i);
4418                if (!ns) {
4419                    continue;
4420                }
4421            } else {
4422                continue;
4423            }
4424        }
4425        if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4426            continue;
4427        }
4428        list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4429        if (j == data_len / sizeof(uint32_t)) {
4430            break;
4431        }
4432    }
4433
4434    return nvme_c2h(n, list, data_len, req);
4435}
4436
4437static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4438{
4439    NvmeNamespace *ns;
4440    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4441    uint32_t nsid = le32_to_cpu(c->nsid);
4442    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4443
4444    struct data {
4445        struct {
4446            NvmeIdNsDescr hdr;
4447            uint8_t v[NVME_NIDL_UUID];
4448        } uuid;
4449        struct {
4450            NvmeIdNsDescr hdr;
4451            uint8_t v;
4452        } csi;
4453    };
4454
4455    struct data *ns_descrs = (struct data *)list;
4456
4457    trace_pci_nvme_identify_ns_descr_list(nsid);
4458
4459    if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4460        return NVME_INVALID_NSID | NVME_DNR;
4461    }
4462
4463    ns = nvme_ns(n, nsid);
4464    if (unlikely(!ns)) {
4465        return NVME_INVALID_FIELD | NVME_DNR;
4466    }
4467
4468    /*
4469     * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
4470     * structure, a Namespace UUID (nidt = 3h) must be reported in the
4471     * Namespace Identification Descriptor. Add the namespace UUID here.
4472     */
4473    ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
4474    ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID;
4475    memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4476
4477    ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI;
4478    ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI;
4479    ns_descrs->csi.v = ns->csi;
4480
4481    return nvme_c2h(n, list, sizeof(list), req);
4482}
4483
4484static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4485{
4486    uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4487    static const int data_len = sizeof(list);
4488
4489    trace_pci_nvme_identify_cmd_set();
4490
4491    NVME_SET_CSI(*list, NVME_CSI_NVM);
4492    NVME_SET_CSI(*list, NVME_CSI_ZONED);
4493
4494    return nvme_c2h(n, list, data_len, req);
4495}
4496
4497static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4498{
4499    NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4500
4501    trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4502                            c->csi);
4503
4504    switch (c->cns) {
4505    case NVME_ID_CNS_NS:
4506        return nvme_identify_ns(n, req, true);
4507    case NVME_ID_CNS_NS_PRESENT:
4508        return nvme_identify_ns(n, req, false);
4509    case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4510        return nvme_identify_ns_attached_list(n, req);
4511    case NVME_ID_CNS_CS_NS:
4512        return nvme_identify_ns_csi(n, req, true);
4513    case NVME_ID_CNS_CS_NS_PRESENT:
4514        return nvme_identify_ns_csi(n, req, false);
4515    case NVME_ID_CNS_CTRL:
4516        return nvme_identify_ctrl(n, req);
4517    case NVME_ID_CNS_CS_CTRL:
4518        return nvme_identify_ctrl_csi(n, req);
4519    case NVME_ID_CNS_NS_ACTIVE_LIST:
4520        return nvme_identify_nslist(n, req, true);
4521    case NVME_ID_CNS_NS_PRESENT_LIST:
4522        return nvme_identify_nslist(n, req, false);
4523    case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4524        return nvme_identify_nslist_csi(n, req, true);
4525    case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4526        return nvme_identify_nslist_csi(n, req, false);
4527    case NVME_ID_CNS_NS_DESCR_LIST:
4528        return nvme_identify_ns_descr_list(n, req);
4529    case NVME_ID_CNS_IO_COMMAND_SET:
4530        return nvme_identify_cmd_set(n, req);
4531    default:
4532        trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4533        return NVME_INVALID_FIELD | NVME_DNR;
4534    }
4535}
4536
4537static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4538{
4539    uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4540
4541    req->cqe.result = 1;
4542    if (nvme_check_sqid(n, sqid)) {
4543        return NVME_INVALID_FIELD | NVME_DNR;
4544    }
4545
4546    return NVME_SUCCESS;
4547}
4548
4549static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4550{
4551    trace_pci_nvme_setfeat_timestamp(ts);
4552
4553    n->host_timestamp = le64_to_cpu(ts);
4554    n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4555}
4556
4557static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4558{
4559    uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4560    uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4561
4562    union nvme_timestamp {
4563        struct {
4564            uint64_t timestamp:48;
4565            uint64_t sync:1;
4566            uint64_t origin:3;
4567            uint64_t rsvd1:12;
4568        };
4569        uint64_t all;
4570    };
4571
4572    union nvme_timestamp ts;
4573    ts.all = 0;
4574    ts.timestamp = n->host_timestamp + elapsed_time;
4575
4576    /* If the host timestamp is non-zero, set the timestamp origin */
4577    ts.origin = n->host_timestamp ? 0x01 : 0x00;
4578
4579    trace_pci_nvme_getfeat_timestamp(ts.all);
4580
4581    return cpu_to_le64(ts.all);
4582}
4583
4584static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4585{
4586    uint64_t timestamp = nvme_get_timestamp(n);
4587
4588    return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4589}
4590
4591static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4592{
4593    NvmeCmd *cmd = &req->cmd;
4594    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4595    uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4596    uint32_t nsid = le32_to_cpu(cmd->nsid);
4597    uint32_t result;
4598    uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4599    NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4600    uint16_t iv;
4601    NvmeNamespace *ns;
4602    int i;
4603
4604    static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4605        [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4606    };
4607
4608    trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4609
4610    if (!nvme_feature_support[fid]) {
4611        return NVME_INVALID_FIELD | NVME_DNR;
4612    }
4613
4614    if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4615        if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4616            /*
4617             * The Reservation Notification Mask and Reservation Persistence
4618             * features require a status code of Invalid Field in Command when
4619             * NSID is FFFFFFFFh. Since the device does not support those
4620             * features we can always return Invalid Namespace or Format as we
4621             * should do for all other features.
4622             */
4623            return NVME_INVALID_NSID | NVME_DNR;
4624        }
4625
4626        if (!nvme_ns(n, nsid)) {
4627            return NVME_INVALID_FIELD | NVME_DNR;
4628        }
4629    }
4630
4631    switch (sel) {
4632    case NVME_GETFEAT_SELECT_CURRENT:
4633        break;
4634    case NVME_GETFEAT_SELECT_SAVED:
4635        /* no features are saveable by the controller; fallthrough */
4636    case NVME_GETFEAT_SELECT_DEFAULT:
4637        goto defaults;
4638    case NVME_GETFEAT_SELECT_CAP:
4639        result = nvme_feature_cap[fid];
4640        goto out;
4641    }
4642
4643    switch (fid) {
4644    case NVME_TEMPERATURE_THRESHOLD:
4645        result = 0;
4646
4647        /*
4648         * The controller only implements the Composite Temperature sensor, so
4649         * return 0 for all other sensors.
4650         */
4651        if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4652            goto out;
4653        }
4654
4655        switch (NVME_TEMP_THSEL(dw11)) {
4656        case NVME_TEMP_THSEL_OVER:
4657            result = n->features.temp_thresh_hi;
4658            goto out;
4659        case NVME_TEMP_THSEL_UNDER:
4660            result = n->features.temp_thresh_low;
4661            goto out;
4662        }
4663
4664        return NVME_INVALID_FIELD | NVME_DNR;
4665    case NVME_ERROR_RECOVERY:
4666        if (!nvme_nsid_valid(n, nsid)) {
4667            return NVME_INVALID_NSID | NVME_DNR;
4668        }
4669
4670        ns = nvme_ns(n, nsid);
4671        if (unlikely(!ns)) {
4672            return NVME_INVALID_FIELD | NVME_DNR;
4673        }
4674
4675        result = ns->features.err_rec;
4676        goto out;
4677    case NVME_VOLATILE_WRITE_CACHE:
4678        result = 0;
4679        for (i = 1; i <= n->num_namespaces; i++) {
4680            ns = nvme_ns(n, i);
4681            if (!ns) {
4682                continue;
4683            }
4684
4685            result = blk_enable_write_cache(ns->blkconf.blk);
4686            if (result) {
4687                break;
4688            }
4689        }
4690        trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4691        goto out;
4692    case NVME_ASYNCHRONOUS_EVENT_CONF:
4693        result = n->features.async_config;
4694        goto out;
4695    case NVME_TIMESTAMP:
4696        return nvme_get_feature_timestamp(n, req);
4697    default:
4698        break;
4699    }
4700
4701defaults:
4702    switch (fid) {
4703    case NVME_TEMPERATURE_THRESHOLD:
4704        result = 0;
4705
4706        if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4707            break;
4708        }
4709
4710        if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4711            result = NVME_TEMPERATURE_WARNING;
4712        }
4713
4714        break;
4715    case NVME_NUMBER_OF_QUEUES:
4716        result = (n->params.max_ioqpairs - 1) |
4717            ((n->params.max_ioqpairs - 1) << 16);
4718        trace_pci_nvme_getfeat_numq(result);
4719        break;
4720    case NVME_INTERRUPT_VECTOR_CONF:
4721        iv = dw11 & 0xffff;
4722        if (iv >= n->params.max_ioqpairs + 1) {
4723            return NVME_INVALID_FIELD | NVME_DNR;
4724        }
4725
4726        result = iv;
4727        if (iv == n->admin_cq.vector) {
4728            result |= NVME_INTVC_NOCOALESCING;
4729        }
4730        break;
4731    case NVME_COMMAND_SET_PROFILE:
4732        result = 0;
4733        break;
4734    default:
4735        result = nvme_feature_default[fid];
4736        break;
4737    }
4738
4739out:
4740    req->cqe.result = cpu_to_le32(result);
4741    return NVME_SUCCESS;
4742}
4743
4744static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4745{
4746    uint16_t ret;
4747    uint64_t timestamp;
4748
4749    ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4750    if (ret) {
4751        return ret;
4752    }
4753
4754    nvme_set_timestamp(n, timestamp);
4755
4756    return NVME_SUCCESS;
4757}
4758
4759static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4760{
4761    NvmeNamespace *ns = NULL;
4762
4763    NvmeCmd *cmd = &req->cmd;
4764    uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4765    uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4766    uint32_t nsid = le32_to_cpu(cmd->nsid);
4767    uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4768    uint8_t save = NVME_SETFEAT_SAVE(dw10);
4769    int i;
4770
4771    trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
4772
4773    if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
4774        return NVME_FID_NOT_SAVEABLE | NVME_DNR;
4775    }
4776
4777    if (!nvme_feature_support[fid]) {
4778        return NVME_INVALID_FIELD | NVME_DNR;
4779    }
4780
4781    if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4782        if (nsid != NVME_NSID_BROADCAST) {
4783            if (!nvme_nsid_valid(n, nsid)) {
4784                return NVME_INVALID_NSID | NVME_DNR;
4785            }
4786
4787            ns = nvme_ns(n, nsid);
4788            if (unlikely(!ns)) {
4789                return NVME_INVALID_FIELD | NVME_DNR;
4790            }
4791        }
4792    } else if (nsid && nsid != NVME_NSID_BROADCAST) {
4793        if (!nvme_nsid_valid(n, nsid)) {
4794            return NVME_INVALID_NSID | NVME_DNR;
4795        }
4796
4797        return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
4798    }
4799
4800    if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
4801        return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
4802    }
4803
4804    switch (fid) {
4805    case NVME_TEMPERATURE_THRESHOLD:
4806        if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4807            break;
4808        }
4809
4810        switch (NVME_TEMP_THSEL(dw11)) {
4811        case NVME_TEMP_THSEL_OVER:
4812            n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
4813            break;
4814        case NVME_TEMP_THSEL_UNDER:
4815            n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
4816            break;
4817        default:
4818            return NVME_INVALID_FIELD | NVME_DNR;
4819        }
4820
4821        if ((n->temperature >= n->features.temp_thresh_hi) ||
4822            (n->temperature <= n->features.temp_thresh_low)) {
4823            nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
4824        }
4825
4826        break;
4827    case NVME_ERROR_RECOVERY:
4828        if (nsid == NVME_NSID_BROADCAST) {
4829            for (i = 1; i <= n->num_namespaces; i++) {
4830                ns = nvme_ns(n, i);
4831
4832                if (!ns) {
4833                    continue;
4834                }
4835
4836                if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
4837                    ns->features.err_rec = dw11;
4838                }
4839            }
4840
4841            break;
4842        }
4843
4844        assert(ns);
4845        if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
4846            ns->features.err_rec = dw11;
4847        }
4848        break;
4849    case NVME_VOLATILE_WRITE_CACHE:
4850        for (i = 1; i <= n->num_namespaces; i++) {
4851            ns = nvme_ns(n, i);
4852            if (!ns) {
4853                continue;
4854            }
4855
4856            if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
4857                blk_flush(ns->blkconf.blk);
4858            }
4859
4860            blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
4861        }
4862
4863        break;
4864
4865    case NVME_NUMBER_OF_QUEUES:
4866        if (n->qs_created) {
4867            return NVME_CMD_SEQ_ERROR | NVME_DNR;
4868        }
4869
4870        /*
4871         * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
4872         * and NSQR.
4873         */
4874        if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
4875            return NVME_INVALID_FIELD | NVME_DNR;
4876        }
4877
4878        trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
4879                                    ((dw11 >> 16) & 0xffff) + 1,
4880                                    n->params.max_ioqpairs,
4881                                    n->params.max_ioqpairs);
4882        req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
4883                                      ((n->params.max_ioqpairs - 1) << 16));
4884        break;
4885    case NVME_ASYNCHRONOUS_EVENT_CONF:
4886        n->features.async_config = dw11;
4887        break;
4888    case NVME_TIMESTAMP:
4889        return nvme_set_feature_timestamp(n, req);
4890    case NVME_COMMAND_SET_PROFILE:
4891        if (dw11 & 0x1ff) {
4892            trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
4893            return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
4894        }
4895        break;
4896    default:
4897        return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
4898    }
4899    return NVME_SUCCESS;
4900}
4901
4902static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
4903{
4904    trace_pci_nvme_aer(nvme_cid(req));
4905
4906    if (n->outstanding_aers > n->params.aerl) {
4907        trace_pci_nvme_aer_aerl_exceeded();
4908        return NVME_AER_LIMIT_EXCEEDED;
4909    }
4910
4911    n->aer_reqs[n->outstanding_aers] = req;
4912    n->outstanding_aers++;
4913
4914    if (!QTAILQ_EMPTY(&n->aer_queue)) {
4915        nvme_process_aers(n);
4916    }
4917
4918    return NVME_NO_COMPLETE;
4919}
4920
4921static void nvme_update_dmrsl(NvmeCtrl *n)
4922{
4923    int nsid;
4924
4925    for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
4926        NvmeNamespace *ns = nvme_ns(n, nsid);
4927        if (!ns) {
4928            continue;
4929        }
4930
4931        n->dmrsl = MIN_NON_ZERO(n->dmrsl,
4932                                BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
4933    }
4934}
4935
4936static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns);
4937static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
4938{
4939    NvmeNamespace *ns;
4940    NvmeCtrl *ctrl;
4941    uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4942    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4943    uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
4944    bool attach = !(dw10 & 0xf);
4945    uint16_t *nr_ids = &list[0];
4946    uint16_t *ids = &list[1];
4947    uint16_t ret;
4948    int i;
4949
4950    trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
4951
4952    if (!nvme_nsid_valid(n, nsid)) {
4953        return NVME_INVALID_NSID | NVME_DNR;
4954    }
4955
4956    ns = nvme_subsys_ns(n->subsys, nsid);
4957    if (!ns) {
4958        return NVME_INVALID_FIELD | NVME_DNR;
4959    }
4960
4961    ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
4962    if (ret) {
4963        return ret;
4964    }
4965
4966    if (!*nr_ids) {
4967        return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4968    }
4969
4970    *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
4971    for (i = 0; i < *nr_ids; i++) {
4972        ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
4973        if (!ctrl) {
4974            return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4975        }
4976
4977        if (attach) {
4978            if (nvme_ns(ctrl, nsid)) {
4979                return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
4980            }
4981
4982            if (ns->attached && !ns->params.shared) {
4983                return NVME_NS_PRIVATE | NVME_DNR;
4984            }
4985
4986            nvme_attach_ns(ctrl, ns);
4987            __nvme_select_ns_iocs(ctrl, ns);
4988        } else {
4989            if (!nvme_ns(ctrl, nsid)) {
4990                return NVME_NS_NOT_ATTACHED | NVME_DNR;
4991            }
4992
4993            ctrl->namespaces[nsid - 1] = NULL;
4994            ns->attached--;
4995
4996            nvme_update_dmrsl(ctrl);
4997        }
4998
4999        /*
5000         * Add namespace id to the changed namespace id list for event clearing
5001         * via Get Log Page command.
5002         */
5003        if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5004            nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5005                               NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5006                               NVME_LOG_CHANGED_NSLIST);
5007        }
5008    }
5009
5010    return NVME_SUCCESS;
5011}
5012
5013static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf,
5014                               uint8_t mset, uint8_t pi, uint8_t pil,
5015                               NvmeRequest *req)
5016{
5017    int64_t len, offset;
5018    struct nvme_aio_format_ctx *ctx;
5019    BlockBackend *blk = ns->blkconf.blk;
5020    uint16_t ms;
5021    uintptr_t *num_formats = (uintptr_t *)&req->opaque;
5022    int *count;
5023
5024    if (ns->params.zoned) {
5025        return NVME_INVALID_FORMAT | NVME_DNR;
5026    }
5027
5028    trace_pci_nvme_format_ns(nvme_cid(req), nvme_nsid(ns), lbaf, mset, pi, pil);
5029
5030    if (lbaf > ns->id_ns.nlbaf) {
5031        return NVME_INVALID_FORMAT | NVME_DNR;
5032    }
5033
5034    ms = ns->id_ns.lbaf[lbaf].ms;
5035
5036    if (pi && (ms < sizeof(NvmeDifTuple))) {
5037        return NVME_INVALID_FORMAT | NVME_DNR;
5038    }
5039
5040    if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5041        return NVME_INVALID_FIELD | NVME_DNR;
5042    }
5043
5044    nvme_ns_drain(ns);
5045    nvme_ns_shutdown(ns);
5046    nvme_ns_cleanup(ns);
5047
5048    ns->id_ns.dps = (pil << 3) | pi;
5049    ns->id_ns.flbas = lbaf | (mset << 4);
5050
5051    nvme_ns_init_format(ns);
5052
5053    ns->status = NVME_FORMAT_IN_PROGRESS;
5054
5055    len = ns->size;
5056    offset = 0;
5057
5058    count = g_new(int, 1);
5059    *count = 1;
5060
5061    (*num_formats)++;
5062
5063    while (len) {
5064        ctx = g_new(struct nvme_aio_format_ctx, 1);
5065        ctx->req = req;
5066        ctx->ns = ns;
5067        ctx->count = count;
5068
5069        size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
5070
5071        (*count)++;
5072
5073        blk_aio_pwrite_zeroes(blk, offset, bytes, BDRV_REQ_MAY_UNMAP,
5074                              nvme_aio_format_cb, ctx);
5075
5076        offset += bytes;
5077        len -= bytes;
5078
5079    }
5080
5081    if (--(*count)) {
5082        return NVME_NO_COMPLETE;
5083    }
5084
5085    g_free(count);
5086    ns->status = 0x0;
5087    (*num_formats)--;
5088
5089    return NVME_SUCCESS;
5090}
5091
5092static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5093{
5094    NvmeNamespace *ns;
5095    uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5096    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5097    uint8_t lbaf = dw10 & 0xf;
5098    uint8_t mset = (dw10 >> 4) & 0x1;
5099    uint8_t pi = (dw10 >> 5) & 0x7;
5100    uint8_t pil = (dw10 >> 8) & 0x1;
5101    uintptr_t *num_formats = (uintptr_t *)&req->opaque;
5102    uint16_t status;
5103    int i;
5104
5105    trace_pci_nvme_format(nvme_cid(req), nsid, lbaf, mset, pi, pil);
5106
5107    /* 1-initialize; see the comment in nvme_dsm */
5108    *num_formats = 1;
5109
5110    if (nsid != NVME_NSID_BROADCAST) {
5111        if (!nvme_nsid_valid(n, nsid)) {
5112            return NVME_INVALID_NSID | NVME_DNR;
5113        }
5114
5115        ns = nvme_ns(n, nsid);
5116        if (!ns) {
5117            return NVME_INVALID_FIELD | NVME_DNR;
5118        }
5119
5120        status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
5121        if (status && status != NVME_NO_COMPLETE) {
5122            req->status = status;
5123        }
5124    } else {
5125        for (i = 1; i <= n->num_namespaces; i++) {
5126            ns = nvme_ns(n, i);
5127            if (!ns) {
5128                continue;
5129            }
5130
5131            status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
5132            if (status && status != NVME_NO_COMPLETE) {
5133                req->status = status;
5134                break;
5135            }
5136        }
5137    }
5138
5139    /* account for the 1-initialization */
5140    if (--(*num_formats)) {
5141        return NVME_NO_COMPLETE;
5142    }
5143
5144    return req->status;
5145}
5146
5147static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5148{
5149    trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5150                             nvme_adm_opc_str(req->cmd.opcode));
5151
5152    if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5153        trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5154        return NVME_INVALID_OPCODE | NVME_DNR;
5155    }
5156
5157    /* SGLs shall not be used for Admin commands in NVMe over PCIe */
5158    if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5159        return NVME_INVALID_FIELD | NVME_DNR;
5160    }
5161
5162    switch (req->cmd.opcode) {
5163    case NVME_ADM_CMD_DELETE_SQ:
5164        return nvme_del_sq(n, req);
5165    case NVME_ADM_CMD_CREATE_SQ:
5166        return nvme_create_sq(n, req);
5167    case NVME_ADM_CMD_GET_LOG_PAGE:
5168        return nvme_get_log(n, req);
5169    case NVME_ADM_CMD_DELETE_CQ:
5170        return nvme_del_cq(n, req);
5171    case NVME_ADM_CMD_CREATE_CQ:
5172        return nvme_create_cq(n, req);
5173    case NVME_ADM_CMD_IDENTIFY:
5174        return nvme_identify(n, req);
5175    case NVME_ADM_CMD_ABORT:
5176        return nvme_abort(n, req);
5177    case NVME_ADM_CMD_SET_FEATURES:
5178        return nvme_set_feature(n, req);
5179    case NVME_ADM_CMD_GET_FEATURES:
5180        return nvme_get_feature(n, req);
5181    case NVME_ADM_CMD_ASYNC_EV_REQ:
5182        return nvme_aer(n, req);
5183    case NVME_ADM_CMD_NS_ATTACHMENT:
5184        return nvme_ns_attachment(n, req);
5185    case NVME_ADM_CMD_FORMAT_NVM:
5186        return nvme_format(n, req);
5187    default:
5188        assert(false);
5189    }
5190
5191    return NVME_INVALID_OPCODE | NVME_DNR;
5192}
5193
5194static void nvme_process_sq(void *opaque)
5195{
5196    NvmeSQueue *sq = opaque;
5197    NvmeCtrl *n = sq->ctrl;
5198    NvmeCQueue *cq = n->cq[sq->cqid];
5199
5200    uint16_t status;
5201    hwaddr addr;
5202    NvmeCmd cmd;
5203    NvmeRequest *req;
5204
5205    while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5206        addr = sq->dma_addr + sq->head * n->sqe_size;
5207        if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5208            trace_pci_nvme_err_addr_read(addr);
5209            trace_pci_nvme_err_cfs();
5210            n->bar.csts = NVME_CSTS_FAILED;
5211            break;
5212        }
5213        nvme_inc_sq_head(sq);
5214
5215        req = QTAILQ_FIRST(&sq->req_list);
5216        QTAILQ_REMOVE(&sq->req_list, req, entry);
5217        QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5218        nvme_req_clear(req);
5219        req->cqe.cid = cmd.cid;
5220        memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5221
5222        status = sq->sqid ? nvme_io_cmd(n, req) :
5223            nvme_admin_cmd(n, req);
5224        if (status != NVME_NO_COMPLETE) {
5225            req->status = status;
5226            nvme_enqueue_req_completion(cq, req);
5227        }
5228    }
5229}
5230
5231static void nvme_ctrl_reset(NvmeCtrl *n)
5232{
5233    NvmeNamespace *ns;
5234    int i;
5235
5236    for (i = 1; i <= n->num_namespaces; i++) {
5237        ns = nvme_ns(n, i);
5238        if (!ns) {
5239            continue;
5240        }
5241
5242        nvme_ns_drain(ns);
5243    }
5244
5245    for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5246        if (n->sq[i] != NULL) {
5247            nvme_free_sq(n->sq[i], n);
5248        }
5249    }
5250    for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5251        if (n->cq[i] != NULL) {
5252            nvme_free_cq(n->cq[i], n);
5253        }
5254    }
5255
5256    while (!QTAILQ_EMPTY(&n->aer_queue)) {
5257        NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5258        QTAILQ_REMOVE(&n->aer_queue, event, entry);
5259        g_free(event);
5260    }
5261
5262    n->aer_queued = 0;
5263    n->outstanding_aers = 0;
5264    n->qs_created = false;
5265
5266    n->bar.cc = 0;
5267}
5268
5269static void nvme_ctrl_shutdown(NvmeCtrl *n)
5270{
5271    NvmeNamespace *ns;
5272    int i;
5273
5274    if (n->pmr.dev) {
5275        memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5276    }
5277
5278    for (i = 1; i <= n->num_namespaces; i++) {
5279        ns = nvme_ns(n, i);
5280        if (!ns) {
5281            continue;
5282        }
5283
5284        nvme_ns_shutdown(ns);
5285    }
5286}
5287
5288static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns)
5289{
5290    ns->iocs = nvme_cse_iocs_none;
5291    switch (ns->csi) {
5292    case NVME_CSI_NVM:
5293        if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
5294            ns->iocs = nvme_cse_iocs_nvm;
5295        }
5296        break;
5297    case NVME_CSI_ZONED:
5298        if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
5299            ns->iocs = nvme_cse_iocs_zoned;
5300        } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
5301            ns->iocs = nvme_cse_iocs_nvm;
5302        }
5303        break;
5304    }
5305}
5306
5307static void nvme_select_ns_iocs(NvmeCtrl *n)
5308{
5309    NvmeNamespace *ns;
5310    int i;
5311
5312    for (i = 1; i <= n->num_namespaces; i++) {
5313        ns = nvme_ns(n, i);
5314        if (!ns) {
5315            continue;
5316        }
5317
5318        __nvme_select_ns_iocs(n, ns);
5319    }
5320}
5321
5322static int nvme_start_ctrl(NvmeCtrl *n)
5323{
5324    uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
5325    uint32_t page_size = 1 << page_bits;
5326
5327    if (unlikely(n->cq[0])) {
5328        trace_pci_nvme_err_startfail_cq();
5329        return -1;
5330    }
5331    if (unlikely(n->sq[0])) {
5332        trace_pci_nvme_err_startfail_sq();
5333        return -1;
5334    }
5335    if (unlikely(!n->bar.asq)) {
5336        trace_pci_nvme_err_startfail_nbarasq();
5337        return -1;
5338    }
5339    if (unlikely(!n->bar.acq)) {
5340        trace_pci_nvme_err_startfail_nbaracq();
5341        return -1;
5342    }
5343    if (unlikely(n->bar.asq & (page_size - 1))) {
5344        trace_pci_nvme_err_startfail_asq_misaligned(n->bar.asq);
5345        return -1;
5346    }
5347    if (unlikely(n->bar.acq & (page_size - 1))) {
5348        trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq);
5349        return -1;
5350    }
5351    if (unlikely(!(NVME_CAP_CSS(n->bar.cap) & (1 << NVME_CC_CSS(n->bar.cc))))) {
5352        trace_pci_nvme_err_startfail_css(NVME_CC_CSS(n->bar.cc));
5353        return -1;
5354    }
5355    if (unlikely(NVME_CC_MPS(n->bar.cc) <
5356                 NVME_CAP_MPSMIN(n->bar.cap))) {
5357        trace_pci_nvme_err_startfail_page_too_small(
5358                    NVME_CC_MPS(n->bar.cc),
5359                    NVME_CAP_MPSMIN(n->bar.cap));
5360        return -1;
5361    }
5362    if (unlikely(NVME_CC_MPS(n->bar.cc) >
5363                 NVME_CAP_MPSMAX(n->bar.cap))) {
5364        trace_pci_nvme_err_startfail_page_too_large(
5365                    NVME_CC_MPS(n->bar.cc),
5366                    NVME_CAP_MPSMAX(n->bar.cap));
5367        return -1;
5368    }
5369    if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
5370                 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5371        trace_pci_nvme_err_startfail_cqent_too_small(
5372                    NVME_CC_IOCQES(n->bar.cc),
5373                    NVME_CTRL_CQES_MIN(n->bar.cap));
5374        return -1;
5375    }
5376    if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
5377                 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5378        trace_pci_nvme_err_startfail_cqent_too_large(
5379                    NVME_CC_IOCQES(n->bar.cc),
5380                    NVME_CTRL_CQES_MAX(n->bar.cap));
5381        return -1;
5382    }
5383    if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
5384                 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5385        trace_pci_nvme_err_startfail_sqent_too_small(
5386                    NVME_CC_IOSQES(n->bar.cc),
5387                    NVME_CTRL_SQES_MIN(n->bar.cap));
5388        return -1;
5389    }
5390    if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
5391                 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5392        trace_pci_nvme_err_startfail_sqent_too_large(
5393                    NVME_CC_IOSQES(n->bar.cc),
5394                    NVME_CTRL_SQES_MAX(n->bar.cap));
5395        return -1;
5396    }
5397    if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
5398        trace_pci_nvme_err_startfail_asqent_sz_zero();
5399        return -1;
5400    }
5401    if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
5402        trace_pci_nvme_err_startfail_acqent_sz_zero();
5403        return -1;
5404    }
5405
5406    n->page_bits = page_bits;
5407    n->page_size = page_size;
5408    n->max_prp_ents = n->page_size / sizeof(uint64_t);
5409    n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
5410    n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
5411    nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
5412                 NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
5413    nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
5414                 NVME_AQA_ASQS(n->bar.aqa) + 1);
5415
5416    nvme_set_timestamp(n, 0ULL);
5417
5418    QTAILQ_INIT(&n->aer_queue);
5419
5420    nvme_select_ns_iocs(n);
5421
5422    return 0;
5423}
5424
5425static void nvme_cmb_enable_regs(NvmeCtrl *n)
5426{
5427    NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1);
5428    NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1);
5429    NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
5430
5431    NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
5432    NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
5433    NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
5434    NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
5435    NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
5436    NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
5437    NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
5438}
5439
5440static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5441                           unsigned size)
5442{
5443    if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5444        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5445                       "MMIO write not 32-bit aligned,"
5446                       " offset=0x%"PRIx64"", offset);
5447        /* should be ignored, fall through for now */
5448    }
5449
5450    if (unlikely(size < sizeof(uint32_t))) {
5451        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5452                       "MMIO write smaller than 32-bits,"
5453                       " offset=0x%"PRIx64", size=%u",
5454                       offset, size);
5455        /* should be ignored, fall through for now */
5456    }
5457
5458    switch (offset) {
5459    case 0xc:   /* INTMS */
5460        if (unlikely(msix_enabled(&(n->parent_obj)))) {
5461            NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5462                           "undefined access to interrupt mask set"
5463                           " when MSI-X is enabled");
5464            /* should be ignored, fall through for now */
5465        }
5466        n->bar.intms |= data & 0xffffffff;
5467        n->bar.intmc = n->bar.intms;
5468        trace_pci_nvme_mmio_intm_set(data & 0xffffffff, n->bar.intmc);
5469        nvme_irq_check(n);
5470        break;
5471    case 0x10:  /* INTMC */
5472        if (unlikely(msix_enabled(&(n->parent_obj)))) {
5473            NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5474                           "undefined access to interrupt mask clr"
5475                           " when MSI-X is enabled");
5476            /* should be ignored, fall through for now */
5477        }
5478        n->bar.intms &= ~(data & 0xffffffff);
5479        n->bar.intmc = n->bar.intms;
5480        trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, n->bar.intmc);
5481        nvme_irq_check(n);
5482        break;
5483    case 0x14:  /* CC */
5484        trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5485        /* Windows first sends data, then sends enable bit */
5486        if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
5487            !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
5488        {
5489            n->bar.cc = data;
5490        }
5491
5492        if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
5493            n->bar.cc = data;
5494            if (unlikely(nvme_start_ctrl(n))) {
5495                trace_pci_nvme_err_startfail();
5496                n->bar.csts = NVME_CSTS_FAILED;
5497            } else {
5498                trace_pci_nvme_mmio_start_success();
5499                n->bar.csts = NVME_CSTS_READY;
5500            }
5501        } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
5502            trace_pci_nvme_mmio_stopped();
5503            nvme_ctrl_reset(n);
5504            n->bar.csts &= ~NVME_CSTS_READY;
5505        }
5506        if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
5507            trace_pci_nvme_mmio_shutdown_set();
5508            nvme_ctrl_shutdown(n);
5509            n->bar.cc = data;
5510            n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
5511        } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
5512            trace_pci_nvme_mmio_shutdown_cleared();
5513            n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
5514            n->bar.cc = data;
5515        }
5516        break;
5517    case 0x1c:  /* CSTS */
5518        if (data & (1 << 4)) {
5519            NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5520                           "attempted to W1C CSTS.NSSRO"
5521                           " but CAP.NSSRS is zero (not supported)");
5522        } else if (data != 0) {
5523            NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5524                           "attempted to set a read only bit"
5525                           " of controller status");
5526        }
5527        break;
5528    case 0x20:  /* NSSR */
5529        if (data == 0x4e564d65) {
5530            trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5531        } else {
5532            /* The spec says that writes of other values have no effect */
5533            return;
5534        }
5535        break;
5536    case 0x24:  /* AQA */
5537        n->bar.aqa = data & 0xffffffff;
5538        trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5539        break;
5540    case 0x28:  /* ASQ */
5541        n->bar.asq = size == 8 ? data :
5542            (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff);
5543        trace_pci_nvme_mmio_asqaddr(data);
5544        break;
5545    case 0x2c:  /* ASQ hi */
5546        n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32);
5547        trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
5548        break;
5549    case 0x30:  /* ACQ */
5550        trace_pci_nvme_mmio_acqaddr(data);
5551        n->bar.acq = size == 8 ? data :
5552            (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff);
5553        break;
5554    case 0x34:  /* ACQ hi */
5555        n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32);
5556        trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
5557        break;
5558    case 0x38:  /* CMBLOC */
5559        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5560                       "invalid write to reserved CMBLOC"
5561                       " when CMBSZ is zero, ignored");
5562        return;
5563    case 0x3C:  /* CMBSZ */
5564        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5565                       "invalid write to read only CMBSZ, ignored");
5566        return;
5567    case 0x50:  /* CMBMSC */
5568        if (!NVME_CAP_CMBS(n->bar.cap)) {
5569            return;
5570        }
5571
5572        n->bar.cmbmsc = size == 8 ? data :
5573            (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff);
5574        n->cmb.cmse = false;
5575
5576        if (NVME_CMBMSC_CRE(data)) {
5577            nvme_cmb_enable_regs(n);
5578
5579            if (NVME_CMBMSC_CMSE(data)) {
5580                hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT;
5581                if (cba + int128_get64(n->cmb.mem.size) < cba) {
5582                    NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1);
5583                    return;
5584                }
5585
5586                n->cmb.cba = cba;
5587                n->cmb.cmse = true;
5588            }
5589        } else {
5590            n->bar.cmbsz = 0;
5591            n->bar.cmbloc = 0;
5592        }
5593
5594        return;
5595    case 0x54:  /* CMBMSC hi */
5596        n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32);
5597        return;
5598
5599    case 0xe00: /* PMRCAP */
5600        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5601                       "invalid write to PMRCAP register, ignored");
5602        return;
5603    case 0xe04: /* PMRCTL */
5604        if (!NVME_CAP_PMRS(n->bar.cap)) {
5605            return;
5606        }
5607
5608        n->bar.pmrctl = data;
5609        if (NVME_PMRCTL_EN(data)) {
5610            memory_region_set_enabled(&n->pmr.dev->mr, true);
5611            n->bar.pmrsts = 0;
5612        } else {
5613            memory_region_set_enabled(&n->pmr.dev->mr, false);
5614            NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
5615            n->pmr.cmse = false;
5616        }
5617        return;
5618    case 0xe08: /* PMRSTS */
5619        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5620                       "invalid write to PMRSTS register, ignored");
5621        return;
5622    case 0xe0C: /* PMREBS */
5623        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5624                       "invalid write to PMREBS register, ignored");
5625        return;
5626    case 0xe10: /* PMRSWTP */
5627        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5628                       "invalid write to PMRSWTP register, ignored");
5629        return;
5630    case 0xe14: /* PMRMSCL */
5631        if (!NVME_CAP_PMRS(n->bar.cap)) {
5632            return;
5633        }
5634
5635        n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff);
5636        n->pmr.cmse = false;
5637
5638        if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) {
5639            hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT;
5640            if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5641                NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1);
5642                return;
5643            }
5644
5645            n->pmr.cmse = true;
5646            n->pmr.cba = cba;
5647        }
5648
5649        return;
5650    case 0xe18: /* PMRMSCU */
5651        if (!NVME_CAP_PMRS(n->bar.cap)) {
5652            return;
5653        }
5654
5655        n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32);
5656        return;
5657    default:
5658        NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5659                       "invalid MMIO write,"
5660                       " offset=0x%"PRIx64", data=%"PRIx64"",
5661                       offset, data);
5662        break;
5663    }
5664}
5665
5666static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5667{
5668    NvmeCtrl *n = (NvmeCtrl *)opaque;
5669    uint8_t *ptr = (uint8_t *)&n->bar;
5670    uint64_t val = 0;
5671
5672    trace_pci_nvme_mmio_read(addr, size);
5673
5674    if (unlikely(addr & (sizeof(uint32_t) - 1))) {
5675        NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
5676                       "MMIO read not 32-bit aligned,"
5677                       " offset=0x%"PRIx64"", addr);
5678        /* should RAZ, fall through for now */
5679    } else if (unlikely(size < sizeof(uint32_t))) {
5680        NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
5681                       "MMIO read smaller than 32-bits,"
5682                       " offset=0x%"PRIx64"", addr);
5683        /* should RAZ, fall through for now */
5684    }
5685
5686    if (addr < sizeof(n->bar)) {
5687        /*
5688         * When PMRWBM bit 1 is set then read from
5689         * from PMRSTS should ensure prior writes
5690         * made it to persistent media
5691         */
5692        if (addr == 0xe08 &&
5693            (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
5694            memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5695        }
5696        memcpy(&val, ptr + addr, size);
5697    } else {
5698        NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
5699                       "MMIO read beyond last register,"
5700                       " offset=0x%"PRIx64", returning 0", addr);
5701    }
5702
5703    return val;
5704}
5705
5706static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
5707{
5708    uint32_t qid;
5709
5710    if (unlikely(addr & ((1 << 2) - 1))) {
5711        NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
5712                       "doorbell write not 32-bit aligned,"
5713                       " offset=0x%"PRIx64", ignoring", addr);
5714        return;
5715    }
5716
5717    if (((addr - 0x1000) >> 2) & 1) {
5718        /* Completion queue doorbell write */
5719
5720        uint16_t new_head = val & 0xffff;
5721        int start_sqs;
5722        NvmeCQueue *cq;
5723
5724        qid = (addr - (0x1000 + (1 << 2))) >> 3;
5725        if (unlikely(nvme_check_cqid(n, qid))) {
5726            NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
5727                           "completion queue doorbell write"
5728                           " for nonexistent queue,"
5729                           " sqid=%"PRIu32", ignoring", qid);
5730
5731            /*
5732             * NVM Express v1.3d, Section 4.1 state: "If host software writes
5733             * an invalid value to the Submission Queue Tail Doorbell or
5734             * Completion Queue Head Doorbell regiter and an Asynchronous Event
5735             * Request command is outstanding, then an asynchronous event is
5736             * posted to the Admin Completion Queue with a status code of
5737             * Invalid Doorbell Write Value."
5738             *
5739             * Also note that the spec includes the "Invalid Doorbell Register"
5740             * status code, but nowhere does it specify when to use it.
5741             * However, it seems reasonable to use it here in a similar
5742             * fashion.
5743             */
5744            if (n->outstanding_aers) {
5745                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5746                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
5747                                   NVME_LOG_ERROR_INFO);
5748            }
5749
5750            return;
5751        }
5752
5753        cq = n->cq[qid];
5754        if (unlikely(new_head >= cq->size)) {
5755            NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
5756                           "completion queue doorbell write value"
5757                           " beyond queue size, sqid=%"PRIu32","
5758                           " new_head=%"PRIu16", ignoring",
5759                           qid, new_head);
5760
5761            if (n->outstanding_aers) {
5762                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5763                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
5764                                   NVME_LOG_ERROR_INFO);
5765            }
5766
5767            return;
5768        }
5769
5770        trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
5771
5772        start_sqs = nvme_cq_full(cq) ? 1 : 0;
5773        cq->head = new_head;
5774        if (start_sqs) {
5775            NvmeSQueue *sq;
5776            QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
5777                timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5778            }
5779            timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5780        }
5781
5782        if (cq->tail == cq->head) {
5783            if (cq->irq_enabled) {
5784                n->cq_pending--;
5785            }
5786
5787            nvme_irq_deassert(n, cq);
5788        }
5789    } else {
5790        /* Submission queue doorbell write */
5791
5792        uint16_t new_tail = val & 0xffff;
5793        NvmeSQueue *sq;
5794
5795        qid = (addr - 0x1000) >> 3;
5796        if (unlikely(nvme_check_sqid(n, qid))) {
5797            NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
5798                           "submission queue doorbell write"
5799                           " for nonexistent queue,"
5800                           " sqid=%"PRIu32", ignoring", qid);
5801
5802            if (n->outstanding_aers) {
5803                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5804                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
5805                                   NVME_LOG_ERROR_INFO);
5806            }
5807
5808            return;
5809        }
5810
5811        sq = n->sq[qid];
5812        if (unlikely(new_tail >= sq->size)) {
5813            NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
5814                           "submission queue doorbell write value"
5815                           " beyond queue size, sqid=%"PRIu32","
5816                           " new_tail=%"PRIu16", ignoring",
5817                           qid, new_tail);
5818
5819            if (n->outstanding_aers) {
5820                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5821                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
5822                                   NVME_LOG_ERROR_INFO);
5823            }
5824
5825            return;
5826        }
5827
5828        trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
5829
5830        sq->tail = new_tail;
5831        timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5832    }
5833}
5834
5835static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
5836                            unsigned size)
5837{
5838    NvmeCtrl *n = (NvmeCtrl *)opaque;
5839
5840    trace_pci_nvme_mmio_write(addr, data, size);
5841
5842    if (addr < sizeof(n->bar)) {
5843        nvme_write_bar(n, addr, data, size);
5844    } else {
5845        nvme_process_db(n, addr, data);
5846    }
5847}
5848
5849static const MemoryRegionOps nvme_mmio_ops = {
5850    .read = nvme_mmio_read,
5851    .write = nvme_mmio_write,
5852    .endianness = DEVICE_LITTLE_ENDIAN,
5853    .impl = {
5854        .min_access_size = 2,
5855        .max_access_size = 8,
5856    },
5857};
5858
5859static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
5860                           unsigned size)
5861{
5862    NvmeCtrl *n = (NvmeCtrl *)opaque;
5863    stn_le_p(&n->cmb.buf[addr], size, data);
5864}
5865
5866static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
5867{
5868    NvmeCtrl *n = (NvmeCtrl *)opaque;
5869    return ldn_le_p(&n->cmb.buf[addr], size);
5870}
5871
5872static const MemoryRegionOps nvme_cmb_ops = {
5873    .read = nvme_cmb_read,
5874    .write = nvme_cmb_write,
5875    .endianness = DEVICE_LITTLE_ENDIAN,
5876    .impl = {
5877        .min_access_size = 1,
5878        .max_access_size = 8,
5879    },
5880};
5881
5882static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
5883{
5884    NvmeParams *params = &n->params;
5885
5886    if (params->num_queues) {
5887        warn_report("num_queues is deprecated; please use max_ioqpairs "
5888                    "instead");
5889
5890        params->max_ioqpairs = params->num_queues - 1;
5891    }
5892
5893    if (n->namespace.blkconf.blk && n->subsys) {
5894        error_setg(errp, "subsystem support is unavailable with legacy "
5895                   "namespace ('drive' property)");
5896        return;
5897    }
5898
5899    if (params->max_ioqpairs < 1 ||
5900        params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
5901        error_setg(errp, "max_ioqpairs must be between 1 and %d",
5902                   NVME_MAX_IOQPAIRS);
5903        return;
5904    }
5905
5906    if (params->msix_qsize < 1 ||
5907        params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
5908        error_setg(errp, "msix_qsize must be between 1 and %d",
5909                   PCI_MSIX_FLAGS_QSIZE + 1);
5910        return;
5911    }
5912
5913    if (!params->serial) {
5914        error_setg(errp, "serial property not set");
5915        return;
5916    }
5917
5918    if (n->pmr.dev) {
5919        if (host_memory_backend_is_mapped(n->pmr.dev)) {
5920            error_setg(errp, "can't use already busy memdev: %s",
5921                       object_get_canonical_path_component(OBJECT(n->pmr.dev)));
5922            return;
5923        }
5924
5925        if (!is_power_of_2(n->pmr.dev->size)) {
5926            error_setg(errp, "pmr backend size needs to be power of 2 in size");
5927            return;
5928        }
5929
5930        host_memory_backend_set_mapped(n->pmr.dev, true);
5931    }
5932
5933    if (n->params.zasl > n->params.mdts) {
5934        error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
5935                   "than or equal to mdts (Maximum Data Transfer Size)");
5936        return;
5937    }
5938
5939    if (!n->params.vsl) {
5940        error_setg(errp, "vsl must be non-zero");
5941        return;
5942    }
5943}
5944
5945static void nvme_init_state(NvmeCtrl *n)
5946{
5947    n->num_namespaces = NVME_MAX_NAMESPACES;
5948    /* add one to max_ioqpairs to account for the admin queue pair */
5949    n->reg_size = pow2ceil(sizeof(NvmeBar) +
5950                           2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
5951    n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
5952    n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
5953    n->temperature = NVME_TEMPERATURE;
5954    n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
5955    n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5956    n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
5957}
5958
5959static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
5960{
5961    uint64_t cmb_size = n->params.cmb_size_mb * MiB;
5962
5963    n->cmb.buf = g_malloc0(cmb_size);
5964    memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
5965                          "nvme-cmb", cmb_size);
5966    pci_register_bar(pci_dev, NVME_CMB_BIR,
5967                     PCI_BASE_ADDRESS_SPACE_MEMORY |
5968                     PCI_BASE_ADDRESS_MEM_TYPE_64 |
5969                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
5970
5971    NVME_CAP_SET_CMBS(n->bar.cap, 1);
5972
5973    if (n->params.legacy_cmb) {
5974        nvme_cmb_enable_regs(n);
5975        n->cmb.cmse = true;
5976    }
5977}
5978
5979static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
5980{
5981    NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1);
5982    NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1);
5983    NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
5984    /* Turn on bit 1 support */
5985    NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
5986    NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1);
5987
5988    pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
5989                     PCI_BASE_ADDRESS_SPACE_MEMORY |
5990                     PCI_BASE_ADDRESS_MEM_TYPE_64 |
5991                     PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
5992
5993    memory_region_set_enabled(&n->pmr.dev->mr, false);
5994}
5995
5996static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
5997{
5998    uint8_t *pci_conf = pci_dev->config;
5999    uint64_t bar_size, msix_table_size, msix_pba_size;
6000    unsigned msix_table_offset, msix_pba_offset;
6001    int ret;
6002
6003    Error *err = NULL;
6004
6005    pci_conf[PCI_INTERRUPT_PIN] = 1;
6006    pci_config_set_prog_interface(pci_conf, 0x2);
6007
6008    if (n->params.use_intel_id) {
6009        pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6010        pci_config_set_device_id(pci_conf, 0x5845);
6011    } else {
6012        pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6013        pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6014    }
6015
6016    pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6017    pcie_endpoint_cap_init(pci_dev, 0x80);
6018
6019    bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6020    msix_table_offset = bar_size;
6021    msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6022
6023    bar_size += msix_table_size;
6024    bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6025    msix_pba_offset = bar_size;
6026    msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6027
6028    bar_size += msix_pba_size;
6029    bar_size = pow2ceil(bar_size);
6030
6031    memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6032    memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6033                          n->reg_size);
6034    memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6035
6036    pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6037                     PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6038    ret = msix_init(pci_dev, n->params.msix_qsize,
6039                    &n->bar0, 0, msix_table_offset,
6040                    &n->bar0, 0, msix_pba_offset, 0, &err);
6041    if (ret < 0) {
6042        if (ret == -ENOTSUP) {
6043            warn_report_err(err);
6044        } else {
6045            error_propagate(errp, err);
6046            return ret;
6047        }
6048    }
6049
6050    if (n->params.cmb_size_mb) {
6051        nvme_init_cmb(n, pci_dev);
6052    }
6053
6054    if (n->pmr.dev) {
6055        nvme_init_pmr(n, pci_dev);
6056    }
6057
6058    return 0;
6059}
6060
6061static void nvme_init_subnqn(NvmeCtrl *n)
6062{
6063    NvmeSubsystem *subsys = n->subsys;
6064    NvmeIdCtrl *id = &n->id_ctrl;
6065
6066    if (!subsys) {
6067        snprintf((char *)id->subnqn, sizeof(id->subnqn),
6068                 "nqn.2019-08.org.qemu:%s", n->params.serial);
6069    } else {
6070        pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6071    }
6072}
6073
6074static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6075{
6076    NvmeIdCtrl *id = &n->id_ctrl;
6077    uint8_t *pci_conf = pci_dev->config;
6078
6079    id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6080    id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6081    strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6082    strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6083    strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6084
6085    id->cntlid = cpu_to_le16(n->cntlid);
6086
6087    id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6088
6089    id->rab = 6;
6090
6091    if (n->params.use_intel_id) {
6092        id->ieee[0] = 0xb3;
6093        id->ieee[1] = 0x02;
6094        id->ieee[2] = 0x00;
6095    } else {
6096        id->ieee[0] = 0x00;
6097        id->ieee[1] = 0x54;
6098        id->ieee[2] = 0x52;
6099    }
6100
6101    id->mdts = n->params.mdts;
6102    id->ver = cpu_to_le32(NVME_SPEC_VER);
6103    id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6104    id->cntrltype = 0x1;
6105
6106    /*
6107     * Because the controller always completes the Abort command immediately,
6108     * there can never be more than one concurrently executing Abort command,
6109     * so this value is never used for anything. Note that there can easily be
6110     * many Abort commands in the queues, but they are not considered
6111     * "executing" until processed by nvme_abort.
6112     *
6113     * The specification recommends a value of 3 for Abort Command Limit (four
6114     * concurrently outstanding Abort commands), so lets use that though it is
6115     * inconsequential.
6116     */
6117    id->acl = 3;
6118    id->aerl = n->params.aerl;
6119    id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6120    id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6121
6122    /* recommended default value (~70 C) */
6123    id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6124    id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6125
6126    id->sqes = (0x6 << 4) | 0x6;
6127    id->cqes = (0x4 << 4) | 0x4;
6128    id->nn = cpu_to_le32(n->num_namespaces);
6129    id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6130                           NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6131                           NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6132
6133    /*
6134     * NOTE: If this device ever supports a command set that does NOT use 0x0
6135     * as a Flush-equivalent operation, support for the broadcast NSID in Flush
6136     * should probably be removed.
6137     *
6138     * See comment in nvme_io_cmd.
6139     */
6140    id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6141
6142    id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6143    id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6144                           NVME_CTRL_SGLS_BITBUCKET);
6145
6146    nvme_init_subnqn(n);
6147
6148    id->psd[0].mp = cpu_to_le16(0x9c4);
6149    id->psd[0].enlat = cpu_to_le32(0x10);
6150    id->psd[0].exlat = cpu_to_le32(0x4);
6151
6152    if (n->subsys) {
6153        id->cmic |= NVME_CMIC_MULTI_CTRL;
6154    }
6155
6156    NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
6157    NVME_CAP_SET_CQR(n->bar.cap, 1);
6158    NVME_CAP_SET_TO(n->bar.cap, 0xf);
6159    NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM);
6160    NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP);
6161    NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
6162    NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
6163    NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
6164    NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0);
6165
6166    n->bar.vs = NVME_SPEC_VER;
6167    n->bar.intmc = n->bar.intms = 0;
6168}
6169
6170static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6171{
6172    int cntlid;
6173
6174    if (!n->subsys) {
6175        return 0;
6176    }
6177
6178    cntlid = nvme_subsys_register_ctrl(n, errp);
6179    if (cntlid < 0) {
6180        return -1;
6181    }
6182
6183    n->cntlid = cntlid;
6184
6185    return 0;
6186}
6187
6188void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6189{
6190    uint32_t nsid = ns->params.nsid;
6191    assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6192
6193    n->namespaces[nsid - 1] = ns;
6194    ns->attached++;
6195
6196    n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6197                            BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6198}
6199
6200static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6201{
6202    NvmeCtrl *n = NVME(pci_dev);
6203    NvmeNamespace *ns;
6204    Error *local_err = NULL;
6205
6206    nvme_check_constraints(n, &local_err);
6207    if (local_err) {
6208        error_propagate(errp, local_err);
6209        return;
6210    }
6211
6212    qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6213                        &pci_dev->qdev, n->parent_obj.qdev.id);
6214
6215    nvme_init_state(n);
6216    if (nvme_init_pci(n, pci_dev, errp)) {
6217        return;
6218    }
6219
6220    if (nvme_init_subsys(n, errp)) {
6221        error_propagate(errp, local_err);
6222        return;
6223    }
6224    nvme_init_ctrl(n, pci_dev);
6225
6226    /* setup a namespace if the controller drive property was given */
6227    if (n->namespace.blkconf.blk) {
6228        ns = &n->namespace;
6229        ns->params.nsid = 1;
6230
6231        if (nvme_ns_setup(n, ns, errp)) {
6232            return;
6233        }
6234
6235        nvme_attach_ns(n, ns);
6236    }
6237}
6238
6239static void nvme_exit(PCIDevice *pci_dev)
6240{
6241    NvmeCtrl *n = NVME(pci_dev);
6242    NvmeNamespace *ns;
6243    int i;
6244
6245    nvme_ctrl_reset(n);
6246
6247    for (i = 1; i <= n->num_namespaces; i++) {
6248        ns = nvme_ns(n, i);
6249        if (!ns) {
6250            continue;
6251        }
6252
6253        nvme_ns_cleanup(ns);
6254    }
6255
6256    g_free(n->cq);
6257    g_free(n->sq);
6258    g_free(n->aer_reqs);
6259
6260    if (n->params.cmb_size_mb) {
6261        g_free(n->cmb.buf);
6262    }
6263
6264    if (n->pmr.dev) {
6265        host_memory_backend_set_mapped(n->pmr.dev, false);
6266    }
6267    msix_uninit(pci_dev, &n->bar0, &n->bar0);
6268    memory_region_del_subregion(&n->bar0, &n->iomem);
6269}
6270
6271static Property nvme_props[] = {
6272    DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6273    DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6274                     HostMemoryBackend *),
6275    DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6276                     NvmeSubsystem *),
6277    DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6278    DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6279    DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6280    DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6281    DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6282    DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6283    DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6284    DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6285    DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6286    DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6287    DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6288    DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6289    DEFINE_PROP_END_OF_LIST(),
6290};
6291
6292static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6293                                   void *opaque, Error **errp)
6294{
6295    NvmeCtrl *n = NVME(obj);
6296    uint8_t value = n->smart_critical_warning;
6297
6298    visit_type_uint8(v, name, &value, errp);
6299}
6300
6301static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6302                                   void *opaque, Error **errp)
6303{
6304    NvmeCtrl *n = NVME(obj);
6305    uint8_t value, old_value, cap = 0, index, event;
6306
6307    if (!visit_type_uint8(v, name, &value, errp)) {
6308        return;
6309    }
6310
6311    cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6312          | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6313    if (NVME_CAP_PMRS(n->bar.cap)) {
6314        cap |= NVME_SMART_PMR_UNRELIABLE;
6315    }
6316
6317    if ((value & cap) != value) {
6318        error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6319                   value & ~cap);
6320        return;
6321    }
6322
6323    old_value = n->smart_critical_warning;
6324    n->smart_critical_warning = value;
6325
6326    /* only inject new bits of smart critical warning */
6327    for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6328        event = 1 << index;
6329        if (value & ~old_value & event)
6330            nvme_smart_event(n, event);
6331    }
6332}
6333
6334static const VMStateDescription nvme_vmstate = {
6335    .name = "nvme",
6336    .unmigratable = 1,
6337};
6338
6339static void nvme_class_init(ObjectClass *oc, void *data)
6340{
6341    DeviceClass *dc = DEVICE_CLASS(oc);
6342    PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6343
6344    pc->realize = nvme_realize;
6345    pc->exit = nvme_exit;
6346    pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6347    pc->revision = 2;
6348
6349    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6350    dc->desc = "Non-Volatile Memory Express";
6351    device_class_set_props(dc, nvme_props);
6352    dc->vmsd = &nvme_vmstate;
6353}
6354
6355static void nvme_instance_init(Object *obj)
6356{
6357    NvmeCtrl *n = NVME(obj);
6358
6359    device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6360                                  "bootindex", "/namespace@1,0",
6361                                  DEVICE(obj));
6362
6363    object_property_add(obj, "smart_critical_warning", "uint8",
6364                        nvme_get_smart_warning,
6365                        nvme_set_smart_warning, NULL, NULL);
6366}
6367
6368static const TypeInfo nvme_info = {
6369    .name          = TYPE_NVME,
6370    .parent        = TYPE_PCI_DEVICE,
6371    .instance_size = sizeof(NvmeCtrl),
6372    .instance_init = nvme_instance_init,
6373    .class_init    = nvme_class_init,
6374    .interfaces = (InterfaceInfo[]) {
6375        { INTERFACE_PCIE_DEVICE },
6376        { }
6377    },
6378};
6379
6380static const TypeInfo nvme_bus_info = {
6381    .name = TYPE_NVME_BUS,
6382    .parent = TYPE_BUS,
6383    .instance_size = sizeof(NvmeBus),
6384};
6385
6386static void nvme_register_types(void)
6387{
6388    type_register_static(&nvme_info);
6389    type_register_static(&nvme_bus_info);
6390}
6391
6392type_init(nvme_register_types)
6393