1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#ifndef HW_NVME_NVME_H
19#define HW_NVME_NVME_H
20
21#include "qemu/uuid.h"
22#include "hw/pci/pci_device.h"
23#include "hw/block/block.h"
24
25#include "block/nvme.h"
26
27#define NVME_MAX_CONTROLLERS 256
28#define NVME_MAX_NAMESPACES 256
29#define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000)
30#define NVME_FDP_MAX_EVENTS 63
31#define NVME_FDP_MAXPIDS 128
32
33
34
35
36
37#define NVME_SQES 6
38#define NVME_CQES 4
39
40QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1);
41
42typedef struct NvmeCtrl NvmeCtrl;
43typedef struct NvmeNamespace NvmeNamespace;
44
45#define TYPE_NVME_BUS "nvme-bus"
46OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS)
47
48typedef struct NvmeBus {
49 BusState parent_bus;
50} NvmeBus;
51
52#define TYPE_NVME_SUBSYS "nvme-subsys"
53#define NVME_SUBSYS(obj) \
54 OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
55#define SUBSYS_SLOT_RSVD (void *)0xFFFF
56
57typedef struct NvmeReclaimUnit {
58 uint64_t ruamw;
59} NvmeReclaimUnit;
60
61typedef struct NvmeRuHandle {
62 uint8_t ruht;
63 uint8_t ruha;
64 uint64_t event_filter;
65 uint8_t lbafi;
66 uint64_t ruamw;
67
68
69 NvmeReclaimUnit *rus;
70} NvmeRuHandle;
71
72typedef struct NvmeFdpEventBuffer {
73 NvmeFdpEvent events[NVME_FDP_MAX_EVENTS];
74 unsigned int nelems;
75 unsigned int start;
76 unsigned int next;
77} NvmeFdpEventBuffer;
78
79typedef struct NvmeEnduranceGroup {
80 uint8_t event_conf;
81
82 struct {
83 NvmeFdpEventBuffer host_events, ctrl_events;
84
85 uint16_t nruh;
86 uint16_t nrg;
87 uint8_t rgif;
88 uint64_t runs;
89
90 uint64_t hbmw;
91 uint64_t mbmw;
92 uint64_t mbe;
93
94 bool enabled;
95
96 NvmeRuHandle *ruhs;
97 } fdp;
98} NvmeEnduranceGroup;
99
100typedef struct NvmeSubsystem {
101 DeviceState parent_obj;
102 NvmeBus bus;
103 uint8_t subnqn[256];
104 char *serial;
105
106 NvmeCtrl *ctrls[NVME_MAX_CONTROLLERS];
107 NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1];
108 NvmeEnduranceGroup endgrp;
109
110 struct {
111 char *nqn;
112
113 struct {
114 bool enabled;
115 uint64_t runs;
116 uint16_t nruh;
117 uint32_t nrg;
118 } fdp;
119 } params;
120} NvmeSubsystem;
121
122int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
123void nvme_subsys_unregister_ctrl(NvmeSubsystem *subsys, NvmeCtrl *n);
124
125static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem *subsys,
126 uint32_t cntlid)
127{
128 if (!subsys || cntlid >= NVME_MAX_CONTROLLERS) {
129 return NULL;
130 }
131
132 if (subsys->ctrls[cntlid] == SUBSYS_SLOT_RSVD) {
133 return NULL;
134 }
135
136 return subsys->ctrls[cntlid];
137}
138
139static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
140 uint32_t nsid)
141{
142 if (!subsys || !nsid || nsid > NVME_MAX_NAMESPACES) {
143 return NULL;
144 }
145
146 return subsys->namespaces[nsid];
147}
148
149#define TYPE_NVME_NS "nvme-ns"
150#define NVME_NS(obj) \
151 OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
152
153typedef struct NvmeZone {
154 NvmeZoneDescr d;
155 uint64_t w_ptr;
156 QTAILQ_ENTRY(NvmeZone) entry;
157} NvmeZone;
158
159#define FDP_EVT_MAX 0xff
160#define NVME_FDP_MAX_NS_RUHS 32u
161#define FDPVSS 0
162
163static const uint8_t nvme_fdp_evf_shifts[FDP_EVT_MAX] = {
164
165 [FDP_EVT_RU_NOT_FULLY_WRITTEN] = 0,
166 [FDP_EVT_RU_ATL_EXCEEDED] = 1,
167 [FDP_EVT_CTRL_RESET_RUH] = 2,
168 [FDP_EVT_INVALID_PID] = 3,
169
170 [FDP_EVT_MEDIA_REALLOC] = 32,
171 [FDP_EVT_RUH_IMPLICIT_RU_CHANGE] = 33,
172};
173
174typedef struct NvmeNamespaceParams {
175 bool detached;
176 bool shared;
177 uint32_t nsid;
178 QemuUUID uuid;
179 uint64_t eui64;
180 bool eui64_default;
181
182 uint16_t ms;
183 uint8_t mset;
184 uint8_t pi;
185 uint8_t pil;
186 uint8_t pif;
187
188 uint16_t mssrl;
189 uint32_t mcl;
190 uint8_t msrc;
191
192 bool zoned;
193 bool cross_zone_read;
194 uint64_t zone_size_bs;
195 uint64_t zone_cap_bs;
196 uint32_t max_active_zones;
197 uint32_t max_open_zones;
198 uint32_t zd_extension_size;
199
200 uint32_t numzrwa;
201 uint64_t zrwas;
202 uint64_t zrwafg;
203
204 struct {
205 char *ruhs;
206 } fdp;
207} NvmeNamespaceParams;
208
209typedef struct NvmeNamespace {
210 DeviceState parent_obj;
211 BlockConf blkconf;
212 int32_t bootindex;
213 int64_t size;
214 int64_t moff;
215 NvmeIdNs id_ns;
216 NvmeIdNsNvm id_ns_nvm;
217 NvmeLBAF lbaf;
218 unsigned int nlbaf;
219 size_t lbasz;
220 const uint32_t *iocs;
221 uint8_t csi;
222 uint16_t status;
223 int attached;
224 uint8_t pif;
225
226 struct {
227 uint16_t zrwas;
228 uint16_t zrwafg;
229 uint32_t numzrwa;
230 } zns;
231
232 QTAILQ_ENTRY(NvmeNamespace) entry;
233
234 NvmeIdNsZoned *id_ns_zoned;
235 NvmeZone *zone_array;
236 QTAILQ_HEAD(, NvmeZone) exp_open_zones;
237 QTAILQ_HEAD(, NvmeZone) imp_open_zones;
238 QTAILQ_HEAD(, NvmeZone) closed_zones;
239 QTAILQ_HEAD(, NvmeZone) full_zones;
240 uint32_t num_zones;
241 uint64_t zone_size;
242 uint64_t zone_capacity;
243 uint32_t zone_size_log2;
244 uint8_t *zd_extensions;
245 int32_t nr_open_zones;
246 int32_t nr_active_zones;
247
248 NvmeNamespaceParams params;
249 NvmeSubsystem *subsys;
250 NvmeEnduranceGroup *endgrp;
251
252 struct {
253 uint32_t err_rec;
254 } features;
255
256 struct {
257 uint16_t nphs;
258
259 uint16_t *phs;
260 } fdp;
261} NvmeNamespace;
262
263static inline uint32_t nvme_nsid(NvmeNamespace *ns)
264{
265 if (ns) {
266 return ns->params.nsid;
267 }
268
269 return 0;
270}
271
272static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
273{
274 return lba << ns->lbaf.ds;
275}
276
277static inline size_t nvme_m2b(NvmeNamespace *ns, uint64_t lba)
278{
279 return ns->lbaf.ms * lba;
280}
281
282static inline int64_t nvme_moff(NvmeNamespace *ns, uint64_t lba)
283{
284 return ns->moff + nvme_m2b(ns, lba);
285}
286
287static inline bool nvme_ns_ext(NvmeNamespace *ns)
288{
289 return !!NVME_ID_NS_FLBAS_EXTENDED(ns->id_ns.flbas);
290}
291
292static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
293{
294 return zone->d.zs >> 4;
295}
296
297static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state)
298{
299 zone->d.zs = state << 4;
300}
301
302static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
303{
304 return zone->d.zslba + ns->zone_size;
305}
306
307static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
308{
309 return zone->d.zslba + zone->d.zcap;
310}
311
312static inline bool nvme_wp_is_valid(NvmeZone *zone)
313{
314 uint8_t st = nvme_get_zone_state(zone);
315
316 return st != NVME_ZONE_STATE_FULL &&
317 st != NVME_ZONE_STATE_READ_ONLY &&
318 st != NVME_ZONE_STATE_OFFLINE;
319}
320
321static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
322 uint32_t zone_idx)
323{
324 return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size];
325}
326
327static inline void nvme_aor_inc_open(NvmeNamespace *ns)
328{
329 assert(ns->nr_open_zones >= 0);
330 if (ns->params.max_open_zones) {
331 ns->nr_open_zones++;
332 assert(ns->nr_open_zones <= ns->params.max_open_zones);
333 }
334}
335
336static inline void nvme_aor_dec_open(NvmeNamespace *ns)
337{
338 if (ns->params.max_open_zones) {
339 assert(ns->nr_open_zones > 0);
340 ns->nr_open_zones--;
341 }
342 assert(ns->nr_open_zones >= 0);
343}
344
345static inline void nvme_aor_inc_active(NvmeNamespace *ns)
346{
347 assert(ns->nr_active_zones >= 0);
348 if (ns->params.max_active_zones) {
349 ns->nr_active_zones++;
350 assert(ns->nr_active_zones <= ns->params.max_active_zones);
351 }
352}
353
354static inline void nvme_aor_dec_active(NvmeNamespace *ns)
355{
356 if (ns->params.max_active_zones) {
357 assert(ns->nr_active_zones > 0);
358 ns->nr_active_zones--;
359 assert(ns->nr_active_zones >= ns->nr_open_zones);
360 }
361 assert(ns->nr_active_zones >= 0);
362}
363
364static inline void nvme_fdp_stat_inc(uint64_t *a, uint64_t b)
365{
366 uint64_t ret = *a + b;
367 *a = ret < *a ? UINT64_MAX : ret;
368}
369
370void nvme_ns_init_format(NvmeNamespace *ns);
371int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
372void nvme_ns_drain(NvmeNamespace *ns);
373void nvme_ns_shutdown(NvmeNamespace *ns);
374void nvme_ns_cleanup(NvmeNamespace *ns);
375
376typedef struct NvmeAsyncEvent {
377 QTAILQ_ENTRY(NvmeAsyncEvent) entry;
378 NvmeAerResult result;
379} NvmeAsyncEvent;
380
381enum {
382 NVME_SG_ALLOC = 1 << 0,
383 NVME_SG_DMA = 1 << 1,
384};
385
386typedef struct NvmeSg {
387 int flags;
388
389 union {
390 QEMUSGList qsg;
391 QEMUIOVector iov;
392 };
393} NvmeSg;
394
395typedef enum NvmeTxDirection {
396 NVME_TX_DIRECTION_TO_DEVICE = 0,
397 NVME_TX_DIRECTION_FROM_DEVICE = 1,
398} NvmeTxDirection;
399
400typedef struct NvmeRequest {
401 struct NvmeSQueue *sq;
402 struct NvmeNamespace *ns;
403 BlockAIOCB *aiocb;
404 uint16_t status;
405 void *opaque;
406 NvmeCqe cqe;
407 NvmeCmd cmd;
408 BlockAcctCookie acct;
409 NvmeSg sg;
410 QTAILQ_ENTRY(NvmeRequest)entry;
411} NvmeRequest;
412
413typedef struct NvmeBounceContext {
414 NvmeRequest *req;
415
416 struct {
417 QEMUIOVector iov;
418 uint8_t *bounce;
419 } data, mdata;
420} NvmeBounceContext;
421
422static inline const char *nvme_adm_opc_str(uint8_t opc)
423{
424 switch (opc) {
425 case NVME_ADM_CMD_DELETE_SQ: return "NVME_ADM_CMD_DELETE_SQ";
426 case NVME_ADM_CMD_CREATE_SQ: return "NVME_ADM_CMD_CREATE_SQ";
427 case NVME_ADM_CMD_GET_LOG_PAGE: return "NVME_ADM_CMD_GET_LOG_PAGE";
428 case NVME_ADM_CMD_DELETE_CQ: return "NVME_ADM_CMD_DELETE_CQ";
429 case NVME_ADM_CMD_CREATE_CQ: return "NVME_ADM_CMD_CREATE_CQ";
430 case NVME_ADM_CMD_IDENTIFY: return "NVME_ADM_CMD_IDENTIFY";
431 case NVME_ADM_CMD_ABORT: return "NVME_ADM_CMD_ABORT";
432 case NVME_ADM_CMD_SET_FEATURES: return "NVME_ADM_CMD_SET_FEATURES";
433 case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES";
434 case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
435 case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT";
436 case NVME_ADM_CMD_DIRECTIVE_SEND: return "NVME_ADM_CMD_DIRECTIVE_SEND";
437 case NVME_ADM_CMD_VIRT_MNGMT: return "NVME_ADM_CMD_VIRT_MNGMT";
438 case NVME_ADM_CMD_DIRECTIVE_RECV: return "NVME_ADM_CMD_DIRECTIVE_RECV";
439 case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG";
440 case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM";
441 default: return "NVME_ADM_CMD_UNKNOWN";
442 }
443}
444
445static inline const char *nvme_io_opc_str(uint8_t opc)
446{
447 switch (opc) {
448 case NVME_CMD_FLUSH: return "NVME_NVM_CMD_FLUSH";
449 case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE";
450 case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
451 case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE";
452 case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
453 case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM";
454 case NVME_CMD_VERIFY: return "NVME_NVM_CMD_VERIFY";
455 case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY";
456 case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND";
457 case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV";
458 case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND";
459 default: return "NVME_NVM_CMD_UNKNOWN";
460 }
461}
462
463typedef struct NvmeSQueue {
464 struct NvmeCtrl *ctrl;
465 uint16_t sqid;
466 uint16_t cqid;
467 uint32_t head;
468 uint32_t tail;
469 uint32_t size;
470 uint64_t dma_addr;
471 uint64_t db_addr;
472 uint64_t ei_addr;
473 QEMUBH *bh;
474 EventNotifier notifier;
475 bool ioeventfd_enabled;
476 NvmeRequest *io_req;
477 QTAILQ_HEAD(, NvmeRequest) req_list;
478 QTAILQ_HEAD(, NvmeRequest) out_req_list;
479 QTAILQ_ENTRY(NvmeSQueue) entry;
480} NvmeSQueue;
481
482typedef struct NvmeCQueue {
483 struct NvmeCtrl *ctrl;
484 uint8_t phase;
485 uint16_t cqid;
486 uint16_t irq_enabled;
487 uint32_t head;
488 uint32_t tail;
489 uint32_t vector;
490 uint32_t size;
491 uint64_t dma_addr;
492 uint64_t db_addr;
493 uint64_t ei_addr;
494 QEMUBH *bh;
495 EventNotifier notifier;
496 bool ioeventfd_enabled;
497 QTAILQ_HEAD(, NvmeSQueue) sq_list;
498 QTAILQ_HEAD(, NvmeRequest) req_list;
499} NvmeCQueue;
500
501#define TYPE_NVME "nvme"
502#define NVME(obj) \
503 OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME)
504
505typedef struct NvmeParams {
506 char *serial;
507 uint32_t num_queues;
508 uint32_t max_ioqpairs;
509 uint16_t msix_qsize;
510 uint32_t cmb_size_mb;
511 uint8_t aerl;
512 uint32_t aer_max_queued;
513 uint8_t mdts;
514 uint8_t vsl;
515 bool use_intel_id;
516 uint8_t zasl;
517 bool auto_transition_zones;
518 bool legacy_cmb;
519 bool ioeventfd;
520 uint8_t sriov_max_vfs;
521 uint16_t sriov_vq_flexible;
522 uint16_t sriov_vi_flexible;
523 uint8_t sriov_max_vq_per_vf;
524 uint8_t sriov_max_vi_per_vf;
525} NvmeParams;
526
527typedef struct NvmeCtrl {
528 PCIDevice parent_obj;
529 MemoryRegion bar0;
530 MemoryRegion iomem;
531 NvmeBar bar;
532 NvmeParams params;
533 NvmeBus bus;
534
535 uint16_t cntlid;
536 bool qs_created;
537 uint32_t page_size;
538 uint16_t page_bits;
539 uint16_t max_prp_ents;
540 uint32_t max_q_ents;
541 uint8_t outstanding_aers;
542 uint32_t irq_status;
543 int cq_pending;
544 uint64_t host_timestamp;
545 uint64_t timestamp_set_qemu_clock_ms;
546 uint64_t starttime_ms;
547 uint16_t temperature;
548 uint8_t smart_critical_warning;
549 uint32_t conf_msix_qsize;
550 uint32_t conf_ioqpairs;
551 uint64_t dbbuf_dbs;
552 uint64_t dbbuf_eis;
553 bool dbbuf_enabled;
554
555 struct {
556 MemoryRegion mem;
557 uint8_t *buf;
558 bool cmse;
559 hwaddr cba;
560 } cmb;
561
562 struct {
563 HostMemoryBackend *dev;
564 bool cmse;
565 hwaddr cba;
566 } pmr;
567
568 uint8_t aer_mask;
569 NvmeRequest **aer_reqs;
570 QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
571 int aer_queued;
572
573 uint32_t dmrsl;
574
575
576#define NVME_CHANGED_NSID_SIZE (NVME_MAX_NAMESPACES + 1)
577 DECLARE_BITMAP(changed_nsids, NVME_CHANGED_NSID_SIZE);
578
579 NvmeSubsystem *subsys;
580
581 NvmeNamespace namespace;
582 NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1];
583 NvmeSQueue **sq;
584 NvmeCQueue **cq;
585 NvmeSQueue admin_sq;
586 NvmeCQueue admin_cq;
587 NvmeIdCtrl id_ctrl;
588
589 struct {
590 struct {
591 uint16_t temp_thresh_hi;
592 uint16_t temp_thresh_low;
593 };
594
595 uint32_t async_config;
596 NvmeHostBehaviorSupport hbs;
597 } features;
598
599 NvmePriCtrlCap pri_ctrl_cap;
600 NvmeSecCtrlList sec_ctrl_list;
601 struct {
602 uint16_t vqrfap;
603 uint16_t virfap;
604 } next_pri_ctrl_cap;
605} NvmeCtrl;
606
607typedef enum NvmeResetType {
608 NVME_RESET_FUNCTION = 0,
609 NVME_RESET_CONTROLLER = 1,
610} NvmeResetType;
611
612static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
613{
614 if (!nsid || nsid > NVME_MAX_NAMESPACES) {
615 return NULL;
616 }
617
618 return n->namespaces[nsid];
619}
620
621static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
622{
623 NvmeSQueue *sq = req->sq;
624 NvmeCtrl *n = sq->ctrl;
625
626 return n->cq[sq->cqid];
627}
628
629static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req)
630{
631 NvmeSQueue *sq = req->sq;
632 return sq->ctrl;
633}
634
635static inline uint16_t nvme_cid(NvmeRequest *req)
636{
637 if (!req) {
638 return 0xffff;
639 }
640
641 return le16_to_cpu(req->cqe.cid);
642}
643
644static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
645{
646 PCIDevice *pci_dev = &n->parent_obj;
647 NvmeCtrl *pf = NVME(pcie_sriov_get_pf(pci_dev));
648
649 if (pci_is_vf(pci_dev)) {
650 return &pf->sec_ctrl_list.sec[pcie_sriov_vf_number(pci_dev)];
651 }
652
653 return NULL;
654}
655
656static inline NvmeSecCtrlEntry *nvme_sctrl_for_cntlid(NvmeCtrl *n,
657 uint16_t cntlid)
658{
659 NvmeSecCtrlList *list = &n->sec_ctrl_list;
660 uint8_t i;
661
662 for (i = 0; i < list->numcntl; i++) {
663 if (le16_to_cpu(list->sec[i].scid) == cntlid) {
664 return &list->sec[i];
665 }
666 }
667
668 return NULL;
669}
670
671void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns);
672uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
673 NvmeTxDirection dir, NvmeRequest *req);
674uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
675 NvmeTxDirection dir, NvmeRequest *req);
676void nvme_rw_complete_cb(void *opaque, int ret);
677uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
678 NvmeCmd *cmd);
679
680#endif
681