1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#ifndef HW_NVME_NVME_H
19#define HW_NVME_NVME_H
20
21#include "qemu/uuid.h"
22#include "hw/pci/pci_device.h"
23#include "hw/block/block.h"
24
25#include "block/nvme.h"
26
27#define NVME_MAX_CONTROLLERS 256
28#define NVME_MAX_NAMESPACES 256
29#define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000)
30#define NVME_FDP_MAX_EVENTS 63
31#define NVME_FDP_MAXPIDS 128
32
33QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1);
34
35typedef struct NvmeCtrl NvmeCtrl;
36typedef struct NvmeNamespace NvmeNamespace;
37
38#define TYPE_NVME_BUS "nvme-bus"
39OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS)
40
41typedef struct NvmeBus {
42 BusState parent_bus;
43} NvmeBus;
44
45#define TYPE_NVME_SUBSYS "nvme-subsys"
46#define NVME_SUBSYS(obj) \
47 OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
48#define SUBSYS_SLOT_RSVD (void *)0xFFFF
49
50typedef struct NvmeReclaimUnit {
51 uint64_t ruamw;
52} NvmeReclaimUnit;
53
54typedef struct NvmeRuHandle {
55 uint8_t ruht;
56 uint8_t ruha;
57 uint64_t event_filter;
58 uint8_t lbafi;
59 uint64_t ruamw;
60
61
62 NvmeReclaimUnit *rus;
63} NvmeRuHandle;
64
65typedef struct NvmeFdpEventBuffer {
66 NvmeFdpEvent events[NVME_FDP_MAX_EVENTS];
67 unsigned int nelems;
68 unsigned int start;
69 unsigned int next;
70} NvmeFdpEventBuffer;
71
72typedef struct NvmeEnduranceGroup {
73 uint8_t event_conf;
74
75 struct {
76 NvmeFdpEventBuffer host_events, ctrl_events;
77
78 uint16_t nruh;
79 uint16_t nrg;
80 uint8_t rgif;
81 uint64_t runs;
82
83 uint64_t hbmw;
84 uint64_t mbmw;
85 uint64_t mbe;
86
87 bool enabled;
88
89 NvmeRuHandle *ruhs;
90 } fdp;
91} NvmeEnduranceGroup;
92
93typedef struct NvmeSubsystem {
94 DeviceState parent_obj;
95 NvmeBus bus;
96 uint8_t subnqn[256];
97 char *serial;
98
99 NvmeCtrl *ctrls[NVME_MAX_CONTROLLERS];
100 NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1];
101 NvmeEnduranceGroup endgrp;
102
103 struct {
104 char *nqn;
105
106 struct {
107 bool enabled;
108 uint64_t runs;
109 uint16_t nruh;
110 uint32_t nrg;
111 } fdp;
112 } params;
113} NvmeSubsystem;
114
115int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
116void nvme_subsys_unregister_ctrl(NvmeSubsystem *subsys, NvmeCtrl *n);
117
118static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem *subsys,
119 uint32_t cntlid)
120{
121 if (!subsys || cntlid >= NVME_MAX_CONTROLLERS) {
122 return NULL;
123 }
124
125 if (subsys->ctrls[cntlid] == SUBSYS_SLOT_RSVD) {
126 return NULL;
127 }
128
129 return subsys->ctrls[cntlid];
130}
131
132static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
133 uint32_t nsid)
134{
135 if (!subsys || !nsid || nsid > NVME_MAX_NAMESPACES) {
136 return NULL;
137 }
138
139 return subsys->namespaces[nsid];
140}
141
142#define TYPE_NVME_NS "nvme-ns"
143#define NVME_NS(obj) \
144 OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
145
146typedef struct NvmeZone {
147 NvmeZoneDescr d;
148 uint64_t w_ptr;
149 QTAILQ_ENTRY(NvmeZone) entry;
150} NvmeZone;
151
152#define FDP_EVT_MAX 0xff
153#define NVME_FDP_MAX_NS_RUHS 32u
154#define FDPVSS 0
155
156static const uint8_t nvme_fdp_evf_shifts[FDP_EVT_MAX] = {
157
158 [FDP_EVT_RU_NOT_FULLY_WRITTEN] = 0,
159 [FDP_EVT_RU_ATL_EXCEEDED] = 1,
160 [FDP_EVT_CTRL_RESET_RUH] = 2,
161 [FDP_EVT_INVALID_PID] = 3,
162
163 [FDP_EVT_MEDIA_REALLOC] = 32,
164 [FDP_EVT_RUH_IMPLICIT_RU_CHANGE] = 33,
165};
166
167typedef struct NvmeNamespaceParams {
168 bool detached;
169 bool shared;
170 uint32_t nsid;
171 QemuUUID uuid;
172 uint64_t eui64;
173 bool eui64_default;
174
175 uint16_t ms;
176 uint8_t mset;
177 uint8_t pi;
178 uint8_t pil;
179 uint8_t pif;
180
181 uint16_t mssrl;
182 uint32_t mcl;
183 uint8_t msrc;
184
185 bool zoned;
186 bool cross_zone_read;
187 uint64_t zone_size_bs;
188 uint64_t zone_cap_bs;
189 uint32_t max_active_zones;
190 uint32_t max_open_zones;
191 uint32_t zd_extension_size;
192
193 uint32_t numzrwa;
194 uint64_t zrwas;
195 uint64_t zrwafg;
196
197 struct {
198 char *ruhs;
199 } fdp;
200} NvmeNamespaceParams;
201
202typedef struct NvmeNamespace {
203 DeviceState parent_obj;
204 BlockConf blkconf;
205 int32_t bootindex;
206 int64_t size;
207 int64_t moff;
208 NvmeIdNs id_ns;
209 NvmeIdNsNvm id_ns_nvm;
210 NvmeLBAF lbaf;
211 unsigned int nlbaf;
212 size_t lbasz;
213 const uint32_t *iocs;
214 uint8_t csi;
215 uint16_t status;
216 int attached;
217 uint8_t pif;
218
219 struct {
220 uint16_t zrwas;
221 uint16_t zrwafg;
222 uint32_t numzrwa;
223 } zns;
224
225 QTAILQ_ENTRY(NvmeNamespace) entry;
226
227 NvmeIdNsZoned *id_ns_zoned;
228 NvmeZone *zone_array;
229 QTAILQ_HEAD(, NvmeZone) exp_open_zones;
230 QTAILQ_HEAD(, NvmeZone) imp_open_zones;
231 QTAILQ_HEAD(, NvmeZone) closed_zones;
232 QTAILQ_HEAD(, NvmeZone) full_zones;
233 uint32_t num_zones;
234 uint64_t zone_size;
235 uint64_t zone_capacity;
236 uint32_t zone_size_log2;
237 uint8_t *zd_extensions;
238 int32_t nr_open_zones;
239 int32_t nr_active_zones;
240
241 NvmeNamespaceParams params;
242 NvmeSubsystem *subsys;
243 NvmeEnduranceGroup *endgrp;
244
245 struct {
246 uint32_t err_rec;
247 } features;
248
249 struct {
250 uint16_t nphs;
251
252 uint16_t *phs;
253 } fdp;
254} NvmeNamespace;
255
256static inline uint32_t nvme_nsid(NvmeNamespace *ns)
257{
258 if (ns) {
259 return ns->params.nsid;
260 }
261
262 return 0;
263}
264
265static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
266{
267 return lba << ns->lbaf.ds;
268}
269
270static inline size_t nvme_m2b(NvmeNamespace *ns, uint64_t lba)
271{
272 return ns->lbaf.ms * lba;
273}
274
275static inline int64_t nvme_moff(NvmeNamespace *ns, uint64_t lba)
276{
277 return ns->moff + nvme_m2b(ns, lba);
278}
279
280static inline bool nvme_ns_ext(NvmeNamespace *ns)
281{
282 return !!NVME_ID_NS_FLBAS_EXTENDED(ns->id_ns.flbas);
283}
284
285static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
286{
287 return zone->d.zs >> 4;
288}
289
290static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state)
291{
292 zone->d.zs = state << 4;
293}
294
295static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
296{
297 return zone->d.zslba + ns->zone_size;
298}
299
300static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
301{
302 return zone->d.zslba + zone->d.zcap;
303}
304
305static inline bool nvme_wp_is_valid(NvmeZone *zone)
306{
307 uint8_t st = nvme_get_zone_state(zone);
308
309 return st != NVME_ZONE_STATE_FULL &&
310 st != NVME_ZONE_STATE_READ_ONLY &&
311 st != NVME_ZONE_STATE_OFFLINE;
312}
313
314static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
315 uint32_t zone_idx)
316{
317 return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size];
318}
319
320static inline void nvme_aor_inc_open(NvmeNamespace *ns)
321{
322 assert(ns->nr_open_zones >= 0);
323 if (ns->params.max_open_zones) {
324 ns->nr_open_zones++;
325 assert(ns->nr_open_zones <= ns->params.max_open_zones);
326 }
327}
328
329static inline void nvme_aor_dec_open(NvmeNamespace *ns)
330{
331 if (ns->params.max_open_zones) {
332 assert(ns->nr_open_zones > 0);
333 ns->nr_open_zones--;
334 }
335 assert(ns->nr_open_zones >= 0);
336}
337
338static inline void nvme_aor_inc_active(NvmeNamespace *ns)
339{
340 assert(ns->nr_active_zones >= 0);
341 if (ns->params.max_active_zones) {
342 ns->nr_active_zones++;
343 assert(ns->nr_active_zones <= ns->params.max_active_zones);
344 }
345}
346
347static inline void nvme_aor_dec_active(NvmeNamespace *ns)
348{
349 if (ns->params.max_active_zones) {
350 assert(ns->nr_active_zones > 0);
351 ns->nr_active_zones--;
352 assert(ns->nr_active_zones >= ns->nr_open_zones);
353 }
354 assert(ns->nr_active_zones >= 0);
355}
356
357static inline void nvme_fdp_stat_inc(uint64_t *a, uint64_t b)
358{
359 uint64_t ret = *a + b;
360 *a = ret < *a ? UINT64_MAX : ret;
361}
362
363void nvme_ns_init_format(NvmeNamespace *ns);
364int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
365void nvme_ns_drain(NvmeNamespace *ns);
366void nvme_ns_shutdown(NvmeNamespace *ns);
367void nvme_ns_cleanup(NvmeNamespace *ns);
368
369typedef struct NvmeAsyncEvent {
370 QTAILQ_ENTRY(NvmeAsyncEvent) entry;
371 NvmeAerResult result;
372} NvmeAsyncEvent;
373
374enum {
375 NVME_SG_ALLOC = 1 << 0,
376 NVME_SG_DMA = 1 << 1,
377};
378
379typedef struct NvmeSg {
380 int flags;
381
382 union {
383 QEMUSGList qsg;
384 QEMUIOVector iov;
385 };
386} NvmeSg;
387
388typedef enum NvmeTxDirection {
389 NVME_TX_DIRECTION_TO_DEVICE = 0,
390 NVME_TX_DIRECTION_FROM_DEVICE = 1,
391} NvmeTxDirection;
392
393typedef struct NvmeRequest {
394 struct NvmeSQueue *sq;
395 struct NvmeNamespace *ns;
396 BlockAIOCB *aiocb;
397 uint16_t status;
398 void *opaque;
399 NvmeCqe cqe;
400 NvmeCmd cmd;
401 BlockAcctCookie acct;
402 NvmeSg sg;
403 QTAILQ_ENTRY(NvmeRequest)entry;
404} NvmeRequest;
405
406typedef struct NvmeBounceContext {
407 NvmeRequest *req;
408
409 struct {
410 QEMUIOVector iov;
411 uint8_t *bounce;
412 } data, mdata;
413} NvmeBounceContext;
414
415static inline const char *nvme_adm_opc_str(uint8_t opc)
416{
417 switch (opc) {
418 case NVME_ADM_CMD_DELETE_SQ: return "NVME_ADM_CMD_DELETE_SQ";
419 case NVME_ADM_CMD_CREATE_SQ: return "NVME_ADM_CMD_CREATE_SQ";
420 case NVME_ADM_CMD_GET_LOG_PAGE: return "NVME_ADM_CMD_GET_LOG_PAGE";
421 case NVME_ADM_CMD_DELETE_CQ: return "NVME_ADM_CMD_DELETE_CQ";
422 case NVME_ADM_CMD_CREATE_CQ: return "NVME_ADM_CMD_CREATE_CQ";
423 case NVME_ADM_CMD_IDENTIFY: return "NVME_ADM_CMD_IDENTIFY";
424 case NVME_ADM_CMD_ABORT: return "NVME_ADM_CMD_ABORT";
425 case NVME_ADM_CMD_SET_FEATURES: return "NVME_ADM_CMD_SET_FEATURES";
426 case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES";
427 case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
428 case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT";
429 case NVME_ADM_CMD_DIRECTIVE_SEND: return "NVME_ADM_CMD_DIRECTIVE_SEND";
430 case NVME_ADM_CMD_VIRT_MNGMT: return "NVME_ADM_CMD_VIRT_MNGMT";
431 case NVME_ADM_CMD_DIRECTIVE_RECV: return "NVME_ADM_CMD_DIRECTIVE_RECV";
432 case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG";
433 case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM";
434 default: return "NVME_ADM_CMD_UNKNOWN";
435 }
436}
437
438static inline const char *nvme_io_opc_str(uint8_t opc)
439{
440 switch (opc) {
441 case NVME_CMD_FLUSH: return "NVME_NVM_CMD_FLUSH";
442 case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE";
443 case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
444 case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE";
445 case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
446 case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM";
447 case NVME_CMD_VERIFY: return "NVME_NVM_CMD_VERIFY";
448 case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY";
449 case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND";
450 case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV";
451 case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND";
452 default: return "NVME_NVM_CMD_UNKNOWN";
453 }
454}
455
456typedef struct NvmeSQueue {
457 struct NvmeCtrl *ctrl;
458 uint16_t sqid;
459 uint16_t cqid;
460 uint32_t head;
461 uint32_t tail;
462 uint32_t size;
463 uint64_t dma_addr;
464 uint64_t db_addr;
465 uint64_t ei_addr;
466 QEMUBH *bh;
467 EventNotifier notifier;
468 bool ioeventfd_enabled;
469 NvmeRequest *io_req;
470 QTAILQ_HEAD(, NvmeRequest) req_list;
471 QTAILQ_HEAD(, NvmeRequest) out_req_list;
472 QTAILQ_ENTRY(NvmeSQueue) entry;
473} NvmeSQueue;
474
475typedef struct NvmeCQueue {
476 struct NvmeCtrl *ctrl;
477 uint8_t phase;
478 uint16_t cqid;
479 uint16_t irq_enabled;
480 uint32_t head;
481 uint32_t tail;
482 uint32_t vector;
483 uint32_t size;
484 uint64_t dma_addr;
485 uint64_t db_addr;
486 uint64_t ei_addr;
487 QEMUBH *bh;
488 EventNotifier notifier;
489 bool ioeventfd_enabled;
490 QTAILQ_HEAD(, NvmeSQueue) sq_list;
491 QTAILQ_HEAD(, NvmeRequest) req_list;
492} NvmeCQueue;
493
494#define TYPE_NVME "nvme"
495#define NVME(obj) \
496 OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME)
497
498typedef struct NvmeParams {
499 char *serial;
500 uint32_t num_queues;
501 uint32_t max_ioqpairs;
502 uint16_t msix_qsize;
503 uint32_t cmb_size_mb;
504 uint8_t aerl;
505 uint32_t aer_max_queued;
506 uint8_t mdts;
507 uint8_t vsl;
508 bool use_intel_id;
509 uint8_t zasl;
510 bool auto_transition_zones;
511 bool legacy_cmb;
512 bool ioeventfd;
513 uint8_t sriov_max_vfs;
514 uint16_t sriov_vq_flexible;
515 uint16_t sriov_vi_flexible;
516 uint8_t sriov_max_vq_per_vf;
517 uint8_t sriov_max_vi_per_vf;
518} NvmeParams;
519
520typedef struct NvmeCtrl {
521 PCIDevice parent_obj;
522 MemoryRegion bar0;
523 MemoryRegion iomem;
524 NvmeBar bar;
525 NvmeParams params;
526 NvmeBus bus;
527
528 uint16_t cntlid;
529 bool qs_created;
530 uint32_t page_size;
531 uint16_t page_bits;
532 uint16_t max_prp_ents;
533 uint16_t cqe_size;
534 uint16_t sqe_size;
535 uint32_t max_q_ents;
536 uint8_t outstanding_aers;
537 uint32_t irq_status;
538 int cq_pending;
539 uint64_t host_timestamp;
540 uint64_t timestamp_set_qemu_clock_ms;
541 uint64_t starttime_ms;
542 uint16_t temperature;
543 uint8_t smart_critical_warning;
544 uint32_t conf_msix_qsize;
545 uint32_t conf_ioqpairs;
546 uint64_t dbbuf_dbs;
547 uint64_t dbbuf_eis;
548 bool dbbuf_enabled;
549
550 struct {
551 MemoryRegion mem;
552 uint8_t *buf;
553 bool cmse;
554 hwaddr cba;
555 } cmb;
556
557 struct {
558 HostMemoryBackend *dev;
559 bool cmse;
560 hwaddr cba;
561 } pmr;
562
563 uint8_t aer_mask;
564 NvmeRequest **aer_reqs;
565 QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
566 int aer_queued;
567
568 uint32_t dmrsl;
569
570
571#define NVME_CHANGED_NSID_SIZE (NVME_MAX_NAMESPACES + 1)
572 DECLARE_BITMAP(changed_nsids, NVME_CHANGED_NSID_SIZE);
573
574 NvmeSubsystem *subsys;
575
576 NvmeNamespace namespace;
577 NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1];
578 NvmeSQueue **sq;
579 NvmeCQueue **cq;
580 NvmeSQueue admin_sq;
581 NvmeCQueue admin_cq;
582 NvmeIdCtrl id_ctrl;
583
584 struct {
585 struct {
586 uint16_t temp_thresh_hi;
587 uint16_t temp_thresh_low;
588 };
589
590 uint32_t async_config;
591 NvmeHostBehaviorSupport hbs;
592 } features;
593
594 NvmePriCtrlCap pri_ctrl_cap;
595 NvmeSecCtrlList sec_ctrl_list;
596 struct {
597 uint16_t vqrfap;
598 uint16_t virfap;
599 } next_pri_ctrl_cap;
600} NvmeCtrl;
601
602typedef enum NvmeResetType {
603 NVME_RESET_FUNCTION = 0,
604 NVME_RESET_CONTROLLER = 1,
605} NvmeResetType;
606
607static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
608{
609 if (!nsid || nsid > NVME_MAX_NAMESPACES) {
610 return NULL;
611 }
612
613 return n->namespaces[nsid];
614}
615
616static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
617{
618 NvmeSQueue *sq = req->sq;
619 NvmeCtrl *n = sq->ctrl;
620
621 return n->cq[sq->cqid];
622}
623
624static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req)
625{
626 NvmeSQueue *sq = req->sq;
627 return sq->ctrl;
628}
629
630static inline uint16_t nvme_cid(NvmeRequest *req)
631{
632 if (!req) {
633 return 0xffff;
634 }
635
636 return le16_to_cpu(req->cqe.cid);
637}
638
639static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
640{
641 PCIDevice *pci_dev = &n->parent_obj;
642 NvmeCtrl *pf = NVME(pcie_sriov_get_pf(pci_dev));
643
644 if (pci_is_vf(pci_dev)) {
645 return &pf->sec_ctrl_list.sec[pcie_sriov_vf_number(pci_dev)];
646 }
647
648 return NULL;
649}
650
651static inline NvmeSecCtrlEntry *nvme_sctrl_for_cntlid(NvmeCtrl *n,
652 uint16_t cntlid)
653{
654 NvmeSecCtrlList *list = &n->sec_ctrl_list;
655 uint8_t i;
656
657 for (i = 0; i < list->numcntl; i++) {
658 if (le16_to_cpu(list->sec[i].scid) == cntlid) {
659 return &list->sec[i];
660 }
661 }
662
663 return NULL;
664}
665
666void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns);
667uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
668 NvmeTxDirection dir, NvmeRequest *req);
669uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
670 NvmeTxDirection dir, NvmeRequest *req);
671void nvme_rw_complete_cb(void *opaque, int ret);
672uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
673 NvmeCmd *cmd);
674
675#endif
676