1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18#ifndef HW_NVME_NVME_H
19#define HW_NVME_NVME_H
20
21#include "qemu/uuid.h"
22#include "hw/pci/pci.h"
23#include "hw/block/block.h"
24
25#include "block/nvme.h"
26
27#define NVME_MAX_CONTROLLERS 256
28#define NVME_MAX_NAMESPACES 256
29#define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000)
30
31QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1);
32
33typedef struct NvmeCtrl NvmeCtrl;
34typedef struct NvmeNamespace NvmeNamespace;
35
36#define TYPE_NVME_BUS "nvme-bus"
37OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS)
38
39typedef struct NvmeBus {
40 BusState parent_bus;
41} NvmeBus;
42
43#define TYPE_NVME_SUBSYS "nvme-subsys"
44#define NVME_SUBSYS(obj) \
45 OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
46#define SUBSYS_SLOT_RSVD (void *)0xFFFF
47
48typedef struct NvmeSubsystem {
49 DeviceState parent_obj;
50 NvmeBus bus;
51 uint8_t subnqn[256];
52 char *serial;
53
54 NvmeCtrl *ctrls[NVME_MAX_CONTROLLERS];
55 NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1];
56
57 struct {
58 char *nqn;
59 } params;
60} NvmeSubsystem;
61
62int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp);
63void nvme_subsys_unregister_ctrl(NvmeSubsystem *subsys, NvmeCtrl *n);
64
65static inline NvmeCtrl *nvme_subsys_ctrl(NvmeSubsystem *subsys,
66 uint32_t cntlid)
67{
68 if (!subsys || cntlid >= NVME_MAX_CONTROLLERS) {
69 return NULL;
70 }
71
72 if (subsys->ctrls[cntlid] == SUBSYS_SLOT_RSVD) {
73 return NULL;
74 }
75
76 return subsys->ctrls[cntlid];
77}
78
79static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
80 uint32_t nsid)
81{
82 if (!subsys || !nsid || nsid > NVME_MAX_NAMESPACES) {
83 return NULL;
84 }
85
86 return subsys->namespaces[nsid];
87}
88
89#define TYPE_NVME_NS "nvme-ns"
90#define NVME_NS(obj) \
91 OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
92
93typedef struct NvmeZone {
94 NvmeZoneDescr d;
95 uint64_t w_ptr;
96 QTAILQ_ENTRY(NvmeZone) entry;
97} NvmeZone;
98
99typedef struct NvmeNamespaceParams {
100 bool detached;
101 bool shared;
102 uint32_t nsid;
103 QemuUUID uuid;
104 uint64_t eui64;
105 bool eui64_default;
106
107 uint16_t ms;
108 uint8_t mset;
109 uint8_t pi;
110 uint8_t pil;
111 uint8_t pif;
112
113 uint16_t mssrl;
114 uint32_t mcl;
115 uint8_t msrc;
116
117 bool zoned;
118 bool cross_zone_read;
119 uint64_t zone_size_bs;
120 uint64_t zone_cap_bs;
121 uint32_t max_active_zones;
122 uint32_t max_open_zones;
123 uint32_t zd_extension_size;
124
125 uint32_t numzrwa;
126 uint64_t zrwas;
127 uint64_t zrwafg;
128} NvmeNamespaceParams;
129
130typedef struct NvmeNamespace {
131 DeviceState parent_obj;
132 BlockConf blkconf;
133 int32_t bootindex;
134 int64_t size;
135 int64_t moff;
136 NvmeIdNs id_ns;
137 NvmeIdNsNvm id_ns_nvm;
138 NvmeLBAF lbaf;
139 unsigned int nlbaf;
140 size_t lbasz;
141 const uint32_t *iocs;
142 uint8_t csi;
143 uint16_t status;
144 int attached;
145 uint8_t pif;
146
147 struct {
148 uint16_t zrwas;
149 uint16_t zrwafg;
150 uint32_t numzrwa;
151 } zns;
152
153 QTAILQ_ENTRY(NvmeNamespace) entry;
154
155 NvmeIdNsZoned *id_ns_zoned;
156 NvmeZone *zone_array;
157 QTAILQ_HEAD(, NvmeZone) exp_open_zones;
158 QTAILQ_HEAD(, NvmeZone) imp_open_zones;
159 QTAILQ_HEAD(, NvmeZone) closed_zones;
160 QTAILQ_HEAD(, NvmeZone) full_zones;
161 uint32_t num_zones;
162 uint64_t zone_size;
163 uint64_t zone_capacity;
164 uint32_t zone_size_log2;
165 uint8_t *zd_extensions;
166 int32_t nr_open_zones;
167 int32_t nr_active_zones;
168
169 NvmeNamespaceParams params;
170
171 struct {
172 uint32_t err_rec;
173 } features;
174} NvmeNamespace;
175
176static inline uint32_t nvme_nsid(NvmeNamespace *ns)
177{
178 if (ns) {
179 return ns->params.nsid;
180 }
181
182 return 0;
183}
184
185static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
186{
187 return lba << ns->lbaf.ds;
188}
189
190static inline size_t nvme_m2b(NvmeNamespace *ns, uint64_t lba)
191{
192 return ns->lbaf.ms * lba;
193}
194
195static inline int64_t nvme_moff(NvmeNamespace *ns, uint64_t lba)
196{
197 return ns->moff + nvme_m2b(ns, lba);
198}
199
200static inline bool nvme_ns_ext(NvmeNamespace *ns)
201{
202 return !!NVME_ID_NS_FLBAS_EXTENDED(ns->id_ns.flbas);
203}
204
205static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
206{
207 return zone->d.zs >> 4;
208}
209
210static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state)
211{
212 zone->d.zs = state << 4;
213}
214
215static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
216{
217 return zone->d.zslba + ns->zone_size;
218}
219
220static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
221{
222 return zone->d.zslba + zone->d.zcap;
223}
224
225static inline bool nvme_wp_is_valid(NvmeZone *zone)
226{
227 uint8_t st = nvme_get_zone_state(zone);
228
229 return st != NVME_ZONE_STATE_FULL &&
230 st != NVME_ZONE_STATE_READ_ONLY &&
231 st != NVME_ZONE_STATE_OFFLINE;
232}
233
234static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
235 uint32_t zone_idx)
236{
237 return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size];
238}
239
240static inline void nvme_aor_inc_open(NvmeNamespace *ns)
241{
242 assert(ns->nr_open_zones >= 0);
243 if (ns->params.max_open_zones) {
244 ns->nr_open_zones++;
245 assert(ns->nr_open_zones <= ns->params.max_open_zones);
246 }
247}
248
249static inline void nvme_aor_dec_open(NvmeNamespace *ns)
250{
251 if (ns->params.max_open_zones) {
252 assert(ns->nr_open_zones > 0);
253 ns->nr_open_zones--;
254 }
255 assert(ns->nr_open_zones >= 0);
256}
257
258static inline void nvme_aor_inc_active(NvmeNamespace *ns)
259{
260 assert(ns->nr_active_zones >= 0);
261 if (ns->params.max_active_zones) {
262 ns->nr_active_zones++;
263 assert(ns->nr_active_zones <= ns->params.max_active_zones);
264 }
265}
266
267static inline void nvme_aor_dec_active(NvmeNamespace *ns)
268{
269 if (ns->params.max_active_zones) {
270 assert(ns->nr_active_zones > 0);
271 ns->nr_active_zones--;
272 assert(ns->nr_active_zones >= ns->nr_open_zones);
273 }
274 assert(ns->nr_active_zones >= 0);
275}
276
277void nvme_ns_init_format(NvmeNamespace *ns);
278int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
279void nvme_ns_drain(NvmeNamespace *ns);
280void nvme_ns_shutdown(NvmeNamespace *ns);
281void nvme_ns_cleanup(NvmeNamespace *ns);
282
283typedef struct NvmeAsyncEvent {
284 QTAILQ_ENTRY(NvmeAsyncEvent) entry;
285 NvmeAerResult result;
286} NvmeAsyncEvent;
287
288enum {
289 NVME_SG_ALLOC = 1 << 0,
290 NVME_SG_DMA = 1 << 1,
291};
292
293typedef struct NvmeSg {
294 int flags;
295
296 union {
297 QEMUSGList qsg;
298 QEMUIOVector iov;
299 };
300} NvmeSg;
301
302typedef enum NvmeTxDirection {
303 NVME_TX_DIRECTION_TO_DEVICE = 0,
304 NVME_TX_DIRECTION_FROM_DEVICE = 1,
305} NvmeTxDirection;
306
307typedef struct NvmeRequest {
308 struct NvmeSQueue *sq;
309 struct NvmeNamespace *ns;
310 BlockAIOCB *aiocb;
311 uint16_t status;
312 void *opaque;
313 NvmeCqe cqe;
314 NvmeCmd cmd;
315 BlockAcctCookie acct;
316 NvmeSg sg;
317 QTAILQ_ENTRY(NvmeRequest)entry;
318} NvmeRequest;
319
320typedef struct NvmeBounceContext {
321 NvmeRequest *req;
322
323 struct {
324 QEMUIOVector iov;
325 uint8_t *bounce;
326 } data, mdata;
327} NvmeBounceContext;
328
329static inline const char *nvme_adm_opc_str(uint8_t opc)
330{
331 switch (opc) {
332 case NVME_ADM_CMD_DELETE_SQ: return "NVME_ADM_CMD_DELETE_SQ";
333 case NVME_ADM_CMD_CREATE_SQ: return "NVME_ADM_CMD_CREATE_SQ";
334 case NVME_ADM_CMD_GET_LOG_PAGE: return "NVME_ADM_CMD_GET_LOG_PAGE";
335 case NVME_ADM_CMD_DELETE_CQ: return "NVME_ADM_CMD_DELETE_CQ";
336 case NVME_ADM_CMD_CREATE_CQ: return "NVME_ADM_CMD_CREATE_CQ";
337 case NVME_ADM_CMD_IDENTIFY: return "NVME_ADM_CMD_IDENTIFY";
338 case NVME_ADM_CMD_ABORT: return "NVME_ADM_CMD_ABORT";
339 case NVME_ADM_CMD_SET_FEATURES: return "NVME_ADM_CMD_SET_FEATURES";
340 case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES";
341 case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
342 case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT";
343 case NVME_ADM_CMD_VIRT_MNGMT: return "NVME_ADM_CMD_VIRT_MNGMT";
344 case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG";
345 case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM";
346 default: return "NVME_ADM_CMD_UNKNOWN";
347 }
348}
349
350static inline const char *nvme_io_opc_str(uint8_t opc)
351{
352 switch (opc) {
353 case NVME_CMD_FLUSH: return "NVME_NVM_CMD_FLUSH";
354 case NVME_CMD_WRITE: return "NVME_NVM_CMD_WRITE";
355 case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
356 case NVME_CMD_COMPARE: return "NVME_NVM_CMD_COMPARE";
357 case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
358 case NVME_CMD_DSM: return "NVME_NVM_CMD_DSM";
359 case NVME_CMD_VERIFY: return "NVME_NVM_CMD_VERIFY";
360 case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY";
361 case NVME_CMD_ZONE_MGMT_SEND: return "NVME_ZONED_CMD_MGMT_SEND";
362 case NVME_CMD_ZONE_MGMT_RECV: return "NVME_ZONED_CMD_MGMT_RECV";
363 case NVME_CMD_ZONE_APPEND: return "NVME_ZONED_CMD_ZONE_APPEND";
364 default: return "NVME_NVM_CMD_UNKNOWN";
365 }
366}
367
368typedef struct NvmeSQueue {
369 struct NvmeCtrl *ctrl;
370 uint16_t sqid;
371 uint16_t cqid;
372 uint32_t head;
373 uint32_t tail;
374 uint32_t size;
375 uint64_t dma_addr;
376 uint64_t db_addr;
377 uint64_t ei_addr;
378 QEMUTimer *timer;
379 EventNotifier notifier;
380 bool ioeventfd_enabled;
381 NvmeRequest *io_req;
382 QTAILQ_HEAD(, NvmeRequest) req_list;
383 QTAILQ_HEAD(, NvmeRequest) out_req_list;
384 QTAILQ_ENTRY(NvmeSQueue) entry;
385} NvmeSQueue;
386
387typedef struct NvmeCQueue {
388 struct NvmeCtrl *ctrl;
389 uint8_t phase;
390 uint16_t cqid;
391 uint16_t irq_enabled;
392 uint32_t head;
393 uint32_t tail;
394 uint32_t vector;
395 uint32_t size;
396 uint64_t dma_addr;
397 uint64_t db_addr;
398 uint64_t ei_addr;
399 QEMUTimer *timer;
400 EventNotifier notifier;
401 bool ioeventfd_enabled;
402 QTAILQ_HEAD(, NvmeSQueue) sq_list;
403 QTAILQ_HEAD(, NvmeRequest) req_list;
404} NvmeCQueue;
405
406#define TYPE_NVME "nvme"
407#define NVME(obj) \
408 OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME)
409
410typedef struct NvmeParams {
411 char *serial;
412 uint32_t num_queues;
413 uint32_t max_ioqpairs;
414 uint16_t msix_qsize;
415 uint32_t cmb_size_mb;
416 uint8_t aerl;
417 uint32_t aer_max_queued;
418 uint8_t mdts;
419 uint8_t vsl;
420 bool use_intel_id;
421 uint8_t zasl;
422 bool auto_transition_zones;
423 bool legacy_cmb;
424 bool ioeventfd;
425 uint8_t sriov_max_vfs;
426 uint16_t sriov_vq_flexible;
427 uint16_t sriov_vi_flexible;
428 uint8_t sriov_max_vq_per_vf;
429 uint8_t sriov_max_vi_per_vf;
430} NvmeParams;
431
432typedef struct NvmeCtrl {
433 PCIDevice parent_obj;
434 MemoryRegion bar0;
435 MemoryRegion iomem;
436 NvmeBar bar;
437 NvmeParams params;
438 NvmeBus bus;
439
440 uint16_t cntlid;
441 bool qs_created;
442 uint32_t page_size;
443 uint16_t page_bits;
444 uint16_t max_prp_ents;
445 uint16_t cqe_size;
446 uint16_t sqe_size;
447 uint32_t max_q_ents;
448 uint8_t outstanding_aers;
449 uint32_t irq_status;
450 int cq_pending;
451 uint64_t host_timestamp;
452 uint64_t timestamp_set_qemu_clock_ms;
453 uint64_t starttime_ms;
454 uint16_t temperature;
455 uint8_t smart_critical_warning;
456 uint32_t conf_msix_qsize;
457 uint32_t conf_ioqpairs;
458 uint64_t dbbuf_dbs;
459 uint64_t dbbuf_eis;
460 bool dbbuf_enabled;
461
462 struct {
463 MemoryRegion mem;
464 uint8_t *buf;
465 bool cmse;
466 hwaddr cba;
467 } cmb;
468
469 struct {
470 HostMemoryBackend *dev;
471 bool cmse;
472 hwaddr cba;
473 } pmr;
474
475 uint8_t aer_mask;
476 NvmeRequest **aer_reqs;
477 QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
478 int aer_queued;
479
480 uint32_t dmrsl;
481
482
483#define NVME_CHANGED_NSID_SIZE (NVME_MAX_NAMESPACES + 1)
484 DECLARE_BITMAP(changed_nsids, NVME_CHANGED_NSID_SIZE);
485
486 NvmeSubsystem *subsys;
487
488 NvmeNamespace namespace;
489 NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1];
490 NvmeSQueue **sq;
491 NvmeCQueue **cq;
492 NvmeSQueue admin_sq;
493 NvmeCQueue admin_cq;
494 NvmeIdCtrl id_ctrl;
495
496 struct {
497 struct {
498 uint16_t temp_thresh_hi;
499 uint16_t temp_thresh_low;
500 };
501
502 uint32_t async_config;
503 NvmeHostBehaviorSupport hbs;
504 } features;
505
506 NvmePriCtrlCap pri_ctrl_cap;
507 NvmeSecCtrlList sec_ctrl_list;
508 struct {
509 uint16_t vqrfap;
510 uint16_t virfap;
511 } next_pri_ctrl_cap;
512} NvmeCtrl;
513
514typedef enum NvmeResetType {
515 NVME_RESET_FUNCTION = 0,
516 NVME_RESET_CONTROLLER = 1,
517} NvmeResetType;
518
519static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
520{
521 if (!nsid || nsid > NVME_MAX_NAMESPACES) {
522 return NULL;
523 }
524
525 return n->namespaces[nsid];
526}
527
528static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
529{
530 NvmeSQueue *sq = req->sq;
531 NvmeCtrl *n = sq->ctrl;
532
533 return n->cq[sq->cqid];
534}
535
536static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req)
537{
538 NvmeSQueue *sq = req->sq;
539 return sq->ctrl;
540}
541
542static inline uint16_t nvme_cid(NvmeRequest *req)
543{
544 if (!req) {
545 return 0xffff;
546 }
547
548 return le16_to_cpu(req->cqe.cid);
549}
550
551static inline NvmeSecCtrlEntry *nvme_sctrl(NvmeCtrl *n)
552{
553 PCIDevice *pci_dev = &n->parent_obj;
554 NvmeCtrl *pf = NVME(pcie_sriov_get_pf(pci_dev));
555
556 if (pci_is_vf(pci_dev)) {
557 return &pf->sec_ctrl_list.sec[pcie_sriov_vf_number(pci_dev)];
558 }
559
560 return NULL;
561}
562
563static inline NvmeSecCtrlEntry *nvme_sctrl_for_cntlid(NvmeCtrl *n,
564 uint16_t cntlid)
565{
566 NvmeSecCtrlList *list = &n->sec_ctrl_list;
567 uint8_t i;
568
569 for (i = 0; i < list->numcntl; i++) {
570 if (le16_to_cpu(list->sec[i].scid) == cntlid) {
571 return &list->sec[i];
572 }
573 }
574
575 return NULL;
576}
577
578void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns);
579uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
580 NvmeTxDirection dir, NvmeRequest *req);
581uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
582 NvmeTxDirection dir, NvmeRequest *req);
583void nvme_rw_complete_cb(void *opaque, int ret);
584uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
585 NvmeCmd *cmd);
586
587#endif
588