1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193#include "qemu/osdep.h"
194#include "qemu/cutils.h"
195#include "qemu/error-report.h"
196#include "qemu/log.h"
197#include "qemu/units.h"
198#include "qemu/range.h"
199#include "qapi/error.h"
200#include "qapi/visitor.h"
201#include "sysemu/sysemu.h"
202#include "sysemu/block-backend.h"
203#include "sysemu/hostmem.h"
204#include "hw/pci/msix.h"
205#include "hw/pci/pcie_sriov.h"
206#include "migration/vmstate.h"
207
208#include "nvme.h"
209#include "dif.h"
210#include "trace.h"
211
212#define NVME_MAX_IOQPAIRS 0xffff
213#define NVME_DB_SIZE 4
214#define NVME_SPEC_VER 0x00010400
215#define NVME_CMB_BIR 2
216#define NVME_PMR_BIR 4
217#define NVME_TEMPERATURE 0x143
218#define NVME_TEMPERATURE_WARNING 0x157
219#define NVME_TEMPERATURE_CRITICAL 0x175
220#define NVME_NUM_FW_SLOTS 1
221#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
222#define NVME_MAX_VFS 127
223#define NVME_VF_RES_GRANULARITY 1
224#define NVME_VF_OFFSET 0x1
225#define NVME_VF_STRIDE 1
226
227#define NVME_GUEST_ERR(trace, fmt, ...) \
228 do { \
229 (trace_##trace)(__VA_ARGS__); \
230 qemu_log_mask(LOG_GUEST_ERROR, #trace \
231 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
232 } while (0)
233
234static const bool nvme_feature_support[NVME_FID_MAX] = {
235 [NVME_ARBITRATION] = true,
236 [NVME_POWER_MANAGEMENT] = true,
237 [NVME_TEMPERATURE_THRESHOLD] = true,
238 [NVME_ERROR_RECOVERY] = true,
239 [NVME_VOLATILE_WRITE_CACHE] = true,
240 [NVME_NUMBER_OF_QUEUES] = true,
241 [NVME_INTERRUPT_COALESCING] = true,
242 [NVME_INTERRUPT_VECTOR_CONF] = true,
243 [NVME_WRITE_ATOMICITY] = true,
244 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
245 [NVME_TIMESTAMP] = true,
246 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
247 [NVME_COMMAND_SET_PROFILE] = true,
248 [NVME_FDP_MODE] = true,
249 [NVME_FDP_EVENTS] = true,
250};
251
252static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
253 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
254 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
255 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
256 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
257 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
258 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
259 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
260 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
261 [NVME_FDP_MODE] = NVME_FEAT_CAP_CHANGE,
262 [NVME_FDP_EVENTS] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
263};
264
265static const uint32_t nvme_cse_acs[256] = {
266 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
267 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
268 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
269 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
270 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
271 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
272 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
273 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
274 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
275 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
276 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
277 [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP,
278 [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
279 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
280 [NVME_ADM_CMD_DIRECTIVE_RECV] = NVME_CMD_EFF_CSUPP,
281 [NVME_ADM_CMD_DIRECTIVE_SEND] = NVME_CMD_EFF_CSUPP,
282};
283
284static const uint32_t nvme_cse_iocs_none[256];
285
286static const uint32_t nvme_cse_iocs_nvm[256] = {
287 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
288 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
289 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
290 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
291 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
292 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
293 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
294 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
295 [NVME_CMD_IO_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
296 [NVME_CMD_IO_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
297};
298
299static const uint32_t nvme_cse_iocs_zoned[256] = {
300 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
301 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
302 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
303 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
304 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
305 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
306 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
307 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
308 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
309 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
310 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
311};
312
313static void nvme_process_sq(void *opaque);
314static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
315static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
316
317static uint16_t nvme_sqid(NvmeRequest *req)
318{
319 return le16_to_cpu(req->sq->sqid);
320}
321
322static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
323 uint16_t ph)
324{
325 uint16_t rgif = ns->endgrp->fdp.rgif;
326
327 if (!rgif) {
328 return ph;
329 }
330
331 return (rg << (16 - rgif)) | ph;
332}
333
334static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
335{
336 return ph < ns->fdp.nphs;
337}
338
339static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
340{
341 return rg < endgrp->fdp.nrg;
342}
343
344static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
345{
346 uint16_t rgif = ns->endgrp->fdp.rgif;
347
348 if (!rgif) {
349 return pid;
350 }
351
352 return pid & ((1 << (15 - rgif)) - 1);
353}
354
355static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
356{
357 uint16_t rgif = ns->endgrp->fdp.rgif;
358
359 if (!rgif) {
360 return 0;
361 }
362
363 return pid >> (16 - rgif);
364}
365
366static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
367 uint16_t *ph, uint16_t *rg)
368{
369 *rg = nvme_pid2rg(ns, pid);
370 *ph = nvme_pid2ph(ns, pid);
371
372 return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
373}
374
375static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
376 NvmeZoneState state)
377{
378 if (QTAILQ_IN_USE(zone, entry)) {
379 switch (nvme_get_zone_state(zone)) {
380 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
381 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
382 break;
383 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
384 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
385 break;
386 case NVME_ZONE_STATE_CLOSED:
387 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
388 break;
389 case NVME_ZONE_STATE_FULL:
390 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
391 default:
392 ;
393 }
394 }
395
396 nvme_set_zone_state(zone, state);
397
398 switch (state) {
399 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
400 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
401 break;
402 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
403 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
404 break;
405 case NVME_ZONE_STATE_CLOSED:
406 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
407 break;
408 case NVME_ZONE_STATE_FULL:
409 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
410 case NVME_ZONE_STATE_READ_ONLY:
411 break;
412 default:
413 zone->d.za = 0;
414 }
415}
416
417static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
418 uint32_t opn, uint32_t zrwa)
419{
420 if (ns->params.max_active_zones != 0 &&
421 ns->nr_active_zones + act > ns->params.max_active_zones) {
422 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
423 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
424 }
425
426 if (ns->params.max_open_zones != 0 &&
427 ns->nr_open_zones + opn > ns->params.max_open_zones) {
428 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
429 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
430 }
431
432 if (zrwa > ns->zns.numzrwa) {
433 return NVME_NOZRWA | NVME_DNR;
434 }
435
436 return NVME_SUCCESS;
437}
438
439
440
441
442
443static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
444{
445 return nvme_zns_check_resources(ns, act, opn, 0);
446}
447
448static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
449{
450 NvmeFdpEvent *ret = NULL;
451 bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
452
453 ret = &ebuf->events[ebuf->next++];
454 if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
455 ebuf->next = 0;
456 }
457 if (is_full) {
458 ebuf->start = ebuf->next;
459 } else {
460 ebuf->nelems++;
461 }
462
463 memset(ret, 0, sizeof(NvmeFdpEvent));
464 ret->timestamp = nvme_get_timestamp(n);
465
466 return ret;
467}
468
469static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
470{
471 return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
472}
473
474static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
475{
476 NvmeEnduranceGroup *endgrp = ns->endgrp;
477 NvmeRuHandle *ruh;
478 NvmeReclaimUnit *ru;
479 NvmeFdpEvent *e = NULL;
480 uint16_t ph, rg, ruhid;
481
482 if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
483 return false;
484 }
485
486 ruhid = ns->fdp.phs[ph];
487
488 ruh = &endgrp->fdp.ruhs[ruhid];
489 ru = &ruh->rus[rg];
490
491 if (ru->ruamw) {
492 if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
493 e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
494 e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
495 e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
496 e->pid = cpu_to_le16(pid);
497 e->nsid = cpu_to_le32(ns->params.nsid);
498 e->rgid = cpu_to_le16(rg);
499 e->ruhid = cpu_to_le16(ruhid);
500 }
501
502
503 nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
504 }
505
506 ru->ruamw = ruh->ruamw;
507
508 return true;
509}
510
511static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
512{
513 hwaddr hi, lo;
514
515 if (!n->cmb.cmse) {
516 return false;
517 }
518
519 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
520 hi = lo + int128_get64(n->cmb.mem.size);
521
522 return addr >= lo && addr < hi;
523}
524
525static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
526{
527 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
528 return &n->cmb.buf[addr - base];
529}
530
531static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
532{
533 hwaddr hi;
534
535 if (!n->pmr.cmse) {
536 return false;
537 }
538
539 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
540
541 return addr >= n->pmr.cba && addr < hi;
542}
543
544static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
545{
546 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
547}
548
549static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
550{
551 hwaddr hi, lo;
552
553
554
555
556
557
558
559
560
561 lo = n->bar0.addr;
562 hi = lo + int128_get64(n->bar0.size);
563
564 return addr >= lo && addr < hi;
565}
566
567static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
568{
569 hwaddr hi = addr + size - 1;
570 if (hi < addr) {
571 return 1;
572 }
573
574 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
575 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
576 return 0;
577 }
578
579 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
580 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
581 return 0;
582 }
583
584 return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
585}
586
587static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
588{
589 hwaddr hi = addr + size - 1;
590 if (hi < addr) {
591 return 1;
592 }
593
594 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
595 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
596 return 0;
597 }
598
599 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
600 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
601 return 0;
602 }
603
604 return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
605}
606
607static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
608{
609 return nsid &&
610 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
611}
612
613static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
614{
615 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
616}
617
618static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
619{
620 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
621}
622
623static void nvme_inc_cq_tail(NvmeCQueue *cq)
624{
625 cq->tail++;
626 if (cq->tail >= cq->size) {
627 cq->tail = 0;
628 cq->phase = !cq->phase;
629 }
630}
631
632static void nvme_inc_sq_head(NvmeSQueue *sq)
633{
634 sq->head = (sq->head + 1) % sq->size;
635}
636
637static uint8_t nvme_cq_full(NvmeCQueue *cq)
638{
639 return (cq->tail + 1) % cq->size == cq->head;
640}
641
642static uint8_t nvme_sq_empty(NvmeSQueue *sq)
643{
644 return sq->head == sq->tail;
645}
646
647static void nvme_irq_check(NvmeCtrl *n)
648{
649 PCIDevice *pci = PCI_DEVICE(n);
650 uint32_t intms = ldl_le_p(&n->bar.intms);
651
652 if (msix_enabled(pci)) {
653 return;
654 }
655 if (~intms & n->irq_status) {
656 pci_irq_assert(pci);
657 } else {
658 pci_irq_deassert(pci);
659 }
660}
661
662static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
663{
664 PCIDevice *pci = PCI_DEVICE(n);
665
666 if (cq->irq_enabled) {
667 if (msix_enabled(pci)) {
668 trace_pci_nvme_irq_msix(cq->vector);
669 msix_notify(pci, cq->vector);
670 } else {
671 trace_pci_nvme_irq_pin();
672 assert(cq->vector < 32);
673 n->irq_status |= 1 << cq->vector;
674 nvme_irq_check(n);
675 }
676 } else {
677 trace_pci_nvme_irq_masked();
678 }
679}
680
681static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
682{
683 if (cq->irq_enabled) {
684 if (msix_enabled(PCI_DEVICE(n))) {
685 return;
686 } else {
687 assert(cq->vector < 32);
688 if (!n->cq_pending) {
689 n->irq_status &= ~(1 << cq->vector);
690 }
691 nvme_irq_check(n);
692 }
693 }
694}
695
696static void nvme_req_clear(NvmeRequest *req)
697{
698 req->ns = NULL;
699 req->opaque = NULL;
700 req->aiocb = NULL;
701 memset(&req->cqe, 0x0, sizeof(req->cqe));
702 req->status = NVME_SUCCESS;
703}
704
705static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
706{
707 if (dma) {
708 pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
709 sg->flags = NVME_SG_DMA;
710 } else {
711 qemu_iovec_init(&sg->iov, 0);
712 }
713
714 sg->flags |= NVME_SG_ALLOC;
715}
716
717static inline void nvme_sg_unmap(NvmeSg *sg)
718{
719 if (!(sg->flags & NVME_SG_ALLOC)) {
720 return;
721 }
722
723 if (sg->flags & NVME_SG_DMA) {
724 qemu_sglist_destroy(&sg->qsg);
725 } else {
726 qemu_iovec_destroy(&sg->iov);
727 }
728
729 memset(sg, 0x0, sizeof(*sg));
730}
731
732
733
734
735
736
737static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
738 NvmeSg *mdata)
739{
740 NvmeSg *dst = data;
741 uint32_t trans_len, count = ns->lbasz;
742 uint64_t offset = 0;
743 bool dma = sg->flags & NVME_SG_DMA;
744 size_t sge_len;
745 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
746 int sg_idx = 0;
747
748 assert(sg->flags & NVME_SG_ALLOC);
749
750 while (sg_len) {
751 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
752
753 trans_len = MIN(sg_len, count);
754 trans_len = MIN(trans_len, sge_len - offset);
755
756 if (dst) {
757 if (dma) {
758 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
759 trans_len);
760 } else {
761 qemu_iovec_add(&dst->iov,
762 sg->iov.iov[sg_idx].iov_base + offset,
763 trans_len);
764 }
765 }
766
767 sg_len -= trans_len;
768 count -= trans_len;
769 offset += trans_len;
770
771 if (count == 0) {
772 dst = (dst == data) ? mdata : data;
773 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
774 }
775
776 if (sge_len == offset) {
777 offset = 0;
778 sg_idx++;
779 }
780 }
781}
782
783static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
784 size_t len)
785{
786 if (!len) {
787 return NVME_SUCCESS;
788 }
789
790 trace_pci_nvme_map_addr_cmb(addr, len);
791
792 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
793 return NVME_DATA_TRAS_ERROR;
794 }
795
796 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
797
798 return NVME_SUCCESS;
799}
800
801static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
802 size_t len)
803{
804 if (!len) {
805 return NVME_SUCCESS;
806 }
807
808 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
809 return NVME_DATA_TRAS_ERROR;
810 }
811
812 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
813
814 return NVME_SUCCESS;
815}
816
817static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
818{
819 bool cmb = false, pmr = false;
820
821 if (!len) {
822 return NVME_SUCCESS;
823 }
824
825 trace_pci_nvme_map_addr(addr, len);
826
827 if (nvme_addr_is_iomem(n, addr)) {
828 return NVME_DATA_TRAS_ERROR;
829 }
830
831 if (nvme_addr_is_cmb(n, addr)) {
832 cmb = true;
833 } else if (nvme_addr_is_pmr(n, addr)) {
834 pmr = true;
835 }
836
837 if (cmb || pmr) {
838 if (sg->flags & NVME_SG_DMA) {
839 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
840 }
841
842 if (sg->iov.niov + 1 > IOV_MAX) {
843 goto max_mappings_exceeded;
844 }
845
846 if (cmb) {
847 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
848 } else {
849 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
850 }
851 }
852
853 if (!(sg->flags & NVME_SG_DMA)) {
854 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
855 }
856
857 if (sg->qsg.nsg + 1 > IOV_MAX) {
858 goto max_mappings_exceeded;
859 }
860
861 qemu_sglist_add(&sg->qsg, addr, len);
862
863 return NVME_SUCCESS;
864
865max_mappings_exceeded:
866 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
867 "number of mappings exceed 1024");
868 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
869}
870
871static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
872{
873 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
874}
875
876static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
877 uint64_t prp2, uint32_t len)
878{
879 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
880 trans_len = MIN(len, trans_len);
881 int num_prps = (len >> n->page_bits) + 1;
882 uint16_t status;
883 int ret;
884
885 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
886
887 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
888
889 status = nvme_map_addr(n, sg, prp1, trans_len);
890 if (status) {
891 goto unmap;
892 }
893
894 len -= trans_len;
895 if (len) {
896 if (len > n->page_size) {
897 uint64_t prp_list[n->max_prp_ents];
898 uint32_t nents, prp_trans;
899 int i = 0;
900
901
902
903
904
905
906 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
907 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
908 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
909 if (ret) {
910 trace_pci_nvme_err_addr_read(prp2);
911 status = NVME_DATA_TRAS_ERROR;
912 goto unmap;
913 }
914 while (len != 0) {
915 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
916
917 if (i == nents - 1 && len > n->page_size) {
918 if (unlikely(prp_ent & (n->page_size - 1))) {
919 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
920 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
921 goto unmap;
922 }
923
924 i = 0;
925 nents = (len + n->page_size - 1) >> n->page_bits;
926 nents = MIN(nents, n->max_prp_ents);
927 prp_trans = nents * sizeof(uint64_t);
928 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
929 prp_trans);
930 if (ret) {
931 trace_pci_nvme_err_addr_read(prp_ent);
932 status = NVME_DATA_TRAS_ERROR;
933 goto unmap;
934 }
935 prp_ent = le64_to_cpu(prp_list[i]);
936 }
937
938 if (unlikely(prp_ent & (n->page_size - 1))) {
939 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
940 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
941 goto unmap;
942 }
943
944 trans_len = MIN(len, n->page_size);
945 status = nvme_map_addr(n, sg, prp_ent, trans_len);
946 if (status) {
947 goto unmap;
948 }
949
950 len -= trans_len;
951 i++;
952 }
953 } else {
954 if (unlikely(prp2 & (n->page_size - 1))) {
955 trace_pci_nvme_err_invalid_prp2_align(prp2);
956 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
957 goto unmap;
958 }
959 status = nvme_map_addr(n, sg, prp2, len);
960 if (status) {
961 goto unmap;
962 }
963 }
964 }
965
966 return NVME_SUCCESS;
967
968unmap:
969 nvme_sg_unmap(sg);
970 return status;
971}
972
973
974
975
976
977static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
978 NvmeSglDescriptor *segment, uint64_t nsgld,
979 size_t *len, NvmeCmd *cmd)
980{
981 dma_addr_t addr, trans_len;
982 uint32_t dlen;
983 uint16_t status;
984
985 for (int i = 0; i < nsgld; i++) {
986 uint8_t type = NVME_SGL_TYPE(segment[i].type);
987
988 switch (type) {
989 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
990 break;
991 case NVME_SGL_DESCR_TYPE_SEGMENT:
992 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
993 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
994 default:
995 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
996 }
997
998 dlen = le32_to_cpu(segment[i].len);
999
1000 if (!dlen) {
1001 continue;
1002 }
1003
1004 if (*len == 0) {
1005
1006
1007
1008
1009
1010 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1011 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1012 break;
1013 }
1014
1015 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1016 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1017 }
1018
1019 trans_len = MIN(*len, dlen);
1020
1021 addr = le64_to_cpu(segment[i].addr);
1022
1023 if (UINT64_MAX - addr < dlen) {
1024 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1025 }
1026
1027 status = nvme_map_addr(n, sg, addr, trans_len);
1028 if (status) {
1029 return status;
1030 }
1031
1032 *len -= trans_len;
1033 }
1034
1035 return NVME_SUCCESS;
1036}
1037
1038static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1039 size_t len, NvmeCmd *cmd)
1040{
1041
1042
1043
1044
1045
1046
1047
1048 const int SEG_CHUNK_SIZE = 256;
1049
1050 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
1051 uint64_t nsgld;
1052 uint32_t seg_len;
1053 uint16_t status;
1054 hwaddr addr;
1055 int ret;
1056
1057 sgld = &sgl;
1058 addr = le64_to_cpu(sgl.addr);
1059
1060 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1061
1062 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1063
1064
1065
1066
1067
1068 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1069 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1070 if (status) {
1071 goto unmap;
1072 }
1073
1074 goto out;
1075 }
1076
1077 for (;;) {
1078 switch (NVME_SGL_TYPE(sgld->type)) {
1079 case NVME_SGL_DESCR_TYPE_SEGMENT:
1080 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1081 break;
1082 default:
1083 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1084 }
1085
1086 seg_len = le32_to_cpu(sgld->len);
1087
1088
1089 if (!seg_len || seg_len & 0xf) {
1090 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1091 }
1092
1093 if (UINT64_MAX - addr < seg_len) {
1094 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1095 }
1096
1097 nsgld = seg_len / sizeof(NvmeSglDescriptor);
1098
1099 while (nsgld > SEG_CHUNK_SIZE) {
1100 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1101 trace_pci_nvme_err_addr_read(addr);
1102 status = NVME_DATA_TRAS_ERROR;
1103 goto unmap;
1104 }
1105
1106 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1107 &len, cmd);
1108 if (status) {
1109 goto unmap;
1110 }
1111
1112 nsgld -= SEG_CHUNK_SIZE;
1113 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1114 }
1115
1116 ret = nvme_addr_read(n, addr, segment, nsgld *
1117 sizeof(NvmeSglDescriptor));
1118 if (ret) {
1119 trace_pci_nvme_err_addr_read(addr);
1120 status = NVME_DATA_TRAS_ERROR;
1121 goto unmap;
1122 }
1123
1124 last_sgld = &segment[nsgld - 1];
1125
1126
1127
1128
1129 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1130 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1131 if (status) {
1132 goto unmap;
1133 }
1134
1135 goto out;
1136 }
1137
1138
1139
1140
1141
1142 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1143 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1144 goto unmap;
1145 }
1146
1147 sgld = last_sgld;
1148 addr = le64_to_cpu(sgld->addr);
1149
1150
1151
1152
1153
1154 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1155 if (status) {
1156 goto unmap;
1157 }
1158 }
1159
1160out:
1161
1162 if (len) {
1163 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1164 goto unmap;
1165 }
1166
1167 return NVME_SUCCESS;
1168
1169unmap:
1170 nvme_sg_unmap(sg);
1171 return status;
1172}
1173
1174uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1175 NvmeCmd *cmd)
1176{
1177 uint64_t prp1, prp2;
1178
1179 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1180 case NVME_PSDT_PRP:
1181 prp1 = le64_to_cpu(cmd->dptr.prp1);
1182 prp2 = le64_to_cpu(cmd->dptr.prp2);
1183
1184 return nvme_map_prp(n, sg, prp1, prp2, len);
1185 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1186 case NVME_PSDT_SGL_MPTR_SGL:
1187 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1188 default:
1189 return NVME_INVALID_FIELD;
1190 }
1191}
1192
1193static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1194 NvmeCmd *cmd)
1195{
1196 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1197 hwaddr mptr = le64_to_cpu(cmd->mptr);
1198 uint16_t status;
1199
1200 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1201 NvmeSglDescriptor sgl;
1202
1203 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1204 return NVME_DATA_TRAS_ERROR;
1205 }
1206
1207 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1208 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1209 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1210 }
1211
1212 return status;
1213 }
1214
1215 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1216 status = nvme_map_addr(n, sg, mptr, len);
1217 if (status) {
1218 nvme_sg_unmap(sg);
1219 }
1220
1221 return status;
1222}
1223
1224static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1225{
1226 NvmeNamespace *ns = req->ns;
1227 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1228 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1229 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1230 size_t len = nvme_l2b(ns, nlb);
1231 uint16_t status;
1232
1233 if (nvme_ns_ext(ns) &&
1234 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1235 NvmeSg sg;
1236
1237 len += nvme_m2b(ns, nlb);
1238
1239 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1240 if (status) {
1241 return status;
1242 }
1243
1244 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1245 nvme_sg_split(&sg, ns, &req->sg, NULL);
1246 nvme_sg_unmap(&sg);
1247
1248 return NVME_SUCCESS;
1249 }
1250
1251 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1252}
1253
1254static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1255{
1256 NvmeNamespace *ns = req->ns;
1257 size_t len = nvme_m2b(ns, nlb);
1258 uint16_t status;
1259
1260 if (nvme_ns_ext(ns)) {
1261 NvmeSg sg;
1262
1263 len += nvme_l2b(ns, nlb);
1264
1265 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1266 if (status) {
1267 return status;
1268 }
1269
1270 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1271 nvme_sg_split(&sg, ns, NULL, &req->sg);
1272 nvme_sg_unmap(&sg);
1273
1274 return NVME_SUCCESS;
1275 }
1276
1277 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1278}
1279
1280static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1281 uint32_t len, uint32_t bytes,
1282 int32_t skip_bytes, int64_t offset,
1283 NvmeTxDirection dir)
1284{
1285 hwaddr addr;
1286 uint32_t trans_len, count = bytes;
1287 bool dma = sg->flags & NVME_SG_DMA;
1288 int64_t sge_len;
1289 int sg_idx = 0;
1290 int ret;
1291
1292 assert(sg->flags & NVME_SG_ALLOC);
1293
1294 while (len) {
1295 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1296
1297 if (sge_len - offset < 0) {
1298 offset -= sge_len;
1299 sg_idx++;
1300 continue;
1301 }
1302
1303 if (sge_len == offset) {
1304 offset = 0;
1305 sg_idx++;
1306 continue;
1307 }
1308
1309 trans_len = MIN(len, count);
1310 trans_len = MIN(trans_len, sge_len - offset);
1311
1312 if (dma) {
1313 addr = sg->qsg.sg[sg_idx].base + offset;
1314 } else {
1315 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1316 }
1317
1318 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1319 ret = nvme_addr_read(n, addr, ptr, trans_len);
1320 } else {
1321 ret = nvme_addr_write(n, addr, ptr, trans_len);
1322 }
1323
1324 if (ret) {
1325 return NVME_DATA_TRAS_ERROR;
1326 }
1327
1328 ptr += trans_len;
1329 len -= trans_len;
1330 count -= trans_len;
1331 offset += trans_len;
1332
1333 if (count == 0) {
1334 count = bytes;
1335 offset += skip_bytes;
1336 }
1337 }
1338
1339 return NVME_SUCCESS;
1340}
1341
1342static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1343 NvmeTxDirection dir)
1344{
1345 assert(sg->flags & NVME_SG_ALLOC);
1346
1347 if (sg->flags & NVME_SG_DMA) {
1348 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1349 dma_addr_t residual;
1350
1351 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1352 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1353 } else {
1354 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1355 }
1356
1357 if (unlikely(residual)) {
1358 trace_pci_nvme_err_invalid_dma();
1359 return NVME_INVALID_FIELD | NVME_DNR;
1360 }
1361 } else {
1362 size_t bytes;
1363
1364 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1365 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1366 } else {
1367 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1368 }
1369
1370 if (unlikely(bytes != len)) {
1371 trace_pci_nvme_err_invalid_dma();
1372 return NVME_INVALID_FIELD | NVME_DNR;
1373 }
1374 }
1375
1376 return NVME_SUCCESS;
1377}
1378
1379static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1380 NvmeRequest *req)
1381{
1382 uint16_t status;
1383
1384 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1385 if (status) {
1386 return status;
1387 }
1388
1389 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1390}
1391
1392static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1393 NvmeRequest *req)
1394{
1395 uint16_t status;
1396
1397 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1398 if (status) {
1399 return status;
1400 }
1401
1402 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1403}
1404
1405uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1406 NvmeTxDirection dir, NvmeRequest *req)
1407{
1408 NvmeNamespace *ns = req->ns;
1409 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1410 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1411 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1412
1413 if (nvme_ns_ext(ns) &&
1414 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1415 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1416 ns->lbaf.ms, 0, dir);
1417 }
1418
1419 return nvme_tx(n, &req->sg, ptr, len, dir);
1420}
1421
1422uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1423 NvmeTxDirection dir, NvmeRequest *req)
1424{
1425 NvmeNamespace *ns = req->ns;
1426 uint16_t status;
1427
1428 if (nvme_ns_ext(ns)) {
1429 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1430 ns->lbasz, ns->lbasz, dir);
1431 }
1432
1433 nvme_sg_unmap(&req->sg);
1434
1435 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1436 if (status) {
1437 return status;
1438 }
1439
1440 return nvme_tx(n, &req->sg, ptr, len, dir);
1441}
1442
1443static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1444 uint32_t align, BlockCompletionFunc *cb,
1445 NvmeRequest *req)
1446{
1447 assert(req->sg.flags & NVME_SG_ALLOC);
1448
1449 if (req->sg.flags & NVME_SG_DMA) {
1450 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1451 } else {
1452 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1453 }
1454}
1455
1456static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1457 uint32_t align, BlockCompletionFunc *cb,
1458 NvmeRequest *req)
1459{
1460 assert(req->sg.flags & NVME_SG_ALLOC);
1461
1462 if (req->sg.flags & NVME_SG_DMA) {
1463 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1464 } else {
1465 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1466 }
1467}
1468
1469static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1470{
1471 trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1472
1473 stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
1474 MEMTXATTRS_UNSPECIFIED);
1475}
1476
1477static void nvme_update_cq_head(NvmeCQueue *cq)
1478{
1479 ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
1480 MEMTXATTRS_UNSPECIFIED);
1481
1482 trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1483}
1484
1485static void nvme_post_cqes(void *opaque)
1486{
1487 NvmeCQueue *cq = opaque;
1488 NvmeCtrl *n = cq->ctrl;
1489 NvmeRequest *req, *next;
1490 bool pending = cq->head != cq->tail;
1491 int ret;
1492
1493 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1494 NvmeSQueue *sq;
1495 hwaddr addr;
1496
1497 if (n->dbbuf_enabled) {
1498 nvme_update_cq_eventidx(cq);
1499 nvme_update_cq_head(cq);
1500 }
1501
1502 if (nvme_cq_full(cq)) {
1503 break;
1504 }
1505
1506 sq = req->sq;
1507 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1508 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1509 req->cqe.sq_head = cpu_to_le16(sq->head);
1510 addr = cq->dma_addr + (cq->tail << NVME_CQES);
1511 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1512 sizeof(req->cqe));
1513 if (ret) {
1514 trace_pci_nvme_err_addr_write(addr);
1515 trace_pci_nvme_err_cfs();
1516 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1517 break;
1518 }
1519 QTAILQ_REMOVE(&cq->req_list, req, entry);
1520 nvme_inc_cq_tail(cq);
1521 nvme_sg_unmap(&req->sg);
1522 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1523 }
1524 if (cq->tail != cq->head) {
1525 if (cq->irq_enabled && !pending) {
1526 n->cq_pending++;
1527 }
1528
1529 nvme_irq_assert(n, cq);
1530 }
1531}
1532
1533static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1534{
1535 assert(cq->cqid == req->sq->cqid);
1536 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1537 le32_to_cpu(req->cqe.result),
1538 le32_to_cpu(req->cqe.dw1),
1539 req->status);
1540
1541 if (req->status) {
1542 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1543 req->status, req->cmd.opcode);
1544 }
1545
1546 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1547 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1548
1549 qemu_bh_schedule(cq->bh);
1550}
1551
1552static void nvme_process_aers(void *opaque)
1553{
1554 NvmeCtrl *n = opaque;
1555 NvmeAsyncEvent *event, *next;
1556
1557 trace_pci_nvme_process_aers(n->aer_queued);
1558
1559 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1560 NvmeRequest *req;
1561 NvmeAerResult *result;
1562
1563
1564 if (!n->outstanding_aers) {
1565 trace_pci_nvme_no_outstanding_aers();
1566 break;
1567 }
1568
1569
1570 if (n->aer_mask & (1 << event->result.event_type)) {
1571 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1572 continue;
1573 }
1574
1575 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1576 n->aer_queued--;
1577
1578 n->aer_mask |= 1 << event->result.event_type;
1579 n->outstanding_aers--;
1580
1581 req = n->aer_reqs[n->outstanding_aers];
1582
1583 result = (NvmeAerResult *) &req->cqe.result;
1584 result->event_type = event->result.event_type;
1585 result->event_info = event->result.event_info;
1586 result->log_page = event->result.log_page;
1587 g_free(event);
1588
1589 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1590 result->log_page);
1591
1592 nvme_enqueue_req_completion(&n->admin_cq, req);
1593 }
1594}
1595
1596static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1597 uint8_t event_info, uint8_t log_page)
1598{
1599 NvmeAsyncEvent *event;
1600
1601 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1602
1603 if (n->aer_queued == n->params.aer_max_queued) {
1604 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1605 return;
1606 }
1607
1608 event = g_new(NvmeAsyncEvent, 1);
1609 event->result = (NvmeAerResult) {
1610 .event_type = event_type,
1611 .event_info = event_info,
1612 .log_page = log_page,
1613 };
1614
1615 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1616 n->aer_queued++;
1617
1618 nvme_process_aers(n);
1619}
1620
1621static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1622{
1623 uint8_t aer_info;
1624
1625
1626 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1627 return;
1628 }
1629
1630 switch (event) {
1631 case NVME_SMART_SPARE:
1632 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1633 break;
1634 case NVME_SMART_TEMPERATURE:
1635 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1636 break;
1637 case NVME_SMART_RELIABILITY:
1638 case NVME_SMART_MEDIA_READ_ONLY:
1639 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1640 case NVME_SMART_PMR_UNRELIABLE:
1641 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1642 break;
1643 default:
1644 return;
1645 }
1646
1647 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1648}
1649
1650static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1651{
1652 n->aer_mask &= ~(1 << event_type);
1653 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1654 nvme_process_aers(n);
1655 }
1656}
1657
1658static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1659{
1660 uint8_t mdts = n->params.mdts;
1661
1662 if (mdts && len > n->page_size << mdts) {
1663 trace_pci_nvme_err_mdts(len);
1664 return NVME_INVALID_FIELD | NVME_DNR;
1665 }
1666
1667 return NVME_SUCCESS;
1668}
1669
1670static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1671 uint32_t nlb)
1672{
1673 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1674
1675 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1676 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1677 return NVME_LBA_RANGE | NVME_DNR;
1678 }
1679
1680 return NVME_SUCCESS;
1681}
1682
1683static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1684 uint32_t nlb, int flags)
1685{
1686 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1687
1688 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1689 int64_t offset = nvme_l2b(ns, slba);
1690 int ret;
1691
1692
1693
1694
1695
1696
1697
1698 do {
1699 bytes -= pnum;
1700
1701 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1702 if (ret < 0) {
1703 return ret;
1704 }
1705
1706
1707 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1708 !!(ret & BDRV_BLOCK_ZERO));
1709
1710 if (!(ret & flags)) {
1711 return 1;
1712 }
1713
1714 offset += pnum;
1715 } while (pnum != bytes);
1716
1717 return 0;
1718}
1719
1720static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1721 uint32_t nlb)
1722{
1723 int ret;
1724 Error *err = NULL;
1725
1726 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1727 if (ret) {
1728 if (ret < 0) {
1729 error_setg_errno(&err, -ret, "unable to get block status");
1730 error_report_err(err);
1731
1732 return NVME_INTERNAL_DEV_ERROR;
1733 }
1734
1735 return NVME_DULB;
1736 }
1737
1738 return NVME_SUCCESS;
1739}
1740
1741static void nvme_aio_err(NvmeRequest *req, int ret)
1742{
1743 uint16_t status = NVME_SUCCESS;
1744 Error *local_err = NULL;
1745
1746 switch (req->cmd.opcode) {
1747 case NVME_CMD_READ:
1748 status = NVME_UNRECOVERED_READ;
1749 break;
1750 case NVME_CMD_FLUSH:
1751 case NVME_CMD_WRITE:
1752 case NVME_CMD_WRITE_ZEROES:
1753 case NVME_CMD_ZONE_APPEND:
1754 case NVME_CMD_COPY:
1755 status = NVME_WRITE_FAULT;
1756 break;
1757 default:
1758 status = NVME_INTERNAL_DEV_ERROR;
1759 break;
1760 }
1761
1762 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1763
1764 error_setg_errno(&local_err, -ret, "aio failed");
1765 error_report_err(local_err);
1766
1767
1768
1769
1770
1771 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1772 return;
1773 }
1774
1775 req->status = status;
1776}
1777
1778static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1779{
1780 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1781 slba / ns->zone_size;
1782}
1783
1784static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1785{
1786 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1787
1788 if (zone_idx >= ns->num_zones) {
1789 return NULL;
1790 }
1791
1792 return &ns->zone_array[zone_idx];
1793}
1794
1795static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1796{
1797 uint64_t zslba = zone->d.zslba;
1798
1799 switch (nvme_get_zone_state(zone)) {
1800 case NVME_ZONE_STATE_EMPTY:
1801 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1802 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1803 case NVME_ZONE_STATE_CLOSED:
1804 return NVME_SUCCESS;
1805 case NVME_ZONE_STATE_FULL:
1806 trace_pci_nvme_err_zone_is_full(zslba);
1807 return NVME_ZONE_FULL;
1808 case NVME_ZONE_STATE_OFFLINE:
1809 trace_pci_nvme_err_zone_is_offline(zslba);
1810 return NVME_ZONE_OFFLINE;
1811 case NVME_ZONE_STATE_READ_ONLY:
1812 trace_pci_nvme_err_zone_is_read_only(zslba);
1813 return NVME_ZONE_READ_ONLY;
1814 default:
1815 assert(false);
1816 }
1817
1818 return NVME_INTERNAL_DEV_ERROR;
1819}
1820
1821static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1822 uint64_t slba, uint32_t nlb)
1823{
1824 uint64_t zcap = nvme_zone_wr_boundary(zone);
1825 uint16_t status;
1826
1827 status = nvme_check_zone_state_for_write(zone);
1828 if (status) {
1829 return status;
1830 }
1831
1832 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1833 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1834
1835 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1836 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1837 return NVME_ZONE_INVALID_WRITE;
1838 }
1839 } else {
1840 if (unlikely(slba != zone->w_ptr)) {
1841 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1842 zone->w_ptr);
1843 return NVME_ZONE_INVALID_WRITE;
1844 }
1845 }
1846
1847 if (unlikely((slba + nlb) > zcap)) {
1848 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1849 return NVME_ZONE_BOUNDARY_ERROR;
1850 }
1851
1852 return NVME_SUCCESS;
1853}
1854
1855static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1856{
1857 switch (nvme_get_zone_state(zone)) {
1858 case NVME_ZONE_STATE_EMPTY:
1859 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1860 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1861 case NVME_ZONE_STATE_FULL:
1862 case NVME_ZONE_STATE_CLOSED:
1863 case NVME_ZONE_STATE_READ_ONLY:
1864 return NVME_SUCCESS;
1865 case NVME_ZONE_STATE_OFFLINE:
1866 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1867 return NVME_ZONE_OFFLINE;
1868 default:
1869 assert(false);
1870 }
1871
1872 return NVME_INTERNAL_DEV_ERROR;
1873}
1874
1875static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1876 uint32_t nlb)
1877{
1878 NvmeZone *zone;
1879 uint64_t bndry, end;
1880 uint16_t status;
1881
1882 zone = nvme_get_zone_by_slba(ns, slba);
1883 assert(zone);
1884
1885 bndry = nvme_zone_rd_boundary(ns, zone);
1886 end = slba + nlb;
1887
1888 status = nvme_check_zone_state_for_read(zone);
1889 if (status) {
1890 ;
1891 } else if (unlikely(end > bndry)) {
1892 if (!ns->params.cross_zone_read) {
1893 status = NVME_ZONE_BOUNDARY_ERROR;
1894 } else {
1895
1896
1897
1898
1899 do {
1900 zone++;
1901 status = nvme_check_zone_state_for_read(zone);
1902 if (status) {
1903 break;
1904 }
1905 } while (end > nvme_zone_rd_boundary(ns, zone));
1906 }
1907 }
1908
1909 return status;
1910}
1911
1912static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1913{
1914 switch (nvme_get_zone_state(zone)) {
1915 case NVME_ZONE_STATE_FULL:
1916 return NVME_SUCCESS;
1917
1918 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1919 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1920 nvme_aor_dec_open(ns);
1921
1922 case NVME_ZONE_STATE_CLOSED:
1923 nvme_aor_dec_active(ns);
1924
1925 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1926 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1927 if (ns->params.numzrwa) {
1928 ns->zns.numzrwa++;
1929 }
1930 }
1931
1932
1933 case NVME_ZONE_STATE_EMPTY:
1934 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1935 return NVME_SUCCESS;
1936
1937 default:
1938 return NVME_ZONE_INVAL_TRANSITION;
1939 }
1940}
1941
1942static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1943{
1944 switch (nvme_get_zone_state(zone)) {
1945 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1946 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1947 nvme_aor_dec_open(ns);
1948 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1949
1950 case NVME_ZONE_STATE_CLOSED:
1951 return NVME_SUCCESS;
1952
1953 default:
1954 return NVME_ZONE_INVAL_TRANSITION;
1955 }
1956}
1957
1958static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1959{
1960 switch (nvme_get_zone_state(zone)) {
1961 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1962 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1963 nvme_aor_dec_open(ns);
1964
1965 case NVME_ZONE_STATE_CLOSED:
1966 nvme_aor_dec_active(ns);
1967
1968 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1969 if (ns->params.numzrwa) {
1970 ns->zns.numzrwa++;
1971 }
1972 }
1973
1974
1975 case NVME_ZONE_STATE_FULL:
1976 zone->w_ptr = zone->d.zslba;
1977 zone->d.wp = zone->w_ptr;
1978 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1979
1980 case NVME_ZONE_STATE_EMPTY:
1981 return NVME_SUCCESS;
1982
1983 default:
1984 return NVME_ZONE_INVAL_TRANSITION;
1985 }
1986}
1987
1988static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1989{
1990 NvmeZone *zone;
1991
1992 if (ns->params.max_open_zones &&
1993 ns->nr_open_zones == ns->params.max_open_zones) {
1994 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1995 if (zone) {
1996
1997
1998
1999 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
2000 nvme_zrm_close(ns, zone);
2001 }
2002 }
2003}
2004
2005enum {
2006 NVME_ZRM_AUTO = 1 << 0,
2007 NVME_ZRM_ZRWA = 1 << 1,
2008};
2009
2010static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
2011 NvmeZone *zone, int flags)
2012{
2013 int act = 0;
2014 uint16_t status;
2015
2016 switch (nvme_get_zone_state(zone)) {
2017 case NVME_ZONE_STATE_EMPTY:
2018 act = 1;
2019
2020
2021
2022 case NVME_ZONE_STATE_CLOSED:
2023 if (n->params.auto_transition_zones) {
2024 nvme_zrm_auto_transition_zone(ns);
2025 }
2026 status = nvme_zns_check_resources(ns, act, 1,
2027 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2028 if (status) {
2029 return status;
2030 }
2031
2032 if (act) {
2033 nvme_aor_inc_active(ns);
2034 }
2035
2036 nvme_aor_inc_open(ns);
2037
2038 if (flags & NVME_ZRM_AUTO) {
2039 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2040 return NVME_SUCCESS;
2041 }
2042
2043
2044
2045 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2046 if (flags & NVME_ZRM_AUTO) {
2047 return NVME_SUCCESS;
2048 }
2049
2050 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2051
2052
2053
2054 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2055 if (flags & NVME_ZRM_ZRWA) {
2056 ns->zns.numzrwa--;
2057
2058 zone->d.za |= NVME_ZA_ZRWA_VALID;
2059 }
2060
2061 return NVME_SUCCESS;
2062
2063 default:
2064 return NVME_ZONE_INVAL_TRANSITION;
2065 }
2066}
2067
2068static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2069 NvmeZone *zone)
2070{
2071 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2072}
2073
2074static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2075 uint32_t nlb)
2076{
2077 zone->d.wp += nlb;
2078
2079 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2080 nvme_zrm_finish(ns, zone);
2081 }
2082}
2083
2084static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2085 uint32_t nlbc)
2086{
2087 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2088
2089 nlbc = nzrwafgs * ns->zns.zrwafg;
2090
2091 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2092
2093 zone->w_ptr += nlbc;
2094
2095 nvme_advance_zone_wp(ns, zone, nlbc);
2096}
2097
2098static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2099{
2100 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2101 NvmeZone *zone;
2102 uint64_t slba;
2103 uint32_t nlb;
2104
2105 slba = le64_to_cpu(rw->slba);
2106 nlb = le16_to_cpu(rw->nlb) + 1;
2107 zone = nvme_get_zone_by_slba(ns, slba);
2108 assert(zone);
2109
2110 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2111 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2112 uint64_t elba = slba + nlb - 1;
2113
2114 if (elba > ezrwa) {
2115 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2116 }
2117
2118 return;
2119 }
2120
2121 nvme_advance_zone_wp(ns, zone, nlb);
2122}
2123
2124static inline bool nvme_is_write(NvmeRequest *req)
2125{
2126 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2127
2128 return rw->opcode == NVME_CMD_WRITE ||
2129 rw->opcode == NVME_CMD_ZONE_APPEND ||
2130 rw->opcode == NVME_CMD_WRITE_ZEROES;
2131}
2132
2133static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
2134{
2135 return qemu_get_aio_context();
2136}
2137
2138static void nvme_misc_cb(void *opaque, int ret)
2139{
2140 NvmeRequest *req = opaque;
2141
2142 trace_pci_nvme_misc_cb(nvme_cid(req));
2143
2144 if (ret) {
2145 nvme_aio_err(req, ret);
2146 }
2147
2148 nvme_enqueue_req_completion(nvme_cq(req), req);
2149}
2150
2151void nvme_rw_complete_cb(void *opaque, int ret)
2152{
2153 NvmeRequest *req = opaque;
2154 NvmeNamespace *ns = req->ns;
2155 BlockBackend *blk = ns->blkconf.blk;
2156 BlockAcctCookie *acct = &req->acct;
2157 BlockAcctStats *stats = blk_get_stats(blk);
2158
2159 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2160
2161 if (ret) {
2162 block_acct_failed(stats, acct);
2163 nvme_aio_err(req, ret);
2164 } else {
2165 block_acct_done(stats, acct);
2166 }
2167
2168 if (ns->params.zoned && nvme_is_write(req)) {
2169 nvme_finalize_zoned_write(ns, req);
2170 }
2171
2172 nvme_enqueue_req_completion(nvme_cq(req), req);
2173}
2174
2175static void nvme_rw_cb(void *opaque, int ret)
2176{
2177 NvmeRequest *req = opaque;
2178 NvmeNamespace *ns = req->ns;
2179
2180 BlockBackend *blk = ns->blkconf.blk;
2181
2182 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2183
2184 if (ret) {
2185 goto out;
2186 }
2187
2188 if (ns->lbaf.ms) {
2189 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2190 uint64_t slba = le64_to_cpu(rw->slba);
2191 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2192 uint64_t offset = nvme_moff(ns, slba);
2193
2194 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2195 size_t mlen = nvme_m2b(ns, nlb);
2196
2197 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2198 BDRV_REQ_MAY_UNMAP,
2199 nvme_rw_complete_cb, req);
2200 return;
2201 }
2202
2203 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2204 uint16_t status;
2205
2206 nvme_sg_unmap(&req->sg);
2207 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2208 if (status) {
2209 ret = -EFAULT;
2210 goto out;
2211 }
2212
2213 if (req->cmd.opcode == NVME_CMD_READ) {
2214 return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2215 }
2216
2217 return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2218 }
2219 }
2220
2221out:
2222 nvme_rw_complete_cb(req, ret);
2223}
2224
2225static void nvme_verify_cb(void *opaque, int ret)
2226{
2227 NvmeBounceContext *ctx = opaque;
2228 NvmeRequest *req = ctx->req;
2229 NvmeNamespace *ns = req->ns;
2230 BlockBackend *blk = ns->blkconf.blk;
2231 BlockAcctCookie *acct = &req->acct;
2232 BlockAcctStats *stats = blk_get_stats(blk);
2233 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2234 uint64_t slba = le64_to_cpu(rw->slba);
2235 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2236 uint16_t apptag = le16_to_cpu(rw->apptag);
2237 uint16_t appmask = le16_to_cpu(rw->appmask);
2238 uint64_t reftag = le32_to_cpu(rw->reftag);
2239 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2240 uint16_t status;
2241
2242 reftag |= cdw3 << 32;
2243
2244 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2245
2246 if (ret) {
2247 block_acct_failed(stats, acct);
2248 nvme_aio_err(req, ret);
2249 goto out;
2250 }
2251
2252 block_acct_done(stats, acct);
2253
2254 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2255 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2256 ctx->mdata.iov.size, slba);
2257 if (status) {
2258 req->status = status;
2259 goto out;
2260 }
2261
2262 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2263 ctx->mdata.bounce, ctx->mdata.iov.size,
2264 prinfo, slba, apptag, appmask, &reftag);
2265 }
2266
2267out:
2268 qemu_iovec_destroy(&ctx->data.iov);
2269 g_free(ctx->data.bounce);
2270
2271 qemu_iovec_destroy(&ctx->mdata.iov);
2272 g_free(ctx->mdata.bounce);
2273
2274 g_free(ctx);
2275
2276 nvme_enqueue_req_completion(nvme_cq(req), req);
2277}
2278
2279
2280static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2281{
2282 NvmeBounceContext *ctx = opaque;
2283 NvmeRequest *req = ctx->req;
2284 NvmeNamespace *ns = req->ns;
2285 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2286 uint64_t slba = le64_to_cpu(rw->slba);
2287 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2288 size_t mlen = nvme_m2b(ns, nlb);
2289 uint64_t offset = nvme_moff(ns, slba);
2290 BlockBackend *blk = ns->blkconf.blk;
2291
2292 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2293
2294 if (ret) {
2295 goto out;
2296 }
2297
2298 ctx->mdata.bounce = g_malloc(mlen);
2299
2300 qemu_iovec_reset(&ctx->mdata.iov);
2301 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2302
2303 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2304 nvme_verify_cb, ctx);
2305 return;
2306
2307out:
2308 nvme_verify_cb(ctx, ret);
2309}
2310
2311struct nvme_compare_ctx {
2312 struct {
2313 QEMUIOVector iov;
2314 uint8_t *bounce;
2315 } data;
2316
2317 struct {
2318 QEMUIOVector iov;
2319 uint8_t *bounce;
2320 } mdata;
2321};
2322
2323static void nvme_compare_mdata_cb(void *opaque, int ret)
2324{
2325 NvmeRequest *req = opaque;
2326 NvmeNamespace *ns = req->ns;
2327 NvmeCtrl *n = nvme_ctrl(req);
2328 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2329 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2330 uint16_t apptag = le16_to_cpu(rw->apptag);
2331 uint16_t appmask = le16_to_cpu(rw->appmask);
2332 uint64_t reftag = le32_to_cpu(rw->reftag);
2333 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2334 struct nvme_compare_ctx *ctx = req->opaque;
2335 g_autofree uint8_t *buf = NULL;
2336 BlockBackend *blk = ns->blkconf.blk;
2337 BlockAcctCookie *acct = &req->acct;
2338 BlockAcctStats *stats = blk_get_stats(blk);
2339 uint16_t status = NVME_SUCCESS;
2340
2341 reftag |= cdw3 << 32;
2342
2343 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2344
2345 if (ret) {
2346 block_acct_failed(stats, acct);
2347 nvme_aio_err(req, ret);
2348 goto out;
2349 }
2350
2351 buf = g_malloc(ctx->mdata.iov.size);
2352
2353 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2354 NVME_TX_DIRECTION_TO_DEVICE, req);
2355 if (status) {
2356 req->status = status;
2357 goto out;
2358 }
2359
2360 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2361 uint64_t slba = le64_to_cpu(rw->slba);
2362 uint8_t *bufp;
2363 uint8_t *mbufp = ctx->mdata.bounce;
2364 uint8_t *end = mbufp + ctx->mdata.iov.size;
2365 int16_t pil = 0;
2366
2367 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2368 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2369 slba, apptag, appmask, &reftag);
2370 if (status) {
2371 req->status = status;
2372 goto out;
2373 }
2374
2375
2376
2377
2378
2379 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2380 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2381 }
2382
2383 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2384 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2385 req->status = NVME_CMP_FAILURE | NVME_DNR;
2386 goto out;
2387 }
2388 }
2389
2390 goto out;
2391 }
2392
2393 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2394 req->status = NVME_CMP_FAILURE | NVME_DNR;
2395 goto out;
2396 }
2397
2398 block_acct_done(stats, acct);
2399
2400out:
2401 qemu_iovec_destroy(&ctx->data.iov);
2402 g_free(ctx->data.bounce);
2403
2404 qemu_iovec_destroy(&ctx->mdata.iov);
2405 g_free(ctx->mdata.bounce);
2406
2407 g_free(ctx);
2408
2409 nvme_enqueue_req_completion(nvme_cq(req), req);
2410}
2411
2412static void nvme_compare_data_cb(void *opaque, int ret)
2413{
2414 NvmeRequest *req = opaque;
2415 NvmeCtrl *n = nvme_ctrl(req);
2416 NvmeNamespace *ns = req->ns;
2417 BlockBackend *blk = ns->blkconf.blk;
2418 BlockAcctCookie *acct = &req->acct;
2419 BlockAcctStats *stats = blk_get_stats(blk);
2420
2421 struct nvme_compare_ctx *ctx = req->opaque;
2422 g_autofree uint8_t *buf = NULL;
2423 uint16_t status;
2424
2425 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2426
2427 if (ret) {
2428 block_acct_failed(stats, acct);
2429 nvme_aio_err(req, ret);
2430 goto out;
2431 }
2432
2433 buf = g_malloc(ctx->data.iov.size);
2434
2435 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2436 NVME_TX_DIRECTION_TO_DEVICE, req);
2437 if (status) {
2438 req->status = status;
2439 goto out;
2440 }
2441
2442 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2443 req->status = NVME_CMP_FAILURE | NVME_DNR;
2444 goto out;
2445 }
2446
2447 if (ns->lbaf.ms) {
2448 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2449 uint64_t slba = le64_to_cpu(rw->slba);
2450 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2451 size_t mlen = nvme_m2b(ns, nlb);
2452 uint64_t offset = nvme_moff(ns, slba);
2453
2454 ctx->mdata.bounce = g_malloc(mlen);
2455
2456 qemu_iovec_init(&ctx->mdata.iov, 1);
2457 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2458
2459 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2460 nvme_compare_mdata_cb, req);
2461 return;
2462 }
2463
2464 block_acct_done(stats, acct);
2465
2466out:
2467 qemu_iovec_destroy(&ctx->data.iov);
2468 g_free(ctx->data.bounce);
2469 g_free(ctx);
2470
2471 nvme_enqueue_req_completion(nvme_cq(req), req);
2472}
2473
2474typedef struct NvmeDSMAIOCB {
2475 BlockAIOCB common;
2476 BlockAIOCB *aiocb;
2477 NvmeRequest *req;
2478 int ret;
2479
2480 NvmeDsmRange *range;
2481 unsigned int nr;
2482 unsigned int idx;
2483} NvmeDSMAIOCB;
2484
2485static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2486{
2487 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2488
2489
2490 iocb->idx = iocb->nr;
2491 iocb->ret = -ECANCELED;
2492
2493 if (iocb->aiocb) {
2494 blk_aio_cancel_async(iocb->aiocb);
2495 iocb->aiocb = NULL;
2496 } else {
2497
2498
2499
2500
2501 assert(iocb->idx == iocb->nr);
2502 }
2503}
2504
2505static const AIOCBInfo nvme_dsm_aiocb_info = {
2506 .aiocb_size = sizeof(NvmeDSMAIOCB),
2507 .cancel_async = nvme_dsm_cancel,
2508};
2509
2510static void nvme_dsm_cb(void *opaque, int ret);
2511
2512static void nvme_dsm_md_cb(void *opaque, int ret)
2513{
2514 NvmeDSMAIOCB *iocb = opaque;
2515 NvmeRequest *req = iocb->req;
2516 NvmeNamespace *ns = req->ns;
2517 NvmeDsmRange *range;
2518 uint64_t slba;
2519 uint32_t nlb;
2520
2521 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2522 goto done;
2523 }
2524
2525 range = &iocb->range[iocb->idx - 1];
2526 slba = le64_to_cpu(range->slba);
2527 nlb = le32_to_cpu(range->nlb);
2528
2529
2530
2531
2532
2533
2534 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2535 if (ret) {
2536 if (ret < 0) {
2537 goto done;
2538 }
2539
2540 nvme_dsm_cb(iocb, 0);
2541 return;
2542 }
2543
2544 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2545 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2546 nvme_dsm_cb, iocb);
2547 return;
2548
2549done:
2550 nvme_dsm_cb(iocb, ret);
2551}
2552
2553static void nvme_dsm_cb(void *opaque, int ret)
2554{
2555 NvmeDSMAIOCB *iocb = opaque;
2556 NvmeRequest *req = iocb->req;
2557 NvmeCtrl *n = nvme_ctrl(req);
2558 NvmeNamespace *ns = req->ns;
2559 NvmeDsmRange *range;
2560 uint64_t slba;
2561 uint32_t nlb;
2562
2563 if (iocb->ret < 0) {
2564 goto done;
2565 } else if (ret < 0) {
2566 iocb->ret = ret;
2567 goto done;
2568 }
2569
2570next:
2571 if (iocb->idx == iocb->nr) {
2572 goto done;
2573 }
2574
2575 range = &iocb->range[iocb->idx++];
2576 slba = le64_to_cpu(range->slba);
2577 nlb = le32_to_cpu(range->nlb);
2578
2579 trace_pci_nvme_dsm_deallocate(slba, nlb);
2580
2581 if (nlb > n->dmrsl) {
2582 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2583 goto next;
2584 }
2585
2586 if (nvme_check_bounds(ns, slba, nlb)) {
2587 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2588 ns->id_ns.nsze);
2589 goto next;
2590 }
2591
2592 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2593 nvme_l2b(ns, nlb),
2594 nvme_dsm_md_cb, iocb);
2595 return;
2596
2597done:
2598 iocb->aiocb = NULL;
2599 iocb->common.cb(iocb->common.opaque, iocb->ret);
2600 qemu_aio_unref(iocb);
2601}
2602
2603static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2604{
2605 NvmeNamespace *ns = req->ns;
2606 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2607 uint32_t attr = le32_to_cpu(dsm->attributes);
2608 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2609 uint16_t status = NVME_SUCCESS;
2610
2611 trace_pci_nvme_dsm(nr, attr);
2612
2613 if (attr & NVME_DSMGMT_AD) {
2614 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2615 nvme_misc_cb, req);
2616
2617 iocb->req = req;
2618 iocb->ret = 0;
2619 iocb->range = g_new(NvmeDsmRange, nr);
2620 iocb->nr = nr;
2621 iocb->idx = 0;
2622
2623 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2624 req);
2625 if (status) {
2626 g_free(iocb->range);
2627 qemu_aio_unref(iocb);
2628
2629 return status;
2630 }
2631
2632 req->aiocb = &iocb->common;
2633 nvme_dsm_cb(iocb, 0);
2634
2635 return NVME_NO_COMPLETE;
2636 }
2637
2638 return status;
2639}
2640
2641static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2642{
2643 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2644 NvmeNamespace *ns = req->ns;
2645 BlockBackend *blk = ns->blkconf.blk;
2646 uint64_t slba = le64_to_cpu(rw->slba);
2647 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2648 size_t len = nvme_l2b(ns, nlb);
2649 int64_t offset = nvme_l2b(ns, slba);
2650 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2651 uint32_t reftag = le32_to_cpu(rw->reftag);
2652 NvmeBounceContext *ctx = NULL;
2653 uint16_t status;
2654
2655 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2656
2657 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2658 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2659 if (status) {
2660 return status;
2661 }
2662
2663 if (prinfo & NVME_PRINFO_PRACT) {
2664 return NVME_INVALID_PROT_INFO | NVME_DNR;
2665 }
2666 }
2667
2668 if (len > n->page_size << n->params.vsl) {
2669 return NVME_INVALID_FIELD | NVME_DNR;
2670 }
2671
2672 status = nvme_check_bounds(ns, slba, nlb);
2673 if (status) {
2674 return status;
2675 }
2676
2677 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2678 status = nvme_check_dulbe(ns, slba, nlb);
2679 if (status) {
2680 return status;
2681 }
2682 }
2683
2684 ctx = g_new0(NvmeBounceContext, 1);
2685 ctx->req = req;
2686
2687 ctx->data.bounce = g_malloc(len);
2688
2689 qemu_iovec_init(&ctx->data.iov, 1);
2690 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2691
2692 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2693 BLOCK_ACCT_READ);
2694
2695 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2696 nvme_verify_mdata_in_cb, ctx);
2697 return NVME_NO_COMPLETE;
2698}
2699
2700typedef struct NvmeCopyAIOCB {
2701 BlockAIOCB common;
2702 BlockAIOCB *aiocb;
2703 NvmeRequest *req;
2704 int ret;
2705
2706 void *ranges;
2707 unsigned int format;
2708 int nr;
2709 int idx;
2710
2711 uint8_t *bounce;
2712 QEMUIOVector iov;
2713 struct {
2714 BlockAcctCookie read;
2715 BlockAcctCookie write;
2716 } acct;
2717
2718 uint64_t reftag;
2719 uint64_t slba;
2720
2721 NvmeZone *zone;
2722} NvmeCopyAIOCB;
2723
2724static void nvme_copy_cancel(BlockAIOCB *aiocb)
2725{
2726 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2727
2728 iocb->ret = -ECANCELED;
2729
2730 if (iocb->aiocb) {
2731 blk_aio_cancel_async(iocb->aiocb);
2732 iocb->aiocb = NULL;
2733 }
2734}
2735
2736static const AIOCBInfo nvme_copy_aiocb_info = {
2737 .aiocb_size = sizeof(NvmeCopyAIOCB),
2738 .cancel_async = nvme_copy_cancel,
2739};
2740
2741static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2742{
2743 NvmeRequest *req = iocb->req;
2744 NvmeNamespace *ns = req->ns;
2745 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2746
2747 if (iocb->idx != iocb->nr) {
2748 req->cqe.result = cpu_to_le32(iocb->idx);
2749 }
2750
2751 qemu_iovec_destroy(&iocb->iov);
2752 g_free(iocb->bounce);
2753
2754 if (iocb->ret < 0) {
2755 block_acct_failed(stats, &iocb->acct.read);
2756 block_acct_failed(stats, &iocb->acct.write);
2757 } else {
2758 block_acct_done(stats, &iocb->acct.read);
2759 block_acct_done(stats, &iocb->acct.write);
2760 }
2761
2762 iocb->common.cb(iocb->common.opaque, iocb->ret);
2763 qemu_aio_unref(iocb);
2764}
2765
2766static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2767
2768static void nvme_copy_source_range_parse_format0(void *ranges, int idx,
2769 uint64_t *slba, uint32_t *nlb,
2770 uint16_t *apptag,
2771 uint16_t *appmask,
2772 uint64_t *reftag)
2773{
2774 NvmeCopySourceRangeFormat0 *_ranges = ranges;
2775
2776 if (slba) {
2777 *slba = le64_to_cpu(_ranges[idx].slba);
2778 }
2779
2780 if (nlb) {
2781 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2782 }
2783
2784 if (apptag) {
2785 *apptag = le16_to_cpu(_ranges[idx].apptag);
2786 }
2787
2788 if (appmask) {
2789 *appmask = le16_to_cpu(_ranges[idx].appmask);
2790 }
2791
2792 if (reftag) {
2793 *reftag = le32_to_cpu(_ranges[idx].reftag);
2794 }
2795}
2796
2797static void nvme_copy_source_range_parse_format1(void *ranges, int idx,
2798 uint64_t *slba, uint32_t *nlb,
2799 uint16_t *apptag,
2800 uint16_t *appmask,
2801 uint64_t *reftag)
2802{
2803 NvmeCopySourceRangeFormat1 *_ranges = ranges;
2804
2805 if (slba) {
2806 *slba = le64_to_cpu(_ranges[idx].slba);
2807 }
2808
2809 if (nlb) {
2810 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2811 }
2812
2813 if (apptag) {
2814 *apptag = le16_to_cpu(_ranges[idx].apptag);
2815 }
2816
2817 if (appmask) {
2818 *appmask = le16_to_cpu(_ranges[idx].appmask);
2819 }
2820
2821 if (reftag) {
2822 *reftag = 0;
2823
2824 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2825 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2826 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2827 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2828 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2829 *reftag |= (uint64_t)_ranges[idx].sr[9];
2830 }
2831}
2832
2833static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2834 uint64_t *slba, uint32_t *nlb,
2835 uint16_t *apptag, uint16_t *appmask,
2836 uint64_t *reftag)
2837{
2838 switch (format) {
2839 case NVME_COPY_FORMAT_0:
2840 nvme_copy_source_range_parse_format0(ranges, idx, slba, nlb, apptag,
2841 appmask, reftag);
2842 break;
2843
2844 case NVME_COPY_FORMAT_1:
2845 nvme_copy_source_range_parse_format1(ranges, idx, slba, nlb, apptag,
2846 appmask, reftag);
2847 break;
2848
2849 default:
2850 abort();
2851 }
2852}
2853
2854static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
2855 NvmeCopyAIOCB *iocb, uint16_t nr)
2856{
2857 uint32_t copy_len = 0;
2858
2859 for (int idx = 0; idx < nr; idx++) {
2860 uint32_t nlb;
2861 nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
2862 &nlb, NULL, NULL, NULL);
2863 copy_len += nlb + 1;
2864 }
2865
2866 if (copy_len > ns->id_ns.mcl) {
2867 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2868 }
2869
2870 return NVME_SUCCESS;
2871}
2872
2873static void nvme_copy_out_completed_cb(void *opaque, int ret)
2874{
2875 NvmeCopyAIOCB *iocb = opaque;
2876 NvmeRequest *req = iocb->req;
2877 NvmeNamespace *ns = req->ns;
2878 uint32_t nlb;
2879
2880 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2881 &nlb, NULL, NULL, NULL);
2882
2883 if (ret < 0) {
2884 iocb->ret = ret;
2885 goto out;
2886 } else if (iocb->ret < 0) {
2887 goto out;
2888 }
2889
2890 if (ns->params.zoned) {
2891 nvme_advance_zone_wp(ns, iocb->zone, nlb);
2892 }
2893
2894 iocb->idx++;
2895 iocb->slba += nlb;
2896out:
2897 nvme_do_copy(iocb);
2898}
2899
2900static void nvme_copy_out_cb(void *opaque, int ret)
2901{
2902 NvmeCopyAIOCB *iocb = opaque;
2903 NvmeRequest *req = iocb->req;
2904 NvmeNamespace *ns = req->ns;
2905 uint32_t nlb;
2906 size_t mlen;
2907 uint8_t *mbounce;
2908
2909 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2910 goto out;
2911 }
2912
2913 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2914 &nlb, NULL, NULL, NULL);
2915
2916 mlen = nvme_m2b(ns, nlb);
2917 mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2918
2919 qemu_iovec_reset(&iocb->iov);
2920 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2921
2922 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2923 &iocb->iov, 0, nvme_copy_out_completed_cb,
2924 iocb);
2925
2926 return;
2927
2928out:
2929 nvme_copy_out_completed_cb(iocb, ret);
2930}
2931
2932static void nvme_copy_in_completed_cb(void *opaque, int ret)
2933{
2934 NvmeCopyAIOCB *iocb = opaque;
2935 NvmeRequest *req = iocb->req;
2936 NvmeNamespace *ns = req->ns;
2937 uint32_t nlb;
2938 uint64_t slba;
2939 uint16_t apptag, appmask;
2940 uint64_t reftag;
2941 size_t len;
2942 uint16_t status;
2943
2944 if (ret < 0) {
2945 iocb->ret = ret;
2946 goto out;
2947 } else if (iocb->ret < 0) {
2948 goto out;
2949 }
2950
2951 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2952 &nlb, &apptag, &appmask, &reftag);
2953 len = nvme_l2b(ns, nlb);
2954
2955 trace_pci_nvme_copy_out(iocb->slba, nlb);
2956
2957 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2958 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2959
2960 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2961 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2962
2963 size_t mlen = nvme_m2b(ns, nlb);
2964 uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2965
2966 status = nvme_dif_mangle_mdata(ns, mbounce, mlen, slba);
2967 if (status) {
2968 goto invalid;
2969 }
2970 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2971 slba, apptag, appmask, &reftag);
2972 if (status) {
2973 goto invalid;
2974 }
2975
2976 apptag = le16_to_cpu(copy->apptag);
2977 appmask = le16_to_cpu(copy->appmask);
2978
2979 if (prinfow & NVME_PRINFO_PRACT) {
2980 status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2981 if (status) {
2982 goto invalid;
2983 }
2984
2985 nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2986 apptag, &iocb->reftag);
2987 } else {
2988 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2989 prinfow, iocb->slba, apptag, appmask,
2990 &iocb->reftag);
2991 if (status) {
2992 goto invalid;
2993 }
2994 }
2995 }
2996
2997 status = nvme_check_bounds(ns, iocb->slba, nlb);
2998 if (status) {
2999 goto invalid;
3000 }
3001
3002 if (ns->params.zoned) {
3003 status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
3004 if (status) {
3005 goto invalid;
3006 }
3007
3008 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
3009 iocb->zone->w_ptr += nlb;
3010 }
3011 }
3012
3013 qemu_iovec_reset(&iocb->iov);
3014 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3015
3016 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
3017 &iocb->iov, 0, nvme_copy_out_cb, iocb);
3018
3019 return;
3020
3021invalid:
3022 req->status = status;
3023 iocb->ret = -1;
3024out:
3025 nvme_do_copy(iocb);
3026}
3027
3028static void nvme_copy_in_cb(void *opaque, int ret)
3029{
3030 NvmeCopyAIOCB *iocb = opaque;
3031 NvmeRequest *req = iocb->req;
3032 NvmeNamespace *ns = req->ns;
3033 uint64_t slba;
3034 uint32_t nlb;
3035
3036 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
3037 goto out;
3038 }
3039
3040 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3041 &nlb, NULL, NULL, NULL);
3042
3043 qemu_iovec_reset(&iocb->iov);
3044 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
3045 nvme_m2b(ns, nlb));
3046
3047 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
3048 &iocb->iov, 0, nvme_copy_in_completed_cb,
3049 iocb);
3050 return;
3051
3052out:
3053 nvme_copy_in_completed_cb(iocb, ret);
3054}
3055
3056static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3057{
3058 NvmeRequest *req = iocb->req;
3059 NvmeNamespace *ns = req->ns;
3060 uint64_t slba;
3061 uint32_t nlb;
3062 size_t len;
3063 uint16_t status;
3064
3065 if (iocb->ret < 0) {
3066 goto done;
3067 }
3068
3069 if (iocb->idx == iocb->nr) {
3070 goto done;
3071 }
3072
3073 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3074 &nlb, NULL, NULL, NULL);
3075 len = nvme_l2b(ns, nlb);
3076
3077 trace_pci_nvme_copy_source_range(slba, nlb);
3078
3079 if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
3080 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3081 goto invalid;
3082 }
3083
3084 status = nvme_check_bounds(ns, slba, nlb);
3085 if (status) {
3086 goto invalid;
3087 }
3088
3089 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3090 status = nvme_check_dulbe(ns, slba, nlb);
3091 if (status) {
3092 goto invalid;
3093 }
3094 }
3095
3096 if (ns->params.zoned) {
3097 status = nvme_check_zone_read(ns, slba, nlb);
3098 if (status) {
3099 goto invalid;
3100 }
3101 }
3102
3103 qemu_iovec_reset(&iocb->iov);
3104 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3105
3106 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
3107 &iocb->iov, 0, nvme_copy_in_cb, iocb);
3108 return;
3109
3110invalid:
3111 req->status = status;
3112 iocb->ret = -1;
3113done:
3114 nvme_copy_done(iocb);
3115}
3116
3117static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3118{
3119 NvmeNamespace *ns = req->ns;
3120 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3121 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3122 nvme_misc_cb, req);
3123 uint16_t nr = copy->nr + 1;
3124 uint8_t format = copy->control[0] & 0xf;
3125 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3126 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3127 size_t len = sizeof(NvmeCopySourceRangeFormat0);
3128
3129 uint16_t status;
3130
3131 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3132
3133 iocb->ranges = NULL;
3134 iocb->zone = NULL;
3135
3136 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
3137 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3138 status = NVME_INVALID_FIELD | NVME_DNR;
3139 goto invalid;
3140 }
3141
3142 if (!(n->id_ctrl.ocfs & (1 << format))) {
3143 trace_pci_nvme_err_copy_invalid_format(format);
3144 status = NVME_INVALID_FIELD | NVME_DNR;
3145 goto invalid;
3146 }
3147
3148 if (nr > ns->id_ns.msrc + 1) {
3149 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3150 goto invalid;
3151 }
3152
3153 if ((ns->pif == 0x0 && format != 0x0) ||
3154 (ns->pif != 0x0 && format != 0x1)) {
3155 status = NVME_INVALID_FORMAT | NVME_DNR;
3156 goto invalid;
3157 }
3158
3159 if (ns->pif) {
3160 len = sizeof(NvmeCopySourceRangeFormat1);
3161 }
3162
3163 iocb->format = format;
3164 iocb->ranges = g_malloc_n(nr, len);
3165 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3166 if (status) {
3167 goto invalid;
3168 }
3169
3170 iocb->slba = le64_to_cpu(copy->sdlba);
3171
3172 if (ns->params.zoned) {
3173 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3174 if (!iocb->zone) {
3175 status = NVME_LBA_RANGE | NVME_DNR;
3176 goto invalid;
3177 }
3178
3179 status = nvme_zrm_auto(n, ns, iocb->zone);
3180 if (status) {
3181 goto invalid;
3182 }
3183 }
3184
3185 status = nvme_check_copy_mcl(ns, iocb, nr);
3186 if (status) {
3187 goto invalid;
3188 }
3189
3190 iocb->req = req;
3191 iocb->ret = 0;
3192 iocb->nr = nr;
3193 iocb->idx = 0;
3194 iocb->reftag = le32_to_cpu(copy->reftag);
3195 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3196 iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
3197 ns->lbasz + ns->lbaf.ms);
3198
3199 qemu_iovec_init(&iocb->iov, 1);
3200
3201 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
3202 BLOCK_ACCT_READ);
3203 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
3204 BLOCK_ACCT_WRITE);
3205
3206 req->aiocb = &iocb->common;
3207 nvme_do_copy(iocb);
3208
3209 return NVME_NO_COMPLETE;
3210
3211invalid:
3212 g_free(iocb->ranges);
3213 qemu_aio_unref(iocb);
3214 return status;
3215}
3216
3217static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3218{
3219 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3220 NvmeNamespace *ns = req->ns;
3221 BlockBackend *blk = ns->blkconf.blk;
3222 uint64_t slba = le64_to_cpu(rw->slba);
3223 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3224 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3225 size_t data_len = nvme_l2b(ns, nlb);
3226 size_t len = data_len;
3227 int64_t offset = nvme_l2b(ns, slba);
3228 struct nvme_compare_ctx *ctx = NULL;
3229 uint16_t status;
3230
3231 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3232
3233 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3234 return NVME_INVALID_PROT_INFO | NVME_DNR;
3235 }
3236
3237 if (nvme_ns_ext(ns)) {
3238 len += nvme_m2b(ns, nlb);
3239 }
3240
3241 status = nvme_check_mdts(n, len);
3242 if (status) {
3243 return status;
3244 }
3245
3246 status = nvme_check_bounds(ns, slba, nlb);
3247 if (status) {
3248 return status;
3249 }
3250
3251 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3252 status = nvme_check_dulbe(ns, slba, nlb);
3253 if (status) {
3254 return status;
3255 }
3256 }
3257
3258 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3259 if (status) {
3260 return status;
3261 }
3262
3263 ctx = g_new(struct nvme_compare_ctx, 1);
3264 ctx->data.bounce = g_malloc(data_len);
3265
3266 req->opaque = ctx;
3267
3268 qemu_iovec_init(&ctx->data.iov, 1);
3269 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3270
3271 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3272 BLOCK_ACCT_READ);
3273 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3274 nvme_compare_data_cb, req);
3275
3276 return NVME_NO_COMPLETE;
3277}
3278
3279typedef struct NvmeFlushAIOCB {
3280 BlockAIOCB common;
3281 BlockAIOCB *aiocb;
3282 NvmeRequest *req;
3283 int ret;
3284
3285 NvmeNamespace *ns;
3286 uint32_t nsid;
3287 bool broadcast;
3288} NvmeFlushAIOCB;
3289
3290static void nvme_flush_cancel(BlockAIOCB *acb)
3291{
3292 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3293
3294 iocb->ret = -ECANCELED;
3295
3296 if (iocb->aiocb) {
3297 blk_aio_cancel_async(iocb->aiocb);
3298 iocb->aiocb = NULL;
3299 }
3300}
3301
3302static const AIOCBInfo nvme_flush_aiocb_info = {
3303 .aiocb_size = sizeof(NvmeFlushAIOCB),
3304 .cancel_async = nvme_flush_cancel,
3305 .get_aio_context = nvme_get_aio_context,
3306};
3307
3308static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3309
3310static void nvme_flush_ns_cb(void *opaque, int ret)
3311{
3312 NvmeFlushAIOCB *iocb = opaque;
3313 NvmeNamespace *ns = iocb->ns;
3314
3315 if (ret < 0) {
3316 iocb->ret = ret;
3317 goto out;
3318 } else if (iocb->ret < 0) {
3319 goto out;
3320 }
3321
3322 if (ns) {
3323 trace_pci_nvme_flush_ns(iocb->nsid);
3324
3325 iocb->ns = NULL;
3326 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3327 return;
3328 }
3329
3330out:
3331 nvme_do_flush(iocb);
3332}
3333
3334static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3335{
3336 NvmeRequest *req = iocb->req;
3337 NvmeCtrl *n = nvme_ctrl(req);
3338 int i;
3339
3340 if (iocb->ret < 0) {
3341 goto done;
3342 }
3343
3344 if (iocb->broadcast) {
3345 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3346 iocb->ns = nvme_ns(n, i);
3347 if (iocb->ns) {
3348 iocb->nsid = i;
3349 break;
3350 }
3351 }
3352 }
3353
3354 if (!iocb->ns) {
3355 goto done;
3356 }
3357
3358 nvme_flush_ns_cb(iocb, 0);
3359 return;
3360
3361done:
3362 iocb->common.cb(iocb->common.opaque, iocb->ret);
3363 qemu_aio_unref(iocb);
3364}
3365
3366static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3367{
3368 NvmeFlushAIOCB *iocb;
3369 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3370 uint16_t status;
3371
3372 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3373
3374 iocb->req = req;
3375 iocb->ret = 0;
3376 iocb->ns = NULL;
3377 iocb->nsid = 0;
3378 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3379
3380 if (!iocb->broadcast) {
3381 if (!nvme_nsid_valid(n, nsid)) {
3382 status = NVME_INVALID_NSID | NVME_DNR;
3383 goto out;
3384 }
3385
3386 iocb->ns = nvme_ns(n, nsid);
3387 if (!iocb->ns) {
3388 status = NVME_INVALID_FIELD | NVME_DNR;
3389 goto out;
3390 }
3391
3392 iocb->nsid = nsid;
3393 }
3394
3395 req->aiocb = &iocb->common;
3396 nvme_do_flush(iocb);
3397
3398 return NVME_NO_COMPLETE;
3399
3400out:
3401 qemu_aio_unref(iocb);
3402
3403 return status;
3404}
3405
3406static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3407{
3408 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3409 NvmeNamespace *ns = req->ns;
3410 uint64_t slba = le64_to_cpu(rw->slba);
3411 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3412 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3413 uint64_t data_size = nvme_l2b(ns, nlb);
3414 uint64_t mapped_size = data_size;
3415 uint64_t data_offset;
3416 BlockBackend *blk = ns->blkconf.blk;
3417 uint16_t status;
3418
3419 if (nvme_ns_ext(ns)) {
3420 mapped_size += nvme_m2b(ns, nlb);
3421
3422 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3423 bool pract = prinfo & NVME_PRINFO_PRACT;
3424
3425 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3426 mapped_size = data_size;
3427 }
3428 }
3429 }
3430
3431 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3432
3433 status = nvme_check_mdts(n, mapped_size);
3434 if (status) {
3435 goto invalid;
3436 }
3437
3438 status = nvme_check_bounds(ns, slba, nlb);
3439 if (status) {
3440 goto invalid;
3441 }
3442
3443 if (ns->params.zoned) {
3444 status = nvme_check_zone_read(ns, slba, nlb);
3445 if (status) {
3446 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3447 goto invalid;
3448 }
3449 }
3450
3451 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3452 status = nvme_check_dulbe(ns, slba, nlb);
3453 if (status) {
3454 goto invalid;
3455 }
3456 }
3457
3458 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3459 return nvme_dif_rw(n, req);
3460 }
3461
3462 status = nvme_map_data(n, nlb, req);
3463 if (status) {
3464 goto invalid;
3465 }
3466
3467 data_offset = nvme_l2b(ns, slba);
3468
3469 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3470 BLOCK_ACCT_READ);
3471 nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3472 return NVME_NO_COMPLETE;
3473
3474invalid:
3475 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3476 return status | NVME_DNR;
3477}
3478
3479static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3480 uint32_t nlb)
3481{
3482 NvmeNamespace *ns = req->ns;
3483 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3484 uint64_t data_size = nvme_l2b(ns, nlb);
3485 uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3486 uint8_t dtype = (dw12 >> 20) & 0xf;
3487 uint16_t pid = le16_to_cpu(rw->dspec);
3488 uint16_t ph, rg, ruhid;
3489 NvmeReclaimUnit *ru;
3490
3491 if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3492 !nvme_parse_pid(ns, pid, &ph, &rg)) {
3493 ph = 0;
3494 rg = 0;
3495 }
3496
3497 ruhid = ns->fdp.phs[ph];
3498 ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3499
3500 nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3501 nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3502
3503 while (nlb) {
3504 if (nlb < ru->ruamw) {
3505 ru->ruamw -= nlb;
3506 break;
3507 }
3508
3509 nlb -= ru->ruamw;
3510 nvme_update_ruh(n, ns, pid);
3511 }
3512}
3513
3514static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3515 bool wrz)
3516{
3517 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3518 NvmeNamespace *ns = req->ns;
3519 uint64_t slba = le64_to_cpu(rw->slba);
3520 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3521 uint16_t ctrl = le16_to_cpu(rw->control);
3522 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3523 uint64_t data_size = nvme_l2b(ns, nlb);
3524 uint64_t mapped_size = data_size;
3525 uint64_t data_offset;
3526 NvmeZone *zone;
3527 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3528 BlockBackend *blk = ns->blkconf.blk;
3529 uint16_t status;
3530
3531 if (nvme_ns_ext(ns)) {
3532 mapped_size += nvme_m2b(ns, nlb);
3533
3534 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3535 bool pract = prinfo & NVME_PRINFO_PRACT;
3536
3537 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3538 mapped_size -= nvme_m2b(ns, nlb);
3539 }
3540 }
3541 }
3542
3543 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3544 nvme_nsid(ns), nlb, mapped_size, slba);
3545
3546 if (!wrz) {
3547 status = nvme_check_mdts(n, mapped_size);
3548 if (status) {
3549 goto invalid;
3550 }
3551 }
3552
3553 status = nvme_check_bounds(ns, slba, nlb);
3554 if (status) {
3555 goto invalid;
3556 }
3557
3558 if (ns->params.zoned) {
3559 zone = nvme_get_zone_by_slba(ns, slba);
3560 assert(zone);
3561
3562 if (append) {
3563 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3564
3565 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3566 return NVME_INVALID_ZONE_OP | NVME_DNR;
3567 }
3568
3569 if (unlikely(slba != zone->d.zslba)) {
3570 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3571 status = NVME_INVALID_FIELD;
3572 goto invalid;
3573 }
3574
3575 if (n->params.zasl &&
3576 data_size > (uint64_t)n->page_size << n->params.zasl) {
3577 trace_pci_nvme_err_zasl(data_size);
3578 return NVME_INVALID_FIELD | NVME_DNR;
3579 }
3580
3581 slba = zone->w_ptr;
3582 rw->slba = cpu_to_le64(slba);
3583 res->slba = cpu_to_le64(slba);
3584
3585 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3586 case NVME_ID_NS_DPS_TYPE_1:
3587 if (!piremap) {
3588 return NVME_INVALID_PROT_INFO | NVME_DNR;
3589 }
3590
3591
3592
3593 case NVME_ID_NS_DPS_TYPE_2:
3594 if (piremap) {
3595 uint32_t reftag = le32_to_cpu(rw->reftag);
3596 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3597 }
3598
3599 break;
3600
3601 case NVME_ID_NS_DPS_TYPE_3:
3602 if (piremap) {
3603 return NVME_INVALID_PROT_INFO | NVME_DNR;
3604 }
3605
3606 break;
3607 }
3608 }
3609
3610 status = nvme_check_zone_write(ns, zone, slba, nlb);
3611 if (status) {
3612 goto invalid;
3613 }
3614
3615 status = nvme_zrm_auto(n, ns, zone);
3616 if (status) {
3617 goto invalid;
3618 }
3619
3620 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3621 zone->w_ptr += nlb;
3622 }
3623 } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3624 nvme_do_write_fdp(n, req, slba, nlb);
3625 }
3626
3627 data_offset = nvme_l2b(ns, slba);
3628
3629 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3630 return nvme_dif_rw(n, req);
3631 }
3632
3633 if (!wrz) {
3634 status = nvme_map_data(n, nlb, req);
3635 if (status) {
3636 goto invalid;
3637 }
3638
3639 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3640 BLOCK_ACCT_WRITE);
3641 nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3642 } else {
3643 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3644 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3645 req);
3646 }
3647
3648 return NVME_NO_COMPLETE;
3649
3650invalid:
3651 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3652 return status | NVME_DNR;
3653}
3654
3655static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3656{
3657 return nvme_do_write(n, req, false, false);
3658}
3659
3660static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3661{
3662 return nvme_do_write(n, req, false, true);
3663}
3664
3665static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3666{
3667 return nvme_do_write(n, req, true, false);
3668}
3669
3670static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3671 uint64_t *slba, uint32_t *zone_idx)
3672{
3673 uint32_t dw10 = le32_to_cpu(c->cdw10);
3674 uint32_t dw11 = le32_to_cpu(c->cdw11);
3675
3676 if (!ns->params.zoned) {
3677 trace_pci_nvme_err_invalid_opc(c->opcode);
3678 return NVME_INVALID_OPCODE | NVME_DNR;
3679 }
3680
3681 *slba = ((uint64_t)dw11) << 32 | dw10;
3682 if (unlikely(*slba >= ns->id_ns.nsze)) {
3683 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3684 *slba = 0;
3685 return NVME_LBA_RANGE | NVME_DNR;
3686 }
3687
3688 *zone_idx = nvme_zone_idx(ns, *slba);
3689 assert(*zone_idx < ns->num_zones);
3690
3691 return NVME_SUCCESS;
3692}
3693
3694typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3695 NvmeRequest *);
3696
3697enum NvmeZoneProcessingMask {
3698 NVME_PROC_CURRENT_ZONE = 0,
3699 NVME_PROC_OPENED_ZONES = 1 << 0,
3700 NVME_PROC_CLOSED_ZONES = 1 << 1,
3701 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3702 NVME_PROC_FULL_ZONES = 1 << 3,
3703};
3704
3705static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3706 NvmeZoneState state, NvmeRequest *req)
3707{
3708 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3709 int flags = 0;
3710
3711 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3712 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3713
3714 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3715 return NVME_INVALID_ZONE_OP | NVME_DNR;
3716 }
3717
3718 if (zone->w_ptr % ns->zns.zrwafg) {
3719 return NVME_NOZRWA | NVME_DNR;
3720 }
3721
3722 flags = NVME_ZRM_ZRWA;
3723 }
3724
3725 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3726}
3727
3728static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3729 NvmeZoneState state, NvmeRequest *req)
3730{
3731 return nvme_zrm_close(ns, zone);
3732}
3733
3734static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3735 NvmeZoneState state, NvmeRequest *req)
3736{
3737 return nvme_zrm_finish(ns, zone);
3738}
3739
3740static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3741 NvmeZoneState state, NvmeRequest *req)
3742{
3743 switch (state) {
3744 case NVME_ZONE_STATE_READ_ONLY:
3745 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3746
3747 case NVME_ZONE_STATE_OFFLINE:
3748 return NVME_SUCCESS;
3749 default:
3750 return NVME_ZONE_INVAL_TRANSITION;
3751 }
3752}
3753
3754static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3755{
3756 uint16_t status;
3757 uint8_t state = nvme_get_zone_state(zone);
3758
3759 if (state == NVME_ZONE_STATE_EMPTY) {
3760 status = nvme_aor_check(ns, 1, 0);
3761 if (status) {
3762 return status;
3763 }
3764 nvme_aor_inc_active(ns);
3765 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3766 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3767 return NVME_SUCCESS;
3768 }
3769
3770 return NVME_ZONE_INVAL_TRANSITION;
3771}
3772
3773static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3774 enum NvmeZoneProcessingMask proc_mask,
3775 op_handler_t op_hndlr, NvmeRequest *req)
3776{
3777 uint16_t status = NVME_SUCCESS;
3778 NvmeZoneState zs = nvme_get_zone_state(zone);
3779 bool proc_zone;
3780
3781 switch (zs) {
3782 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3783 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3784 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3785 break;
3786 case NVME_ZONE_STATE_CLOSED:
3787 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3788 break;
3789 case NVME_ZONE_STATE_READ_ONLY:
3790 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3791 break;
3792 case NVME_ZONE_STATE_FULL:
3793 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3794 break;
3795 default:
3796 proc_zone = false;
3797 }
3798
3799 if (proc_zone) {
3800 status = op_hndlr(ns, zone, zs, req);
3801 }
3802
3803 return status;
3804}
3805
3806static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3807 enum NvmeZoneProcessingMask proc_mask,
3808 op_handler_t op_hndlr, NvmeRequest *req)
3809{
3810 NvmeZone *next;
3811 uint16_t status = NVME_SUCCESS;
3812 int i;
3813
3814 if (!proc_mask) {
3815 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3816 } else {
3817 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3818 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3819 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3820 req);
3821 if (status && status != NVME_NO_COMPLETE) {
3822 goto out;
3823 }
3824 }
3825 }
3826 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3827 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3828 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3829 req);
3830 if (status && status != NVME_NO_COMPLETE) {
3831 goto out;
3832 }
3833 }
3834
3835 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3836 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3837 req);
3838 if (status && status != NVME_NO_COMPLETE) {
3839 goto out;
3840 }
3841 }
3842 }
3843 if (proc_mask & NVME_PROC_FULL_ZONES) {
3844 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3845 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3846 req);
3847 if (status && status != NVME_NO_COMPLETE) {
3848 goto out;
3849 }
3850 }
3851 }
3852
3853 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3854 for (i = 0; i < ns->num_zones; i++, zone++) {
3855 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3856 req);
3857 if (status && status != NVME_NO_COMPLETE) {
3858 goto out;
3859 }
3860 }
3861 }
3862 }
3863
3864out:
3865 return status;
3866}
3867
3868typedef struct NvmeZoneResetAIOCB {
3869 BlockAIOCB common;
3870 BlockAIOCB *aiocb;
3871 NvmeRequest *req;
3872 int ret;
3873
3874 bool all;
3875 int idx;
3876 NvmeZone *zone;
3877} NvmeZoneResetAIOCB;
3878
3879static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3880{
3881 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3882 NvmeRequest *req = iocb->req;
3883 NvmeNamespace *ns = req->ns;
3884
3885 iocb->idx = ns->num_zones;
3886
3887 iocb->ret = -ECANCELED;
3888
3889 if (iocb->aiocb) {
3890 blk_aio_cancel_async(iocb->aiocb);
3891 iocb->aiocb = NULL;
3892 }
3893}
3894
3895static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3896 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3897 .cancel_async = nvme_zone_reset_cancel,
3898};
3899
3900static void nvme_zone_reset_cb(void *opaque, int ret);
3901
3902static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3903{
3904 NvmeZoneResetAIOCB *iocb = opaque;
3905 NvmeRequest *req = iocb->req;
3906 NvmeNamespace *ns = req->ns;
3907 int64_t moff;
3908 int count;
3909
3910 if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
3911 goto out;
3912 }
3913
3914 moff = nvme_moff(ns, iocb->zone->d.zslba);
3915 count = nvme_m2b(ns, ns->zone_size);
3916
3917 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3918 BDRV_REQ_MAY_UNMAP,
3919 nvme_zone_reset_cb, iocb);
3920 return;
3921
3922out:
3923 nvme_zone_reset_cb(iocb, ret);
3924}
3925
3926static void nvme_zone_reset_cb(void *opaque, int ret)
3927{
3928 NvmeZoneResetAIOCB *iocb = opaque;
3929 NvmeRequest *req = iocb->req;
3930 NvmeNamespace *ns = req->ns;
3931
3932 if (iocb->ret < 0) {
3933 goto done;
3934 } else if (ret < 0) {
3935 iocb->ret = ret;
3936 goto done;
3937 }
3938
3939 if (iocb->zone) {
3940 nvme_zrm_reset(ns, iocb->zone);
3941
3942 if (!iocb->all) {
3943 goto done;
3944 }
3945 }
3946
3947 while (iocb->idx < ns->num_zones) {
3948 NvmeZone *zone = &ns->zone_array[iocb->idx++];
3949
3950 switch (nvme_get_zone_state(zone)) {
3951 case NVME_ZONE_STATE_EMPTY:
3952 if (!iocb->all) {
3953 goto done;
3954 }
3955
3956 continue;
3957
3958 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3959 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3960 case NVME_ZONE_STATE_CLOSED:
3961 case NVME_ZONE_STATE_FULL:
3962 iocb->zone = zone;
3963 break;
3964
3965 default:
3966 continue;
3967 }
3968
3969 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3970
3971 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3972 nvme_l2b(ns, zone->d.zslba),
3973 nvme_l2b(ns, ns->zone_size),
3974 BDRV_REQ_MAY_UNMAP,
3975 nvme_zone_reset_epilogue_cb,
3976 iocb);
3977 return;
3978 }
3979
3980done:
3981 iocb->aiocb = NULL;
3982
3983 iocb->common.cb(iocb->common.opaque, iocb->ret);
3984 qemu_aio_unref(iocb);
3985}
3986
3987static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
3988 uint64_t elba, NvmeRequest *req)
3989{
3990 NvmeNamespace *ns = req->ns;
3991 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3992 uint64_t wp = zone->d.wp;
3993 uint32_t nlb = elba - wp + 1;
3994 uint16_t status;
3995
3996
3997 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3998 return NVME_INVALID_ZONE_OP | NVME_DNR;
3999 }
4000
4001 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
4002 return NVME_INVALID_FIELD | NVME_DNR;
4003 }
4004
4005 if (elba < wp || elba > wp + ns->zns.zrwas) {
4006 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
4007 }
4008
4009 if (nlb % ns->zns.zrwafg) {
4010 return NVME_INVALID_FIELD | NVME_DNR;
4011 }
4012
4013 status = nvme_zrm_auto(n, ns, zone);
4014 if (status) {
4015 return status;
4016 }
4017
4018 zone->w_ptr += nlb;
4019
4020 nvme_advance_zone_wp(ns, zone, nlb);
4021
4022 return NVME_SUCCESS;
4023}
4024
4025static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4026{
4027 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4028 NvmeNamespace *ns = req->ns;
4029 NvmeZone *zone;
4030 NvmeZoneResetAIOCB *iocb;
4031 uint8_t *zd_ext;
4032 uint64_t slba = 0;
4033 uint32_t zone_idx = 0;
4034 uint16_t status;
4035 uint8_t action = cmd->zsa;
4036 bool all;
4037 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4038
4039 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4040
4041 req->status = NVME_SUCCESS;
4042
4043 if (!all) {
4044 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4045 if (status) {
4046 return status;
4047 }
4048 }
4049
4050 zone = &ns->zone_array[zone_idx];
4051 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4052 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4053 return NVME_INVALID_FIELD | NVME_DNR;
4054 }
4055
4056 switch (action) {
4057
4058 case NVME_ZONE_ACTION_OPEN:
4059 if (all) {
4060 proc_mask = NVME_PROC_CLOSED_ZONES;
4061 }
4062 trace_pci_nvme_open_zone(slba, zone_idx, all);
4063 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4064 break;
4065
4066 case NVME_ZONE_ACTION_CLOSE:
4067 if (all) {
4068 proc_mask = NVME_PROC_OPENED_ZONES;
4069 }
4070 trace_pci_nvme_close_zone(slba, zone_idx, all);
4071 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4072 break;
4073
4074 case NVME_ZONE_ACTION_FINISH:
4075 if (all) {
4076 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4077 }
4078 trace_pci_nvme_finish_zone(slba, zone_idx, all);
4079 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4080 break;
4081
4082 case NVME_ZONE_ACTION_RESET:
4083 trace_pci_nvme_reset_zone(slba, zone_idx, all);
4084
4085 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4086 nvme_misc_cb, req);
4087
4088 iocb->req = req;
4089 iocb->ret = 0;
4090 iocb->all = all;
4091 iocb->idx = zone_idx;
4092 iocb->zone = NULL;
4093
4094 req->aiocb = &iocb->common;
4095 nvme_zone_reset_cb(iocb, 0);
4096
4097 return NVME_NO_COMPLETE;
4098
4099 case NVME_ZONE_ACTION_OFFLINE:
4100 if (all) {
4101 proc_mask = NVME_PROC_READ_ONLY_ZONES;
4102 }
4103 trace_pci_nvme_offline_zone(slba, zone_idx, all);
4104 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4105 break;
4106
4107 case NVME_ZONE_ACTION_SET_ZD_EXT:
4108 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4109 if (all || !ns->params.zd_extension_size) {
4110 return NVME_INVALID_FIELD | NVME_DNR;
4111 }
4112 zd_ext = nvme_get_zd_extension(ns, zone_idx);
4113 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4114 if (status) {
4115 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4116 return status;
4117 }
4118
4119 status = nvme_set_zd_ext(ns, zone);
4120 if (status == NVME_SUCCESS) {
4121 trace_pci_nvme_zd_extension_set(zone_idx);
4122 return status;
4123 }
4124 break;
4125
4126 case NVME_ZONE_ACTION_ZRWA_FLUSH:
4127 if (all) {
4128 return NVME_INVALID_FIELD | NVME_DNR;
4129 }
4130
4131 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4132
4133 default:
4134 trace_pci_nvme_err_invalid_mgmt_action(action);
4135 status = NVME_INVALID_FIELD;
4136 }
4137
4138 if (status == NVME_ZONE_INVAL_TRANSITION) {
4139 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4140 zone->d.za);
4141 }
4142 if (status) {
4143 status |= NVME_DNR;
4144 }
4145
4146 return status;
4147}
4148
4149static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4150{
4151 NvmeZoneState zs = nvme_get_zone_state(zl);
4152
4153 switch (zafs) {
4154 case NVME_ZONE_REPORT_ALL:
4155 return true;
4156 case NVME_ZONE_REPORT_EMPTY:
4157 return zs == NVME_ZONE_STATE_EMPTY;
4158 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4159 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4160 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4161 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4162 case NVME_ZONE_REPORT_CLOSED:
4163 return zs == NVME_ZONE_STATE_CLOSED;
4164 case NVME_ZONE_REPORT_FULL:
4165 return zs == NVME_ZONE_STATE_FULL;
4166 case NVME_ZONE_REPORT_READ_ONLY:
4167 return zs == NVME_ZONE_STATE_READ_ONLY;
4168 case NVME_ZONE_REPORT_OFFLINE:
4169 return zs == NVME_ZONE_STATE_OFFLINE;
4170 default:
4171 return false;
4172 }
4173}
4174
4175static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4176{
4177 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
4178 NvmeNamespace *ns = req->ns;
4179
4180 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4181 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4182 uint32_t zone_idx, zra, zrasf, partial;
4183 uint64_t max_zones, nr_zones = 0;
4184 uint16_t status;
4185 uint64_t slba;
4186 NvmeZoneDescr *z;
4187 NvmeZone *zone;
4188 NvmeZoneReportHeader *header;
4189 void *buf, *buf_p;
4190 size_t zone_entry_sz;
4191 int i;
4192
4193 req->status = NVME_SUCCESS;
4194
4195 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4196 if (status) {
4197 return status;
4198 }
4199
4200 zra = dw13 & 0xff;
4201 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4202 return NVME_INVALID_FIELD | NVME_DNR;
4203 }
4204 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4205 return NVME_INVALID_FIELD | NVME_DNR;
4206 }
4207
4208 zrasf = (dw13 >> 8) & 0xff;
4209 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4210 return NVME_INVALID_FIELD | NVME_DNR;
4211 }
4212
4213 if (data_size < sizeof(NvmeZoneReportHeader)) {
4214 return NVME_INVALID_FIELD | NVME_DNR;
4215 }
4216
4217 status = nvme_check_mdts(n, data_size);
4218 if (status) {
4219 return status;
4220 }
4221
4222 partial = (dw13 >> 16) & 0x01;
4223
4224 zone_entry_sz = sizeof(NvmeZoneDescr);
4225 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4226 zone_entry_sz += ns->params.zd_extension_size;
4227 }
4228
4229 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4230 buf = g_malloc0(data_size);
4231
4232 zone = &ns->zone_array[zone_idx];
4233 for (i = zone_idx; i < ns->num_zones; i++) {
4234 if (partial && nr_zones >= max_zones) {
4235 break;
4236 }
4237 if (nvme_zone_matches_filter(zrasf, zone++)) {
4238 nr_zones++;
4239 }
4240 }
4241 header = buf;
4242 header->nr_zones = cpu_to_le64(nr_zones);
4243
4244 buf_p = buf + sizeof(NvmeZoneReportHeader);
4245 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4246 zone = &ns->zone_array[zone_idx];
4247 if (nvme_zone_matches_filter(zrasf, zone)) {
4248 z = buf_p;
4249 buf_p += sizeof(NvmeZoneDescr);
4250
4251 z->zt = zone->d.zt;
4252 z->zs = zone->d.zs;
4253 z->zcap = cpu_to_le64(zone->d.zcap);
4254 z->zslba = cpu_to_le64(zone->d.zslba);
4255 z->za = zone->d.za;
4256
4257 if (nvme_wp_is_valid(zone)) {
4258 z->wp = cpu_to_le64(zone->d.wp);
4259 } else {
4260 z->wp = cpu_to_le64(~0ULL);
4261 }
4262
4263 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4264 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4265 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4266 ns->params.zd_extension_size);
4267 }
4268 buf_p += ns->params.zd_extension_size;
4269 }
4270
4271 max_zones--;
4272 }
4273 }
4274
4275 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4276
4277 g_free(buf);
4278
4279 return status;
4280}
4281
4282static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4283 size_t len)
4284{
4285 NvmeNamespace *ns = req->ns;
4286 NvmeEnduranceGroup *endgrp;
4287 NvmeRuhStatus *hdr;
4288 NvmeRuhStatusDescr *ruhsd;
4289 unsigned int nruhsd;
4290 uint16_t rg, ph, *ruhid;
4291 size_t trans_len;
4292 g_autofree uint8_t *buf = NULL;
4293
4294 if (!n->subsys) {
4295 return NVME_INVALID_FIELD | NVME_DNR;
4296 }
4297
4298 if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4299 return NVME_INVALID_NSID | NVME_DNR;
4300 }
4301
4302 if (!n->subsys->endgrp.fdp.enabled) {
4303 return NVME_FDP_DISABLED | NVME_DNR;
4304 }
4305
4306 endgrp = ns->endgrp;
4307
4308 nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4309 trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4310 buf = g_malloc(trans_len);
4311
4312 trans_len = MIN(trans_len, len);
4313
4314 hdr = (NvmeRuhStatus *)buf;
4315 ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4316
4317 hdr->nruhsd = cpu_to_le16(nruhsd);
4318
4319 ruhid = ns->fdp.phs;
4320
4321 for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4322 NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4323
4324 for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4325 uint16_t pid = nvme_make_pid(ns, rg, ph);
4326
4327 ruhsd->pid = cpu_to_le16(pid);
4328 ruhsd->ruhid = *ruhid;
4329 ruhsd->earutr = 0;
4330 ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4331 }
4332 }
4333
4334 return nvme_c2h(n, buf, trans_len, req);
4335}
4336
4337static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4338{
4339 NvmeCmd *cmd = &req->cmd;
4340 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4341 uint32_t numd = le32_to_cpu(cmd->cdw11);
4342 uint8_t mo = (cdw10 & 0xff);
4343 size_t len = (numd + 1) << 2;
4344
4345 switch (mo) {
4346 case NVME_IOMR_MO_NOP:
4347 return 0;
4348 case NVME_IOMR_MO_RUH_STATUS:
4349 return nvme_io_mgmt_recv_ruhs(n, req, len);
4350 default:
4351 return NVME_INVALID_FIELD | NVME_DNR;
4352 };
4353}
4354
4355static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4356{
4357 NvmeCmd *cmd = &req->cmd;
4358 NvmeNamespace *ns = req->ns;
4359 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4360 uint16_t ret = NVME_SUCCESS;
4361 uint32_t npid = (cdw10 >> 1) + 1;
4362 unsigned int i = 0;
4363 g_autofree uint16_t *pids = NULL;
4364 uint32_t maxnpid;
4365
4366 if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
4367 return NVME_FDP_DISABLED | NVME_DNR;
4368 }
4369
4370 maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4371
4372 if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4373 return NVME_INVALID_FIELD | NVME_DNR;
4374 }
4375
4376 pids = g_new(uint16_t, npid);
4377
4378 ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4379 if (ret) {
4380 return ret;
4381 }
4382
4383 for (; i < npid; i++) {
4384 if (!nvme_update_ruh(n, ns, pids[i])) {
4385 return NVME_INVALID_FIELD | NVME_DNR;
4386 }
4387 }
4388
4389 return ret;
4390}
4391
4392static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4393{
4394 NvmeCmd *cmd = &req->cmd;
4395 uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4396 uint8_t mo = (cdw10 & 0xff);
4397
4398 switch (mo) {
4399 case NVME_IOMS_MO_NOP:
4400 return 0;
4401 case NVME_IOMS_MO_RUH_UPDATE:
4402 return nvme_io_mgmt_send_ruh_update(n, req);
4403 default:
4404 return NVME_INVALID_FIELD | NVME_DNR;
4405 };
4406}
4407
4408static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4409{
4410 NvmeNamespace *ns;
4411 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4412
4413 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4414 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4415
4416 if (!nvme_nsid_valid(n, nsid)) {
4417 return NVME_INVALID_NSID | NVME_DNR;
4418 }
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4440 return nvme_flush(n, req);
4441 }
4442
4443 ns = nvme_ns(n, nsid);
4444 if (unlikely(!ns)) {
4445 return NVME_INVALID_FIELD | NVME_DNR;
4446 }
4447
4448 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4449 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4450 return NVME_INVALID_OPCODE | NVME_DNR;
4451 }
4452
4453 if (ns->status) {
4454 return ns->status;
4455 }
4456
4457 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4458 return NVME_INVALID_FIELD;
4459 }
4460
4461 req->ns = ns;
4462
4463 switch (req->cmd.opcode) {
4464 case NVME_CMD_WRITE_ZEROES:
4465 return nvme_write_zeroes(n, req);
4466 case NVME_CMD_ZONE_APPEND:
4467 return nvme_zone_append(n, req);
4468 case NVME_CMD_WRITE:
4469 return nvme_write(n, req);
4470 case NVME_CMD_READ:
4471 return nvme_read(n, req);
4472 case NVME_CMD_COMPARE:
4473 return nvme_compare(n, req);
4474 case NVME_CMD_DSM:
4475 return nvme_dsm(n, req);
4476 case NVME_CMD_VERIFY:
4477 return nvme_verify(n, req);
4478 case NVME_CMD_COPY:
4479 return nvme_copy(n, req);
4480 case NVME_CMD_ZONE_MGMT_SEND:
4481 return nvme_zone_mgmt_send(n, req);
4482 case NVME_CMD_ZONE_MGMT_RECV:
4483 return nvme_zone_mgmt_recv(n, req);
4484 case NVME_CMD_IO_MGMT_RECV:
4485 return nvme_io_mgmt_recv(n, req);
4486 case NVME_CMD_IO_MGMT_SEND:
4487 return nvme_io_mgmt_send(n, req);
4488 default:
4489 assert(false);
4490 }
4491
4492 return NVME_INVALID_OPCODE | NVME_DNR;
4493}
4494
4495static void nvme_cq_notifier(EventNotifier *e)
4496{
4497 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4498 NvmeCtrl *n = cq->ctrl;
4499
4500 if (!event_notifier_test_and_clear(e)) {
4501 return;
4502 }
4503
4504 nvme_update_cq_head(cq);
4505
4506 if (cq->tail == cq->head) {
4507 if (cq->irq_enabled) {
4508 n->cq_pending--;
4509 }
4510
4511 nvme_irq_deassert(n, cq);
4512 }
4513
4514 qemu_bh_schedule(cq->bh);
4515}
4516
4517static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4518{
4519 NvmeCtrl *n = cq->ctrl;
4520 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4521 int ret;
4522
4523 ret = event_notifier_init(&cq->notifier, 0);
4524 if (ret < 0) {
4525 return ret;
4526 }
4527
4528 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4529 memory_region_add_eventfd(&n->iomem,
4530 0x1000 + offset, 4, false, 0, &cq->notifier);
4531
4532 return 0;
4533}
4534
4535static void nvme_sq_notifier(EventNotifier *e)
4536{
4537 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4538
4539 if (!event_notifier_test_and_clear(e)) {
4540 return;
4541 }
4542
4543 nvme_process_sq(sq);
4544}
4545
4546static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4547{
4548 NvmeCtrl *n = sq->ctrl;
4549 uint16_t offset = sq->sqid << 3;
4550 int ret;
4551
4552 ret = event_notifier_init(&sq->notifier, 0);
4553 if (ret < 0) {
4554 return ret;
4555 }
4556
4557 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4558 memory_region_add_eventfd(&n->iomem,
4559 0x1000 + offset, 4, false, 0, &sq->notifier);
4560
4561 return 0;
4562}
4563
4564static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4565{
4566 uint16_t offset = sq->sqid << 3;
4567
4568 n->sq[sq->sqid] = NULL;
4569 qemu_bh_delete(sq->bh);
4570 if (sq->ioeventfd_enabled) {
4571 memory_region_del_eventfd(&n->iomem,
4572 0x1000 + offset, 4, false, 0, &sq->notifier);
4573 event_notifier_set_handler(&sq->notifier, NULL);
4574 event_notifier_cleanup(&sq->notifier);
4575 }
4576 g_free(sq->io_req);
4577 if (sq->sqid) {
4578 g_free(sq);
4579 }
4580}
4581
4582static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4583{
4584 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4585 NvmeRequest *r, *next;
4586 NvmeSQueue *sq;
4587 NvmeCQueue *cq;
4588 uint16_t qid = le16_to_cpu(c->qid);
4589
4590 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4591 trace_pci_nvme_err_invalid_del_sq(qid);
4592 return NVME_INVALID_QID | NVME_DNR;
4593 }
4594
4595 trace_pci_nvme_del_sq(qid);
4596
4597 sq = n->sq[qid];
4598 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4599 r = QTAILQ_FIRST(&sq->out_req_list);
4600 assert(r->aiocb);
4601 blk_aio_cancel(r->aiocb);
4602 }
4603
4604 assert(QTAILQ_EMPTY(&sq->out_req_list));
4605
4606 if (!nvme_check_cqid(n, sq->cqid)) {
4607 cq = n->cq[sq->cqid];
4608 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4609
4610 nvme_post_cqes(cq);
4611 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4612 if (r->sq == sq) {
4613 QTAILQ_REMOVE(&cq->req_list, r, entry);
4614 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4615 }
4616 }
4617 }
4618
4619 nvme_free_sq(sq, n);
4620 return NVME_SUCCESS;
4621}
4622
4623static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4624 uint16_t sqid, uint16_t cqid, uint16_t size)
4625{
4626 int i;
4627 NvmeCQueue *cq;
4628
4629 sq->ctrl = n;
4630 sq->dma_addr = dma_addr;
4631 sq->sqid = sqid;
4632 sq->size = size;
4633 sq->cqid = cqid;
4634 sq->head = sq->tail = 0;
4635 sq->io_req = g_new0(NvmeRequest, sq->size);
4636
4637 QTAILQ_INIT(&sq->req_list);
4638 QTAILQ_INIT(&sq->out_req_list);
4639 for (i = 0; i < sq->size; i++) {
4640 sq->io_req[i].sq = sq;
4641 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4642 }
4643
4644 sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4645 &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4646
4647 if (n->dbbuf_enabled) {
4648 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4649 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4650
4651 if (n->params.ioeventfd && sq->sqid != 0) {
4652 if (!nvme_init_sq_ioeventfd(sq)) {
4653 sq->ioeventfd_enabled = true;
4654 }
4655 }
4656 }
4657
4658 assert(n->cq[cqid]);
4659 cq = n->cq[cqid];
4660 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4661 n->sq[sqid] = sq;
4662}
4663
4664static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4665{
4666 NvmeSQueue *sq;
4667 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4668
4669 uint16_t cqid = le16_to_cpu(c->cqid);
4670 uint16_t sqid = le16_to_cpu(c->sqid);
4671 uint16_t qsize = le16_to_cpu(c->qsize);
4672 uint16_t qflags = le16_to_cpu(c->sq_flags);
4673 uint64_t prp1 = le64_to_cpu(c->prp1);
4674
4675 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4676
4677 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4678 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4679 return NVME_INVALID_CQID | NVME_DNR;
4680 }
4681 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4682 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4683 return NVME_INVALID_QID | NVME_DNR;
4684 }
4685 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4686 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4687 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4688 }
4689 if (unlikely(prp1 & (n->page_size - 1))) {
4690 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4691 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4692 }
4693 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4694 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4695 return NVME_INVALID_FIELD | NVME_DNR;
4696 }
4697 sq = g_malloc0(sizeof(*sq));
4698 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4699 return NVME_SUCCESS;
4700}
4701
4702struct nvme_stats {
4703 uint64_t units_read;
4704 uint64_t units_written;
4705 uint64_t read_commands;
4706 uint64_t write_commands;
4707};
4708
4709static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4710{
4711 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4712
4713 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4714 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4715 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4716 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4717}
4718
4719static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4720 uint64_t off, NvmeRequest *req)
4721{
4722 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4723 struct nvme_stats stats = { 0 };
4724 NvmeSmartLog smart = { 0 };
4725 uint32_t trans_len;
4726 NvmeNamespace *ns;
4727 time_t current_ms;
4728 uint64_t u_read, u_written;
4729
4730 if (off >= sizeof(smart)) {
4731 return NVME_INVALID_FIELD | NVME_DNR;
4732 }
4733
4734 if (nsid != 0xffffffff) {
4735 ns = nvme_ns(n, nsid);
4736 if (!ns) {
4737 return NVME_INVALID_NSID | NVME_DNR;
4738 }
4739 nvme_set_blk_stats(ns, &stats);
4740 } else {
4741 int i;
4742
4743 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4744 ns = nvme_ns(n, i);
4745 if (!ns) {
4746 continue;
4747 }
4748 nvme_set_blk_stats(ns, &stats);
4749 }
4750 }
4751
4752 trans_len = MIN(sizeof(smart) - off, buf_len);
4753 smart.critical_warning = n->smart_critical_warning;
4754
4755 u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
4756 u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
4757
4758 smart.data_units_read[0] = cpu_to_le64(u_read);
4759 smart.data_units_written[0] = cpu_to_le64(u_written);
4760 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4761 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4762
4763 smart.temperature = cpu_to_le16(n->temperature);
4764
4765 if ((n->temperature >= n->features.temp_thresh_hi) ||
4766 (n->temperature <= n->features.temp_thresh_low)) {
4767 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4768 }
4769
4770 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4771 smart.power_on_hours[0] =
4772 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4773
4774 if (!rae) {
4775 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4776 }
4777
4778 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4779}
4780
4781static uint16_t nvme_endgrp_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4782 uint64_t off, NvmeRequest *req)
4783{
4784 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
4785 uint16_t endgrpid = (dw11 >> 16) & 0xffff;
4786 struct nvme_stats stats = {};
4787 NvmeEndGrpLog info = {};
4788 int i;
4789
4790 if (!n->subsys || endgrpid != 0x1) {
4791 return NVME_INVALID_FIELD | NVME_DNR;
4792 }
4793
4794 if (off >= sizeof(info)) {
4795 return NVME_INVALID_FIELD | NVME_DNR;
4796 }
4797
4798 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4799 NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
4800 if (!ns) {
4801 continue;
4802 }
4803
4804 nvme_set_blk_stats(ns, &stats);
4805 }
4806
4807 info.data_units_read[0] =
4808 cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
4809 info.data_units_written[0] =
4810 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
4811 info.media_units_written[0] =
4812 cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
4813
4814 info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4815 info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4816
4817 buf_len = MIN(sizeof(info) - off, buf_len);
4818
4819 return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
4820}
4821
4822
4823static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4824 NvmeRequest *req)
4825{
4826 uint32_t trans_len;
4827 NvmeFwSlotInfoLog fw_log = {
4828 .afi = 0x1,
4829 };
4830
4831 if (off >= sizeof(fw_log)) {
4832 return NVME_INVALID_FIELD | NVME_DNR;
4833 }
4834
4835 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4836 trans_len = MIN(sizeof(fw_log) - off, buf_len);
4837
4838 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4839}
4840
4841static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4842 uint64_t off, NvmeRequest *req)
4843{
4844 uint32_t trans_len;
4845 NvmeErrorLog errlog;
4846
4847 if (off >= sizeof(errlog)) {
4848 return NVME_INVALID_FIELD | NVME_DNR;
4849 }
4850
4851 if (!rae) {
4852 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4853 }
4854
4855 memset(&errlog, 0x0, sizeof(errlog));
4856 trans_len = MIN(sizeof(errlog) - off, buf_len);
4857
4858 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4859}
4860
4861static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4862 uint64_t off, NvmeRequest *req)
4863{
4864 uint32_t nslist[1024];
4865 uint32_t trans_len;
4866 int i = 0;
4867 uint32_t nsid;
4868
4869 if (off >= sizeof(nslist)) {
4870 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
4871 return NVME_INVALID_FIELD | NVME_DNR;
4872 }
4873
4874 memset(nslist, 0x0, sizeof(nslist));
4875 trans_len = MIN(sizeof(nslist) - off, buf_len);
4876
4877 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4878 NVME_CHANGED_NSID_SIZE) {
4879
4880
4881
4882
4883 if (i == ARRAY_SIZE(nslist)) {
4884 memset(nslist, 0x0, sizeof(nslist));
4885 nslist[0] = 0xffffffff;
4886 break;
4887 }
4888
4889 nslist[i++] = nsid;
4890 clear_bit(nsid, n->changed_nsids);
4891 }
4892
4893
4894
4895
4896
4897 if (nslist[0] == 0xffffffff) {
4898 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4899 }
4900
4901 if (!rae) {
4902 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4903 }
4904
4905 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4906}
4907
4908static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4909 uint64_t off, NvmeRequest *req)
4910{
4911 NvmeEffectsLog log = {};
4912 const uint32_t *src_iocs = NULL;
4913 uint32_t trans_len;
4914
4915 if (off >= sizeof(log)) {
4916 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4917 return NVME_INVALID_FIELD | NVME_DNR;
4918 }
4919
4920 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4921 case NVME_CC_CSS_NVM:
4922 src_iocs = nvme_cse_iocs_nvm;
4923
4924 case NVME_CC_CSS_ADMIN_ONLY:
4925 break;
4926 case NVME_CC_CSS_CSI:
4927 switch (csi) {
4928 case NVME_CSI_NVM:
4929 src_iocs = nvme_cse_iocs_nvm;
4930 break;
4931 case NVME_CSI_ZONED:
4932 src_iocs = nvme_cse_iocs_zoned;
4933 break;
4934 }
4935 }
4936
4937 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4938
4939 if (src_iocs) {
4940 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4941 }
4942
4943 trans_len = MIN(sizeof(log) - off, buf_len);
4944
4945 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4946}
4947
4948static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
4949{
4950 size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
4951 + vss;
4952 return ROUND_UP(entry_siz, 8);
4953}
4954
4955static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
4956 uint64_t off, NvmeRequest *req)
4957{
4958 uint32_t log_size, trans_len;
4959 g_autofree uint8_t *buf = NULL;
4960 NvmeFdpDescrHdr *hdr;
4961 NvmeRuhDescr *ruhd;
4962 NvmeEnduranceGroup *endgrp;
4963 NvmeFdpConfsHdr *log;
4964 size_t nruh, fdp_descr_size;
4965 int i;
4966
4967 if (endgrpid != 1 || !n->subsys) {
4968 return NVME_INVALID_FIELD | NVME_DNR;
4969 }
4970
4971 endgrp = &n->subsys->endgrp;
4972
4973 if (endgrp->fdp.enabled) {
4974 nruh = endgrp->fdp.nruh;
4975 } else {
4976 nruh = 1;
4977 }
4978
4979 fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
4980 log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
4981
4982 if (off >= log_size) {
4983 return NVME_INVALID_FIELD | NVME_DNR;
4984 }
4985
4986 trans_len = MIN(log_size - off, buf_len);
4987
4988 buf = g_malloc0(log_size);
4989 log = (NvmeFdpConfsHdr *)buf;
4990 hdr = (NvmeFdpDescrHdr *)(log + 1);
4991 ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
4992
4993 log->num_confs = cpu_to_le16(0);
4994 log->size = cpu_to_le32(log_size);
4995
4996 hdr->descr_size = cpu_to_le16(fdp_descr_size);
4997 if (endgrp->fdp.enabled) {
4998 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
4999 hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
5000 hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
5001 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5002 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5003 hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
5004 hdr->runs = cpu_to_le64(endgrp->fdp.runs);
5005
5006 for (i = 0; i < nruh; i++) {
5007 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5008 ruhd++;
5009 }
5010 } else {
5011
5012 hdr->nrg = cpu_to_le16(1);
5013 hdr->nruh = cpu_to_le16(1);
5014 hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5015 hdr->nnss = cpu_to_le32(1);
5016 hdr->runs = cpu_to_le64(96 * MiB);
5017
5018 ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5019 }
5020
5021 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5022}
5023
5024static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
5025 uint32_t dw10, uint32_t dw12,
5026 uint32_t buf_len, uint64_t off,
5027 NvmeRequest *req)
5028{
5029 NvmeRuHandle *ruh;
5030 NvmeRuhuLog *hdr;
5031 NvmeRuhuDescr *ruhud;
5032 NvmeEnduranceGroup *endgrp;
5033 g_autofree uint8_t *buf = NULL;
5034 uint32_t log_size, trans_len;
5035 uint16_t i;
5036
5037 if (endgrpid != 1 || !n->subsys) {
5038 return NVME_INVALID_FIELD | NVME_DNR;
5039 }
5040
5041 endgrp = &n->subsys->endgrp;
5042
5043 if (!endgrp->fdp.enabled) {
5044 return NVME_FDP_DISABLED | NVME_DNR;
5045 }
5046
5047 log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5048
5049 if (off >= log_size) {
5050 return NVME_INVALID_FIELD | NVME_DNR;
5051 }
5052
5053 trans_len = MIN(log_size - off, buf_len);
5054
5055 buf = g_malloc0(log_size);
5056 hdr = (NvmeRuhuLog *)buf;
5057 ruhud = (NvmeRuhuDescr *)(hdr + 1);
5058
5059 ruh = endgrp->fdp.ruhs;
5060 hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5061
5062 for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5063 ruhud->ruha = ruh->ruha;
5064 }
5065
5066 return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5067}
5068
5069static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5070 uint64_t off, NvmeRequest *req)
5071{
5072 NvmeEnduranceGroup *endgrp;
5073 NvmeFdpStatsLog log = {};
5074 uint32_t trans_len;
5075
5076 if (off >= sizeof(NvmeFdpStatsLog)) {
5077 return NVME_INVALID_FIELD | NVME_DNR;
5078 }
5079
5080 if (endgrpid != 1 || !n->subsys) {
5081 return NVME_INVALID_FIELD | NVME_DNR;
5082 }
5083
5084 if (!n->subsys->endgrp.fdp.enabled) {
5085 return NVME_FDP_DISABLED | NVME_DNR;
5086 }
5087
5088 endgrp = &n->subsys->endgrp;
5089
5090 trans_len = MIN(sizeof(log) - off, buf_len);
5091
5092
5093 log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5094 log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5095 log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5096
5097 return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5098}
5099
5100static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5101 uint32_t buf_len, uint64_t off,
5102 NvmeRequest *req)
5103{
5104 NvmeEnduranceGroup *endgrp;
5105 NvmeCmd *cmd = &req->cmd;
5106 bool host_events = (cmd->cdw10 >> 8) & 0x1;
5107 uint32_t log_size, trans_len;
5108 NvmeFdpEventBuffer *ebuf;
5109 g_autofree NvmeFdpEventsLog *elog = NULL;
5110 NvmeFdpEvent *event;
5111
5112 if (endgrpid != 1 || !n->subsys) {
5113 return NVME_INVALID_FIELD | NVME_DNR;
5114 }
5115
5116 endgrp = &n->subsys->endgrp;
5117
5118 if (!endgrp->fdp.enabled) {
5119 return NVME_FDP_DISABLED | NVME_DNR;
5120 }
5121
5122 if (host_events) {
5123 ebuf = &endgrp->fdp.host_events;
5124 } else {
5125 ebuf = &endgrp->fdp.ctrl_events;
5126 }
5127
5128 log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5129
5130 if (off >= log_size) {
5131 return NVME_INVALID_FIELD | NVME_DNR;
5132 }
5133
5134 trans_len = MIN(log_size - off, buf_len);
5135 elog = g_malloc0(log_size);
5136 elog->num_events = cpu_to_le32(ebuf->nelems);
5137 event = (NvmeFdpEvent *)(elog + 1);
5138
5139 if (ebuf->nelems && ebuf->start == ebuf->next) {
5140 unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5141
5142 memcpy(event, &ebuf->events[ebuf->start],
5143 sizeof(NvmeFdpEvent) * nelems);
5144 memcpy(event + nelems, ebuf->events,
5145 sizeof(NvmeFdpEvent) * ebuf->next);
5146 } else if (ebuf->start < ebuf->next) {
5147 memcpy(event, &ebuf->events[ebuf->start],
5148 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5149 }
5150
5151 return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5152}
5153
5154static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5155{
5156 NvmeCmd *cmd = &req->cmd;
5157
5158 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5159 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5160 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5161 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5162 uint8_t lid = dw10 & 0xff;
5163 uint8_t lsp = (dw10 >> 8) & 0xf;
5164 uint8_t rae = (dw10 >> 15) & 0x1;
5165 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
5166 uint32_t numdl, numdu, lspi;
5167 uint64_t off, lpol, lpou;
5168 size_t len;
5169 uint16_t status;
5170
5171 numdl = (dw10 >> 16);
5172 numdu = (dw11 & 0xffff);
5173 lspi = (dw11 >> 16);
5174 lpol = dw12;
5175 lpou = dw13;
5176
5177 len = (((numdu << 16) | numdl) + 1) << 2;
5178 off = (lpou << 32ULL) | lpol;
5179
5180 if (off & 0x3) {
5181 return NVME_INVALID_FIELD | NVME_DNR;
5182 }
5183
5184 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5185
5186 status = nvme_check_mdts(n, len);
5187 if (status) {
5188 return status;
5189 }
5190
5191 switch (lid) {
5192 case NVME_LOG_ERROR_INFO:
5193 return nvme_error_info(n, rae, len, off, req);
5194 case NVME_LOG_SMART_INFO:
5195 return nvme_smart_info(n, rae, len, off, req);
5196 case NVME_LOG_FW_SLOT_INFO:
5197 return nvme_fw_log_info(n, len, off, req);
5198 case NVME_LOG_CHANGED_NSLIST:
5199 return nvme_changed_nslist(n, rae, len, off, req);
5200 case NVME_LOG_CMD_EFFECTS:
5201 return nvme_cmd_effects(n, csi, len, off, req);
5202 case NVME_LOG_ENDGRP:
5203 return nvme_endgrp_info(n, rae, len, off, req);
5204 case NVME_LOG_FDP_CONFS:
5205 return nvme_fdp_confs(n, lspi, len, off, req);
5206 case NVME_LOG_FDP_RUH_USAGE:
5207 return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5208 case NVME_LOG_FDP_STATS:
5209 return nvme_fdp_stats(n, lspi, len, off, req);
5210 case NVME_LOG_FDP_EVENTS:
5211 return nvme_fdp_events(n, lspi, len, off, req);
5212 default:
5213 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5214 return NVME_INVALID_FIELD | NVME_DNR;
5215 }
5216}
5217
5218static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5219{
5220 PCIDevice *pci = PCI_DEVICE(n);
5221 uint16_t offset = (cq->cqid << 3) + (1 << 2);
5222
5223 n->cq[cq->cqid] = NULL;
5224 qemu_bh_delete(cq->bh);
5225 if (cq->ioeventfd_enabled) {
5226 memory_region_del_eventfd(&n->iomem,
5227 0x1000 + offset, 4, false, 0, &cq->notifier);
5228 event_notifier_set_handler(&cq->notifier, NULL);
5229 event_notifier_cleanup(&cq->notifier);
5230 }
5231 if (msix_enabled(pci)) {
5232 msix_vector_unuse(pci, cq->vector);
5233 }
5234 if (cq->cqid) {
5235 g_free(cq);
5236 }
5237}
5238
5239static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5240{
5241 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5242 NvmeCQueue *cq;
5243 uint16_t qid = le16_to_cpu(c->qid);
5244
5245 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5246 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5247 return NVME_INVALID_CQID | NVME_DNR;
5248 }
5249
5250 cq = n->cq[qid];
5251 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5252 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5253 return NVME_INVALID_QUEUE_DEL;
5254 }
5255
5256 if (cq->irq_enabled && cq->tail != cq->head) {
5257 n->cq_pending--;
5258 }
5259
5260 nvme_irq_deassert(n, cq);
5261 trace_pci_nvme_del_cq(qid);
5262 nvme_free_cq(cq, n);
5263 return NVME_SUCCESS;
5264}
5265
5266static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5267 uint16_t cqid, uint16_t vector, uint16_t size,
5268 uint16_t irq_enabled)
5269{
5270 PCIDevice *pci = PCI_DEVICE(n);
5271
5272 if (msix_enabled(pci)) {
5273 msix_vector_use(pci, vector);
5274 }
5275 cq->ctrl = n;
5276 cq->cqid = cqid;
5277 cq->size = size;
5278 cq->dma_addr = dma_addr;
5279 cq->phase = 1;
5280 cq->irq_enabled = irq_enabled;
5281 cq->vector = vector;
5282 cq->head = cq->tail = 0;
5283 QTAILQ_INIT(&cq->req_list);
5284 QTAILQ_INIT(&cq->sq_list);
5285 if (n->dbbuf_enabled) {
5286 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5287 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5288
5289 if (n->params.ioeventfd && cqid != 0) {
5290 if (!nvme_init_cq_ioeventfd(cq)) {
5291 cq->ioeventfd_enabled = true;
5292 }
5293 }
5294 }
5295 n->cq[cqid] = cq;
5296 cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5297 &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5298}
5299
5300static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5301{
5302 NvmeCQueue *cq;
5303 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5304 uint16_t cqid = le16_to_cpu(c->cqid);
5305 uint16_t vector = le16_to_cpu(c->irq_vector);
5306 uint16_t qsize = le16_to_cpu(c->qsize);
5307 uint16_t qflags = le16_to_cpu(c->cq_flags);
5308 uint64_t prp1 = le64_to_cpu(c->prp1);
5309 uint32_t cc = ldq_le_p(&n->bar.cc);
5310 uint8_t iocqes = NVME_CC_IOCQES(cc);
5311 uint8_t iosqes = NVME_CC_IOSQES(cc);
5312
5313 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5314 NVME_CQ_FLAGS_IEN(qflags) != 0);
5315
5316 if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
5317 trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
5318 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5319 }
5320
5321 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5322 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5323 return NVME_INVALID_QID | NVME_DNR;
5324 }
5325 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5326 trace_pci_nvme_err_invalid_create_cq_size(qsize);
5327 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5328 }
5329 if (unlikely(prp1 & (n->page_size - 1))) {
5330 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5331 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5332 }
5333 if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5334 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5335 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5336 }
5337 if (unlikely(vector >= n->conf_msix_qsize)) {
5338 trace_pci_nvme_err_invalid_create_cq_vector(vector);
5339 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5340 }
5341 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5342 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5343 return NVME_INVALID_FIELD | NVME_DNR;
5344 }
5345
5346 cq = g_malloc0(sizeof(*cq));
5347 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5348 NVME_CQ_FLAGS_IEN(qflags));
5349
5350
5351
5352
5353
5354
5355 n->qs_created = true;
5356 return NVME_SUCCESS;
5357}
5358
5359static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5360{
5361 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5362
5363 return nvme_c2h(n, id, sizeof(id), req);
5364}
5365
5366static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5367{
5368 trace_pci_nvme_identify_ctrl();
5369
5370 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5371}
5372
5373static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5374{
5375 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5376 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5377 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5378
5379 trace_pci_nvme_identify_ctrl_csi(c->csi);
5380
5381 switch (c->csi) {
5382 case NVME_CSI_NVM:
5383 id_nvm->vsl = n->params.vsl;
5384 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5385 break;
5386
5387 case NVME_CSI_ZONED:
5388 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5389 break;
5390
5391 default:
5392 return NVME_INVALID_FIELD | NVME_DNR;
5393 }
5394
5395 return nvme_c2h(n, id, sizeof(id), req);
5396}
5397
5398static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5399{
5400 NvmeNamespace *ns;
5401 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5402 uint32_t nsid = le32_to_cpu(c->nsid);
5403
5404 trace_pci_nvme_identify_ns(nsid);
5405
5406 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5407 return NVME_INVALID_NSID | NVME_DNR;
5408 }
5409
5410 ns = nvme_ns(n, nsid);
5411 if (unlikely(!ns)) {
5412 if (!active) {
5413 ns = nvme_subsys_ns(n->subsys, nsid);
5414 if (!ns) {
5415 return nvme_rpt_empty_id_struct(n, req);
5416 }
5417 } else {
5418 return nvme_rpt_empty_id_struct(n, req);
5419 }
5420 }
5421
5422 if (active || ns->csi == NVME_CSI_NVM) {
5423 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5424 }
5425
5426 return NVME_INVALID_CMD_SET | NVME_DNR;
5427}
5428
5429static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5430 bool attached)
5431{
5432 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5433 uint32_t nsid = le32_to_cpu(c->nsid);
5434 uint16_t min_id = le16_to_cpu(c->ctrlid);
5435 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5436 uint16_t *ids = &list[1];
5437 NvmeNamespace *ns;
5438 NvmeCtrl *ctrl;
5439 int cntlid, nr_ids = 0;
5440
5441 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5442
5443 if (!n->subsys) {
5444 return NVME_INVALID_FIELD | NVME_DNR;
5445 }
5446
5447 if (attached) {
5448 if (nsid == NVME_NSID_BROADCAST) {
5449 return NVME_INVALID_FIELD | NVME_DNR;
5450 }
5451
5452 ns = nvme_subsys_ns(n->subsys, nsid);
5453 if (!ns) {
5454 return NVME_INVALID_FIELD | NVME_DNR;
5455 }
5456 }
5457
5458 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5459 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5460 if (!ctrl) {
5461 continue;
5462 }
5463
5464 if (attached && !nvme_ns(ctrl, nsid)) {
5465 continue;
5466 }
5467
5468 ids[nr_ids++] = cntlid;
5469 }
5470
5471 list[0] = nr_ids;
5472
5473 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5474}
5475
5476static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5477{
5478 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5479
5480 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5481 sizeof(NvmePriCtrlCap), req);
5482}
5483
5484static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5485{
5486 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5487 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5488 uint16_t min_id = le16_to_cpu(c->ctrlid);
5489 uint8_t num_sec_ctrl = n->sec_ctrl_list.numcntl;
5490 NvmeSecCtrlList list = {0};
5491 uint8_t i;
5492
5493 for (i = 0; i < num_sec_ctrl; i++) {
5494 if (n->sec_ctrl_list.sec[i].scid >= min_id) {
5495 list.numcntl = num_sec_ctrl - i;
5496 memcpy(&list.sec, n->sec_ctrl_list.sec + i,
5497 list.numcntl * sizeof(NvmeSecCtrlEntry));
5498 break;
5499 }
5500 }
5501
5502 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5503
5504 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5505}
5506
5507static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5508 bool active)
5509{
5510 NvmeNamespace *ns;
5511 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5512 uint32_t nsid = le32_to_cpu(c->nsid);
5513
5514 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5515
5516 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5517 return NVME_INVALID_NSID | NVME_DNR;
5518 }
5519
5520 ns = nvme_ns(n, nsid);
5521 if (unlikely(!ns)) {
5522 if (!active) {
5523 ns = nvme_subsys_ns(n->subsys, nsid);
5524 if (!ns) {
5525 return nvme_rpt_empty_id_struct(n, req);
5526 }
5527 } else {
5528 return nvme_rpt_empty_id_struct(n, req);
5529 }
5530 }
5531
5532 if (c->csi == NVME_CSI_NVM) {
5533 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5534 req);
5535 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5536 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5537 req);
5538 }
5539
5540 return NVME_INVALID_FIELD | NVME_DNR;
5541}
5542
5543static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5544 bool active)
5545{
5546 NvmeNamespace *ns;
5547 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5548 uint32_t min_nsid = le32_to_cpu(c->nsid);
5549 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5550 static const int data_len = sizeof(list);
5551 uint32_t *list_ptr = (uint32_t *)list;
5552 int i, j = 0;
5553
5554 trace_pci_nvme_identify_nslist(min_nsid);
5555
5556
5557
5558
5559
5560
5561
5562 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5563 return NVME_INVALID_NSID | NVME_DNR;
5564 }
5565
5566 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5567 ns = nvme_ns(n, i);
5568 if (!ns) {
5569 if (!active) {
5570 ns = nvme_subsys_ns(n->subsys, i);
5571 if (!ns) {
5572 continue;
5573 }
5574 } else {
5575 continue;
5576 }
5577 }
5578 if (ns->params.nsid <= min_nsid) {
5579 continue;
5580 }
5581 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5582 if (j == data_len / sizeof(uint32_t)) {
5583 break;
5584 }
5585 }
5586
5587 return nvme_c2h(n, list, data_len, req);
5588}
5589
5590static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5591 bool active)
5592{
5593 NvmeNamespace *ns;
5594 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5595 uint32_t min_nsid = le32_to_cpu(c->nsid);
5596 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5597 static const int data_len = sizeof(list);
5598 uint32_t *list_ptr = (uint32_t *)list;
5599 int i, j = 0;
5600
5601 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5602
5603
5604
5605
5606 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5607 return NVME_INVALID_NSID | NVME_DNR;
5608 }
5609
5610 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5611 return NVME_INVALID_FIELD | NVME_DNR;
5612 }
5613
5614 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5615 ns = nvme_ns(n, i);
5616 if (!ns) {
5617 if (!active) {
5618 ns = nvme_subsys_ns(n->subsys, i);
5619 if (!ns) {
5620 continue;
5621 }
5622 } else {
5623 continue;
5624 }
5625 }
5626 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5627 continue;
5628 }
5629 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5630 if (j == data_len / sizeof(uint32_t)) {
5631 break;
5632 }
5633 }
5634
5635 return nvme_c2h(n, list, data_len, req);
5636}
5637
5638static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5639{
5640 NvmeNamespace *ns;
5641 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5642 uint32_t nsid = le32_to_cpu(c->nsid);
5643 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5644 uint8_t *pos = list;
5645 struct {
5646 NvmeIdNsDescr hdr;
5647 uint8_t v[NVME_NIDL_UUID];
5648 } QEMU_PACKED uuid = {};
5649 struct {
5650 NvmeIdNsDescr hdr;
5651 uint64_t v;
5652 } QEMU_PACKED eui64 = {};
5653 struct {
5654 NvmeIdNsDescr hdr;
5655 uint8_t v;
5656 } QEMU_PACKED csi = {};
5657
5658 trace_pci_nvme_identify_ns_descr_list(nsid);
5659
5660 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5661 return NVME_INVALID_NSID | NVME_DNR;
5662 }
5663
5664 ns = nvme_ns(n, nsid);
5665 if (unlikely(!ns)) {
5666 return NVME_INVALID_FIELD | NVME_DNR;
5667 }
5668
5669 if (!qemu_uuid_is_null(&ns->params.uuid)) {
5670 uuid.hdr.nidt = NVME_NIDT_UUID;
5671 uuid.hdr.nidl = NVME_NIDL_UUID;
5672 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
5673 memcpy(pos, &uuid, sizeof(uuid));
5674 pos += sizeof(uuid);
5675 }
5676
5677 if (ns->params.eui64) {
5678 eui64.hdr.nidt = NVME_NIDT_EUI64;
5679 eui64.hdr.nidl = NVME_NIDL_EUI64;
5680 eui64.v = cpu_to_be64(ns->params.eui64);
5681 memcpy(pos, &eui64, sizeof(eui64));
5682 pos += sizeof(eui64);
5683 }
5684
5685 csi.hdr.nidt = NVME_NIDT_CSI;
5686 csi.hdr.nidl = NVME_NIDL_CSI;
5687 csi.v = ns->csi;
5688 memcpy(pos, &csi, sizeof(csi));
5689 pos += sizeof(csi);
5690
5691 return nvme_c2h(n, list, sizeof(list), req);
5692}
5693
5694static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
5695{
5696 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5697 static const int data_len = sizeof(list);
5698
5699 trace_pci_nvme_identify_cmd_set();
5700
5701 NVME_SET_CSI(*list, NVME_CSI_NVM);
5702 NVME_SET_CSI(*list, NVME_CSI_ZONED);
5703
5704 return nvme_c2h(n, list, data_len, req);
5705}
5706
5707static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
5708{
5709 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5710
5711 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
5712 c->csi);
5713
5714 switch (c->cns) {
5715 case NVME_ID_CNS_NS:
5716 return nvme_identify_ns(n, req, true);
5717 case NVME_ID_CNS_NS_PRESENT:
5718 return nvme_identify_ns(n, req, false);
5719 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
5720 return nvme_identify_ctrl_list(n, req, true);
5721 case NVME_ID_CNS_CTRL_LIST:
5722 return nvme_identify_ctrl_list(n, req, false);
5723 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
5724 return nvme_identify_pri_ctrl_cap(n, req);
5725 case NVME_ID_CNS_SECONDARY_CTRL_LIST:
5726 return nvme_identify_sec_ctrl_list(n, req);
5727 case NVME_ID_CNS_CS_NS:
5728 return nvme_identify_ns_csi(n, req, true);
5729 case NVME_ID_CNS_CS_NS_PRESENT:
5730 return nvme_identify_ns_csi(n, req, false);
5731 case NVME_ID_CNS_CTRL:
5732 return nvme_identify_ctrl(n, req);
5733 case NVME_ID_CNS_CS_CTRL:
5734 return nvme_identify_ctrl_csi(n, req);
5735 case NVME_ID_CNS_NS_ACTIVE_LIST:
5736 return nvme_identify_nslist(n, req, true);
5737 case NVME_ID_CNS_NS_PRESENT_LIST:
5738 return nvme_identify_nslist(n, req, false);
5739 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
5740 return nvme_identify_nslist_csi(n, req, true);
5741 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
5742 return nvme_identify_nslist_csi(n, req, false);
5743 case NVME_ID_CNS_NS_DESCR_LIST:
5744 return nvme_identify_ns_descr_list(n, req);
5745 case NVME_ID_CNS_IO_COMMAND_SET:
5746 return nvme_identify_cmd_set(n, req);
5747 default:
5748 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
5749 return NVME_INVALID_FIELD | NVME_DNR;
5750 }
5751}
5752
5753static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
5754{
5755 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
5756
5757 req->cqe.result = 1;
5758 if (nvme_check_sqid(n, sqid)) {
5759 return NVME_INVALID_FIELD | NVME_DNR;
5760 }
5761
5762 return NVME_SUCCESS;
5763}
5764
5765static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
5766{
5767 trace_pci_nvme_setfeat_timestamp(ts);
5768
5769 n->host_timestamp = le64_to_cpu(ts);
5770 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5771}
5772
5773static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
5774{
5775 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5776 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
5777
5778 union nvme_timestamp {
5779 struct {
5780 uint64_t timestamp:48;
5781 uint64_t sync:1;
5782 uint64_t origin:3;
5783 uint64_t rsvd1:12;
5784 };
5785 uint64_t all;
5786 };
5787
5788 union nvme_timestamp ts;
5789 ts.all = 0;
5790 ts.timestamp = n->host_timestamp + elapsed_time;
5791
5792
5793 ts.origin = n->host_timestamp ? 0x01 : 0x00;
5794
5795 trace_pci_nvme_getfeat_timestamp(ts.all);
5796
5797 return cpu_to_le64(ts.all);
5798}
5799
5800static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
5801{
5802 uint64_t timestamp = nvme_get_timestamp(n);
5803
5804 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
5805}
5806
5807static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
5808 uint32_t *result)
5809{
5810 *result = 0;
5811
5812 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
5813 return NVME_INVALID_FIELD | NVME_DNR;
5814 }
5815
5816 *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
5817 *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
5818
5819 return NVME_SUCCESS;
5820}
5821
5822static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
5823 NvmeRequest *req, uint32_t *result)
5824{
5825 NvmeCmd *cmd = &req->cmd;
5826 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
5827 uint16_t ph = cdw11 & 0xffff;
5828 uint8_t noet = (cdw11 >> 16) & 0xff;
5829 uint16_t ruhid, ret;
5830 uint32_t nentries = 0;
5831 uint8_t s_events_ndx = 0;
5832 size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
5833 g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
5834 NvmeRuHandle *ruh;
5835 NvmeFdpEventDescr *s_event;
5836
5837 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
5838 return NVME_FDP_DISABLED | NVME_DNR;
5839 }
5840
5841 if (!nvme_ph_valid(ns, ph)) {
5842 return NVME_INVALID_FIELD | NVME_DNR;
5843 }
5844
5845 ruhid = ns->fdp.phs[ph];
5846 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
5847
5848 assert(ruh);
5849
5850 if (unlikely(noet == 0)) {
5851 return NVME_INVALID_FIELD | NVME_DNR;
5852 }
5853
5854 for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
5855 uint8_t shift = nvme_fdp_evf_shifts[event_type];
5856 if (!shift && event_type) {
5857
5858
5859
5860
5861 continue;
5862 }
5863
5864 nentries++;
5865
5866 s_event = &s_events[s_events_ndx];
5867 s_event->evt = event_type;
5868 s_event->evta = (ruh->event_filter >> shift) & 0x1;
5869
5870
5871 if ((++s_events_ndx) == noet) {
5872 break;
5873 }
5874 }
5875
5876 ret = nvme_c2h(n, s_events, s_events_siz, req);
5877 if (ret) {
5878 return ret;
5879 }
5880
5881 *result = nentries;
5882 return NVME_SUCCESS;
5883}
5884
5885static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
5886{
5887 NvmeCmd *cmd = &req->cmd;
5888 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5889 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5890 uint32_t nsid = le32_to_cpu(cmd->nsid);
5891 uint32_t result;
5892 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5893 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
5894 uint16_t iv;
5895 NvmeNamespace *ns;
5896 int i;
5897 uint16_t endgrpid = 0, ret = NVME_SUCCESS;
5898
5899 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
5900 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
5901 };
5902
5903 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
5904
5905 if (!nvme_feature_support[fid]) {
5906 return NVME_INVALID_FIELD | NVME_DNR;
5907 }
5908
5909 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5910 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5911
5912
5913
5914
5915
5916
5917
5918 return NVME_INVALID_NSID | NVME_DNR;
5919 }
5920
5921 if (!nvme_ns(n, nsid)) {
5922 return NVME_INVALID_FIELD | NVME_DNR;
5923 }
5924 }
5925
5926 switch (sel) {
5927 case NVME_GETFEAT_SELECT_CURRENT:
5928 break;
5929 case NVME_GETFEAT_SELECT_SAVED:
5930
5931 case NVME_GETFEAT_SELECT_DEFAULT:
5932 goto defaults;
5933 case NVME_GETFEAT_SELECT_CAP:
5934 result = nvme_feature_cap[fid];
5935 goto out;
5936 }
5937
5938 switch (fid) {
5939 case NVME_TEMPERATURE_THRESHOLD:
5940 result = 0;
5941
5942
5943
5944
5945
5946 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5947 goto out;
5948 }
5949
5950 switch (NVME_TEMP_THSEL(dw11)) {
5951 case NVME_TEMP_THSEL_OVER:
5952 result = n->features.temp_thresh_hi;
5953 goto out;
5954 case NVME_TEMP_THSEL_UNDER:
5955 result = n->features.temp_thresh_low;
5956 goto out;
5957 }
5958
5959 return NVME_INVALID_FIELD | NVME_DNR;
5960 case NVME_ERROR_RECOVERY:
5961 if (!nvme_nsid_valid(n, nsid)) {
5962 return NVME_INVALID_NSID | NVME_DNR;
5963 }
5964
5965 ns = nvme_ns(n, nsid);
5966 if (unlikely(!ns)) {
5967 return NVME_INVALID_FIELD | NVME_DNR;
5968 }
5969
5970 result = ns->features.err_rec;
5971 goto out;
5972 case NVME_VOLATILE_WRITE_CACHE:
5973 result = 0;
5974 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5975 ns = nvme_ns(n, i);
5976 if (!ns) {
5977 continue;
5978 }
5979
5980 result = blk_enable_write_cache(ns->blkconf.blk);
5981 if (result) {
5982 break;
5983 }
5984 }
5985 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
5986 goto out;
5987 case NVME_ASYNCHRONOUS_EVENT_CONF:
5988 result = n->features.async_config;
5989 goto out;
5990 case NVME_TIMESTAMP:
5991 return nvme_get_feature_timestamp(n, req);
5992 case NVME_HOST_BEHAVIOR_SUPPORT:
5993 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
5994 sizeof(n->features.hbs), req);
5995 case NVME_FDP_MODE:
5996 endgrpid = dw11 & 0xff;
5997
5998 if (endgrpid != 0x1) {
5999 return NVME_INVALID_FIELD | NVME_DNR;
6000 }
6001
6002 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6003 if (ret) {
6004 return ret;
6005 }
6006 goto out;
6007 case NVME_FDP_EVENTS:
6008 if (!nvme_nsid_valid(n, nsid)) {
6009 return NVME_INVALID_NSID | NVME_DNR;
6010 }
6011
6012 ns = nvme_ns(n, nsid);
6013 if (unlikely(!ns)) {
6014 return NVME_INVALID_FIELD | NVME_DNR;
6015 }
6016
6017 ret = nvme_get_feature_fdp_events(n, ns, req, &result);
6018 if (ret) {
6019 return ret;
6020 }
6021 goto out;
6022 default:
6023 break;
6024 }
6025
6026defaults:
6027 switch (fid) {
6028 case NVME_TEMPERATURE_THRESHOLD:
6029 result = 0;
6030
6031 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6032 break;
6033 }
6034
6035 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
6036 result = NVME_TEMPERATURE_WARNING;
6037 }
6038
6039 break;
6040 case NVME_NUMBER_OF_QUEUES:
6041 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6042 trace_pci_nvme_getfeat_numq(result);
6043 break;
6044 case NVME_INTERRUPT_VECTOR_CONF:
6045 iv = dw11 & 0xffff;
6046 if (iv >= n->conf_ioqpairs + 1) {
6047 return NVME_INVALID_FIELD | NVME_DNR;
6048 }
6049
6050 result = iv;
6051 if (iv == n->admin_cq.vector) {
6052 result |= NVME_INTVC_NOCOALESCING;
6053 }
6054 break;
6055 case NVME_FDP_MODE:
6056 endgrpid = dw11 & 0xff;
6057
6058 if (endgrpid != 0x1) {
6059 return NVME_INVALID_FIELD | NVME_DNR;
6060 }
6061
6062 ret = nvme_get_feature_fdp(n, endgrpid, &result);
6063 if (ret) {
6064 return ret;
6065 }
6066 goto out;
6067
6068 break;
6069 default:
6070 result = nvme_feature_default[fid];
6071 break;
6072 }
6073
6074out:
6075 req->cqe.result = cpu_to_le32(result);
6076 return ret;
6077}
6078
6079static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6080{
6081 uint16_t ret;
6082 uint64_t timestamp;
6083
6084 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
6085 if (ret) {
6086 return ret;
6087 }
6088
6089 nvme_set_timestamp(n, timestamp);
6090
6091 return NVME_SUCCESS;
6092}
6093
6094static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6095 NvmeRequest *req)
6096{
6097 NvmeCmd *cmd = &req->cmd;
6098 uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6099 uint16_t ph = cdw11 & 0xffff;
6100 uint8_t noet = (cdw11 >> 16) & 0xff;
6101 uint16_t ret, ruhid;
6102 uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6103 uint8_t event_mask = 0;
6104 unsigned int i;
6105 g_autofree uint8_t *events = g_malloc0(noet);
6106 NvmeRuHandle *ruh = NULL;
6107
6108 assert(ns);
6109
6110 if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6111 return NVME_FDP_DISABLED | NVME_DNR;
6112 }
6113
6114 if (!nvme_ph_valid(ns, ph)) {
6115 return NVME_INVALID_FIELD | NVME_DNR;
6116 }
6117
6118 ruhid = ns->fdp.phs[ph];
6119 ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6120
6121 ret = nvme_h2c(n, events, noet, req);
6122 if (ret) {
6123 return ret;
6124 }
6125
6126 for (i = 0; i < noet; i++) {
6127 event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6128 }
6129
6130 if (enable) {
6131 ruh->event_filter |= event_mask;
6132 } else {
6133 ruh->event_filter = ruh->event_filter & ~event_mask;
6134 }
6135
6136 return NVME_SUCCESS;
6137}
6138
6139static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6140{
6141 NvmeNamespace *ns = NULL;
6142
6143 NvmeCmd *cmd = &req->cmd;
6144 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6145 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6146 uint32_t nsid = le32_to_cpu(cmd->nsid);
6147 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6148 uint8_t save = NVME_SETFEAT_SAVE(dw10);
6149 uint16_t status;
6150 int i;
6151
6152 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6153
6154 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6155 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6156 }
6157
6158 if (!nvme_feature_support[fid]) {
6159 return NVME_INVALID_FIELD | NVME_DNR;
6160 }
6161
6162 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6163 if (nsid != NVME_NSID_BROADCAST) {
6164 if (!nvme_nsid_valid(n, nsid)) {
6165 return NVME_INVALID_NSID | NVME_DNR;
6166 }
6167
6168 ns = nvme_ns(n, nsid);
6169 if (unlikely(!ns)) {
6170 return NVME_INVALID_FIELD | NVME_DNR;
6171 }
6172 }
6173 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6174 if (!nvme_nsid_valid(n, nsid)) {
6175 return NVME_INVALID_NSID | NVME_DNR;
6176 }
6177
6178 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6179 }
6180
6181 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6182 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6183 }
6184
6185 switch (fid) {
6186 case NVME_TEMPERATURE_THRESHOLD:
6187 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6188 break;
6189 }
6190
6191 switch (NVME_TEMP_THSEL(dw11)) {
6192 case NVME_TEMP_THSEL_OVER:
6193 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6194 break;
6195 case NVME_TEMP_THSEL_UNDER:
6196 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6197 break;
6198 default:
6199 return NVME_INVALID_FIELD | NVME_DNR;
6200 }
6201
6202 if ((n->temperature >= n->features.temp_thresh_hi) ||
6203 (n->temperature <= n->features.temp_thresh_low)) {
6204 nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6205 }
6206
6207 break;
6208 case NVME_ERROR_RECOVERY:
6209 if (nsid == NVME_NSID_BROADCAST) {
6210 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6211 ns = nvme_ns(n, i);
6212
6213 if (!ns) {
6214 continue;
6215 }
6216
6217 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6218 ns->features.err_rec = dw11;
6219 }
6220 }
6221
6222 break;
6223 }
6224
6225 assert(ns);
6226 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6227 ns->features.err_rec = dw11;
6228 }
6229 break;
6230 case NVME_VOLATILE_WRITE_CACHE:
6231 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6232 ns = nvme_ns(n, i);
6233 if (!ns) {
6234 continue;
6235 }
6236
6237 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6238 blk_flush(ns->blkconf.blk);
6239 }
6240
6241 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6242 }
6243
6244 break;
6245
6246 case NVME_NUMBER_OF_QUEUES:
6247 if (n->qs_created) {
6248 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6249 }
6250
6251
6252
6253
6254
6255 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6256 return NVME_INVALID_FIELD | NVME_DNR;
6257 }
6258
6259 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6260 ((dw11 >> 16) & 0xffff) + 1,
6261 n->conf_ioqpairs,
6262 n->conf_ioqpairs);
6263 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6264 ((n->conf_ioqpairs - 1) << 16));
6265 break;
6266 case NVME_ASYNCHRONOUS_EVENT_CONF:
6267 n->features.async_config = dw11;
6268 break;
6269 case NVME_TIMESTAMP:
6270 return nvme_set_feature_timestamp(n, req);
6271 case NVME_HOST_BEHAVIOR_SUPPORT:
6272 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6273 sizeof(n->features.hbs), req);
6274 if (status) {
6275 return status;
6276 }
6277
6278 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6279 ns = nvme_ns(n, i);
6280
6281 if (!ns) {
6282 continue;
6283 }
6284
6285 ns->id_ns.nlbaf = ns->nlbaf - 1;
6286 if (!n->features.hbs.lbafee) {
6287 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6288 }
6289 }
6290
6291 return status;
6292 case NVME_COMMAND_SET_PROFILE:
6293 if (dw11 & 0x1ff) {
6294 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6295 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
6296 }
6297 break;
6298 case NVME_FDP_MODE:
6299
6300 return NVME_CMD_SEQ_ERROR | NVME_DNR;
6301 case NVME_FDP_EVENTS:
6302 return nvme_set_feature_fdp_events(n, ns, req);
6303 default:
6304 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6305 }
6306 return NVME_SUCCESS;
6307}
6308
6309static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6310{
6311 trace_pci_nvme_aer(nvme_cid(req));
6312
6313 if (n->outstanding_aers > n->params.aerl) {
6314 trace_pci_nvme_aer_aerl_exceeded();
6315 return NVME_AER_LIMIT_EXCEEDED;
6316 }
6317
6318 n->aer_reqs[n->outstanding_aers] = req;
6319 n->outstanding_aers++;
6320
6321 if (!QTAILQ_EMPTY(&n->aer_queue)) {
6322 nvme_process_aers(n);
6323 }
6324
6325 return NVME_NO_COMPLETE;
6326}
6327
6328static void nvme_update_dmrsl(NvmeCtrl *n)
6329{
6330 int nsid;
6331
6332 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6333 NvmeNamespace *ns = nvme_ns(n, nsid);
6334 if (!ns) {
6335 continue;
6336 }
6337
6338 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6339 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6340 }
6341}
6342
6343static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
6344{
6345 uint32_t cc = ldl_le_p(&n->bar.cc);
6346
6347 ns->iocs = nvme_cse_iocs_none;
6348 switch (ns->csi) {
6349 case NVME_CSI_NVM:
6350 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
6351 ns->iocs = nvme_cse_iocs_nvm;
6352 }
6353 break;
6354 case NVME_CSI_ZONED:
6355 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
6356 ns->iocs = nvme_cse_iocs_zoned;
6357 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
6358 ns->iocs = nvme_cse_iocs_nvm;
6359 }
6360 break;
6361 }
6362}
6363
6364static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6365{
6366 NvmeNamespace *ns;
6367 NvmeCtrl *ctrl;
6368 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6369 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6370 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6371 uint8_t sel = dw10 & 0xf;
6372 uint16_t *nr_ids = &list[0];
6373 uint16_t *ids = &list[1];
6374 uint16_t ret;
6375 int i;
6376
6377 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6378
6379 if (!nvme_nsid_valid(n, nsid)) {
6380 return NVME_INVALID_NSID | NVME_DNR;
6381 }
6382
6383 ns = nvme_subsys_ns(n->subsys, nsid);
6384 if (!ns) {
6385 return NVME_INVALID_FIELD | NVME_DNR;
6386 }
6387
6388 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6389 if (ret) {
6390 return ret;
6391 }
6392
6393 if (!*nr_ids) {
6394 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6395 }
6396
6397 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6398 for (i = 0; i < *nr_ids; i++) {
6399 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6400 if (!ctrl) {
6401 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6402 }
6403
6404 switch (sel) {
6405 case NVME_NS_ATTACHMENT_ATTACH:
6406 if (nvme_ns(ctrl, nsid)) {
6407 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6408 }
6409
6410 if (ns->attached && !ns->params.shared) {
6411 return NVME_NS_PRIVATE | NVME_DNR;
6412 }
6413
6414 nvme_attach_ns(ctrl, ns);
6415 nvme_select_iocs_ns(ctrl, ns);
6416
6417 break;
6418
6419 case NVME_NS_ATTACHMENT_DETACH:
6420 if (!nvme_ns(ctrl, nsid)) {
6421 return NVME_NS_NOT_ATTACHED | NVME_DNR;
6422 }
6423
6424 ctrl->namespaces[nsid] = NULL;
6425 ns->attached--;
6426
6427 nvme_update_dmrsl(ctrl);
6428
6429 break;
6430
6431 default:
6432 return NVME_INVALID_FIELD | NVME_DNR;
6433 }
6434
6435
6436
6437
6438
6439 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6440 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6441 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6442 NVME_LOG_CHANGED_NSLIST);
6443 }
6444 }
6445
6446 return NVME_SUCCESS;
6447}
6448
6449typedef struct NvmeFormatAIOCB {
6450 BlockAIOCB common;
6451 BlockAIOCB *aiocb;
6452 NvmeRequest *req;
6453 int ret;
6454
6455 NvmeNamespace *ns;
6456 uint32_t nsid;
6457 bool broadcast;
6458 int64_t offset;
6459
6460 uint8_t lbaf;
6461 uint8_t mset;
6462 uint8_t pi;
6463 uint8_t pil;
6464} NvmeFormatAIOCB;
6465
6466static void nvme_format_cancel(BlockAIOCB *aiocb)
6467{
6468 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6469
6470 iocb->ret = -ECANCELED;
6471
6472 if (iocb->aiocb) {
6473 blk_aio_cancel_async(iocb->aiocb);
6474 iocb->aiocb = NULL;
6475 }
6476}
6477
6478static const AIOCBInfo nvme_format_aiocb_info = {
6479 .aiocb_size = sizeof(NvmeFormatAIOCB),
6480 .cancel_async = nvme_format_cancel,
6481 .get_aio_context = nvme_get_aio_context,
6482};
6483
6484static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6485 uint8_t pi, uint8_t pil)
6486{
6487 uint8_t lbafl = lbaf & 0xf;
6488 uint8_t lbafu = lbaf >> 4;
6489
6490 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6491
6492 ns->id_ns.dps = (pil << 3) | pi;
6493 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6494
6495 nvme_ns_init_format(ns);
6496}
6497
6498static void nvme_do_format(NvmeFormatAIOCB *iocb);
6499
6500static void nvme_format_ns_cb(void *opaque, int ret)
6501{
6502 NvmeFormatAIOCB *iocb = opaque;
6503 NvmeNamespace *ns = iocb->ns;
6504 int bytes;
6505
6506 if (iocb->ret < 0) {
6507 goto done;
6508 } else if (ret < 0) {
6509 iocb->ret = ret;
6510 goto done;
6511 }
6512
6513 assert(ns);
6514
6515 if (iocb->offset < ns->size) {
6516 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6517
6518 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6519 bytes, BDRV_REQ_MAY_UNMAP,
6520 nvme_format_ns_cb, iocb);
6521
6522 iocb->offset += bytes;
6523 return;
6524 }
6525
6526 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6527 ns->status = 0x0;
6528 iocb->ns = NULL;
6529 iocb->offset = 0;
6530
6531done:
6532 nvme_do_format(iocb);
6533}
6534
6535static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6536{
6537 if (ns->params.zoned) {
6538 return NVME_INVALID_FORMAT | NVME_DNR;
6539 }
6540
6541 if (lbaf > ns->id_ns.nlbaf) {
6542 return NVME_INVALID_FORMAT | NVME_DNR;
6543 }
6544
6545 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6546 return NVME_INVALID_FORMAT | NVME_DNR;
6547 }
6548
6549 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6550 return NVME_INVALID_FIELD | NVME_DNR;
6551 }
6552
6553 return NVME_SUCCESS;
6554}
6555
6556static void nvme_do_format(NvmeFormatAIOCB *iocb)
6557{
6558 NvmeRequest *req = iocb->req;
6559 NvmeCtrl *n = nvme_ctrl(req);
6560 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6561 uint8_t lbaf = dw10 & 0xf;
6562 uint8_t pi = (dw10 >> 5) & 0x7;
6563 uint16_t status;
6564 int i;
6565
6566 if (iocb->ret < 0) {
6567 goto done;
6568 }
6569
6570 if (iocb->broadcast) {
6571 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6572 iocb->ns = nvme_ns(n, i);
6573 if (iocb->ns) {
6574 iocb->nsid = i;
6575 break;
6576 }
6577 }
6578 }
6579
6580 if (!iocb->ns) {
6581 goto done;
6582 }
6583
6584 status = nvme_format_check(iocb->ns, lbaf, pi);
6585 if (status) {
6586 req->status = status;
6587 goto done;
6588 }
6589
6590 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
6591 nvme_format_ns_cb(iocb, 0);
6592 return;
6593
6594done:
6595 iocb->common.cb(iocb->common.opaque, iocb->ret);
6596 qemu_aio_unref(iocb);
6597}
6598
6599static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
6600{
6601 NvmeFormatAIOCB *iocb;
6602 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6603 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6604 uint8_t lbaf = dw10 & 0xf;
6605 uint8_t mset = (dw10 >> 4) & 0x1;
6606 uint8_t pi = (dw10 >> 5) & 0x7;
6607 uint8_t pil = (dw10 >> 8) & 0x1;
6608 uint8_t lbafu = (dw10 >> 12) & 0x3;
6609 uint16_t status;
6610
6611 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
6612
6613 iocb->req = req;
6614 iocb->ret = 0;
6615 iocb->ns = NULL;
6616 iocb->nsid = 0;
6617 iocb->lbaf = lbaf;
6618 iocb->mset = mset;
6619 iocb->pi = pi;
6620 iocb->pil = pil;
6621 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
6622 iocb->offset = 0;
6623
6624 if (n->features.hbs.lbafee) {
6625 iocb->lbaf |= lbafu << 4;
6626 }
6627
6628 if (!iocb->broadcast) {
6629 if (!nvme_nsid_valid(n, nsid)) {
6630 status = NVME_INVALID_NSID | NVME_DNR;
6631 goto out;
6632 }
6633
6634 iocb->ns = nvme_ns(n, nsid);
6635 if (!iocb->ns) {
6636 status = NVME_INVALID_FIELD | NVME_DNR;
6637 goto out;
6638 }
6639 }
6640
6641 req->aiocb = &iocb->common;
6642 nvme_do_format(iocb);
6643
6644 return NVME_NO_COMPLETE;
6645
6646out:
6647 qemu_aio_unref(iocb);
6648
6649 return status;
6650}
6651
6652static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
6653 int *num_prim, int *num_sec)
6654{
6655 *num_total = le32_to_cpu(rt ?
6656 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
6657 *num_prim = le16_to_cpu(rt ?
6658 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
6659 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
6660}
6661
6662static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
6663 uint16_t cntlid, uint8_t rt,
6664 int nr)
6665{
6666 int num_total, num_prim, num_sec;
6667
6668 if (cntlid != n->cntlid) {
6669 return NVME_INVALID_CTRL_ID | NVME_DNR;
6670 }
6671
6672 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6673
6674 if (nr > num_total) {
6675 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6676 }
6677
6678 if (nr > num_total - num_sec) {
6679 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6680 }
6681
6682 if (rt) {
6683 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
6684 } else {
6685 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
6686 }
6687
6688 req->cqe.result = cpu_to_le32(nr);
6689 return req->status;
6690}
6691
6692static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
6693 uint8_t rt, int nr)
6694{
6695 int prev_nr, prev_total;
6696
6697 if (rt) {
6698 prev_nr = le16_to_cpu(sctrl->nvi);
6699 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
6700 sctrl->nvi = cpu_to_le16(nr);
6701 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
6702 } else {
6703 prev_nr = le16_to_cpu(sctrl->nvq);
6704 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
6705 sctrl->nvq = cpu_to_le16(nr);
6706 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
6707 }
6708}
6709
6710static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
6711 uint16_t cntlid, uint8_t rt, int nr)
6712{
6713 int num_total, num_prim, num_sec, num_free, diff, limit;
6714 NvmeSecCtrlEntry *sctrl;
6715
6716 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6717 if (!sctrl) {
6718 return NVME_INVALID_CTRL_ID | NVME_DNR;
6719 }
6720
6721 if (sctrl->scs) {
6722 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
6723 }
6724
6725 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
6726 if (nr > limit) {
6727 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6728 }
6729
6730 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6731 num_free = num_total - num_prim - num_sec;
6732 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
6733
6734 if (diff > num_free) {
6735 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6736 }
6737
6738 nvme_update_virt_res(n, sctrl, rt, nr);
6739 req->cqe.result = cpu_to_le32(nr);
6740
6741 return req->status;
6742}
6743
6744static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
6745{
6746 PCIDevice *pci = PCI_DEVICE(n);
6747 NvmeCtrl *sn = NULL;
6748 NvmeSecCtrlEntry *sctrl;
6749 int vf_index;
6750
6751 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6752 if (!sctrl) {
6753 return NVME_INVALID_CTRL_ID | NVME_DNR;
6754 }
6755
6756 if (!pci_is_vf(pci)) {
6757 vf_index = le16_to_cpu(sctrl->vfn) - 1;
6758 sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
6759 }
6760
6761 if (online) {
6762 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
6763 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
6764 }
6765
6766 if (!sctrl->scs) {
6767 sctrl->scs = 0x1;
6768 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
6769 }
6770 } else {
6771 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
6772 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
6773
6774 if (sctrl->scs) {
6775 sctrl->scs = 0x0;
6776 if (sn) {
6777 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
6778 }
6779 }
6780 }
6781
6782 return NVME_SUCCESS;
6783}
6784
6785static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
6786{
6787 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6788 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
6789 uint8_t act = dw10 & 0xf;
6790 uint8_t rt = (dw10 >> 8) & 0x7;
6791 uint16_t cntlid = (dw10 >> 16) & 0xffff;
6792 int nr = dw11 & 0xffff;
6793
6794 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
6795
6796 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
6797 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6798 }
6799
6800 switch (act) {
6801 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
6802 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
6803 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
6804 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
6805 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
6806 return nvme_virt_set_state(n, cntlid, true);
6807 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
6808 return nvme_virt_set_state(n, cntlid, false);
6809 default:
6810 return NVME_INVALID_FIELD | NVME_DNR;
6811 }
6812}
6813
6814static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
6815{
6816 PCIDevice *pci = PCI_DEVICE(n);
6817 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
6818 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
6819 int i;
6820
6821
6822 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
6823 return NVME_INVALID_FIELD | NVME_DNR;
6824 }
6825
6826
6827 n->dbbuf_dbs = dbs_addr;
6828 n->dbbuf_eis = eis_addr;
6829 n->dbbuf_enabled = true;
6830
6831 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
6832 NvmeSQueue *sq = n->sq[i];
6833 NvmeCQueue *cq = n->cq[i];
6834
6835 if (sq) {
6836
6837
6838
6839
6840
6841 sq->db_addr = dbs_addr + (i << 3);
6842 sq->ei_addr = eis_addr + (i << 3);
6843 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
6844
6845 if (n->params.ioeventfd && sq->sqid != 0) {
6846 if (!nvme_init_sq_ioeventfd(sq)) {
6847 sq->ioeventfd_enabled = true;
6848 }
6849 }
6850 }
6851
6852 if (cq) {
6853
6854 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
6855 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
6856 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
6857
6858 if (n->params.ioeventfd && cq->cqid != 0) {
6859 if (!nvme_init_cq_ioeventfd(cq)) {
6860 cq->ioeventfd_enabled = true;
6861 }
6862 }
6863 }
6864 }
6865
6866 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
6867
6868 return NVME_SUCCESS;
6869}
6870
6871static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
6872{
6873 return NVME_INVALID_FIELD | NVME_DNR;
6874}
6875
6876static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
6877{
6878 NvmeNamespace *ns;
6879 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6880 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
6881 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6882 uint8_t doper, dtype;
6883 uint32_t numd, trans_len;
6884 NvmeDirectiveIdentify id = {
6885 .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
6886 .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
6887 };
6888
6889 numd = dw10 + 1;
6890 doper = dw11 & 0xff;
6891 dtype = (dw11 >> 8) & 0xff;
6892
6893 trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
6894
6895 if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
6896 doper != NVME_DIRECTIVE_RETURN_PARAMS) {
6897 return NVME_INVALID_FIELD | NVME_DNR;
6898 }
6899
6900 ns = nvme_ns(n, nsid);
6901 if (!ns) {
6902 return NVME_INVALID_FIELD | NVME_DNR;
6903 }
6904
6905 switch (dtype) {
6906 case NVME_DIRECTIVE_IDENTIFY:
6907 switch (doper) {
6908 case NVME_DIRECTIVE_RETURN_PARAMS:
6909 if (ns->endgrp && ns->endgrp->fdp.enabled) {
6910 id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
6911 id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
6912 id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
6913 }
6914
6915 return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
6916
6917 default:
6918 return NVME_INVALID_FIELD | NVME_DNR;
6919 }
6920
6921 default:
6922 return NVME_INVALID_FIELD;
6923 }
6924}
6925
6926static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
6927{
6928 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
6929 nvme_adm_opc_str(req->cmd.opcode));
6930
6931 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
6932 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
6933 return NVME_INVALID_OPCODE | NVME_DNR;
6934 }
6935
6936
6937 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
6938 return NVME_INVALID_FIELD | NVME_DNR;
6939 }
6940
6941 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
6942 return NVME_INVALID_FIELD;
6943 }
6944
6945 switch (req->cmd.opcode) {
6946 case NVME_ADM_CMD_DELETE_SQ:
6947 return nvme_del_sq(n, req);
6948 case NVME_ADM_CMD_CREATE_SQ:
6949 return nvme_create_sq(n, req);
6950 case NVME_ADM_CMD_GET_LOG_PAGE:
6951 return nvme_get_log(n, req);
6952 case NVME_ADM_CMD_DELETE_CQ:
6953 return nvme_del_cq(n, req);
6954 case NVME_ADM_CMD_CREATE_CQ:
6955 return nvme_create_cq(n, req);
6956 case NVME_ADM_CMD_IDENTIFY:
6957 return nvme_identify(n, req);
6958 case NVME_ADM_CMD_ABORT:
6959 return nvme_abort(n, req);
6960 case NVME_ADM_CMD_SET_FEATURES:
6961 return nvme_set_feature(n, req);
6962 case NVME_ADM_CMD_GET_FEATURES:
6963 return nvme_get_feature(n, req);
6964 case NVME_ADM_CMD_ASYNC_EV_REQ:
6965 return nvme_aer(n, req);
6966 case NVME_ADM_CMD_NS_ATTACHMENT:
6967 return nvme_ns_attachment(n, req);
6968 case NVME_ADM_CMD_VIRT_MNGMT:
6969 return nvme_virt_mngmt(n, req);
6970 case NVME_ADM_CMD_DBBUF_CONFIG:
6971 return nvme_dbbuf_config(n, req);
6972 case NVME_ADM_CMD_FORMAT_NVM:
6973 return nvme_format(n, req);
6974 case NVME_ADM_CMD_DIRECTIVE_SEND:
6975 return nvme_directive_send(n, req);
6976 case NVME_ADM_CMD_DIRECTIVE_RECV:
6977 return nvme_directive_receive(n, req);
6978 default:
6979 assert(false);
6980 }
6981
6982 return NVME_INVALID_OPCODE | NVME_DNR;
6983}
6984
6985static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
6986{
6987 trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
6988
6989 stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
6990 MEMTXATTRS_UNSPECIFIED);
6991}
6992
6993static void nvme_update_sq_tail(NvmeSQueue *sq)
6994{
6995 ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
6996 MEMTXATTRS_UNSPECIFIED);
6997
6998 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
6999}
7000
7001static void nvme_process_sq(void *opaque)
7002{
7003 NvmeSQueue *sq = opaque;
7004 NvmeCtrl *n = sq->ctrl;
7005 NvmeCQueue *cq = n->cq[sq->cqid];
7006
7007 uint16_t status;
7008 hwaddr addr;
7009 NvmeCmd cmd;
7010 NvmeRequest *req;
7011
7012 if (n->dbbuf_enabled) {
7013 nvme_update_sq_tail(sq);
7014 }
7015
7016 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
7017 addr = sq->dma_addr + (sq->head << NVME_SQES);
7018 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
7019 trace_pci_nvme_err_addr_read(addr);
7020 trace_pci_nvme_err_cfs();
7021 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7022 break;
7023 }
7024 nvme_inc_sq_head(sq);
7025
7026 req = QTAILQ_FIRST(&sq->req_list);
7027 QTAILQ_REMOVE(&sq->req_list, req, entry);
7028 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
7029 nvme_req_clear(req);
7030 req->cqe.cid = cmd.cid;
7031 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
7032
7033 status = sq->sqid ? nvme_io_cmd(n, req) :
7034 nvme_admin_cmd(n, req);
7035 if (status != NVME_NO_COMPLETE) {
7036 req->status = status;
7037 nvme_enqueue_req_completion(cq, req);
7038 }
7039
7040 if (n->dbbuf_enabled) {
7041 nvme_update_sq_eventidx(sq);
7042 nvme_update_sq_tail(sq);
7043 }
7044 }
7045}
7046
7047static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7048{
7049 uint8_t *config;
7050
7051 if (!msix_present(pci_dev)) {
7052 return;
7053 }
7054
7055 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7056
7057 config = pci_dev->config + pci_dev->msix_cap;
7058 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7059 table_size - 1);
7060}
7061
7062static void nvme_activate_virt_res(NvmeCtrl *n)
7063{
7064 PCIDevice *pci_dev = PCI_DEVICE(n);
7065 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7066 NvmeSecCtrlEntry *sctrl;
7067
7068
7069 if (pci_is_vf(pci_dev)) {
7070 sctrl = nvme_sctrl(n);
7071 cap->vqprt = sctrl->nvq;
7072 cap->viprt = sctrl->nvi;
7073 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7074 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7075 } else {
7076 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7077 cap->virfap = n->next_pri_ctrl_cap.virfap;
7078 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7079 le16_to_cpu(cap->vqrfap) - 1;
7080 n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7081 le16_to_cpu(cap->virfap);
7082 }
7083}
7084
7085static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7086{
7087 PCIDevice *pci_dev = PCI_DEVICE(n);
7088 NvmeSecCtrlEntry *sctrl;
7089 NvmeNamespace *ns;
7090 int i;
7091
7092 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7093 ns = nvme_ns(n, i);
7094 if (!ns) {
7095 continue;
7096 }
7097
7098 nvme_ns_drain(ns);
7099 }
7100
7101 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7102 if (n->sq[i] != NULL) {
7103 nvme_free_sq(n->sq[i], n);
7104 }
7105 }
7106 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7107 if (n->cq[i] != NULL) {
7108 nvme_free_cq(n->cq[i], n);
7109 }
7110 }
7111
7112 while (!QTAILQ_EMPTY(&n->aer_queue)) {
7113 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7114 QTAILQ_REMOVE(&n->aer_queue, event, entry);
7115 g_free(event);
7116 }
7117
7118 if (n->params.sriov_max_vfs) {
7119 if (!pci_is_vf(pci_dev)) {
7120 for (i = 0; i < n->sec_ctrl_list.numcntl; i++) {
7121 sctrl = &n->sec_ctrl_list.sec[i];
7122 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7123 }
7124
7125 if (rst != NVME_RESET_CONTROLLER) {
7126 pcie_sriov_pf_disable_vfs(pci_dev);
7127 }
7128 }
7129
7130 if (rst != NVME_RESET_CONTROLLER) {
7131 nvme_activate_virt_res(n);
7132 }
7133 }
7134
7135 n->aer_queued = 0;
7136 n->aer_mask = 0;
7137 n->outstanding_aers = 0;
7138 n->qs_created = false;
7139
7140 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7141
7142 if (pci_is_vf(pci_dev)) {
7143 sctrl = nvme_sctrl(n);
7144
7145 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7146 } else {
7147 stl_le_p(&n->bar.csts, 0);
7148 }
7149
7150 stl_le_p(&n->bar.intms, 0);
7151 stl_le_p(&n->bar.intmc, 0);
7152 stl_le_p(&n->bar.cc, 0);
7153
7154 n->dbbuf_dbs = 0;
7155 n->dbbuf_eis = 0;
7156 n->dbbuf_enabled = false;
7157}
7158
7159static void nvme_ctrl_shutdown(NvmeCtrl *n)
7160{
7161 NvmeNamespace *ns;
7162 int i;
7163
7164 if (n->pmr.dev) {
7165 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7166 }
7167
7168 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7169 ns = nvme_ns(n, i);
7170 if (!ns) {
7171 continue;
7172 }
7173
7174 nvme_ns_shutdown(ns);
7175 }
7176}
7177
7178static void nvme_select_iocs(NvmeCtrl *n)
7179{
7180 NvmeNamespace *ns;
7181 int i;
7182
7183 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7184 ns = nvme_ns(n, i);
7185 if (!ns) {
7186 continue;
7187 }
7188
7189 nvme_select_iocs_ns(n, ns);
7190 }
7191}
7192
7193static int nvme_start_ctrl(NvmeCtrl *n)
7194{
7195 uint64_t cap = ldq_le_p(&n->bar.cap);
7196 uint32_t cc = ldl_le_p(&n->bar.cc);
7197 uint32_t aqa = ldl_le_p(&n->bar.aqa);
7198 uint64_t asq = ldq_le_p(&n->bar.asq);
7199 uint64_t acq = ldq_le_p(&n->bar.acq);
7200 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7201 uint32_t page_size = 1 << page_bits;
7202 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7203
7204 if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7205 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7206 le16_to_cpu(sctrl->nvq));
7207 return -1;
7208 }
7209 if (unlikely(n->cq[0])) {
7210 trace_pci_nvme_err_startfail_cq();
7211 return -1;
7212 }
7213 if (unlikely(n->sq[0])) {
7214 trace_pci_nvme_err_startfail_sq();
7215 return -1;
7216 }
7217 if (unlikely(asq & (page_size - 1))) {
7218 trace_pci_nvme_err_startfail_asq_misaligned(asq);
7219 return -1;
7220 }
7221 if (unlikely(acq & (page_size - 1))) {
7222 trace_pci_nvme_err_startfail_acq_misaligned(acq);
7223 return -1;
7224 }
7225 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7226 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7227 return -1;
7228 }
7229 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7230 trace_pci_nvme_err_startfail_page_too_small(
7231 NVME_CC_MPS(cc),
7232 NVME_CAP_MPSMIN(cap));
7233 return -1;
7234 }
7235 if (unlikely(NVME_CC_MPS(cc) >
7236 NVME_CAP_MPSMAX(cap))) {
7237 trace_pci_nvme_err_startfail_page_too_large(
7238 NVME_CC_MPS(cc),
7239 NVME_CAP_MPSMAX(cap));
7240 return -1;
7241 }
7242 if (unlikely(!NVME_AQA_ASQS(aqa))) {
7243 trace_pci_nvme_err_startfail_asqent_sz_zero();
7244 return -1;
7245 }
7246 if (unlikely(!NVME_AQA_ACQS(aqa))) {
7247 trace_pci_nvme_err_startfail_acqent_sz_zero();
7248 return -1;
7249 }
7250
7251 n->page_bits = page_bits;
7252 n->page_size = page_size;
7253 n->max_prp_ents = n->page_size / sizeof(uint64_t);
7254 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7255 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7256
7257 nvme_set_timestamp(n, 0ULL);
7258
7259 nvme_select_iocs(n);
7260
7261 return 0;
7262}
7263
7264static void nvme_cmb_enable_regs(NvmeCtrl *n)
7265{
7266 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7267 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7268
7269 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7270 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7271 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7272 stl_le_p(&n->bar.cmbloc, cmbloc);
7273
7274 NVME_CMBSZ_SET_SQS(cmbsz, 1);
7275 NVME_CMBSZ_SET_CQS(cmbsz, 0);
7276 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7277 NVME_CMBSZ_SET_RDS(cmbsz, 1);
7278 NVME_CMBSZ_SET_WDS(cmbsz, 1);
7279 NVME_CMBSZ_SET_SZU(cmbsz, 2);
7280 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7281 stl_le_p(&n->bar.cmbsz, cmbsz);
7282}
7283
7284static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7285 unsigned size)
7286{
7287 PCIDevice *pci = PCI_DEVICE(n);
7288 uint64_t cap = ldq_le_p(&n->bar.cap);
7289 uint32_t cc = ldl_le_p(&n->bar.cc);
7290 uint32_t intms = ldl_le_p(&n->bar.intms);
7291 uint32_t csts = ldl_le_p(&n->bar.csts);
7292 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7293
7294 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7295 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7296 "MMIO write not 32-bit aligned,"
7297 " offset=0x%"PRIx64"", offset);
7298
7299 }
7300
7301 if (unlikely(size < sizeof(uint32_t))) {
7302 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7303 "MMIO write smaller than 32-bits,"
7304 " offset=0x%"PRIx64", size=%u",
7305 offset, size);
7306
7307 }
7308
7309 switch (offset) {
7310 case NVME_REG_INTMS:
7311 if (unlikely(msix_enabled(pci))) {
7312 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7313 "undefined access to interrupt mask set"
7314 " when MSI-X is enabled");
7315
7316 }
7317 intms |= data;
7318 stl_le_p(&n->bar.intms, intms);
7319 n->bar.intmc = n->bar.intms;
7320 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7321 nvme_irq_check(n);
7322 break;
7323 case NVME_REG_INTMC:
7324 if (unlikely(msix_enabled(pci))) {
7325 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7326 "undefined access to interrupt mask clr"
7327 " when MSI-X is enabled");
7328
7329 }
7330 intms &= ~data;
7331 stl_le_p(&n->bar.intms, intms);
7332 n->bar.intmc = n->bar.intms;
7333 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7334 nvme_irq_check(n);
7335 break;
7336 case NVME_REG_CC:
7337 stl_le_p(&n->bar.cc, data);
7338
7339 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7340
7341 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7342 trace_pci_nvme_mmio_shutdown_set();
7343 nvme_ctrl_shutdown(n);
7344 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7345 csts |= NVME_CSTS_SHST_COMPLETE;
7346 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7347 trace_pci_nvme_mmio_shutdown_cleared();
7348 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7349 }
7350
7351 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7352 if (unlikely(nvme_start_ctrl(n))) {
7353 trace_pci_nvme_err_startfail();
7354 csts = NVME_CSTS_FAILED;
7355 } else {
7356 trace_pci_nvme_mmio_start_success();
7357 csts = NVME_CSTS_READY;
7358 }
7359 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7360 trace_pci_nvme_mmio_stopped();
7361 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7362
7363 break;
7364 }
7365
7366 stl_le_p(&n->bar.csts, csts);
7367
7368 break;
7369 case NVME_REG_CSTS:
7370 if (data & (1 << 4)) {
7371 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7372 "attempted to W1C CSTS.NSSRO"
7373 " but CAP.NSSRS is zero (not supported)");
7374 } else if (data != 0) {
7375 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7376 "attempted to set a read only bit"
7377 " of controller status");
7378 }
7379 break;
7380 case NVME_REG_NSSR:
7381 if (data == 0x4e564d65) {
7382 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7383 } else {
7384
7385 return;
7386 }
7387 break;
7388 case NVME_REG_AQA:
7389 stl_le_p(&n->bar.aqa, data);
7390 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7391 break;
7392 case NVME_REG_ASQ:
7393 stn_le_p(&n->bar.asq, size, data);
7394 trace_pci_nvme_mmio_asqaddr(data);
7395 break;
7396 case NVME_REG_ASQ + 4:
7397 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7398 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7399 break;
7400 case NVME_REG_ACQ:
7401 trace_pci_nvme_mmio_acqaddr(data);
7402 stn_le_p(&n->bar.acq, size, data);
7403 break;
7404 case NVME_REG_ACQ + 4:
7405 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7406 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7407 break;
7408 case NVME_REG_CMBLOC:
7409 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7410 "invalid write to reserved CMBLOC"
7411 " when CMBSZ is zero, ignored");
7412 return;
7413 case NVME_REG_CMBSZ:
7414 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7415 "invalid write to read only CMBSZ, ignored");
7416 return;
7417 case NVME_REG_CMBMSC:
7418 if (!NVME_CAP_CMBS(cap)) {
7419 return;
7420 }
7421
7422 stn_le_p(&n->bar.cmbmsc, size, data);
7423 n->cmb.cmse = false;
7424
7425 if (NVME_CMBMSC_CRE(data)) {
7426 nvme_cmb_enable_regs(n);
7427
7428 if (NVME_CMBMSC_CMSE(data)) {
7429 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7430 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7431 if (cba + int128_get64(n->cmb.mem.size) < cba) {
7432 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7433 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7434 stl_le_p(&n->bar.cmbsts, cmbsts);
7435 return;
7436 }
7437
7438 n->cmb.cba = cba;
7439 n->cmb.cmse = true;
7440 }
7441 } else {
7442 n->bar.cmbsz = 0;
7443 n->bar.cmbloc = 0;
7444 }
7445
7446 return;
7447 case NVME_REG_CMBMSC + 4:
7448 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7449 return;
7450
7451 case NVME_REG_PMRCAP:
7452 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7453 "invalid write to PMRCAP register, ignored");
7454 return;
7455 case NVME_REG_PMRCTL:
7456 if (!NVME_CAP_PMRS(cap)) {
7457 return;
7458 }
7459
7460 stl_le_p(&n->bar.pmrctl, data);
7461 if (NVME_PMRCTL_EN(data)) {
7462 memory_region_set_enabled(&n->pmr.dev->mr, true);
7463 pmrsts = 0;
7464 } else {
7465 memory_region_set_enabled(&n->pmr.dev->mr, false);
7466 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7467 n->pmr.cmse = false;
7468 }
7469 stl_le_p(&n->bar.pmrsts, pmrsts);
7470 return;
7471 case NVME_REG_PMRSTS:
7472 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7473 "invalid write to PMRSTS register, ignored");
7474 return;
7475 case NVME_REG_PMREBS:
7476 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7477 "invalid write to PMREBS register, ignored");
7478 return;
7479 case NVME_REG_PMRSWTP:
7480 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7481 "invalid write to PMRSWTP register, ignored");
7482 return;
7483 case NVME_REG_PMRMSCL:
7484 if (!NVME_CAP_PMRS(cap)) {
7485 return;
7486 }
7487
7488 stl_le_p(&n->bar.pmrmscl, data);
7489 n->pmr.cmse = false;
7490
7491 if (NVME_PMRMSCL_CMSE(data)) {
7492 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
7493 hwaddr cba = pmrmscu << 32 |
7494 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
7495 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
7496 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
7497 stl_le_p(&n->bar.pmrsts, pmrsts);
7498 return;
7499 }
7500
7501 n->pmr.cmse = true;
7502 n->pmr.cba = cba;
7503 }
7504
7505 return;
7506 case NVME_REG_PMRMSCU:
7507 if (!NVME_CAP_PMRS(cap)) {
7508 return;
7509 }
7510
7511 stl_le_p(&n->bar.pmrmscu, data);
7512 return;
7513 default:
7514 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
7515 "invalid MMIO write,"
7516 " offset=0x%"PRIx64", data=%"PRIx64"",
7517 offset, data);
7518 break;
7519 }
7520}
7521
7522static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
7523{
7524 NvmeCtrl *n = (NvmeCtrl *)opaque;
7525 uint8_t *ptr = (uint8_t *)&n->bar;
7526
7527 trace_pci_nvme_mmio_read(addr, size);
7528
7529 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
7530 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
7531 "MMIO read not 32-bit aligned,"
7532 " offset=0x%"PRIx64"", addr);
7533
7534 } else if (unlikely(size < sizeof(uint32_t))) {
7535 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
7536 "MMIO read smaller than 32-bits,"
7537 " offset=0x%"PRIx64"", addr);
7538
7539 }
7540
7541 if (addr > sizeof(n->bar) - size) {
7542 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
7543 "MMIO read beyond last register,"
7544 " offset=0x%"PRIx64", returning 0", addr);
7545
7546 return 0;
7547 }
7548
7549 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7550 addr != NVME_REG_CSTS) {
7551 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7552 return 0;
7553 }
7554
7555
7556
7557
7558
7559
7560 if (addr == NVME_REG_PMRSTS &&
7561 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
7562 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7563 }
7564
7565 return ldn_le_p(ptr + addr, size);
7566}
7567
7568static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
7569{
7570 PCIDevice *pci = PCI_DEVICE(n);
7571 uint32_t qid;
7572
7573 if (unlikely(addr & ((1 << 2) - 1))) {
7574 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
7575 "doorbell write not 32-bit aligned,"
7576 " offset=0x%"PRIx64", ignoring", addr);
7577 return;
7578 }
7579
7580 if (((addr - 0x1000) >> 2) & 1) {
7581
7582
7583 uint16_t new_head = val & 0xffff;
7584 int start_sqs;
7585 NvmeCQueue *cq;
7586
7587 qid = (addr - (0x1000 + (1 << 2))) >> 3;
7588 if (unlikely(nvme_check_cqid(n, qid))) {
7589 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
7590 "completion queue doorbell write"
7591 " for nonexistent queue,"
7592 " sqid=%"PRIu32", ignoring", qid);
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607 if (n->outstanding_aers) {
7608 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7609 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
7610 NVME_LOG_ERROR_INFO);
7611 }
7612
7613 return;
7614 }
7615
7616 cq = n->cq[qid];
7617 if (unlikely(new_head >= cq->size)) {
7618 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
7619 "completion queue doorbell write value"
7620 " beyond queue size, sqid=%"PRIu32","
7621 " new_head=%"PRIu16", ignoring",
7622 qid, new_head);
7623
7624 if (n->outstanding_aers) {
7625 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7626 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
7627 NVME_LOG_ERROR_INFO);
7628 }
7629
7630 return;
7631 }
7632
7633 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
7634
7635 start_sqs = nvme_cq_full(cq) ? 1 : 0;
7636 cq->head = new_head;
7637 if (!qid && n->dbbuf_enabled) {
7638 stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7639 }
7640 if (start_sqs) {
7641 NvmeSQueue *sq;
7642 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
7643 qemu_bh_schedule(sq->bh);
7644 }
7645 qemu_bh_schedule(cq->bh);
7646 }
7647
7648 if (cq->tail == cq->head) {
7649 if (cq->irq_enabled) {
7650 n->cq_pending--;
7651 }
7652
7653 nvme_irq_deassert(n, cq);
7654 }
7655 } else {
7656
7657
7658 uint16_t new_tail = val & 0xffff;
7659 NvmeSQueue *sq;
7660
7661 qid = (addr - 0x1000) >> 3;
7662 if (unlikely(nvme_check_sqid(n, qid))) {
7663 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
7664 "submission queue doorbell write"
7665 " for nonexistent queue,"
7666 " sqid=%"PRIu32", ignoring", qid);
7667
7668 if (n->outstanding_aers) {
7669 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7670 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
7671 NVME_LOG_ERROR_INFO);
7672 }
7673
7674 return;
7675 }
7676
7677 sq = n->sq[qid];
7678 if (unlikely(new_tail >= sq->size)) {
7679 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
7680 "submission queue doorbell write value"
7681 " beyond queue size, sqid=%"PRIu32","
7682 " new_tail=%"PRIu16", ignoring",
7683 qid, new_tail);
7684
7685 if (n->outstanding_aers) {
7686 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
7687 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
7688 NVME_LOG_ERROR_INFO);
7689 }
7690
7691 return;
7692 }
7693
7694 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
7695
7696 sq->tail = new_tail;
7697 if (!qid && n->dbbuf_enabled) {
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711 stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7712 }
7713
7714 qemu_bh_schedule(sq->bh);
7715 }
7716}
7717
7718static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
7719 unsigned size)
7720{
7721 NvmeCtrl *n = (NvmeCtrl *)opaque;
7722
7723 trace_pci_nvme_mmio_write(addr, data, size);
7724
7725 if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7726 addr != NVME_REG_CSTS) {
7727 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7728 return;
7729 }
7730
7731 if (addr < sizeof(n->bar)) {
7732 nvme_write_bar(n, addr, data, size);
7733 } else {
7734 nvme_process_db(n, addr, data);
7735 }
7736}
7737
7738static const MemoryRegionOps nvme_mmio_ops = {
7739 .read = nvme_mmio_read,
7740 .write = nvme_mmio_write,
7741 .endianness = DEVICE_LITTLE_ENDIAN,
7742 .impl = {
7743 .min_access_size = 2,
7744 .max_access_size = 8,
7745 },
7746};
7747
7748static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
7749 unsigned size)
7750{
7751 NvmeCtrl *n = (NvmeCtrl *)opaque;
7752 stn_le_p(&n->cmb.buf[addr], size, data);
7753}
7754
7755static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
7756{
7757 NvmeCtrl *n = (NvmeCtrl *)opaque;
7758 return ldn_le_p(&n->cmb.buf[addr], size);
7759}
7760
7761static const MemoryRegionOps nvme_cmb_ops = {
7762 .read = nvme_cmb_read,
7763 .write = nvme_cmb_write,
7764 .endianness = DEVICE_LITTLE_ENDIAN,
7765 .impl = {
7766 .min_access_size = 1,
7767 .max_access_size = 8,
7768 },
7769};
7770
7771static bool nvme_check_params(NvmeCtrl *n, Error **errp)
7772{
7773 NvmeParams *params = &n->params;
7774
7775 if (params->num_queues) {
7776 warn_report("num_queues is deprecated; please use max_ioqpairs "
7777 "instead");
7778
7779 params->max_ioqpairs = params->num_queues - 1;
7780 }
7781
7782 if (n->namespace.blkconf.blk && n->subsys) {
7783 error_setg(errp, "subsystem support is unavailable with legacy "
7784 "namespace ('drive' property)");
7785 return false;
7786 }
7787
7788 if (params->max_ioqpairs < 1 ||
7789 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
7790 error_setg(errp, "max_ioqpairs must be between 1 and %d",
7791 NVME_MAX_IOQPAIRS);
7792 return false;
7793 }
7794
7795 if (params->msix_qsize < 1 ||
7796 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
7797 error_setg(errp, "msix_qsize must be between 1 and %d",
7798 PCI_MSIX_FLAGS_QSIZE + 1);
7799 return false;
7800 }
7801
7802 if (!params->serial) {
7803 error_setg(errp, "serial property not set");
7804 return false;
7805 }
7806
7807 if (n->pmr.dev) {
7808 if (host_memory_backend_is_mapped(n->pmr.dev)) {
7809 error_setg(errp, "can't use already busy memdev: %s",
7810 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
7811 return false;
7812 }
7813
7814 if (!is_power_of_2(n->pmr.dev->size)) {
7815 error_setg(errp, "pmr backend size needs to be power of 2 in size");
7816 return false;
7817 }
7818
7819 host_memory_backend_set_mapped(n->pmr.dev, true);
7820 }
7821
7822 if (n->params.zasl > n->params.mdts) {
7823 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
7824 "than or equal to mdts (Maximum Data Transfer Size)");
7825 return false;
7826 }
7827
7828 if (!n->params.vsl) {
7829 error_setg(errp, "vsl must be non-zero");
7830 return false;
7831 }
7832
7833 if (params->sriov_max_vfs) {
7834 if (!n->subsys) {
7835 error_setg(errp, "subsystem is required for the use of SR-IOV");
7836 return false;
7837 }
7838
7839 if (params->sriov_max_vfs > NVME_MAX_VFS) {
7840 error_setg(errp, "sriov_max_vfs must be between 0 and %d",
7841 NVME_MAX_VFS);
7842 return false;
7843 }
7844
7845 if (params->cmb_size_mb) {
7846 error_setg(errp, "CMB is not supported with SR-IOV");
7847 return false;
7848 }
7849
7850 if (n->pmr.dev) {
7851 error_setg(errp, "PMR is not supported with SR-IOV");
7852 return false;
7853 }
7854
7855 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
7856 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
7857 " must be set for the use of SR-IOV");
7858 return false;
7859 }
7860
7861 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
7862 error_setg(errp, "sriov_vq_flexible must be greater than or equal"
7863 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
7864 return false;
7865 }
7866
7867 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
7868 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
7869 " greater than or equal to 2");
7870 return false;
7871 }
7872
7873 if (params->sriov_vi_flexible < params->sriov_max_vfs) {
7874 error_setg(errp, "sriov_vi_flexible must be greater than or equal"
7875 " to %d (sriov_max_vfs)", params->sriov_max_vfs);
7876 return false;
7877 }
7878
7879 if (params->msix_qsize < params->sriov_vi_flexible + 1) {
7880 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
7881 " greater than or equal to 1");
7882 return false;
7883 }
7884
7885 if (params->sriov_max_vi_per_vf &&
7886 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
7887 error_setg(errp, "sriov_max_vi_per_vf must meet:"
7888 " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
7889 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
7890 return false;
7891 }
7892
7893 if (params->sriov_max_vq_per_vf &&
7894 (params->sriov_max_vq_per_vf < 2 ||
7895 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
7896 error_setg(errp, "sriov_max_vq_per_vf must meet:"
7897 " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
7898 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
7899 return false;
7900 }
7901 }
7902
7903 return true;
7904}
7905
7906static void nvme_init_state(NvmeCtrl *n)
7907{
7908 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7909 NvmeSecCtrlList *list = &n->sec_ctrl_list;
7910 NvmeSecCtrlEntry *sctrl;
7911 PCIDevice *pci = PCI_DEVICE(n);
7912 uint8_t max_vfs;
7913 int i;
7914
7915 if (pci_is_vf(pci)) {
7916 sctrl = nvme_sctrl(n);
7917 max_vfs = 0;
7918 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7919 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7920 } else {
7921 max_vfs = n->params.sriov_max_vfs;
7922 n->conf_ioqpairs = n->params.max_ioqpairs;
7923 n->conf_msix_qsize = n->params.msix_qsize;
7924 }
7925
7926 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
7927 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
7928 n->temperature = NVME_TEMPERATURE;
7929 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
7930 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
7931 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
7932 QTAILQ_INIT(&n->aer_queue);
7933
7934 list->numcntl = cpu_to_le16(max_vfs);
7935 for (i = 0; i < max_vfs; i++) {
7936 sctrl = &list->sec[i];
7937 sctrl->pcid = cpu_to_le16(n->cntlid);
7938 sctrl->vfn = cpu_to_le16(i + 1);
7939 }
7940
7941 cap->cntlid = cpu_to_le16(n->cntlid);
7942 cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
7943
7944 if (pci_is_vf(pci)) {
7945 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
7946 } else {
7947 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
7948 n->params.sriov_vq_flexible);
7949 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
7950 cap->vqrfap = cap->vqfrt;
7951 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
7952 cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
7953 cpu_to_le16(n->params.sriov_max_vq_per_vf) :
7954 cap->vqfrt / MAX(max_vfs, 1);
7955 }
7956
7957 if (pci_is_vf(pci)) {
7958 cap->viprt = cpu_to_le16(n->conf_msix_qsize);
7959 } else {
7960 cap->viprt = cpu_to_le16(n->params.msix_qsize -
7961 n->params.sriov_vi_flexible);
7962 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
7963 cap->virfap = cap->vifrt;
7964 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
7965 cap->vifrsm = n->params.sriov_max_vi_per_vf ?
7966 cpu_to_le16(n->params.sriov_max_vi_per_vf) :
7967 cap->vifrt / MAX(max_vfs, 1);
7968 }
7969}
7970
7971static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
7972{
7973 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
7974 uint64_t cap = ldq_le_p(&n->bar.cap);
7975
7976 n->cmb.buf = g_malloc0(cmb_size);
7977 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
7978 "nvme-cmb", cmb_size);
7979 pci_register_bar(pci_dev, NVME_CMB_BIR,
7980 PCI_BASE_ADDRESS_SPACE_MEMORY |
7981 PCI_BASE_ADDRESS_MEM_TYPE_64 |
7982 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
7983
7984 NVME_CAP_SET_CMBS(cap, 1);
7985 stq_le_p(&n->bar.cap, cap);
7986
7987 if (n->params.legacy_cmb) {
7988 nvme_cmb_enable_regs(n);
7989 n->cmb.cmse = true;
7990 }
7991}
7992
7993static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
7994{
7995 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
7996
7997 NVME_PMRCAP_SET_RDS(pmrcap, 1);
7998 NVME_PMRCAP_SET_WDS(pmrcap, 1);
7999 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
8000
8001 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
8002 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
8003 stl_le_p(&n->bar.pmrcap, pmrcap);
8004
8005 pci_register_bar(pci_dev, NVME_PMR_BIR,
8006 PCI_BASE_ADDRESS_SPACE_MEMORY |
8007 PCI_BASE_ADDRESS_MEM_TYPE_64 |
8008 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8009
8010 memory_region_set_enabled(&n->pmr.dev->mr, false);
8011}
8012
8013static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs,
8014 unsigned *msix_table_offset,
8015 unsigned *msix_pba_offset)
8016{
8017 uint64_t bar_size, msix_table_size, msix_pba_size;
8018
8019 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8020 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8021
8022 if (msix_table_offset) {
8023 *msix_table_offset = bar_size;
8024 }
8025
8026 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8027 bar_size += msix_table_size;
8028 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8029
8030 if (msix_pba_offset) {
8031 *msix_pba_offset = bar_size;
8032 }
8033
8034 msix_pba_size = QEMU_ALIGN_UP(total_irqs, 64) / 8;
8035 bar_size += msix_pba_size;
8036
8037 bar_size = pow2ceil(bar_size);
8038 return bar_size;
8039}
8040
8041static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset)
8042{
8043 uint16_t vf_dev_id = n->params.use_intel_id ?
8044 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8045 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8046 uint64_t bar_size = nvme_bar_size(le16_to_cpu(cap->vqfrsm),
8047 le16_to_cpu(cap->vifrsm),
8048 NULL, NULL);
8049
8050 pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8051 n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8052 NVME_VF_OFFSET, NVME_VF_STRIDE);
8053
8054 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8055 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8056}
8057
8058static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8059{
8060 Error *err = NULL;
8061 int ret;
8062
8063 ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
8064 PCI_PM_SIZEOF, &err);
8065 if (err) {
8066 error_report_err(err);
8067 return ret;
8068 }
8069
8070 pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8071 PCI_PM_CAP_VER_1_2);
8072 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8073 PCI_PM_CTRL_NO_SOFT_RESET);
8074 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8075 PCI_PM_CTRL_STATE_MASK);
8076
8077 return 0;
8078}
8079
8080static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8081{
8082 ERRP_GUARD();
8083 uint8_t *pci_conf = pci_dev->config;
8084 uint64_t bar_size;
8085 unsigned msix_table_offset, msix_pba_offset;
8086 int ret;
8087
8088 pci_conf[PCI_INTERRUPT_PIN] = 1;
8089 pci_config_set_prog_interface(pci_conf, 0x2);
8090
8091 if (n->params.use_intel_id) {
8092 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8093 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8094 } else {
8095 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8096 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8097 }
8098
8099 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8100 nvme_add_pm_capability(pci_dev, 0x60);
8101 pcie_endpoint_cap_init(pci_dev, 0x80);
8102 pcie_cap_flr_init(pci_dev);
8103 if (n->params.sriov_max_vfs) {
8104 pcie_ari_init(pci_dev, 0x100);
8105 }
8106
8107
8108 bar_size = nvme_bar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize,
8109 &msix_table_offset, &msix_pba_offset);
8110
8111 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8112 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8113 msix_table_offset);
8114 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8115
8116 if (pci_is_vf(pci_dev)) {
8117 pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8118 } else {
8119 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8120 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8121 }
8122 ret = msix_init(pci_dev, n->params.msix_qsize,
8123 &n->bar0, 0, msix_table_offset,
8124 &n->bar0, 0, msix_pba_offset, 0, errp);
8125 if (ret == -ENOTSUP) {
8126
8127 warn_report_err(*errp);
8128 *errp = NULL;
8129 } else if (ret < 0) {
8130
8131 return false;
8132 }
8133
8134 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8135
8136 if (n->params.cmb_size_mb) {
8137 nvme_init_cmb(n, pci_dev);
8138 }
8139
8140 if (n->pmr.dev) {
8141 nvme_init_pmr(n, pci_dev);
8142 }
8143
8144 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8145 nvme_init_sriov(n, pci_dev, 0x120);
8146 }
8147
8148 return true;
8149}
8150
8151static void nvme_init_subnqn(NvmeCtrl *n)
8152{
8153 NvmeSubsystem *subsys = n->subsys;
8154 NvmeIdCtrl *id = &n->id_ctrl;
8155
8156 if (!subsys) {
8157 snprintf((char *)id->subnqn, sizeof(id->subnqn),
8158 "nqn.2019-08.org.qemu:%s", n->params.serial);
8159 } else {
8160 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8161 }
8162}
8163
8164static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8165{
8166 NvmeIdCtrl *id = &n->id_ctrl;
8167 uint8_t *pci_conf = pci_dev->config;
8168 uint64_t cap = ldq_le_p(&n->bar.cap);
8169 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8170 uint32_t ctratt;
8171
8172 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8173 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8174 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8175 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8176 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8177
8178 id->cntlid = cpu_to_le16(n->cntlid);
8179
8180 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8181 ctratt = NVME_CTRATT_ELBAS;
8182
8183 id->rab = 6;
8184
8185 if (n->params.use_intel_id) {
8186 id->ieee[0] = 0xb3;
8187 id->ieee[1] = 0x02;
8188 id->ieee[2] = 0x00;
8189 } else {
8190 id->ieee[0] = 0x00;
8191 id->ieee[1] = 0x54;
8192 id->ieee[2] = 0x52;
8193 }
8194
8195 id->mdts = n->params.mdts;
8196 id->ver = cpu_to_le32(NVME_SPEC_VER);
8197 id->oacs =
8198 cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF |
8199 NVME_OACS_DIRECTIVES);
8200 id->cntrltype = 0x1;
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213 id->acl = 3;
8214 id->aerl = n->params.aerl;
8215 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8216 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8217
8218
8219 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8220 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8221
8222 id->sqes = (NVME_SQES << 4) | NVME_SQES;
8223 id->cqes = (NVME_CQES << 4) | NVME_CQES;
8224 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8225 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8226 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8227 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
8228
8229
8230
8231
8232
8233
8234
8235
8236 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8237
8238 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1);
8239 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN);
8240
8241 nvme_init_subnqn(n);
8242
8243 id->psd[0].mp = cpu_to_le16(0x9c4);
8244 id->psd[0].enlat = cpu_to_le32(0x10);
8245 id->psd[0].exlat = cpu_to_le32(0x4);
8246
8247 if (n->subsys) {
8248 id->cmic |= NVME_CMIC_MULTI_CTRL;
8249 ctratt |= NVME_CTRATT_ENDGRPS;
8250
8251 id->endgidmax = cpu_to_le16(0x1);
8252
8253 if (n->subsys->endgrp.fdp.enabled) {
8254 ctratt |= NVME_CTRATT_FDPS;
8255 }
8256 }
8257
8258 id->ctratt = cpu_to_le32(ctratt);
8259
8260 NVME_CAP_SET_MQES(cap, 0x7ff);
8261 NVME_CAP_SET_CQR(cap, 1);
8262 NVME_CAP_SET_TO(cap, 0xf);
8263 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
8264 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
8265 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
8266 NVME_CAP_SET_MPSMAX(cap, 4);
8267 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8268 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8269 stq_le_p(&n->bar.cap, cap);
8270
8271 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8272 n->bar.intmc = n->bar.intms = 0;
8273
8274 if (pci_is_vf(pci_dev) && !sctrl->scs) {
8275 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8276 }
8277}
8278
8279static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8280{
8281 int cntlid;
8282
8283 if (!n->subsys) {
8284 return 0;
8285 }
8286
8287 cntlid = nvme_subsys_register_ctrl(n, errp);
8288 if (cntlid < 0) {
8289 return -1;
8290 }
8291
8292 n->cntlid = cntlid;
8293
8294 return 0;
8295}
8296
8297void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8298{
8299 uint32_t nsid = ns->params.nsid;
8300 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8301
8302 n->namespaces[nsid] = ns;
8303 ns->attached++;
8304
8305 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
8306 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
8307}
8308
8309static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8310{
8311 NvmeCtrl *n = NVME(pci_dev);
8312 DeviceState *dev = DEVICE(pci_dev);
8313 NvmeNamespace *ns;
8314 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8315
8316 if (pci_is_vf(pci_dev)) {
8317
8318
8319
8320
8321 memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8322 n->subsys = pn->subsys;
8323 }
8324
8325 if (!nvme_check_params(n, errp)) {
8326 return;
8327 }
8328
8329 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8330
8331 if (nvme_init_subsys(n, errp)) {
8332 return;
8333 }
8334 nvme_init_state(n);
8335 if (!nvme_init_pci(n, pci_dev, errp)) {
8336 return;
8337 }
8338 nvme_init_ctrl(n, pci_dev);
8339
8340
8341 if (n->namespace.blkconf.blk) {
8342 ns = &n->namespace;
8343 ns->params.nsid = 1;
8344
8345 if (nvme_ns_setup(ns, errp)) {
8346 return;
8347 }
8348
8349 nvme_attach_ns(n, ns);
8350 }
8351}
8352
8353static void nvme_exit(PCIDevice *pci_dev)
8354{
8355 NvmeCtrl *n = NVME(pci_dev);
8356 NvmeNamespace *ns;
8357 int i;
8358
8359 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8360
8361 if (n->subsys) {
8362 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
8363 ns = nvme_ns(n, i);
8364 if (ns) {
8365 ns->attached--;
8366 }
8367 }
8368
8369 nvme_subsys_unregister_ctrl(n->subsys, n);
8370 }
8371
8372 g_free(n->cq);
8373 g_free(n->sq);
8374 g_free(n->aer_reqs);
8375
8376 if (n->params.cmb_size_mb) {
8377 g_free(n->cmb.buf);
8378 }
8379
8380 if (n->pmr.dev) {
8381 host_memory_backend_set_mapped(n->pmr.dev, false);
8382 }
8383
8384 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8385 pcie_sriov_pf_exit(pci_dev);
8386 }
8387
8388 msix_uninit(pci_dev, &n->bar0, &n->bar0);
8389 memory_region_del_subregion(&n->bar0, &n->iomem);
8390}
8391
8392static Property nvme_props[] = {
8393 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
8394 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
8395 HostMemoryBackend *),
8396 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
8397 NvmeSubsystem *),
8398 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
8399 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
8400 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
8401 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
8402 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
8403 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
8404 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
8405 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
8406 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
8407 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
8408 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
8409 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
8410 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
8411 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
8412 params.auto_transition_zones, true),
8413 DEFINE_PROP_UINT8("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
8414 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
8415 params.sriov_vq_flexible, 0),
8416 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
8417 params.sriov_vi_flexible, 0),
8418 DEFINE_PROP_UINT8("sriov_max_vi_per_vf", NvmeCtrl,
8419 params.sriov_max_vi_per_vf, 0),
8420 DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
8421 params.sriov_max_vq_per_vf, 0),
8422 DEFINE_PROP_END_OF_LIST(),
8423};
8424
8425static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
8426 void *opaque, Error **errp)
8427{
8428 NvmeCtrl *n = NVME(obj);
8429 uint8_t value = n->smart_critical_warning;
8430
8431 visit_type_uint8(v, name, &value, errp);
8432}
8433
8434static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
8435 void *opaque, Error **errp)
8436{
8437 NvmeCtrl *n = NVME(obj);
8438 uint8_t value, old_value, cap = 0, index, event;
8439
8440 if (!visit_type_uint8(v, name, &value, errp)) {
8441 return;
8442 }
8443
8444 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
8445 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
8446 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
8447 cap |= NVME_SMART_PMR_UNRELIABLE;
8448 }
8449
8450 if ((value & cap) != value) {
8451 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
8452 value & ~cap);
8453 return;
8454 }
8455
8456 old_value = n->smart_critical_warning;
8457 n->smart_critical_warning = value;
8458
8459
8460 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
8461 event = 1 << index;
8462 if (value & ~old_value & event)
8463 nvme_smart_event(n, event);
8464 }
8465}
8466
8467static void nvme_pci_reset(DeviceState *qdev)
8468{
8469 PCIDevice *pci_dev = PCI_DEVICE(qdev);
8470 NvmeCtrl *n = NVME(pci_dev);
8471
8472 trace_pci_nvme_pci_reset();
8473 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8474}
8475
8476static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address,
8477 uint32_t val, int len)
8478{
8479 NvmeCtrl *n = NVME(dev);
8480 NvmeSecCtrlEntry *sctrl;
8481 uint16_t sriov_cap = dev->exp.sriov_cap;
8482 uint32_t off = address - sriov_cap;
8483 int i, num_vfs;
8484
8485 if (!sriov_cap) {
8486 return;
8487 }
8488
8489 if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
8490 if (!(val & PCI_SRIOV_CTRL_VFE)) {
8491 num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
8492 for (i = 0; i < num_vfs; i++) {
8493 sctrl = &n->sec_ctrl_list.sec[i];
8494 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
8495 }
8496 }
8497 }
8498}
8499
8500static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
8501 uint32_t val, int len)
8502{
8503 nvme_sriov_pre_write_ctrl(dev, address, val, len);
8504 pci_default_write_config(dev, address, val, len);
8505 pcie_cap_flr_write_config(dev, address, val, len);
8506}
8507
8508static const VMStateDescription nvme_vmstate = {
8509 .name = "nvme",
8510 .unmigratable = 1,
8511};
8512
8513static void nvme_class_init(ObjectClass *oc, void *data)
8514{
8515 DeviceClass *dc = DEVICE_CLASS(oc);
8516 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
8517
8518 pc->realize = nvme_realize;
8519 pc->config_write = nvme_pci_write_config;
8520 pc->exit = nvme_exit;
8521 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
8522 pc->revision = 2;
8523
8524 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
8525 dc->desc = "Non-Volatile Memory Express";
8526 device_class_set_props(dc, nvme_props);
8527 dc->vmsd = &nvme_vmstate;
8528 dc->reset = nvme_pci_reset;
8529}
8530
8531static void nvme_instance_init(Object *obj)
8532{
8533 NvmeCtrl *n = NVME(obj);
8534
8535 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
8536 "bootindex", "/namespace@1,0",
8537 DEVICE(obj));
8538
8539 object_property_add(obj, "smart_critical_warning", "uint8",
8540 nvme_get_smart_warning,
8541 nvme_set_smart_warning, NULL, NULL);
8542}
8543
8544static const TypeInfo nvme_info = {
8545 .name = TYPE_NVME,
8546 .parent = TYPE_PCI_DEVICE,
8547 .instance_size = sizeof(NvmeCtrl),
8548 .instance_init = nvme_instance_init,
8549 .class_init = nvme_class_init,
8550 .interfaces = (InterfaceInfo[]) {
8551 { INTERFACE_PCIE_DEVICE },
8552 { }
8553 },
8554};
8555
8556static const TypeInfo nvme_bus_info = {
8557 .name = TYPE_NVME_BUS,
8558 .parent = TYPE_BUS,
8559 .instance_size = sizeof(NvmeBus),
8560};
8561
8562static void nvme_register_types(void)
8563{
8564 type_register_static(&nvme_info);
8565 type_register_static(&nvme_bus_info);
8566}
8567
8568type_init(nvme_register_types)
8569