1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152#include "qemu/osdep.h"
153#include "qemu/cutils.h"
154#include "qemu/error-report.h"
155#include "qemu/log.h"
156#include "qemu/units.h"
157#include "qapi/error.h"
158#include "qapi/visitor.h"
159#include "sysemu/sysemu.h"
160#include "sysemu/block-backend.h"
161#include "sysemu/hostmem.h"
162#include "hw/pci/msix.h"
163#include "migration/vmstate.h"
164
165#include "nvme.h"
166#include "dif.h"
167#include "trace.h"
168
169#define NVME_MAX_IOQPAIRS 0xffff
170#define NVME_DB_SIZE 4
171#define NVME_SPEC_VER 0x00010400
172#define NVME_CMB_BIR 2
173#define NVME_PMR_BIR 4
174#define NVME_TEMPERATURE 0x143
175#define NVME_TEMPERATURE_WARNING 0x157
176#define NVME_TEMPERATURE_CRITICAL 0x175
177#define NVME_NUM_FW_SLOTS 1
178#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
179
180#define NVME_GUEST_ERR(trace, fmt, ...) \
181 do { \
182 (trace_##trace)(__VA_ARGS__); \
183 qemu_log_mask(LOG_GUEST_ERROR, #trace \
184 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
185 } while (0)
186
187static const bool nvme_feature_support[NVME_FID_MAX] = {
188 [NVME_ARBITRATION] = true,
189 [NVME_POWER_MANAGEMENT] = true,
190 [NVME_TEMPERATURE_THRESHOLD] = true,
191 [NVME_ERROR_RECOVERY] = true,
192 [NVME_VOLATILE_WRITE_CACHE] = true,
193 [NVME_NUMBER_OF_QUEUES] = true,
194 [NVME_INTERRUPT_COALESCING] = true,
195 [NVME_INTERRUPT_VECTOR_CONF] = true,
196 [NVME_WRITE_ATOMICITY] = true,
197 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
198 [NVME_TIMESTAMP] = true,
199 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
200 [NVME_COMMAND_SET_PROFILE] = true,
201};
202
203static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
204 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
205 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
206 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
207 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
208 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
209 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
210 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
211 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
212};
213
214static const uint32_t nvme_cse_acs[256] = {
215 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
216 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
217 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
218 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
219 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
220 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
221 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
222 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
223 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
224 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
225 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
226 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
227};
228
229static const uint32_t nvme_cse_iocs_none[256];
230
231static const uint32_t nvme_cse_iocs_nvm[256] = {
232 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
233 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
235 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
236 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
237 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
238 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
239 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
240};
241
242static const uint32_t nvme_cse_iocs_zoned[256] = {
243 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
244 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
246 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
247 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
248 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
249 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
251 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
252 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
253 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
254};
255
256static void nvme_process_sq(void *opaque);
257
258static uint16_t nvme_sqid(NvmeRequest *req)
259{
260 return le16_to_cpu(req->sq->sqid);
261}
262
263static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
264 NvmeZoneState state)
265{
266 if (QTAILQ_IN_USE(zone, entry)) {
267 switch (nvme_get_zone_state(zone)) {
268 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
269 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
270 break;
271 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
272 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
273 break;
274 case NVME_ZONE_STATE_CLOSED:
275 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
276 break;
277 case NVME_ZONE_STATE_FULL:
278 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
279 default:
280 ;
281 }
282 }
283
284 nvme_set_zone_state(zone, state);
285
286 switch (state) {
287 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
288 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
289 break;
290 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
291 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
292 break;
293 case NVME_ZONE_STATE_CLOSED:
294 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
295 break;
296 case NVME_ZONE_STATE_FULL:
297 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
298 case NVME_ZONE_STATE_READ_ONLY:
299 break;
300 default:
301 zone->d.za = 0;
302 }
303}
304
305static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
306 uint32_t opn, uint32_t zrwa)
307{
308 if (ns->params.max_active_zones != 0 &&
309 ns->nr_active_zones + act > ns->params.max_active_zones) {
310 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
311 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
312 }
313
314 if (ns->params.max_open_zones != 0 &&
315 ns->nr_open_zones + opn > ns->params.max_open_zones) {
316 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
317 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
318 }
319
320 if (zrwa > ns->zns.numzrwa) {
321 return NVME_NOZRWA | NVME_DNR;
322 }
323
324 return NVME_SUCCESS;
325}
326
327
328
329
330
331static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
332{
333 return nvme_zns_check_resources(ns, act, opn, 0);
334}
335
336static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
337{
338 hwaddr hi, lo;
339
340 if (!n->cmb.cmse) {
341 return false;
342 }
343
344 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
345 hi = lo + int128_get64(n->cmb.mem.size);
346
347 return addr >= lo && addr < hi;
348}
349
350static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
351{
352 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
353 return &n->cmb.buf[addr - base];
354}
355
356static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
357{
358 hwaddr hi;
359
360 if (!n->pmr.cmse) {
361 return false;
362 }
363
364 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
365
366 return addr >= n->pmr.cba && addr < hi;
367}
368
369static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
370{
371 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
372}
373
374static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
375{
376 hwaddr hi, lo;
377
378
379
380
381
382
383
384
385
386 lo = n->bar0.addr;
387 hi = lo + int128_get64(n->bar0.size);
388
389 return addr >= lo && addr < hi;
390}
391
392static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
393{
394 hwaddr hi = addr + size - 1;
395 if (hi < addr) {
396 return 1;
397 }
398
399 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
400 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
401 return 0;
402 }
403
404 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
405 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
406 return 0;
407 }
408
409 return pci_dma_read(&n->parent_obj, addr, buf, size);
410}
411
412static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
413{
414 hwaddr hi = addr + size - 1;
415 if (hi < addr) {
416 return 1;
417 }
418
419 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
420 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
421 return 0;
422 }
423
424 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
425 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
426 return 0;
427 }
428
429 return pci_dma_write(&n->parent_obj, addr, buf, size);
430}
431
432static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
433{
434 return nsid &&
435 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
436}
437
438static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
439{
440 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
441}
442
443static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
444{
445 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
446}
447
448static void nvme_inc_cq_tail(NvmeCQueue *cq)
449{
450 cq->tail++;
451 if (cq->tail >= cq->size) {
452 cq->tail = 0;
453 cq->phase = !cq->phase;
454 }
455}
456
457static void nvme_inc_sq_head(NvmeSQueue *sq)
458{
459 sq->head = (sq->head + 1) % sq->size;
460}
461
462static uint8_t nvme_cq_full(NvmeCQueue *cq)
463{
464 return (cq->tail + 1) % cq->size == cq->head;
465}
466
467static uint8_t nvme_sq_empty(NvmeSQueue *sq)
468{
469 return sq->head == sq->tail;
470}
471
472static void nvme_irq_check(NvmeCtrl *n)
473{
474 uint32_t intms = ldl_le_p(&n->bar.intms);
475
476 if (msix_enabled(&(n->parent_obj))) {
477 return;
478 }
479 if (~intms & n->irq_status) {
480 pci_irq_assert(&n->parent_obj);
481 } else {
482 pci_irq_deassert(&n->parent_obj);
483 }
484}
485
486static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
487{
488 if (cq->irq_enabled) {
489 if (msix_enabled(&(n->parent_obj))) {
490 trace_pci_nvme_irq_msix(cq->vector);
491 msix_notify(&(n->parent_obj), cq->vector);
492 } else {
493 trace_pci_nvme_irq_pin();
494 assert(cq->vector < 32);
495 n->irq_status |= 1 << cq->vector;
496 nvme_irq_check(n);
497 }
498 } else {
499 trace_pci_nvme_irq_masked();
500 }
501}
502
503static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
504{
505 if (cq->irq_enabled) {
506 if (msix_enabled(&(n->parent_obj))) {
507 return;
508 } else {
509 assert(cq->vector < 32);
510 if (!n->cq_pending) {
511 n->irq_status &= ~(1 << cq->vector);
512 }
513 nvme_irq_check(n);
514 }
515 }
516}
517
518static void nvme_req_clear(NvmeRequest *req)
519{
520 req->ns = NULL;
521 req->opaque = NULL;
522 req->aiocb = NULL;
523 memset(&req->cqe, 0x0, sizeof(req->cqe));
524 req->status = NVME_SUCCESS;
525}
526
527static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
528{
529 if (dma) {
530 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
531 sg->flags = NVME_SG_DMA;
532 } else {
533 qemu_iovec_init(&sg->iov, 0);
534 }
535
536 sg->flags |= NVME_SG_ALLOC;
537}
538
539static inline void nvme_sg_unmap(NvmeSg *sg)
540{
541 if (!(sg->flags & NVME_SG_ALLOC)) {
542 return;
543 }
544
545 if (sg->flags & NVME_SG_DMA) {
546 qemu_sglist_destroy(&sg->qsg);
547 } else {
548 qemu_iovec_destroy(&sg->iov);
549 }
550
551 memset(sg, 0x0, sizeof(*sg));
552}
553
554
555
556
557
558
559static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
560 NvmeSg *mdata)
561{
562 NvmeSg *dst = data;
563 uint32_t trans_len, count = ns->lbasz;
564 uint64_t offset = 0;
565 bool dma = sg->flags & NVME_SG_DMA;
566 size_t sge_len;
567 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
568 int sg_idx = 0;
569
570 assert(sg->flags & NVME_SG_ALLOC);
571
572 while (sg_len) {
573 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
574
575 trans_len = MIN(sg_len, count);
576 trans_len = MIN(trans_len, sge_len - offset);
577
578 if (dst) {
579 if (dma) {
580 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
581 trans_len);
582 } else {
583 qemu_iovec_add(&dst->iov,
584 sg->iov.iov[sg_idx].iov_base + offset,
585 trans_len);
586 }
587 }
588
589 sg_len -= trans_len;
590 count -= trans_len;
591 offset += trans_len;
592
593 if (count == 0) {
594 dst = (dst == data) ? mdata : data;
595 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
596 }
597
598 if (sge_len == offset) {
599 offset = 0;
600 sg_idx++;
601 }
602 }
603}
604
605static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
606 size_t len)
607{
608 if (!len) {
609 return NVME_SUCCESS;
610 }
611
612 trace_pci_nvme_map_addr_cmb(addr, len);
613
614 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
615 return NVME_DATA_TRAS_ERROR;
616 }
617
618 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
619
620 return NVME_SUCCESS;
621}
622
623static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
624 size_t len)
625{
626 if (!len) {
627 return NVME_SUCCESS;
628 }
629
630 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
631 return NVME_DATA_TRAS_ERROR;
632 }
633
634 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
635
636 return NVME_SUCCESS;
637}
638
639static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
640{
641 bool cmb = false, pmr = false;
642
643 if (!len) {
644 return NVME_SUCCESS;
645 }
646
647 trace_pci_nvme_map_addr(addr, len);
648
649 if (nvme_addr_is_iomem(n, addr)) {
650 return NVME_DATA_TRAS_ERROR;
651 }
652
653 if (nvme_addr_is_cmb(n, addr)) {
654 cmb = true;
655 } else if (nvme_addr_is_pmr(n, addr)) {
656 pmr = true;
657 }
658
659 if (cmb || pmr) {
660 if (sg->flags & NVME_SG_DMA) {
661 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
662 }
663
664 if (sg->iov.niov + 1 > IOV_MAX) {
665 goto max_mappings_exceeded;
666 }
667
668 if (cmb) {
669 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
670 } else {
671 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
672 }
673 }
674
675 if (!(sg->flags & NVME_SG_DMA)) {
676 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
677 }
678
679 if (sg->qsg.nsg + 1 > IOV_MAX) {
680 goto max_mappings_exceeded;
681 }
682
683 qemu_sglist_add(&sg->qsg, addr, len);
684
685 return NVME_SUCCESS;
686
687max_mappings_exceeded:
688 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
689 "number of mappings exceed 1024");
690 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
691}
692
693static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
694{
695 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
696}
697
698static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
699 uint64_t prp2, uint32_t len)
700{
701 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
702 trans_len = MIN(len, trans_len);
703 int num_prps = (len >> n->page_bits) + 1;
704 uint16_t status;
705 int ret;
706
707 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
708
709 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
710
711 status = nvme_map_addr(n, sg, prp1, trans_len);
712 if (status) {
713 goto unmap;
714 }
715
716 len -= trans_len;
717 if (len) {
718 if (len > n->page_size) {
719 uint64_t prp_list[n->max_prp_ents];
720 uint32_t nents, prp_trans;
721 int i = 0;
722
723
724
725
726
727
728 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
729 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
730 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
731 if (ret) {
732 trace_pci_nvme_err_addr_read(prp2);
733 status = NVME_DATA_TRAS_ERROR;
734 goto unmap;
735 }
736 while (len != 0) {
737 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
738
739 if (i == nents - 1 && len > n->page_size) {
740 if (unlikely(prp_ent & (n->page_size - 1))) {
741 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
742 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
743 goto unmap;
744 }
745
746 i = 0;
747 nents = (len + n->page_size - 1) >> n->page_bits;
748 nents = MIN(nents, n->max_prp_ents);
749 prp_trans = nents * sizeof(uint64_t);
750 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
751 prp_trans);
752 if (ret) {
753 trace_pci_nvme_err_addr_read(prp_ent);
754 status = NVME_DATA_TRAS_ERROR;
755 goto unmap;
756 }
757 prp_ent = le64_to_cpu(prp_list[i]);
758 }
759
760 if (unlikely(prp_ent & (n->page_size - 1))) {
761 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
762 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
763 goto unmap;
764 }
765
766 trans_len = MIN(len, n->page_size);
767 status = nvme_map_addr(n, sg, prp_ent, trans_len);
768 if (status) {
769 goto unmap;
770 }
771
772 len -= trans_len;
773 i++;
774 }
775 } else {
776 if (unlikely(prp2 & (n->page_size - 1))) {
777 trace_pci_nvme_err_invalid_prp2_align(prp2);
778 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
779 goto unmap;
780 }
781 status = nvme_map_addr(n, sg, prp2, len);
782 if (status) {
783 goto unmap;
784 }
785 }
786 }
787
788 return NVME_SUCCESS;
789
790unmap:
791 nvme_sg_unmap(sg);
792 return status;
793}
794
795
796
797
798
799static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
800 NvmeSglDescriptor *segment, uint64_t nsgld,
801 size_t *len, NvmeCmd *cmd)
802{
803 dma_addr_t addr, trans_len;
804 uint32_t dlen;
805 uint16_t status;
806
807 for (int i = 0; i < nsgld; i++) {
808 uint8_t type = NVME_SGL_TYPE(segment[i].type);
809
810 switch (type) {
811 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
812 if (cmd->opcode == NVME_CMD_WRITE) {
813 continue;
814 }
815 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
816 break;
817 case NVME_SGL_DESCR_TYPE_SEGMENT:
818 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
819 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
820 default:
821 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
822 }
823
824 dlen = le32_to_cpu(segment[i].len);
825
826 if (!dlen) {
827 continue;
828 }
829
830 if (*len == 0) {
831
832
833
834
835
836 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
837 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
838 break;
839 }
840
841 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
842 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
843 }
844
845 trans_len = MIN(*len, dlen);
846
847 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
848 goto next;
849 }
850
851 addr = le64_to_cpu(segment[i].addr);
852
853 if (UINT64_MAX - addr < dlen) {
854 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
855 }
856
857 status = nvme_map_addr(n, sg, addr, trans_len);
858 if (status) {
859 return status;
860 }
861
862next:
863 *len -= trans_len;
864 }
865
866 return NVME_SUCCESS;
867}
868
869static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
870 size_t len, NvmeCmd *cmd)
871{
872
873
874
875
876
877
878
879 const int SEG_CHUNK_SIZE = 256;
880
881 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
882 uint64_t nsgld;
883 uint32_t seg_len;
884 uint16_t status;
885 hwaddr addr;
886 int ret;
887
888 sgld = &sgl;
889 addr = le64_to_cpu(sgl.addr);
890
891 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
892
893 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
894
895
896
897
898
899 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
900 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
901 if (status) {
902 goto unmap;
903 }
904
905 goto out;
906 }
907
908 for (;;) {
909 switch (NVME_SGL_TYPE(sgld->type)) {
910 case NVME_SGL_DESCR_TYPE_SEGMENT:
911 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
912 break;
913 default:
914 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
915 }
916
917 seg_len = le32_to_cpu(sgld->len);
918
919
920 if ((!seg_len || seg_len & 0xf) &&
921 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
922 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
923 }
924
925 if (UINT64_MAX - addr < seg_len) {
926 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
927 }
928
929 nsgld = seg_len / sizeof(NvmeSglDescriptor);
930
931 while (nsgld > SEG_CHUNK_SIZE) {
932 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
933 trace_pci_nvme_err_addr_read(addr);
934 status = NVME_DATA_TRAS_ERROR;
935 goto unmap;
936 }
937
938 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
939 &len, cmd);
940 if (status) {
941 goto unmap;
942 }
943
944 nsgld -= SEG_CHUNK_SIZE;
945 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
946 }
947
948 ret = nvme_addr_read(n, addr, segment, nsgld *
949 sizeof(NvmeSglDescriptor));
950 if (ret) {
951 trace_pci_nvme_err_addr_read(addr);
952 status = NVME_DATA_TRAS_ERROR;
953 goto unmap;
954 }
955
956 last_sgld = &segment[nsgld - 1];
957
958
959
960
961
962 switch (NVME_SGL_TYPE(last_sgld->type)) {
963 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
964 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
965 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
966 if (status) {
967 goto unmap;
968 }
969
970 goto out;
971
972 default:
973 break;
974 }
975
976
977
978
979
980 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
981 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
982 goto unmap;
983 }
984
985 sgld = last_sgld;
986 addr = le64_to_cpu(sgld->addr);
987
988
989
990
991
992 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
993 if (status) {
994 goto unmap;
995 }
996 }
997
998out:
999
1000 if (len) {
1001 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1002 goto unmap;
1003 }
1004
1005 return NVME_SUCCESS;
1006
1007unmap:
1008 nvme_sg_unmap(sg);
1009 return status;
1010}
1011
1012uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1013 NvmeCmd *cmd)
1014{
1015 uint64_t prp1, prp2;
1016
1017 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1018 case NVME_PSDT_PRP:
1019 prp1 = le64_to_cpu(cmd->dptr.prp1);
1020 prp2 = le64_to_cpu(cmd->dptr.prp2);
1021
1022 return nvme_map_prp(n, sg, prp1, prp2, len);
1023 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1024 case NVME_PSDT_SGL_MPTR_SGL:
1025 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1026 default:
1027 return NVME_INVALID_FIELD;
1028 }
1029}
1030
1031static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1032 NvmeCmd *cmd)
1033{
1034 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1035 hwaddr mptr = le64_to_cpu(cmd->mptr);
1036 uint16_t status;
1037
1038 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1039 NvmeSglDescriptor sgl;
1040
1041 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1042 return NVME_DATA_TRAS_ERROR;
1043 }
1044
1045 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1046 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1047 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1048 }
1049
1050 return status;
1051 }
1052
1053 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1054 status = nvme_map_addr(n, sg, mptr, len);
1055 if (status) {
1056 nvme_sg_unmap(sg);
1057 }
1058
1059 return status;
1060}
1061
1062static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1063{
1064 NvmeNamespace *ns = req->ns;
1065 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1066 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1067 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1068 size_t len = nvme_l2b(ns, nlb);
1069 uint16_t status;
1070
1071 if (nvme_ns_ext(ns) &&
1072 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1073 NvmeSg sg;
1074
1075 len += nvme_m2b(ns, nlb);
1076
1077 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1078 if (status) {
1079 return status;
1080 }
1081
1082 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1083 nvme_sg_split(&sg, ns, &req->sg, NULL);
1084 nvme_sg_unmap(&sg);
1085
1086 return NVME_SUCCESS;
1087 }
1088
1089 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1090}
1091
1092static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1093{
1094 NvmeNamespace *ns = req->ns;
1095 size_t len = nvme_m2b(ns, nlb);
1096 uint16_t status;
1097
1098 if (nvme_ns_ext(ns)) {
1099 NvmeSg sg;
1100
1101 len += nvme_l2b(ns, nlb);
1102
1103 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1104 if (status) {
1105 return status;
1106 }
1107
1108 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1109 nvme_sg_split(&sg, ns, NULL, &req->sg);
1110 nvme_sg_unmap(&sg);
1111
1112 return NVME_SUCCESS;
1113 }
1114
1115 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1116}
1117
1118static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1119 uint32_t len, uint32_t bytes,
1120 int32_t skip_bytes, int64_t offset,
1121 NvmeTxDirection dir)
1122{
1123 hwaddr addr;
1124 uint32_t trans_len, count = bytes;
1125 bool dma = sg->flags & NVME_SG_DMA;
1126 int64_t sge_len;
1127 int sg_idx = 0;
1128 int ret;
1129
1130 assert(sg->flags & NVME_SG_ALLOC);
1131
1132 while (len) {
1133 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1134
1135 if (sge_len - offset < 0) {
1136 offset -= sge_len;
1137 sg_idx++;
1138 continue;
1139 }
1140
1141 if (sge_len == offset) {
1142 offset = 0;
1143 sg_idx++;
1144 continue;
1145 }
1146
1147 trans_len = MIN(len, count);
1148 trans_len = MIN(trans_len, sge_len - offset);
1149
1150 if (dma) {
1151 addr = sg->qsg.sg[sg_idx].base + offset;
1152 } else {
1153 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1154 }
1155
1156 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1157 ret = nvme_addr_read(n, addr, ptr, trans_len);
1158 } else {
1159 ret = nvme_addr_write(n, addr, ptr, trans_len);
1160 }
1161
1162 if (ret) {
1163 return NVME_DATA_TRAS_ERROR;
1164 }
1165
1166 ptr += trans_len;
1167 len -= trans_len;
1168 count -= trans_len;
1169 offset += trans_len;
1170
1171 if (count == 0) {
1172 count = bytes;
1173 offset += skip_bytes;
1174 }
1175 }
1176
1177 return NVME_SUCCESS;
1178}
1179
1180static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1181 NvmeTxDirection dir)
1182{
1183 assert(sg->flags & NVME_SG_ALLOC);
1184
1185 if (sg->flags & NVME_SG_DMA) {
1186 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1187 dma_addr_t residual;
1188
1189 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1190 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1191 } else {
1192 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1193 }
1194
1195 if (unlikely(residual)) {
1196 trace_pci_nvme_err_invalid_dma();
1197 return NVME_INVALID_FIELD | NVME_DNR;
1198 }
1199 } else {
1200 size_t bytes;
1201
1202 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1203 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1204 } else {
1205 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1206 }
1207
1208 if (unlikely(bytes != len)) {
1209 trace_pci_nvme_err_invalid_dma();
1210 return NVME_INVALID_FIELD | NVME_DNR;
1211 }
1212 }
1213
1214 return NVME_SUCCESS;
1215}
1216
1217static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1218 NvmeRequest *req)
1219{
1220 uint16_t status;
1221
1222 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1223 if (status) {
1224 return status;
1225 }
1226
1227 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1228}
1229
1230static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1231 NvmeRequest *req)
1232{
1233 uint16_t status;
1234
1235 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1236 if (status) {
1237 return status;
1238 }
1239
1240 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1241}
1242
1243uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1244 NvmeTxDirection dir, NvmeRequest *req)
1245{
1246 NvmeNamespace *ns = req->ns;
1247 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1248 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1249 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1250
1251 if (nvme_ns_ext(ns) &&
1252 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1253 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1254 ns->lbaf.ms, 0, dir);
1255 }
1256
1257 return nvme_tx(n, &req->sg, ptr, len, dir);
1258}
1259
1260uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1261 NvmeTxDirection dir, NvmeRequest *req)
1262{
1263 NvmeNamespace *ns = req->ns;
1264 uint16_t status;
1265
1266 if (nvme_ns_ext(ns)) {
1267 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1268 ns->lbasz, ns->lbasz, dir);
1269 }
1270
1271 nvme_sg_unmap(&req->sg);
1272
1273 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1274 if (status) {
1275 return status;
1276 }
1277
1278 return nvme_tx(n, &req->sg, ptr, len, dir);
1279}
1280
1281static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1282 BlockCompletionFunc *cb, NvmeRequest *req)
1283{
1284 assert(req->sg.flags & NVME_SG_ALLOC);
1285
1286 if (req->sg.flags & NVME_SG_DMA) {
1287 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1288 cb, req);
1289 } else {
1290 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1291 }
1292}
1293
1294static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1295 BlockCompletionFunc *cb, NvmeRequest *req)
1296{
1297 assert(req->sg.flags & NVME_SG_ALLOC);
1298
1299 if (req->sg.flags & NVME_SG_DMA) {
1300 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1301 cb, req);
1302 } else {
1303 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1304 }
1305}
1306
1307static void nvme_post_cqes(void *opaque)
1308{
1309 NvmeCQueue *cq = opaque;
1310 NvmeCtrl *n = cq->ctrl;
1311 NvmeRequest *req, *next;
1312 bool pending = cq->head != cq->tail;
1313 int ret;
1314
1315 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1316 NvmeSQueue *sq;
1317 hwaddr addr;
1318
1319 if (nvme_cq_full(cq)) {
1320 break;
1321 }
1322
1323 sq = req->sq;
1324 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1325 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1326 req->cqe.sq_head = cpu_to_le16(sq->head);
1327 addr = cq->dma_addr + cq->tail * n->cqe_size;
1328 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1329 sizeof(req->cqe));
1330 if (ret) {
1331 trace_pci_nvme_err_addr_write(addr);
1332 trace_pci_nvme_err_cfs();
1333 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1334 break;
1335 }
1336 QTAILQ_REMOVE(&cq->req_list, req, entry);
1337 nvme_inc_cq_tail(cq);
1338 nvme_sg_unmap(&req->sg);
1339 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1340 }
1341 if (cq->tail != cq->head) {
1342 if (cq->irq_enabled && !pending) {
1343 n->cq_pending++;
1344 }
1345
1346 nvme_irq_assert(n, cq);
1347 }
1348}
1349
1350static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1351{
1352 assert(cq->cqid == req->sq->cqid);
1353 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1354 le32_to_cpu(req->cqe.result),
1355 le32_to_cpu(req->cqe.dw1),
1356 req->status);
1357
1358 if (req->status) {
1359 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1360 req->status, req->cmd.opcode);
1361 }
1362
1363 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1364 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1365 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1366}
1367
1368static void nvme_process_aers(void *opaque)
1369{
1370 NvmeCtrl *n = opaque;
1371 NvmeAsyncEvent *event, *next;
1372
1373 trace_pci_nvme_process_aers(n->aer_queued);
1374
1375 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1376 NvmeRequest *req;
1377 NvmeAerResult *result;
1378
1379
1380 if (!n->outstanding_aers) {
1381 trace_pci_nvme_no_outstanding_aers();
1382 break;
1383 }
1384
1385
1386 if (n->aer_mask & (1 << event->result.event_type)) {
1387 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1388 continue;
1389 }
1390
1391 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1392 n->aer_queued--;
1393
1394 n->aer_mask |= 1 << event->result.event_type;
1395 n->outstanding_aers--;
1396
1397 req = n->aer_reqs[n->outstanding_aers];
1398
1399 result = (NvmeAerResult *) &req->cqe.result;
1400 result->event_type = event->result.event_type;
1401 result->event_info = event->result.event_info;
1402 result->log_page = event->result.log_page;
1403 g_free(event);
1404
1405 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1406 result->log_page);
1407
1408 nvme_enqueue_req_completion(&n->admin_cq, req);
1409 }
1410}
1411
1412static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1413 uint8_t event_info, uint8_t log_page)
1414{
1415 NvmeAsyncEvent *event;
1416
1417 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1418
1419 if (n->aer_queued == n->params.aer_max_queued) {
1420 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1421 return;
1422 }
1423
1424 event = g_new(NvmeAsyncEvent, 1);
1425 event->result = (NvmeAerResult) {
1426 .event_type = event_type,
1427 .event_info = event_info,
1428 .log_page = log_page,
1429 };
1430
1431 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1432 n->aer_queued++;
1433
1434 nvme_process_aers(n);
1435}
1436
1437static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1438{
1439 uint8_t aer_info;
1440
1441
1442 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1443 return;
1444 }
1445
1446 switch (event) {
1447 case NVME_SMART_SPARE:
1448 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1449 break;
1450 case NVME_SMART_TEMPERATURE:
1451 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1452 break;
1453 case NVME_SMART_RELIABILITY:
1454 case NVME_SMART_MEDIA_READ_ONLY:
1455 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1456 case NVME_SMART_PMR_UNRELIABLE:
1457 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1458 break;
1459 default:
1460 return;
1461 }
1462
1463 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1464}
1465
1466static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1467{
1468 n->aer_mask &= ~(1 << event_type);
1469 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1470 nvme_process_aers(n);
1471 }
1472}
1473
1474static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1475{
1476 uint8_t mdts = n->params.mdts;
1477
1478 if (mdts && len > n->page_size << mdts) {
1479 trace_pci_nvme_err_mdts(len);
1480 return NVME_INVALID_FIELD | NVME_DNR;
1481 }
1482
1483 return NVME_SUCCESS;
1484}
1485
1486static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1487 uint32_t nlb)
1488{
1489 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1490
1491 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1492 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1493 return NVME_LBA_RANGE | NVME_DNR;
1494 }
1495
1496 return NVME_SUCCESS;
1497}
1498
1499static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1500 uint32_t nlb, int flags)
1501{
1502 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1503
1504 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1505 int64_t offset = nvme_l2b(ns, slba);
1506 int ret;
1507
1508
1509
1510
1511
1512
1513
1514 do {
1515 bytes -= pnum;
1516
1517 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1518 if (ret < 0) {
1519 return ret;
1520 }
1521
1522
1523 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1524 !!(ret & BDRV_BLOCK_ZERO));
1525
1526 if (!(ret & flags)) {
1527 return 1;
1528 }
1529
1530 offset += pnum;
1531 } while (pnum != bytes);
1532
1533 return 0;
1534}
1535
1536static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1537 uint32_t nlb)
1538{
1539 int ret;
1540 Error *err = NULL;
1541
1542 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1543 if (ret) {
1544 if (ret < 0) {
1545 error_setg_errno(&err, -ret, "unable to get block status");
1546 error_report_err(err);
1547
1548 return NVME_INTERNAL_DEV_ERROR;
1549 }
1550
1551 return NVME_DULB;
1552 }
1553
1554 return NVME_SUCCESS;
1555}
1556
1557static void nvme_aio_err(NvmeRequest *req, int ret)
1558{
1559 uint16_t status = NVME_SUCCESS;
1560 Error *local_err = NULL;
1561
1562 switch (req->cmd.opcode) {
1563 case NVME_CMD_READ:
1564 status = NVME_UNRECOVERED_READ;
1565 break;
1566 case NVME_CMD_FLUSH:
1567 case NVME_CMD_WRITE:
1568 case NVME_CMD_WRITE_ZEROES:
1569 case NVME_CMD_ZONE_APPEND:
1570 status = NVME_WRITE_FAULT;
1571 break;
1572 default:
1573 status = NVME_INTERNAL_DEV_ERROR;
1574 break;
1575 }
1576
1577 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1578
1579 error_setg_errno(&local_err, -ret, "aio failed");
1580 error_report_err(local_err);
1581
1582
1583
1584
1585
1586 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1587 return;
1588 }
1589
1590 req->status = status;
1591}
1592
1593static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1594{
1595 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1596 slba / ns->zone_size;
1597}
1598
1599static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1600{
1601 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1602
1603 if (zone_idx >= ns->num_zones) {
1604 return NULL;
1605 }
1606
1607 return &ns->zone_array[zone_idx];
1608}
1609
1610static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1611{
1612 uint64_t zslba = zone->d.zslba;
1613
1614 switch (nvme_get_zone_state(zone)) {
1615 case NVME_ZONE_STATE_EMPTY:
1616 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1617 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1618 case NVME_ZONE_STATE_CLOSED:
1619 return NVME_SUCCESS;
1620 case NVME_ZONE_STATE_FULL:
1621 trace_pci_nvme_err_zone_is_full(zslba);
1622 return NVME_ZONE_FULL;
1623 case NVME_ZONE_STATE_OFFLINE:
1624 trace_pci_nvme_err_zone_is_offline(zslba);
1625 return NVME_ZONE_OFFLINE;
1626 case NVME_ZONE_STATE_READ_ONLY:
1627 trace_pci_nvme_err_zone_is_read_only(zslba);
1628 return NVME_ZONE_READ_ONLY;
1629 default:
1630 assert(false);
1631 }
1632
1633 return NVME_INTERNAL_DEV_ERROR;
1634}
1635
1636static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1637 uint64_t slba, uint32_t nlb)
1638{
1639 uint64_t zcap = nvme_zone_wr_boundary(zone);
1640 uint16_t status;
1641
1642 status = nvme_check_zone_state_for_write(zone);
1643 if (status) {
1644 return status;
1645 }
1646
1647 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1648 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1649
1650 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1651 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1652 return NVME_ZONE_INVALID_WRITE;
1653 }
1654 } else {
1655 if (unlikely(slba != zone->w_ptr)) {
1656 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1657 zone->w_ptr);
1658 return NVME_ZONE_INVALID_WRITE;
1659 }
1660 }
1661
1662 if (unlikely((slba + nlb) > zcap)) {
1663 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1664 return NVME_ZONE_BOUNDARY_ERROR;
1665 }
1666
1667 return NVME_SUCCESS;
1668}
1669
1670static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1671{
1672 switch (nvme_get_zone_state(zone)) {
1673 case NVME_ZONE_STATE_EMPTY:
1674 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1675 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1676 case NVME_ZONE_STATE_FULL:
1677 case NVME_ZONE_STATE_CLOSED:
1678 case NVME_ZONE_STATE_READ_ONLY:
1679 return NVME_SUCCESS;
1680 case NVME_ZONE_STATE_OFFLINE:
1681 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1682 return NVME_ZONE_OFFLINE;
1683 default:
1684 assert(false);
1685 }
1686
1687 return NVME_INTERNAL_DEV_ERROR;
1688}
1689
1690static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1691 uint32_t nlb)
1692{
1693 NvmeZone *zone;
1694 uint64_t bndry, end;
1695 uint16_t status;
1696
1697 zone = nvme_get_zone_by_slba(ns, slba);
1698 assert(zone);
1699
1700 bndry = nvme_zone_rd_boundary(ns, zone);
1701 end = slba + nlb;
1702
1703 status = nvme_check_zone_state_for_read(zone);
1704 if (status) {
1705 ;
1706 } else if (unlikely(end > bndry)) {
1707 if (!ns->params.cross_zone_read) {
1708 status = NVME_ZONE_BOUNDARY_ERROR;
1709 } else {
1710
1711
1712
1713
1714 do {
1715 zone++;
1716 status = nvme_check_zone_state_for_read(zone);
1717 if (status) {
1718 break;
1719 }
1720 } while (end > nvme_zone_rd_boundary(ns, zone));
1721 }
1722 }
1723
1724 return status;
1725}
1726
1727static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1728{
1729 switch (nvme_get_zone_state(zone)) {
1730 case NVME_ZONE_STATE_FULL:
1731 return NVME_SUCCESS;
1732
1733 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1734 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1735 nvme_aor_dec_open(ns);
1736
1737 case NVME_ZONE_STATE_CLOSED:
1738 nvme_aor_dec_active(ns);
1739
1740 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1741 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1742 if (ns->params.numzrwa) {
1743 ns->zns.numzrwa++;
1744 }
1745 }
1746
1747
1748 case NVME_ZONE_STATE_EMPTY:
1749 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1750 return NVME_SUCCESS;
1751
1752 default:
1753 return NVME_ZONE_INVAL_TRANSITION;
1754 }
1755}
1756
1757static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1758{
1759 switch (nvme_get_zone_state(zone)) {
1760 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1761 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1762 nvme_aor_dec_open(ns);
1763 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1764
1765 case NVME_ZONE_STATE_CLOSED:
1766 return NVME_SUCCESS;
1767
1768 default:
1769 return NVME_ZONE_INVAL_TRANSITION;
1770 }
1771}
1772
1773static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1774{
1775 switch (nvme_get_zone_state(zone)) {
1776 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1777 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1778 nvme_aor_dec_open(ns);
1779
1780 case NVME_ZONE_STATE_CLOSED:
1781 nvme_aor_dec_active(ns);
1782
1783 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1784 if (ns->params.numzrwa) {
1785 ns->zns.numzrwa++;
1786 }
1787 }
1788
1789
1790 case NVME_ZONE_STATE_FULL:
1791 zone->w_ptr = zone->d.zslba;
1792 zone->d.wp = zone->w_ptr;
1793 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1794
1795 case NVME_ZONE_STATE_EMPTY:
1796 return NVME_SUCCESS;
1797
1798 default:
1799 return NVME_ZONE_INVAL_TRANSITION;
1800 }
1801}
1802
1803static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1804{
1805 NvmeZone *zone;
1806
1807 if (ns->params.max_open_zones &&
1808 ns->nr_open_zones == ns->params.max_open_zones) {
1809 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1810 if (zone) {
1811
1812
1813
1814 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1815 nvme_zrm_close(ns, zone);
1816 }
1817 }
1818}
1819
1820enum {
1821 NVME_ZRM_AUTO = 1 << 0,
1822 NVME_ZRM_ZRWA = 1 << 1,
1823};
1824
1825static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1826 NvmeZone *zone, int flags)
1827{
1828 int act = 0;
1829 uint16_t status;
1830
1831 switch (nvme_get_zone_state(zone)) {
1832 case NVME_ZONE_STATE_EMPTY:
1833 act = 1;
1834
1835
1836
1837 case NVME_ZONE_STATE_CLOSED:
1838 if (n->params.auto_transition_zones) {
1839 nvme_zrm_auto_transition_zone(ns);
1840 }
1841 status = nvme_zns_check_resources(ns, act, 1,
1842 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
1843 if (status) {
1844 return status;
1845 }
1846
1847 if (act) {
1848 nvme_aor_inc_active(ns);
1849 }
1850
1851 nvme_aor_inc_open(ns);
1852
1853 if (flags & NVME_ZRM_AUTO) {
1854 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1855 return NVME_SUCCESS;
1856 }
1857
1858
1859
1860 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1861 if (flags & NVME_ZRM_AUTO) {
1862 return NVME_SUCCESS;
1863 }
1864
1865 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1866
1867
1868
1869 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1870 if (flags & NVME_ZRM_ZRWA) {
1871 ns->zns.numzrwa--;
1872
1873 zone->d.za |= NVME_ZA_ZRWA_VALID;
1874 }
1875
1876 return NVME_SUCCESS;
1877
1878 default:
1879 return NVME_ZONE_INVAL_TRANSITION;
1880 }
1881}
1882
1883static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1884 NvmeZone *zone)
1885{
1886 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1887}
1888
1889static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1890 uint32_t nlb)
1891{
1892 zone->d.wp += nlb;
1893
1894 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1895 nvme_zrm_finish(ns, zone);
1896 }
1897}
1898
1899static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
1900 uint32_t nlbc)
1901{
1902 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
1903
1904 nlbc = nzrwafgs * ns->zns.zrwafg;
1905
1906 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
1907
1908 zone->w_ptr += nlbc;
1909
1910 nvme_advance_zone_wp(ns, zone, nlbc);
1911}
1912
1913static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1914{
1915 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1916 NvmeZone *zone;
1917 uint64_t slba;
1918 uint32_t nlb;
1919
1920 slba = le64_to_cpu(rw->slba);
1921 nlb = le16_to_cpu(rw->nlb) + 1;
1922 zone = nvme_get_zone_by_slba(ns, slba);
1923 assert(zone);
1924
1925 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1926 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
1927 uint64_t elba = slba + nlb - 1;
1928
1929 if (elba > ezrwa) {
1930 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
1931 }
1932
1933 return;
1934 }
1935
1936 nvme_advance_zone_wp(ns, zone, nlb);
1937}
1938
1939static inline bool nvme_is_write(NvmeRequest *req)
1940{
1941 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1942
1943 return rw->opcode == NVME_CMD_WRITE ||
1944 rw->opcode == NVME_CMD_ZONE_APPEND ||
1945 rw->opcode == NVME_CMD_WRITE_ZEROES;
1946}
1947
1948static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1949{
1950 return qemu_get_aio_context();
1951}
1952
1953static void nvme_misc_cb(void *opaque, int ret)
1954{
1955 NvmeRequest *req = opaque;
1956
1957 trace_pci_nvme_misc_cb(nvme_cid(req));
1958
1959 if (ret) {
1960 nvme_aio_err(req, ret);
1961 }
1962
1963 nvme_enqueue_req_completion(nvme_cq(req), req);
1964}
1965
1966void nvme_rw_complete_cb(void *opaque, int ret)
1967{
1968 NvmeRequest *req = opaque;
1969 NvmeNamespace *ns = req->ns;
1970 BlockBackend *blk = ns->blkconf.blk;
1971 BlockAcctCookie *acct = &req->acct;
1972 BlockAcctStats *stats = blk_get_stats(blk);
1973
1974 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1975
1976 if (ret) {
1977 block_acct_failed(stats, acct);
1978 nvme_aio_err(req, ret);
1979 } else {
1980 block_acct_done(stats, acct);
1981 }
1982
1983 if (ns->params.zoned && nvme_is_write(req)) {
1984 nvme_finalize_zoned_write(ns, req);
1985 }
1986
1987 nvme_enqueue_req_completion(nvme_cq(req), req);
1988}
1989
1990static void nvme_rw_cb(void *opaque, int ret)
1991{
1992 NvmeRequest *req = opaque;
1993 NvmeNamespace *ns = req->ns;
1994
1995 BlockBackend *blk = ns->blkconf.blk;
1996
1997 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1998
1999 if (ret) {
2000 goto out;
2001 }
2002
2003 if (ns->lbaf.ms) {
2004 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2005 uint64_t slba = le64_to_cpu(rw->slba);
2006 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2007 uint64_t offset = nvme_moff(ns, slba);
2008
2009 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2010 size_t mlen = nvme_m2b(ns, nlb);
2011
2012 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2013 BDRV_REQ_MAY_UNMAP,
2014 nvme_rw_complete_cb, req);
2015 return;
2016 }
2017
2018 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2019 uint16_t status;
2020
2021 nvme_sg_unmap(&req->sg);
2022 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2023 if (status) {
2024 ret = -EFAULT;
2025 goto out;
2026 }
2027
2028 if (req->cmd.opcode == NVME_CMD_READ) {
2029 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
2030 }
2031
2032 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
2033 }
2034 }
2035
2036out:
2037 nvme_rw_complete_cb(req, ret);
2038}
2039
2040static void nvme_verify_cb(void *opaque, int ret)
2041{
2042 NvmeBounceContext *ctx = opaque;
2043 NvmeRequest *req = ctx->req;
2044 NvmeNamespace *ns = req->ns;
2045 BlockBackend *blk = ns->blkconf.blk;
2046 BlockAcctCookie *acct = &req->acct;
2047 BlockAcctStats *stats = blk_get_stats(blk);
2048 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2049 uint64_t slba = le64_to_cpu(rw->slba);
2050 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2051 uint16_t apptag = le16_to_cpu(rw->apptag);
2052 uint16_t appmask = le16_to_cpu(rw->appmask);
2053 uint64_t reftag = le32_to_cpu(rw->reftag);
2054 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2055 uint16_t status;
2056
2057 reftag |= cdw3 << 32;
2058
2059 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2060
2061 if (ret) {
2062 block_acct_failed(stats, acct);
2063 nvme_aio_err(req, ret);
2064 goto out;
2065 }
2066
2067 block_acct_done(stats, acct);
2068
2069 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2070 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2071 ctx->mdata.iov.size, slba);
2072 if (status) {
2073 req->status = status;
2074 goto out;
2075 }
2076
2077 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2078 ctx->mdata.bounce, ctx->mdata.iov.size,
2079 prinfo, slba, apptag, appmask, &reftag);
2080 }
2081
2082out:
2083 qemu_iovec_destroy(&ctx->data.iov);
2084 g_free(ctx->data.bounce);
2085
2086 qemu_iovec_destroy(&ctx->mdata.iov);
2087 g_free(ctx->mdata.bounce);
2088
2089 g_free(ctx);
2090
2091 nvme_enqueue_req_completion(nvme_cq(req), req);
2092}
2093
2094
2095static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2096{
2097 NvmeBounceContext *ctx = opaque;
2098 NvmeRequest *req = ctx->req;
2099 NvmeNamespace *ns = req->ns;
2100 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2101 uint64_t slba = le64_to_cpu(rw->slba);
2102 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2103 size_t mlen = nvme_m2b(ns, nlb);
2104 uint64_t offset = nvme_moff(ns, slba);
2105 BlockBackend *blk = ns->blkconf.blk;
2106
2107 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2108
2109 if (ret) {
2110 goto out;
2111 }
2112
2113 ctx->mdata.bounce = g_malloc(mlen);
2114
2115 qemu_iovec_reset(&ctx->mdata.iov);
2116 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2117
2118 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2119 nvme_verify_cb, ctx);
2120 return;
2121
2122out:
2123 nvme_verify_cb(ctx, ret);
2124}
2125
2126struct nvme_compare_ctx {
2127 struct {
2128 QEMUIOVector iov;
2129 uint8_t *bounce;
2130 } data;
2131
2132 struct {
2133 QEMUIOVector iov;
2134 uint8_t *bounce;
2135 } mdata;
2136};
2137
2138static void nvme_compare_mdata_cb(void *opaque, int ret)
2139{
2140 NvmeRequest *req = opaque;
2141 NvmeNamespace *ns = req->ns;
2142 NvmeCtrl *n = nvme_ctrl(req);
2143 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2144 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2145 uint16_t apptag = le16_to_cpu(rw->apptag);
2146 uint16_t appmask = le16_to_cpu(rw->appmask);
2147 uint64_t reftag = le32_to_cpu(rw->reftag);
2148 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2149 struct nvme_compare_ctx *ctx = req->opaque;
2150 g_autofree uint8_t *buf = NULL;
2151 BlockBackend *blk = ns->blkconf.blk;
2152 BlockAcctCookie *acct = &req->acct;
2153 BlockAcctStats *stats = blk_get_stats(blk);
2154 uint16_t status = NVME_SUCCESS;
2155
2156 reftag |= cdw3 << 32;
2157
2158 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2159
2160 if (ret) {
2161 block_acct_failed(stats, acct);
2162 nvme_aio_err(req, ret);
2163 goto out;
2164 }
2165
2166 buf = g_malloc(ctx->mdata.iov.size);
2167
2168 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2169 NVME_TX_DIRECTION_TO_DEVICE, req);
2170 if (status) {
2171 req->status = status;
2172 goto out;
2173 }
2174
2175 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2176 uint64_t slba = le64_to_cpu(rw->slba);
2177 uint8_t *bufp;
2178 uint8_t *mbufp = ctx->mdata.bounce;
2179 uint8_t *end = mbufp + ctx->mdata.iov.size;
2180 int16_t pil = 0;
2181
2182 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2183 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2184 slba, apptag, appmask, &reftag);
2185 if (status) {
2186 req->status = status;
2187 goto out;
2188 }
2189
2190
2191
2192
2193
2194 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2195 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2196 }
2197
2198 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2199 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2200 req->status = NVME_CMP_FAILURE;
2201 goto out;
2202 }
2203 }
2204
2205 goto out;
2206 }
2207
2208 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2209 req->status = NVME_CMP_FAILURE;
2210 goto out;
2211 }
2212
2213 block_acct_done(stats, acct);
2214
2215out:
2216 qemu_iovec_destroy(&ctx->data.iov);
2217 g_free(ctx->data.bounce);
2218
2219 qemu_iovec_destroy(&ctx->mdata.iov);
2220 g_free(ctx->mdata.bounce);
2221
2222 g_free(ctx);
2223
2224 nvme_enqueue_req_completion(nvme_cq(req), req);
2225}
2226
2227static void nvme_compare_data_cb(void *opaque, int ret)
2228{
2229 NvmeRequest *req = opaque;
2230 NvmeCtrl *n = nvme_ctrl(req);
2231 NvmeNamespace *ns = req->ns;
2232 BlockBackend *blk = ns->blkconf.blk;
2233 BlockAcctCookie *acct = &req->acct;
2234 BlockAcctStats *stats = blk_get_stats(blk);
2235
2236 struct nvme_compare_ctx *ctx = req->opaque;
2237 g_autofree uint8_t *buf = NULL;
2238 uint16_t status;
2239
2240 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2241
2242 if (ret) {
2243 block_acct_failed(stats, acct);
2244 nvme_aio_err(req, ret);
2245 goto out;
2246 }
2247
2248 buf = g_malloc(ctx->data.iov.size);
2249
2250 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2251 NVME_TX_DIRECTION_TO_DEVICE, req);
2252 if (status) {
2253 req->status = status;
2254 goto out;
2255 }
2256
2257 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2258 req->status = NVME_CMP_FAILURE;
2259 goto out;
2260 }
2261
2262 if (ns->lbaf.ms) {
2263 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2264 uint64_t slba = le64_to_cpu(rw->slba);
2265 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2266 size_t mlen = nvme_m2b(ns, nlb);
2267 uint64_t offset = nvme_moff(ns, slba);
2268
2269 ctx->mdata.bounce = g_malloc(mlen);
2270
2271 qemu_iovec_init(&ctx->mdata.iov, 1);
2272 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2273
2274 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2275 nvme_compare_mdata_cb, req);
2276 return;
2277 }
2278
2279 block_acct_done(stats, acct);
2280
2281out:
2282 qemu_iovec_destroy(&ctx->data.iov);
2283 g_free(ctx->data.bounce);
2284 g_free(ctx);
2285
2286 nvme_enqueue_req_completion(nvme_cq(req), req);
2287}
2288
2289typedef struct NvmeDSMAIOCB {
2290 BlockAIOCB common;
2291 BlockAIOCB *aiocb;
2292 NvmeRequest *req;
2293 QEMUBH *bh;
2294 int ret;
2295
2296 NvmeDsmRange *range;
2297 unsigned int nr;
2298 unsigned int idx;
2299} NvmeDSMAIOCB;
2300
2301static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2302{
2303 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2304
2305
2306 iocb->idx = iocb->nr;
2307 iocb->ret = -ECANCELED;
2308
2309 if (iocb->aiocb) {
2310 blk_aio_cancel_async(iocb->aiocb);
2311 iocb->aiocb = NULL;
2312 } else {
2313
2314
2315
2316
2317 assert(iocb->idx == iocb->nr);
2318 }
2319}
2320
2321static const AIOCBInfo nvme_dsm_aiocb_info = {
2322 .aiocb_size = sizeof(NvmeDSMAIOCB),
2323 .cancel_async = nvme_dsm_cancel,
2324};
2325
2326static void nvme_dsm_bh(void *opaque)
2327{
2328 NvmeDSMAIOCB *iocb = opaque;
2329
2330 iocb->common.cb(iocb->common.opaque, iocb->ret);
2331
2332 qemu_bh_delete(iocb->bh);
2333 iocb->bh = NULL;
2334 qemu_aio_unref(iocb);
2335}
2336
2337static void nvme_dsm_cb(void *opaque, int ret);
2338
2339static void nvme_dsm_md_cb(void *opaque, int ret)
2340{
2341 NvmeDSMAIOCB *iocb = opaque;
2342 NvmeRequest *req = iocb->req;
2343 NvmeNamespace *ns = req->ns;
2344 NvmeDsmRange *range;
2345 uint64_t slba;
2346 uint32_t nlb;
2347
2348 if (ret < 0) {
2349 iocb->ret = ret;
2350 goto done;
2351 }
2352
2353 if (!ns->lbaf.ms) {
2354 nvme_dsm_cb(iocb, 0);
2355 return;
2356 }
2357
2358 range = &iocb->range[iocb->idx - 1];
2359 slba = le64_to_cpu(range->slba);
2360 nlb = le32_to_cpu(range->nlb);
2361
2362
2363
2364
2365
2366
2367 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2368 if (ret) {
2369 if (ret < 0) {
2370 iocb->ret = ret;
2371 goto done;
2372 }
2373
2374 nvme_dsm_cb(iocb, 0);
2375 }
2376
2377 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2378 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2379 nvme_dsm_cb, iocb);
2380 return;
2381
2382done:
2383 iocb->aiocb = NULL;
2384 qemu_bh_schedule(iocb->bh);
2385}
2386
2387static void nvme_dsm_cb(void *opaque, int ret)
2388{
2389 NvmeDSMAIOCB *iocb = opaque;
2390 NvmeRequest *req = iocb->req;
2391 NvmeCtrl *n = nvme_ctrl(req);
2392 NvmeNamespace *ns = req->ns;
2393 NvmeDsmRange *range;
2394 uint64_t slba;
2395 uint32_t nlb;
2396
2397 if (ret < 0) {
2398 iocb->ret = ret;
2399 goto done;
2400 }
2401
2402next:
2403 if (iocb->idx == iocb->nr) {
2404 goto done;
2405 }
2406
2407 range = &iocb->range[iocb->idx++];
2408 slba = le64_to_cpu(range->slba);
2409 nlb = le32_to_cpu(range->nlb);
2410
2411 trace_pci_nvme_dsm_deallocate(slba, nlb);
2412
2413 if (nlb > n->dmrsl) {
2414 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2415 goto next;
2416 }
2417
2418 if (nvme_check_bounds(ns, slba, nlb)) {
2419 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2420 ns->id_ns.nsze);
2421 goto next;
2422 }
2423
2424 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2425 nvme_l2b(ns, nlb),
2426 nvme_dsm_md_cb, iocb);
2427 return;
2428
2429done:
2430 iocb->aiocb = NULL;
2431 qemu_bh_schedule(iocb->bh);
2432}
2433
2434static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2435{
2436 NvmeNamespace *ns = req->ns;
2437 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2438 uint32_t attr = le32_to_cpu(dsm->attributes);
2439 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2440 uint16_t status = NVME_SUCCESS;
2441
2442 trace_pci_nvme_dsm(nr, attr);
2443
2444 if (attr & NVME_DSMGMT_AD) {
2445 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2446 nvme_misc_cb, req);
2447
2448 iocb->req = req;
2449 iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2450 iocb->ret = 0;
2451 iocb->range = g_new(NvmeDsmRange, nr);
2452 iocb->nr = nr;
2453 iocb->idx = 0;
2454
2455 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2456 req);
2457 if (status) {
2458 return status;
2459 }
2460
2461 req->aiocb = &iocb->common;
2462 nvme_dsm_cb(iocb, 0);
2463
2464 return NVME_NO_COMPLETE;
2465 }
2466
2467 return status;
2468}
2469
2470static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2471{
2472 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2473 NvmeNamespace *ns = req->ns;
2474 BlockBackend *blk = ns->blkconf.blk;
2475 uint64_t slba = le64_to_cpu(rw->slba);
2476 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2477 size_t len = nvme_l2b(ns, nlb);
2478 int64_t offset = nvme_l2b(ns, slba);
2479 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2480 uint32_t reftag = le32_to_cpu(rw->reftag);
2481 NvmeBounceContext *ctx = NULL;
2482 uint16_t status;
2483
2484 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2485
2486 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2487 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2488 if (status) {
2489 return status;
2490 }
2491
2492 if (prinfo & NVME_PRINFO_PRACT) {
2493 return NVME_INVALID_PROT_INFO | NVME_DNR;
2494 }
2495 }
2496
2497 if (len > n->page_size << n->params.vsl) {
2498 return NVME_INVALID_FIELD | NVME_DNR;
2499 }
2500
2501 status = nvme_check_bounds(ns, slba, nlb);
2502 if (status) {
2503 return status;
2504 }
2505
2506 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2507 status = nvme_check_dulbe(ns, slba, nlb);
2508 if (status) {
2509 return status;
2510 }
2511 }
2512
2513 ctx = g_new0(NvmeBounceContext, 1);
2514 ctx->req = req;
2515
2516 ctx->data.bounce = g_malloc(len);
2517
2518 qemu_iovec_init(&ctx->data.iov, 1);
2519 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2520
2521 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2522 BLOCK_ACCT_READ);
2523
2524 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2525 nvme_verify_mdata_in_cb, ctx);
2526 return NVME_NO_COMPLETE;
2527}
2528
2529typedef struct NvmeCopyAIOCB {
2530 BlockAIOCB common;
2531 BlockAIOCB *aiocb;
2532 NvmeRequest *req;
2533 QEMUBH *bh;
2534 int ret;
2535
2536 void *ranges;
2537 unsigned int format;
2538 int nr;
2539 int idx;
2540
2541 uint8_t *bounce;
2542 QEMUIOVector iov;
2543 struct {
2544 BlockAcctCookie read;
2545 BlockAcctCookie write;
2546 } acct;
2547
2548 uint64_t reftag;
2549 uint64_t slba;
2550
2551 NvmeZone *zone;
2552} NvmeCopyAIOCB;
2553
2554static void nvme_copy_cancel(BlockAIOCB *aiocb)
2555{
2556 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2557
2558 iocb->ret = -ECANCELED;
2559
2560 if (iocb->aiocb) {
2561 blk_aio_cancel_async(iocb->aiocb);
2562 iocb->aiocb = NULL;
2563 }
2564}
2565
2566static const AIOCBInfo nvme_copy_aiocb_info = {
2567 .aiocb_size = sizeof(NvmeCopyAIOCB),
2568 .cancel_async = nvme_copy_cancel,
2569};
2570
2571static void nvme_copy_bh(void *opaque)
2572{
2573 NvmeCopyAIOCB *iocb = opaque;
2574 NvmeRequest *req = iocb->req;
2575 NvmeNamespace *ns = req->ns;
2576 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2577
2578 if (iocb->idx != iocb->nr) {
2579 req->cqe.result = cpu_to_le32(iocb->idx);
2580 }
2581
2582 qemu_iovec_destroy(&iocb->iov);
2583 g_free(iocb->bounce);
2584
2585 qemu_bh_delete(iocb->bh);
2586 iocb->bh = NULL;
2587
2588 if (iocb->ret < 0) {
2589 block_acct_failed(stats, &iocb->acct.read);
2590 block_acct_failed(stats, &iocb->acct.write);
2591 } else {
2592 block_acct_done(stats, &iocb->acct.read);
2593 block_acct_done(stats, &iocb->acct.write);
2594 }
2595
2596 iocb->common.cb(iocb->common.opaque, iocb->ret);
2597 qemu_aio_unref(iocb);
2598}
2599
2600static void nvme_copy_cb(void *opaque, int ret);
2601
2602static void nvme_copy_source_range_parse_format0(void *ranges, int idx,
2603 uint64_t *slba, uint32_t *nlb,
2604 uint16_t *apptag,
2605 uint16_t *appmask,
2606 uint64_t *reftag)
2607{
2608 NvmeCopySourceRangeFormat0 *_ranges = ranges;
2609
2610 if (slba) {
2611 *slba = le64_to_cpu(_ranges[idx].slba);
2612 }
2613
2614 if (nlb) {
2615 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2616 }
2617
2618 if (apptag) {
2619 *apptag = le16_to_cpu(_ranges[idx].apptag);
2620 }
2621
2622 if (appmask) {
2623 *appmask = le16_to_cpu(_ranges[idx].appmask);
2624 }
2625
2626 if (reftag) {
2627 *reftag = le32_to_cpu(_ranges[idx].reftag);
2628 }
2629}
2630
2631static void nvme_copy_source_range_parse_format1(void *ranges, int idx,
2632 uint64_t *slba, uint32_t *nlb,
2633 uint16_t *apptag,
2634 uint16_t *appmask,
2635 uint64_t *reftag)
2636{
2637 NvmeCopySourceRangeFormat1 *_ranges = ranges;
2638
2639 if (slba) {
2640 *slba = le64_to_cpu(_ranges[idx].slba);
2641 }
2642
2643 if (nlb) {
2644 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2645 }
2646
2647 if (apptag) {
2648 *apptag = le16_to_cpu(_ranges[idx].apptag);
2649 }
2650
2651 if (appmask) {
2652 *appmask = le16_to_cpu(_ranges[idx].appmask);
2653 }
2654
2655 if (reftag) {
2656 *reftag = 0;
2657
2658 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2659 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2660 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2661 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2662 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2663 *reftag |= (uint64_t)_ranges[idx].sr[9];
2664 }
2665}
2666
2667static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2668 uint64_t *slba, uint32_t *nlb,
2669 uint16_t *apptag, uint16_t *appmask,
2670 uint64_t *reftag)
2671{
2672 switch (format) {
2673 case NVME_COPY_FORMAT_0:
2674 nvme_copy_source_range_parse_format0(ranges, idx, slba, nlb, apptag,
2675 appmask, reftag);
2676 break;
2677
2678 case NVME_COPY_FORMAT_1:
2679 nvme_copy_source_range_parse_format1(ranges, idx, slba, nlb, apptag,
2680 appmask, reftag);
2681 break;
2682
2683 default:
2684 abort();
2685 }
2686}
2687
2688static void nvme_copy_out_completed_cb(void *opaque, int ret)
2689{
2690 NvmeCopyAIOCB *iocb = opaque;
2691 NvmeRequest *req = iocb->req;
2692 NvmeNamespace *ns = req->ns;
2693 uint32_t nlb;
2694
2695 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2696 &nlb, NULL, NULL, NULL);
2697
2698 if (ret < 0) {
2699 iocb->ret = ret;
2700 goto out;
2701 } else if (iocb->ret < 0) {
2702 goto out;
2703 }
2704
2705 if (ns->params.zoned) {
2706 nvme_advance_zone_wp(ns, iocb->zone, nlb);
2707 }
2708
2709 iocb->idx++;
2710 iocb->slba += nlb;
2711out:
2712 nvme_copy_cb(iocb, iocb->ret);
2713}
2714
2715static void nvme_copy_out_cb(void *opaque, int ret)
2716{
2717 NvmeCopyAIOCB *iocb = opaque;
2718 NvmeRequest *req = iocb->req;
2719 NvmeNamespace *ns = req->ns;
2720 uint32_t nlb;
2721 size_t mlen;
2722 uint8_t *mbounce;
2723
2724 if (ret < 0) {
2725 iocb->ret = ret;
2726 goto out;
2727 } else if (iocb->ret < 0) {
2728 goto out;
2729 }
2730
2731 if (!ns->lbaf.ms) {
2732 nvme_copy_out_completed_cb(iocb, 0);
2733 return;
2734 }
2735
2736 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2737 &nlb, NULL, NULL, NULL);
2738
2739 mlen = nvme_m2b(ns, nlb);
2740 mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2741
2742 qemu_iovec_reset(&iocb->iov);
2743 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2744
2745 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2746 &iocb->iov, 0, nvme_copy_out_completed_cb,
2747 iocb);
2748
2749 return;
2750
2751out:
2752 nvme_copy_cb(iocb, ret);
2753}
2754
2755static void nvme_copy_in_completed_cb(void *opaque, int ret)
2756{
2757 NvmeCopyAIOCB *iocb = opaque;
2758 NvmeRequest *req = iocb->req;
2759 NvmeNamespace *ns = req->ns;
2760 uint32_t nlb;
2761 uint64_t slba;
2762 uint16_t apptag, appmask;
2763 uint64_t reftag;
2764 size_t len;
2765 uint16_t status;
2766
2767 if (ret < 0) {
2768 iocb->ret = ret;
2769 goto out;
2770 } else if (iocb->ret < 0) {
2771 goto out;
2772 }
2773
2774 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2775 &nlb, &apptag, &appmask, &reftag);
2776 len = nvme_l2b(ns, nlb);
2777
2778 trace_pci_nvme_copy_out(iocb->slba, nlb);
2779
2780 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2781 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2782
2783 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2784 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2785
2786 size_t mlen = nvme_m2b(ns, nlb);
2787 uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2788
2789 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2790 slba, apptag, appmask, &reftag);
2791 if (status) {
2792 goto invalid;
2793 }
2794
2795 apptag = le16_to_cpu(copy->apptag);
2796 appmask = le16_to_cpu(copy->appmask);
2797
2798 if (prinfow & NVME_PRINFO_PRACT) {
2799 status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2800 if (status) {
2801 goto invalid;
2802 }
2803
2804 nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2805 apptag, &iocb->reftag);
2806 } else {
2807 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2808 prinfow, iocb->slba, apptag, appmask,
2809 &iocb->reftag);
2810 if (status) {
2811 goto invalid;
2812 }
2813 }
2814 }
2815
2816 status = nvme_check_bounds(ns, iocb->slba, nlb);
2817 if (status) {
2818 goto invalid;
2819 }
2820
2821 if (ns->params.zoned) {
2822 status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2823 if (status) {
2824 goto invalid;
2825 }
2826
2827 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
2828 iocb->zone->w_ptr += nlb;
2829 }
2830 }
2831
2832 qemu_iovec_reset(&iocb->iov);
2833 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2834
2835 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2836 &iocb->iov, 0, nvme_copy_out_cb, iocb);
2837
2838 return;
2839
2840invalid:
2841 req->status = status;
2842 iocb->aiocb = NULL;
2843 if (iocb->bh) {
2844 qemu_bh_schedule(iocb->bh);
2845 }
2846
2847 return;
2848
2849out:
2850 nvme_copy_cb(iocb, ret);
2851}
2852
2853static void nvme_copy_in_cb(void *opaque, int ret)
2854{
2855 NvmeCopyAIOCB *iocb = opaque;
2856 NvmeRequest *req = iocb->req;
2857 NvmeNamespace *ns = req->ns;
2858 uint64_t slba;
2859 uint32_t nlb;
2860
2861 if (ret < 0) {
2862 iocb->ret = ret;
2863 goto out;
2864 } else if (iocb->ret < 0) {
2865 goto out;
2866 }
2867
2868 if (!ns->lbaf.ms) {
2869 nvme_copy_in_completed_cb(iocb, 0);
2870 return;
2871 }
2872
2873 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2874 &nlb, NULL, NULL, NULL);
2875
2876 qemu_iovec_reset(&iocb->iov);
2877 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2878 nvme_m2b(ns, nlb));
2879
2880 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2881 &iocb->iov, 0, nvme_copy_in_completed_cb,
2882 iocb);
2883 return;
2884
2885out:
2886 nvme_copy_cb(iocb, iocb->ret);
2887}
2888
2889static void nvme_copy_cb(void *opaque, int ret)
2890{
2891 NvmeCopyAIOCB *iocb = opaque;
2892 NvmeRequest *req = iocb->req;
2893 NvmeNamespace *ns = req->ns;
2894 uint64_t slba;
2895 uint32_t nlb;
2896 size_t len;
2897 uint16_t status;
2898
2899 if (ret < 0) {
2900 iocb->ret = ret;
2901 goto done;
2902 } else if (iocb->ret < 0) {
2903 goto done;
2904 }
2905
2906 if (iocb->idx == iocb->nr) {
2907 goto done;
2908 }
2909
2910 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2911 &nlb, NULL, NULL, NULL);
2912 len = nvme_l2b(ns, nlb);
2913
2914 trace_pci_nvme_copy_source_range(slba, nlb);
2915
2916 if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2917 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2918 goto invalid;
2919 }
2920
2921 status = nvme_check_bounds(ns, slba, nlb);
2922 if (status) {
2923 goto invalid;
2924 }
2925
2926 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2927 status = nvme_check_dulbe(ns, slba, nlb);
2928 if (status) {
2929 goto invalid;
2930 }
2931 }
2932
2933 if (ns->params.zoned) {
2934 status = nvme_check_zone_read(ns, slba, nlb);
2935 if (status) {
2936 goto invalid;
2937 }
2938 }
2939
2940 qemu_iovec_reset(&iocb->iov);
2941 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2942
2943 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2944 &iocb->iov, 0, nvme_copy_in_cb, iocb);
2945 return;
2946
2947invalid:
2948 req->status = status;
2949done:
2950 iocb->aiocb = NULL;
2951 if (iocb->bh) {
2952 qemu_bh_schedule(iocb->bh);
2953 }
2954}
2955
2956
2957static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2958{
2959 NvmeNamespace *ns = req->ns;
2960 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2961 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
2962 nvme_misc_cb, req);
2963 uint16_t nr = copy->nr + 1;
2964 uint8_t format = copy->control[0] & 0xf;
2965 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2966 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2967 size_t len = sizeof(NvmeCopySourceRangeFormat0);
2968
2969 uint16_t status;
2970
2971 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2972
2973 iocb->ranges = NULL;
2974 iocb->zone = NULL;
2975
2976 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2977 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
2978 status = NVME_INVALID_FIELD | NVME_DNR;
2979 goto invalid;
2980 }
2981
2982 if (!(n->id_ctrl.ocfs & (1 << format))) {
2983 trace_pci_nvme_err_copy_invalid_format(format);
2984 status = NVME_INVALID_FIELD | NVME_DNR;
2985 goto invalid;
2986 }
2987
2988 if (nr > ns->id_ns.msrc + 1) {
2989 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2990 goto invalid;
2991 }
2992
2993 if (ns->pif && format != 0x1) {
2994 status = NVME_INVALID_FORMAT | NVME_DNR;
2995 goto invalid;
2996 }
2997
2998 if (ns->pif) {
2999 len = sizeof(NvmeCopySourceRangeFormat1);
3000 }
3001
3002 iocb->format = format;
3003 iocb->ranges = g_malloc_n(nr, len);
3004 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3005 if (status) {
3006 goto invalid;
3007 }
3008
3009 iocb->slba = le64_to_cpu(copy->sdlba);
3010
3011 if (ns->params.zoned) {
3012 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3013 if (!iocb->zone) {
3014 status = NVME_LBA_RANGE | NVME_DNR;
3015 goto invalid;
3016 }
3017
3018 status = nvme_zrm_auto(n, ns, iocb->zone);
3019 if (status) {
3020 goto invalid;
3021 }
3022 }
3023
3024 iocb->req = req;
3025 iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
3026 iocb->ret = 0;
3027 iocb->nr = nr;
3028 iocb->idx = 0;
3029 iocb->reftag = le32_to_cpu(copy->reftag);
3030 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3031 iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
3032 ns->lbasz + ns->lbaf.ms);
3033
3034 qemu_iovec_init(&iocb->iov, 1);
3035
3036 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
3037 BLOCK_ACCT_READ);
3038 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
3039 BLOCK_ACCT_WRITE);
3040
3041 req->aiocb = &iocb->common;
3042 nvme_copy_cb(iocb, 0);
3043
3044 return NVME_NO_COMPLETE;
3045
3046invalid:
3047 g_free(iocb->ranges);
3048 qemu_aio_unref(iocb);
3049 return status;
3050}
3051
3052static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3053{
3054 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3055 NvmeNamespace *ns = req->ns;
3056 BlockBackend *blk = ns->blkconf.blk;
3057 uint64_t slba = le64_to_cpu(rw->slba);
3058 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3059 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3060 size_t data_len = nvme_l2b(ns, nlb);
3061 size_t len = data_len;
3062 int64_t offset = nvme_l2b(ns, slba);
3063 struct nvme_compare_ctx *ctx = NULL;
3064 uint16_t status;
3065
3066 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3067
3068 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3069 return NVME_INVALID_PROT_INFO | NVME_DNR;
3070 }
3071
3072 if (nvme_ns_ext(ns)) {
3073 len += nvme_m2b(ns, nlb);
3074 }
3075
3076 status = nvme_check_mdts(n, len);
3077 if (status) {
3078 return status;
3079 }
3080
3081 status = nvme_check_bounds(ns, slba, nlb);
3082 if (status) {
3083 return status;
3084 }
3085
3086 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3087 status = nvme_check_dulbe(ns, slba, nlb);
3088 if (status) {
3089 return status;
3090 }
3091 }
3092
3093 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3094 if (status) {
3095 return status;
3096 }
3097
3098 ctx = g_new(struct nvme_compare_ctx, 1);
3099 ctx->data.bounce = g_malloc(data_len);
3100
3101 req->opaque = ctx;
3102
3103 qemu_iovec_init(&ctx->data.iov, 1);
3104 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3105
3106 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3107 BLOCK_ACCT_READ);
3108 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3109 nvme_compare_data_cb, req);
3110
3111 return NVME_NO_COMPLETE;
3112}
3113
3114typedef struct NvmeFlushAIOCB {
3115 BlockAIOCB common;
3116 BlockAIOCB *aiocb;
3117 NvmeRequest *req;
3118 QEMUBH *bh;
3119 int ret;
3120
3121 NvmeNamespace *ns;
3122 uint32_t nsid;
3123 bool broadcast;
3124} NvmeFlushAIOCB;
3125
3126static void nvme_flush_cancel(BlockAIOCB *acb)
3127{
3128 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3129
3130 iocb->ret = -ECANCELED;
3131
3132 if (iocb->aiocb) {
3133 blk_aio_cancel_async(iocb->aiocb);
3134 }
3135}
3136
3137static const AIOCBInfo nvme_flush_aiocb_info = {
3138 .aiocb_size = sizeof(NvmeFlushAIOCB),
3139 .cancel_async = nvme_flush_cancel,
3140 .get_aio_context = nvme_get_aio_context,
3141};
3142
3143static void nvme_flush_ns_cb(void *opaque, int ret)
3144{
3145 NvmeFlushAIOCB *iocb = opaque;
3146 NvmeNamespace *ns = iocb->ns;
3147
3148 if (ret < 0) {
3149 iocb->ret = ret;
3150 goto out;
3151 } else if (iocb->ret < 0) {
3152 goto out;
3153 }
3154
3155 if (ns) {
3156 trace_pci_nvme_flush_ns(iocb->nsid);
3157
3158 iocb->ns = NULL;
3159 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3160 return;
3161 }
3162
3163out:
3164 iocb->aiocb = NULL;
3165 qemu_bh_schedule(iocb->bh);
3166}
3167
3168static void nvme_flush_bh(void *opaque)
3169{
3170 NvmeFlushAIOCB *iocb = opaque;
3171 NvmeRequest *req = iocb->req;
3172 NvmeCtrl *n = nvme_ctrl(req);
3173 int i;
3174
3175 if (iocb->ret < 0) {
3176 goto done;
3177 }
3178
3179 if (iocb->broadcast) {
3180 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3181 iocb->ns = nvme_ns(n, i);
3182 if (iocb->ns) {
3183 iocb->nsid = i;
3184 break;
3185 }
3186 }
3187 }
3188
3189 if (!iocb->ns) {
3190 goto done;
3191 }
3192
3193 nvme_flush_ns_cb(iocb, 0);
3194 return;
3195
3196done:
3197 qemu_bh_delete(iocb->bh);
3198 iocb->bh = NULL;
3199
3200 iocb->common.cb(iocb->common.opaque, iocb->ret);
3201
3202 qemu_aio_unref(iocb);
3203
3204 return;
3205}
3206
3207static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3208{
3209 NvmeFlushAIOCB *iocb;
3210 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3211 uint16_t status;
3212
3213 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3214
3215 iocb->req = req;
3216 iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3217 iocb->ret = 0;
3218 iocb->ns = NULL;
3219 iocb->nsid = 0;
3220 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3221
3222 if (!iocb->broadcast) {
3223 if (!nvme_nsid_valid(n, nsid)) {
3224 status = NVME_INVALID_NSID | NVME_DNR;
3225 goto out;
3226 }
3227
3228 iocb->ns = nvme_ns(n, nsid);
3229 if (!iocb->ns) {
3230 status = NVME_INVALID_FIELD | NVME_DNR;
3231 goto out;
3232 }
3233
3234 iocb->nsid = nsid;
3235 }
3236
3237 req->aiocb = &iocb->common;
3238 qemu_bh_schedule(iocb->bh);
3239
3240 return NVME_NO_COMPLETE;
3241
3242out:
3243 qemu_bh_delete(iocb->bh);
3244 iocb->bh = NULL;
3245 qemu_aio_unref(iocb);
3246
3247 return status;
3248}
3249
3250static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3251{
3252 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3253 NvmeNamespace *ns = req->ns;
3254 uint64_t slba = le64_to_cpu(rw->slba);
3255 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3256 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3257 uint64_t data_size = nvme_l2b(ns, nlb);
3258 uint64_t mapped_size = data_size;
3259 uint64_t data_offset;
3260 BlockBackend *blk = ns->blkconf.blk;
3261 uint16_t status;
3262
3263 if (nvme_ns_ext(ns)) {
3264 mapped_size += nvme_m2b(ns, nlb);
3265
3266 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3267 bool pract = prinfo & NVME_PRINFO_PRACT;
3268
3269 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3270 mapped_size = data_size;
3271 }
3272 }
3273 }
3274
3275 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3276
3277 status = nvme_check_mdts(n, mapped_size);
3278 if (status) {
3279 goto invalid;
3280 }
3281
3282 status = nvme_check_bounds(ns, slba, nlb);
3283 if (status) {
3284 goto invalid;
3285 }
3286
3287 if (ns->params.zoned) {
3288 status = nvme_check_zone_read(ns, slba, nlb);
3289 if (status) {
3290 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3291 goto invalid;
3292 }
3293 }
3294
3295 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3296 status = nvme_check_dulbe(ns, slba, nlb);
3297 if (status) {
3298 goto invalid;
3299 }
3300 }
3301
3302 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3303 return nvme_dif_rw(n, req);
3304 }
3305
3306 status = nvme_map_data(n, nlb, req);
3307 if (status) {
3308 goto invalid;
3309 }
3310
3311 data_offset = nvme_l2b(ns, slba);
3312
3313 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3314 BLOCK_ACCT_READ);
3315 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3316 return NVME_NO_COMPLETE;
3317
3318invalid:
3319 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3320 return status | NVME_DNR;
3321}
3322
3323static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3324 bool wrz)
3325{
3326 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3327 NvmeNamespace *ns = req->ns;
3328 uint64_t slba = le64_to_cpu(rw->slba);
3329 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3330 uint16_t ctrl = le16_to_cpu(rw->control);
3331 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3332 uint64_t data_size = nvme_l2b(ns, nlb);
3333 uint64_t mapped_size = data_size;
3334 uint64_t data_offset;
3335 NvmeZone *zone;
3336 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3337 BlockBackend *blk = ns->blkconf.blk;
3338 uint16_t status;
3339
3340 if (nvme_ns_ext(ns)) {
3341 mapped_size += nvme_m2b(ns, nlb);
3342
3343 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3344 bool pract = prinfo & NVME_PRINFO_PRACT;
3345
3346 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3347 mapped_size -= nvme_m2b(ns, nlb);
3348 }
3349 }
3350 }
3351
3352 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3353 nvme_nsid(ns), nlb, mapped_size, slba);
3354
3355 if (!wrz) {
3356 status = nvme_check_mdts(n, mapped_size);
3357 if (status) {
3358 goto invalid;
3359 }
3360 }
3361
3362 status = nvme_check_bounds(ns, slba, nlb);
3363 if (status) {
3364 goto invalid;
3365 }
3366
3367 if (ns->params.zoned) {
3368 zone = nvme_get_zone_by_slba(ns, slba);
3369 assert(zone);
3370
3371 if (append) {
3372 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3373
3374 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3375 return NVME_INVALID_ZONE_OP | NVME_DNR;
3376 }
3377
3378 if (unlikely(slba != zone->d.zslba)) {
3379 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3380 status = NVME_INVALID_FIELD;
3381 goto invalid;
3382 }
3383
3384 if (n->params.zasl &&
3385 data_size > (uint64_t)n->page_size << n->params.zasl) {
3386 trace_pci_nvme_err_zasl(data_size);
3387 return NVME_INVALID_FIELD | NVME_DNR;
3388 }
3389
3390 slba = zone->w_ptr;
3391 rw->slba = cpu_to_le64(slba);
3392 res->slba = cpu_to_le64(slba);
3393
3394 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3395 case NVME_ID_NS_DPS_TYPE_1:
3396 if (!piremap) {
3397 return NVME_INVALID_PROT_INFO | NVME_DNR;
3398 }
3399
3400
3401
3402 case NVME_ID_NS_DPS_TYPE_2:
3403 if (piremap) {
3404 uint32_t reftag = le32_to_cpu(rw->reftag);
3405 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3406 }
3407
3408 break;
3409
3410 case NVME_ID_NS_DPS_TYPE_3:
3411 if (piremap) {
3412 return NVME_INVALID_PROT_INFO | NVME_DNR;
3413 }
3414
3415 break;
3416 }
3417 }
3418
3419 status = nvme_check_zone_write(ns, zone, slba, nlb);
3420 if (status) {
3421 goto invalid;
3422 }
3423
3424 status = nvme_zrm_auto(n, ns, zone);
3425 if (status) {
3426 goto invalid;
3427 }
3428
3429 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3430 zone->w_ptr += nlb;
3431 }
3432 }
3433
3434 data_offset = nvme_l2b(ns, slba);
3435
3436 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3437 return nvme_dif_rw(n, req);
3438 }
3439
3440 if (!wrz) {
3441 status = nvme_map_data(n, nlb, req);
3442 if (status) {
3443 goto invalid;
3444 }
3445
3446 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3447 BLOCK_ACCT_WRITE);
3448 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3449 } else {
3450 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3451 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3452 req);
3453 }
3454
3455 return NVME_NO_COMPLETE;
3456
3457invalid:
3458 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3459 return status | NVME_DNR;
3460}
3461
3462static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3463{
3464 return nvme_do_write(n, req, false, false);
3465}
3466
3467static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3468{
3469 return nvme_do_write(n, req, false, true);
3470}
3471
3472static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3473{
3474 return nvme_do_write(n, req, true, false);
3475}
3476
3477static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3478 uint64_t *slba, uint32_t *zone_idx)
3479{
3480 uint32_t dw10 = le32_to_cpu(c->cdw10);
3481 uint32_t dw11 = le32_to_cpu(c->cdw11);
3482
3483 if (!ns->params.zoned) {
3484 trace_pci_nvme_err_invalid_opc(c->opcode);
3485 return NVME_INVALID_OPCODE | NVME_DNR;
3486 }
3487
3488 *slba = ((uint64_t)dw11) << 32 | dw10;
3489 if (unlikely(*slba >= ns->id_ns.nsze)) {
3490 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3491 *slba = 0;
3492 return NVME_LBA_RANGE | NVME_DNR;
3493 }
3494
3495 *zone_idx = nvme_zone_idx(ns, *slba);
3496 assert(*zone_idx < ns->num_zones);
3497
3498 return NVME_SUCCESS;
3499}
3500
3501typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3502 NvmeRequest *);
3503
3504enum NvmeZoneProcessingMask {
3505 NVME_PROC_CURRENT_ZONE = 0,
3506 NVME_PROC_OPENED_ZONES = 1 << 0,
3507 NVME_PROC_CLOSED_ZONES = 1 << 1,
3508 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3509 NVME_PROC_FULL_ZONES = 1 << 3,
3510};
3511
3512static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3513 NvmeZoneState state, NvmeRequest *req)
3514{
3515 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3516 int flags = 0;
3517
3518 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3519 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3520
3521 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3522 return NVME_INVALID_ZONE_OP | NVME_DNR;
3523 }
3524
3525 if (zone->w_ptr % ns->zns.zrwafg) {
3526 return NVME_NOZRWA | NVME_DNR;
3527 }
3528
3529 flags = NVME_ZRM_ZRWA;
3530 }
3531
3532 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3533}
3534
3535static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3536 NvmeZoneState state, NvmeRequest *req)
3537{
3538 return nvme_zrm_close(ns, zone);
3539}
3540
3541static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3542 NvmeZoneState state, NvmeRequest *req)
3543{
3544 return nvme_zrm_finish(ns, zone);
3545}
3546
3547static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3548 NvmeZoneState state, NvmeRequest *req)
3549{
3550 switch (state) {
3551 case NVME_ZONE_STATE_READ_ONLY:
3552 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3553
3554 case NVME_ZONE_STATE_OFFLINE:
3555 return NVME_SUCCESS;
3556 default:
3557 return NVME_ZONE_INVAL_TRANSITION;
3558 }
3559}
3560
3561static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3562{
3563 uint16_t status;
3564 uint8_t state = nvme_get_zone_state(zone);
3565
3566 if (state == NVME_ZONE_STATE_EMPTY) {
3567 status = nvme_aor_check(ns, 1, 0);
3568 if (status) {
3569 return status;
3570 }
3571 nvme_aor_inc_active(ns);
3572 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3573 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3574 return NVME_SUCCESS;
3575 }
3576
3577 return NVME_ZONE_INVAL_TRANSITION;
3578}
3579
3580static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3581 enum NvmeZoneProcessingMask proc_mask,
3582 op_handler_t op_hndlr, NvmeRequest *req)
3583{
3584 uint16_t status = NVME_SUCCESS;
3585 NvmeZoneState zs = nvme_get_zone_state(zone);
3586 bool proc_zone;
3587
3588 switch (zs) {
3589 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3590 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3591 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3592 break;
3593 case NVME_ZONE_STATE_CLOSED:
3594 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3595 break;
3596 case NVME_ZONE_STATE_READ_ONLY:
3597 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3598 break;
3599 case NVME_ZONE_STATE_FULL:
3600 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3601 break;
3602 default:
3603 proc_zone = false;
3604 }
3605
3606 if (proc_zone) {
3607 status = op_hndlr(ns, zone, zs, req);
3608 }
3609
3610 return status;
3611}
3612
3613static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3614 enum NvmeZoneProcessingMask proc_mask,
3615 op_handler_t op_hndlr, NvmeRequest *req)
3616{
3617 NvmeZone *next;
3618 uint16_t status = NVME_SUCCESS;
3619 int i;
3620
3621 if (!proc_mask) {
3622 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3623 } else {
3624 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3625 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3626 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3627 req);
3628 if (status && status != NVME_NO_COMPLETE) {
3629 goto out;
3630 }
3631 }
3632 }
3633 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3634 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3635 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3636 req);
3637 if (status && status != NVME_NO_COMPLETE) {
3638 goto out;
3639 }
3640 }
3641
3642 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3643 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3644 req);
3645 if (status && status != NVME_NO_COMPLETE) {
3646 goto out;
3647 }
3648 }
3649 }
3650 if (proc_mask & NVME_PROC_FULL_ZONES) {
3651 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3652 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3653 req);
3654 if (status && status != NVME_NO_COMPLETE) {
3655 goto out;
3656 }
3657 }
3658 }
3659
3660 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3661 for (i = 0; i < ns->num_zones; i++, zone++) {
3662 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3663 req);
3664 if (status && status != NVME_NO_COMPLETE) {
3665 goto out;
3666 }
3667 }
3668 }
3669 }
3670
3671out:
3672 return status;
3673}
3674
3675typedef struct NvmeZoneResetAIOCB {
3676 BlockAIOCB common;
3677 BlockAIOCB *aiocb;
3678 NvmeRequest *req;
3679 QEMUBH *bh;
3680 int ret;
3681
3682 bool all;
3683 int idx;
3684 NvmeZone *zone;
3685} NvmeZoneResetAIOCB;
3686
3687static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3688{
3689 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3690 NvmeRequest *req = iocb->req;
3691 NvmeNamespace *ns = req->ns;
3692
3693 iocb->idx = ns->num_zones;
3694
3695 iocb->ret = -ECANCELED;
3696
3697 if (iocb->aiocb) {
3698 blk_aio_cancel_async(iocb->aiocb);
3699 iocb->aiocb = NULL;
3700 }
3701}
3702
3703static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3704 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3705 .cancel_async = nvme_zone_reset_cancel,
3706};
3707
3708static void nvme_zone_reset_bh(void *opaque)
3709{
3710 NvmeZoneResetAIOCB *iocb = opaque;
3711
3712 iocb->common.cb(iocb->common.opaque, iocb->ret);
3713
3714 qemu_bh_delete(iocb->bh);
3715 iocb->bh = NULL;
3716 qemu_aio_unref(iocb);
3717}
3718
3719static void nvme_zone_reset_cb(void *opaque, int ret);
3720
3721static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3722{
3723 NvmeZoneResetAIOCB *iocb = opaque;
3724 NvmeRequest *req = iocb->req;
3725 NvmeNamespace *ns = req->ns;
3726 int64_t moff;
3727 int count;
3728
3729 if (ret < 0) {
3730 nvme_zone_reset_cb(iocb, ret);
3731 return;
3732 }
3733
3734 if (!ns->lbaf.ms) {
3735 nvme_zone_reset_cb(iocb, 0);
3736 return;
3737 }
3738
3739 moff = nvme_moff(ns, iocb->zone->d.zslba);
3740 count = nvme_m2b(ns, ns->zone_size);
3741
3742 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3743 BDRV_REQ_MAY_UNMAP,
3744 nvme_zone_reset_cb, iocb);
3745 return;
3746}
3747
3748static void nvme_zone_reset_cb(void *opaque, int ret)
3749{
3750 NvmeZoneResetAIOCB *iocb = opaque;
3751 NvmeRequest *req = iocb->req;
3752 NvmeNamespace *ns = req->ns;
3753
3754 if (ret < 0) {
3755 iocb->ret = ret;
3756 goto done;
3757 }
3758
3759 if (iocb->zone) {
3760 nvme_zrm_reset(ns, iocb->zone);
3761
3762 if (!iocb->all) {
3763 goto done;
3764 }
3765 }
3766
3767 while (iocb->idx < ns->num_zones) {
3768 NvmeZone *zone = &ns->zone_array[iocb->idx++];
3769
3770 switch (nvme_get_zone_state(zone)) {
3771 case NVME_ZONE_STATE_EMPTY:
3772 if (!iocb->all) {
3773 goto done;
3774 }
3775
3776 continue;
3777
3778 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3779 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3780 case NVME_ZONE_STATE_CLOSED:
3781 case NVME_ZONE_STATE_FULL:
3782 iocb->zone = zone;
3783 break;
3784
3785 default:
3786 continue;
3787 }
3788
3789 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3790
3791 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3792 nvme_l2b(ns, zone->d.zslba),
3793 nvme_l2b(ns, ns->zone_size),
3794 BDRV_REQ_MAY_UNMAP,
3795 nvme_zone_reset_epilogue_cb,
3796 iocb);
3797 return;
3798 }
3799
3800done:
3801 iocb->aiocb = NULL;
3802 if (iocb->bh) {
3803 qemu_bh_schedule(iocb->bh);
3804 }
3805}
3806
3807static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
3808 uint64_t elba, NvmeRequest *req)
3809{
3810 NvmeNamespace *ns = req->ns;
3811 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3812 uint64_t wp = zone->d.wp;
3813 uint32_t nlb = elba - wp + 1;
3814 uint16_t status;
3815
3816
3817 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3818 return NVME_INVALID_ZONE_OP | NVME_DNR;
3819 }
3820
3821 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3822 return NVME_INVALID_FIELD | NVME_DNR;
3823 }
3824
3825 if (elba < wp || elba > wp + ns->zns.zrwas) {
3826 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
3827 }
3828
3829 if (nlb % ns->zns.zrwafg) {
3830 return NVME_INVALID_FIELD | NVME_DNR;
3831 }
3832
3833 status = nvme_zrm_auto(n, ns, zone);
3834 if (status) {
3835 return status;
3836 }
3837
3838 zone->w_ptr += nlb;
3839
3840 nvme_advance_zone_wp(ns, zone, nlb);
3841
3842 return NVME_SUCCESS;
3843}
3844
3845static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3846{
3847 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3848 NvmeNamespace *ns = req->ns;
3849 NvmeZone *zone;
3850 NvmeZoneResetAIOCB *iocb;
3851 uint8_t *zd_ext;
3852 uint64_t slba = 0;
3853 uint32_t zone_idx = 0;
3854 uint16_t status;
3855 uint8_t action = cmd->zsa;
3856 bool all;
3857 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3858
3859 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
3860
3861 req->status = NVME_SUCCESS;
3862
3863 if (!all) {
3864 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
3865 if (status) {
3866 return status;
3867 }
3868 }
3869
3870 zone = &ns->zone_array[zone_idx];
3871 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
3872 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3873 return NVME_INVALID_FIELD | NVME_DNR;
3874 }
3875
3876 switch (action) {
3877
3878 case NVME_ZONE_ACTION_OPEN:
3879 if (all) {
3880 proc_mask = NVME_PROC_CLOSED_ZONES;
3881 }
3882 trace_pci_nvme_open_zone(slba, zone_idx, all);
3883 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3884 break;
3885
3886 case NVME_ZONE_ACTION_CLOSE:
3887 if (all) {
3888 proc_mask = NVME_PROC_OPENED_ZONES;
3889 }
3890 trace_pci_nvme_close_zone(slba, zone_idx, all);
3891 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3892 break;
3893
3894 case NVME_ZONE_ACTION_FINISH:
3895 if (all) {
3896 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3897 }
3898 trace_pci_nvme_finish_zone(slba, zone_idx, all);
3899 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3900 break;
3901
3902 case NVME_ZONE_ACTION_RESET:
3903 trace_pci_nvme_reset_zone(slba, zone_idx, all);
3904
3905 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3906 nvme_misc_cb, req);
3907
3908 iocb->req = req;
3909 iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3910 iocb->ret = 0;
3911 iocb->all = all;
3912 iocb->idx = zone_idx;
3913 iocb->zone = NULL;
3914
3915 req->aiocb = &iocb->common;
3916 nvme_zone_reset_cb(iocb, 0);
3917
3918 return NVME_NO_COMPLETE;
3919
3920 case NVME_ZONE_ACTION_OFFLINE:
3921 if (all) {
3922 proc_mask = NVME_PROC_READ_ONLY_ZONES;
3923 }
3924 trace_pci_nvme_offline_zone(slba, zone_idx, all);
3925 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3926 break;
3927
3928 case NVME_ZONE_ACTION_SET_ZD_EXT:
3929 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3930 if (all || !ns->params.zd_extension_size) {
3931 return NVME_INVALID_FIELD | NVME_DNR;
3932 }
3933 zd_ext = nvme_get_zd_extension(ns, zone_idx);
3934 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3935 if (status) {
3936 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3937 return status;
3938 }
3939
3940 status = nvme_set_zd_ext(ns, zone);
3941 if (status == NVME_SUCCESS) {
3942 trace_pci_nvme_zd_extension_set(zone_idx);
3943 return status;
3944 }
3945 break;
3946
3947 case NVME_ZONE_ACTION_ZRWA_FLUSH:
3948 if (all) {
3949 return NVME_INVALID_FIELD | NVME_DNR;
3950 }
3951
3952 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
3953
3954 default:
3955 trace_pci_nvme_err_invalid_mgmt_action(action);
3956 status = NVME_INVALID_FIELD;
3957 }
3958
3959 if (status == NVME_ZONE_INVAL_TRANSITION) {
3960 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3961 zone->d.za);
3962 }
3963 if (status) {
3964 status |= NVME_DNR;
3965 }
3966
3967 return status;
3968}
3969
3970static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3971{
3972 NvmeZoneState zs = nvme_get_zone_state(zl);
3973
3974 switch (zafs) {
3975 case NVME_ZONE_REPORT_ALL:
3976 return true;
3977 case NVME_ZONE_REPORT_EMPTY:
3978 return zs == NVME_ZONE_STATE_EMPTY;
3979 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3980 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3981 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3982 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3983 case NVME_ZONE_REPORT_CLOSED:
3984 return zs == NVME_ZONE_STATE_CLOSED;
3985 case NVME_ZONE_REPORT_FULL:
3986 return zs == NVME_ZONE_STATE_FULL;
3987 case NVME_ZONE_REPORT_READ_ONLY:
3988 return zs == NVME_ZONE_STATE_READ_ONLY;
3989 case NVME_ZONE_REPORT_OFFLINE:
3990 return zs == NVME_ZONE_STATE_OFFLINE;
3991 default:
3992 return false;
3993 }
3994}
3995
3996static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3997{
3998 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3999 NvmeNamespace *ns = req->ns;
4000
4001 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4002 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4003 uint32_t zone_idx, zra, zrasf, partial;
4004 uint64_t max_zones, nr_zones = 0;
4005 uint16_t status;
4006 uint64_t slba;
4007 NvmeZoneDescr *z;
4008 NvmeZone *zone;
4009 NvmeZoneReportHeader *header;
4010 void *buf, *buf_p;
4011 size_t zone_entry_sz;
4012 int i;
4013
4014 req->status = NVME_SUCCESS;
4015
4016 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4017 if (status) {
4018 return status;
4019 }
4020
4021 zra = dw13 & 0xff;
4022 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4023 return NVME_INVALID_FIELD | NVME_DNR;
4024 }
4025 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4026 return NVME_INVALID_FIELD | NVME_DNR;
4027 }
4028
4029 zrasf = (dw13 >> 8) & 0xff;
4030 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4031 return NVME_INVALID_FIELD | NVME_DNR;
4032 }
4033
4034 if (data_size < sizeof(NvmeZoneReportHeader)) {
4035 return NVME_INVALID_FIELD | NVME_DNR;
4036 }
4037
4038 status = nvme_check_mdts(n, data_size);
4039 if (status) {
4040 return status;
4041 }
4042
4043 partial = (dw13 >> 16) & 0x01;
4044
4045 zone_entry_sz = sizeof(NvmeZoneDescr);
4046 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4047 zone_entry_sz += ns->params.zd_extension_size;
4048 }
4049
4050 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4051 buf = g_malloc0(data_size);
4052
4053 zone = &ns->zone_array[zone_idx];
4054 for (i = zone_idx; i < ns->num_zones; i++) {
4055 if (partial && nr_zones >= max_zones) {
4056 break;
4057 }
4058 if (nvme_zone_matches_filter(zrasf, zone++)) {
4059 nr_zones++;
4060 }
4061 }
4062 header = (NvmeZoneReportHeader *)buf;
4063 header->nr_zones = cpu_to_le64(nr_zones);
4064
4065 buf_p = buf + sizeof(NvmeZoneReportHeader);
4066 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4067 zone = &ns->zone_array[zone_idx];
4068 if (nvme_zone_matches_filter(zrasf, zone)) {
4069 z = (NvmeZoneDescr *)buf_p;
4070 buf_p += sizeof(NvmeZoneDescr);
4071
4072 z->zt = zone->d.zt;
4073 z->zs = zone->d.zs;
4074 z->zcap = cpu_to_le64(zone->d.zcap);
4075 z->zslba = cpu_to_le64(zone->d.zslba);
4076 z->za = zone->d.za;
4077
4078 if (nvme_wp_is_valid(zone)) {
4079 z->wp = cpu_to_le64(zone->d.wp);
4080 } else {
4081 z->wp = cpu_to_le64(~0ULL);
4082 }
4083
4084 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4085 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4086 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4087 ns->params.zd_extension_size);
4088 }
4089 buf_p += ns->params.zd_extension_size;
4090 }
4091
4092 max_zones--;
4093 }
4094 }
4095
4096 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4097
4098 g_free(buf);
4099
4100 return status;
4101}
4102
4103static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4104{
4105 NvmeNamespace *ns;
4106 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4107
4108 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4109 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4110
4111 if (!nvme_nsid_valid(n, nsid)) {
4112 return NVME_INVALID_NSID | NVME_DNR;
4113 }
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4135 return nvme_flush(n, req);
4136 }
4137
4138 ns = nvme_ns(n, nsid);
4139 if (unlikely(!ns)) {
4140 return NVME_INVALID_FIELD | NVME_DNR;
4141 }
4142
4143 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4144 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4145 return NVME_INVALID_OPCODE | NVME_DNR;
4146 }
4147
4148 if (ns->status) {
4149 return ns->status;
4150 }
4151
4152 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4153 return NVME_INVALID_FIELD;
4154 }
4155
4156 req->ns = ns;
4157
4158 switch (req->cmd.opcode) {
4159 case NVME_CMD_WRITE_ZEROES:
4160 return nvme_write_zeroes(n, req);
4161 case NVME_CMD_ZONE_APPEND:
4162 return nvme_zone_append(n, req);
4163 case NVME_CMD_WRITE:
4164 return nvme_write(n, req);
4165 case NVME_CMD_READ:
4166 return nvme_read(n, req);
4167 case NVME_CMD_COMPARE:
4168 return nvme_compare(n, req);
4169 case NVME_CMD_DSM:
4170 return nvme_dsm(n, req);
4171 case NVME_CMD_VERIFY:
4172 return nvme_verify(n, req);
4173 case NVME_CMD_COPY:
4174 return nvme_copy(n, req);
4175 case NVME_CMD_ZONE_MGMT_SEND:
4176 return nvme_zone_mgmt_send(n, req);
4177 case NVME_CMD_ZONE_MGMT_RECV:
4178 return nvme_zone_mgmt_recv(n, req);
4179 default:
4180 assert(false);
4181 }
4182
4183 return NVME_INVALID_OPCODE | NVME_DNR;
4184}
4185
4186static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4187{
4188 n->sq[sq->sqid] = NULL;
4189 timer_free(sq->timer);
4190 g_free(sq->io_req);
4191 if (sq->sqid) {
4192 g_free(sq);
4193 }
4194}
4195
4196static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4197{
4198 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4199 NvmeRequest *r, *next;
4200 NvmeSQueue *sq;
4201 NvmeCQueue *cq;
4202 uint16_t qid = le16_to_cpu(c->qid);
4203
4204 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4205 trace_pci_nvme_err_invalid_del_sq(qid);
4206 return NVME_INVALID_QID | NVME_DNR;
4207 }
4208
4209 trace_pci_nvme_del_sq(qid);
4210
4211 sq = n->sq[qid];
4212 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4213 r = QTAILQ_FIRST(&sq->out_req_list);
4214 assert(r->aiocb);
4215 blk_aio_cancel(r->aiocb);
4216 }
4217
4218 assert(QTAILQ_EMPTY(&sq->out_req_list));
4219
4220 if (!nvme_check_cqid(n, sq->cqid)) {
4221 cq = n->cq[sq->cqid];
4222 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4223
4224 nvme_post_cqes(cq);
4225 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4226 if (r->sq == sq) {
4227 QTAILQ_REMOVE(&cq->req_list, r, entry);
4228 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4229 }
4230 }
4231 }
4232
4233 nvme_free_sq(sq, n);
4234 return NVME_SUCCESS;
4235}
4236
4237static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4238 uint16_t sqid, uint16_t cqid, uint16_t size)
4239{
4240 int i;
4241 NvmeCQueue *cq;
4242
4243 sq->ctrl = n;
4244 sq->dma_addr = dma_addr;
4245 sq->sqid = sqid;
4246 sq->size = size;
4247 sq->cqid = cqid;
4248 sq->head = sq->tail = 0;
4249 sq->io_req = g_new0(NvmeRequest, sq->size);
4250
4251 QTAILQ_INIT(&sq->req_list);
4252 QTAILQ_INIT(&sq->out_req_list);
4253 for (i = 0; i < sq->size; i++) {
4254 sq->io_req[i].sq = sq;
4255 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4256 }
4257 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
4258
4259 assert(n->cq[cqid]);
4260 cq = n->cq[cqid];
4261 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4262 n->sq[sqid] = sq;
4263}
4264
4265static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4266{
4267 NvmeSQueue *sq;
4268 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4269
4270 uint16_t cqid = le16_to_cpu(c->cqid);
4271 uint16_t sqid = le16_to_cpu(c->sqid);
4272 uint16_t qsize = le16_to_cpu(c->qsize);
4273 uint16_t qflags = le16_to_cpu(c->sq_flags);
4274 uint64_t prp1 = le64_to_cpu(c->prp1);
4275
4276 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4277
4278 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4279 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4280 return NVME_INVALID_CQID | NVME_DNR;
4281 }
4282 if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
4283 n->sq[sqid] != NULL)) {
4284 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4285 return NVME_INVALID_QID | NVME_DNR;
4286 }
4287 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4288 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4289 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4290 }
4291 if (unlikely(prp1 & (n->page_size - 1))) {
4292 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4293 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4294 }
4295 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4296 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4297 return NVME_INVALID_FIELD | NVME_DNR;
4298 }
4299 sq = g_malloc0(sizeof(*sq));
4300 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4301 return NVME_SUCCESS;
4302}
4303
4304struct nvme_stats {
4305 uint64_t units_read;
4306 uint64_t units_written;
4307 uint64_t read_commands;
4308 uint64_t write_commands;
4309};
4310
4311static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4312{
4313 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4314
4315 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4316 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4317 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4318 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4319}
4320
4321static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4322 uint64_t off, NvmeRequest *req)
4323{
4324 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4325 struct nvme_stats stats = { 0 };
4326 NvmeSmartLog smart = { 0 };
4327 uint32_t trans_len;
4328 NvmeNamespace *ns;
4329 time_t current_ms;
4330
4331 if (off >= sizeof(smart)) {
4332 return NVME_INVALID_FIELD | NVME_DNR;
4333 }
4334
4335 if (nsid != 0xffffffff) {
4336 ns = nvme_ns(n, nsid);
4337 if (!ns) {
4338 return NVME_INVALID_NSID | NVME_DNR;
4339 }
4340 nvme_set_blk_stats(ns, &stats);
4341 } else {
4342 int i;
4343
4344 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4345 ns = nvme_ns(n, i);
4346 if (!ns) {
4347 continue;
4348 }
4349 nvme_set_blk_stats(ns, &stats);
4350 }
4351 }
4352
4353 trans_len = MIN(sizeof(smart) - off, buf_len);
4354 smart.critical_warning = n->smart_critical_warning;
4355
4356 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4357 1000));
4358 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4359 1000));
4360 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4361 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4362
4363 smart.temperature = cpu_to_le16(n->temperature);
4364
4365 if ((n->temperature >= n->features.temp_thresh_hi) ||
4366 (n->temperature <= n->features.temp_thresh_low)) {
4367 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4368 }
4369
4370 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4371 smart.power_on_hours[0] =
4372 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4373
4374 if (!rae) {
4375 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4376 }
4377
4378 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4379}
4380
4381static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4382 NvmeRequest *req)
4383{
4384 uint32_t trans_len;
4385 NvmeFwSlotInfoLog fw_log = {
4386 .afi = 0x1,
4387 };
4388
4389 if (off >= sizeof(fw_log)) {
4390 return NVME_INVALID_FIELD | NVME_DNR;
4391 }
4392
4393 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4394 trans_len = MIN(sizeof(fw_log) - off, buf_len);
4395
4396 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4397}
4398
4399static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4400 uint64_t off, NvmeRequest *req)
4401{
4402 uint32_t trans_len;
4403 NvmeErrorLog errlog;
4404
4405 if (off >= sizeof(errlog)) {
4406 return NVME_INVALID_FIELD | NVME_DNR;
4407 }
4408
4409 if (!rae) {
4410 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4411 }
4412
4413 memset(&errlog, 0x0, sizeof(errlog));
4414 trans_len = MIN(sizeof(errlog) - off, buf_len);
4415
4416 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4417}
4418
4419static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4420 uint64_t off, NvmeRequest *req)
4421{
4422 uint32_t nslist[1024];
4423 uint32_t trans_len;
4424 int i = 0;
4425 uint32_t nsid;
4426
4427 if (off >= sizeof(nslist)) {
4428 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
4429 return NVME_INVALID_FIELD | NVME_DNR;
4430 }
4431
4432 memset(nslist, 0x0, sizeof(nslist));
4433 trans_len = MIN(sizeof(nslist) - off, buf_len);
4434
4435 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4436 NVME_CHANGED_NSID_SIZE) {
4437
4438
4439
4440
4441 if (i == ARRAY_SIZE(nslist)) {
4442 memset(nslist, 0x0, sizeof(nslist));
4443 nslist[0] = 0xffffffff;
4444 break;
4445 }
4446
4447 nslist[i++] = nsid;
4448 clear_bit(nsid, n->changed_nsids);
4449 }
4450
4451
4452
4453
4454
4455 if (nslist[0] == 0xffffffff) {
4456 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4457 }
4458
4459 if (!rae) {
4460 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4461 }
4462
4463 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4464}
4465
4466static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4467 uint64_t off, NvmeRequest *req)
4468{
4469 NvmeEffectsLog log = {};
4470 const uint32_t *src_iocs = NULL;
4471 uint32_t trans_len;
4472
4473 if (off >= sizeof(log)) {
4474 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4475 return NVME_INVALID_FIELD | NVME_DNR;
4476 }
4477
4478 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4479 case NVME_CC_CSS_NVM:
4480 src_iocs = nvme_cse_iocs_nvm;
4481
4482 case NVME_CC_CSS_ADMIN_ONLY:
4483 break;
4484 case NVME_CC_CSS_CSI:
4485 switch (csi) {
4486 case NVME_CSI_NVM:
4487 src_iocs = nvme_cse_iocs_nvm;
4488 break;
4489 case NVME_CSI_ZONED:
4490 src_iocs = nvme_cse_iocs_zoned;
4491 break;
4492 }
4493 }
4494
4495 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4496
4497 if (src_iocs) {
4498 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4499 }
4500
4501 trans_len = MIN(sizeof(log) - off, buf_len);
4502
4503 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4504}
4505
4506static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4507{
4508 NvmeCmd *cmd = &req->cmd;
4509
4510 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4511 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4512 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4513 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4514 uint8_t lid = dw10 & 0xff;
4515 uint8_t lsp = (dw10 >> 8) & 0xf;
4516 uint8_t rae = (dw10 >> 15) & 0x1;
4517 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
4518 uint32_t numdl, numdu;
4519 uint64_t off, lpol, lpou;
4520 size_t len;
4521 uint16_t status;
4522
4523 numdl = (dw10 >> 16);
4524 numdu = (dw11 & 0xffff);
4525 lpol = dw12;
4526 lpou = dw13;
4527
4528 len = (((numdu << 16) | numdl) + 1) << 2;
4529 off = (lpou << 32ULL) | lpol;
4530
4531 if (off & 0x3) {
4532 return NVME_INVALID_FIELD | NVME_DNR;
4533 }
4534
4535 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4536
4537 status = nvme_check_mdts(n, len);
4538 if (status) {
4539 return status;
4540 }
4541
4542 switch (lid) {
4543 case NVME_LOG_ERROR_INFO:
4544 return nvme_error_info(n, rae, len, off, req);
4545 case NVME_LOG_SMART_INFO:
4546 return nvme_smart_info(n, rae, len, off, req);
4547 case NVME_LOG_FW_SLOT_INFO:
4548 return nvme_fw_log_info(n, len, off, req);
4549 case NVME_LOG_CHANGED_NSLIST:
4550 return nvme_changed_nslist(n, rae, len, off, req);
4551 case NVME_LOG_CMD_EFFECTS:
4552 return nvme_cmd_effects(n, csi, len, off, req);
4553 default:
4554 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4555 return NVME_INVALID_FIELD | NVME_DNR;
4556 }
4557}
4558
4559static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4560{
4561 n->cq[cq->cqid] = NULL;
4562 timer_free(cq->timer);
4563 if (msix_enabled(&n->parent_obj)) {
4564 msix_vector_unuse(&n->parent_obj, cq->vector);
4565 }
4566 if (cq->cqid) {
4567 g_free(cq);
4568 }
4569}
4570
4571static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4572{
4573 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4574 NvmeCQueue *cq;
4575 uint16_t qid = le16_to_cpu(c->qid);
4576
4577 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4578 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4579 return NVME_INVALID_CQID | NVME_DNR;
4580 }
4581
4582 cq = n->cq[qid];
4583 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4584 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4585 return NVME_INVALID_QUEUE_DEL;
4586 }
4587
4588 if (cq->irq_enabled && cq->tail != cq->head) {
4589 n->cq_pending--;
4590 }
4591
4592 nvme_irq_deassert(n, cq);
4593 trace_pci_nvme_del_cq(qid);
4594 nvme_free_cq(cq, n);
4595 return NVME_SUCCESS;
4596}
4597
4598static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4599 uint16_t cqid, uint16_t vector, uint16_t size,
4600 uint16_t irq_enabled)
4601{
4602 int ret;
4603
4604 if (msix_enabled(&n->parent_obj)) {
4605 ret = msix_vector_use(&n->parent_obj, vector);
4606 assert(ret == 0);
4607 }
4608 cq->ctrl = n;
4609 cq->cqid = cqid;
4610 cq->size = size;
4611 cq->dma_addr = dma_addr;
4612 cq->phase = 1;
4613 cq->irq_enabled = irq_enabled;
4614 cq->vector = vector;
4615 cq->head = cq->tail = 0;
4616 QTAILQ_INIT(&cq->req_list);
4617 QTAILQ_INIT(&cq->sq_list);
4618 n->cq[cqid] = cq;
4619 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4620}
4621
4622static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4623{
4624 NvmeCQueue *cq;
4625 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4626 uint16_t cqid = le16_to_cpu(c->cqid);
4627 uint16_t vector = le16_to_cpu(c->irq_vector);
4628 uint16_t qsize = le16_to_cpu(c->qsize);
4629 uint16_t qflags = le16_to_cpu(c->cq_flags);
4630 uint64_t prp1 = le64_to_cpu(c->prp1);
4631
4632 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4633 NVME_CQ_FLAGS_IEN(qflags) != 0);
4634
4635 if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4636 n->cq[cqid] != NULL)) {
4637 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4638 return NVME_INVALID_QID | NVME_DNR;
4639 }
4640 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4641 trace_pci_nvme_err_invalid_create_cq_size(qsize);
4642 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4643 }
4644 if (unlikely(prp1 & (n->page_size - 1))) {
4645 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4646 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4647 }
4648 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4649 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4650 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4651 }
4652 if (unlikely(vector >= n->params.msix_qsize)) {
4653 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4654 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4655 }
4656 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4657 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4658 return NVME_INVALID_FIELD | NVME_DNR;
4659 }
4660
4661 cq = g_malloc0(sizeof(*cq));
4662 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4663 NVME_CQ_FLAGS_IEN(qflags));
4664
4665
4666
4667
4668
4669
4670 n->qs_created = true;
4671 return NVME_SUCCESS;
4672}
4673
4674static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4675{
4676 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4677
4678 return nvme_c2h(n, id, sizeof(id), req);
4679}
4680
4681static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4682{
4683 trace_pci_nvme_identify_ctrl();
4684
4685 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4686}
4687
4688static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4689{
4690 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4691 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4692 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4693
4694 trace_pci_nvme_identify_ctrl_csi(c->csi);
4695
4696 switch (c->csi) {
4697 case NVME_CSI_NVM:
4698 id_nvm->vsl = n->params.vsl;
4699 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4700 break;
4701
4702 case NVME_CSI_ZONED:
4703 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4704 break;
4705
4706 default:
4707 return NVME_INVALID_FIELD | NVME_DNR;
4708 }
4709
4710 return nvme_c2h(n, id, sizeof(id), req);
4711}
4712
4713static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4714{
4715 NvmeNamespace *ns;
4716 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4717 uint32_t nsid = le32_to_cpu(c->nsid);
4718
4719 trace_pci_nvme_identify_ns(nsid);
4720
4721 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4722 return NVME_INVALID_NSID | NVME_DNR;
4723 }
4724
4725 ns = nvme_ns(n, nsid);
4726 if (unlikely(!ns)) {
4727 if (!active) {
4728 ns = nvme_subsys_ns(n->subsys, nsid);
4729 if (!ns) {
4730 return nvme_rpt_empty_id_struct(n, req);
4731 }
4732 } else {
4733 return nvme_rpt_empty_id_struct(n, req);
4734 }
4735 }
4736
4737 if (active || ns->csi == NVME_CSI_NVM) {
4738 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4739 }
4740
4741 return NVME_INVALID_CMD_SET | NVME_DNR;
4742}
4743
4744static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4745 bool attached)
4746{
4747 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4748 uint32_t nsid = le32_to_cpu(c->nsid);
4749 uint16_t min_id = le16_to_cpu(c->ctrlid);
4750 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4751 uint16_t *ids = &list[1];
4752 NvmeNamespace *ns;
4753 NvmeCtrl *ctrl;
4754 int cntlid, nr_ids = 0;
4755
4756 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4757
4758 if (!n->subsys) {
4759 return NVME_INVALID_FIELD | NVME_DNR;
4760 }
4761
4762 if (attached) {
4763 if (nsid == NVME_NSID_BROADCAST) {
4764 return NVME_INVALID_FIELD | NVME_DNR;
4765 }
4766
4767 ns = nvme_subsys_ns(n->subsys, nsid);
4768 if (!ns) {
4769 return NVME_INVALID_FIELD | NVME_DNR;
4770 }
4771 }
4772
4773 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4774 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4775 if (!ctrl) {
4776 continue;
4777 }
4778
4779 if (attached && !nvme_ns(ctrl, nsid)) {
4780 continue;
4781 }
4782
4783 ids[nr_ids++] = cntlid;
4784 }
4785
4786 list[0] = nr_ids;
4787
4788 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4789}
4790
4791static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4792 bool active)
4793{
4794 NvmeNamespace *ns;
4795 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4796 uint32_t nsid = le32_to_cpu(c->nsid);
4797
4798 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4799
4800 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4801 return NVME_INVALID_NSID | NVME_DNR;
4802 }
4803
4804 ns = nvme_ns(n, nsid);
4805 if (unlikely(!ns)) {
4806 if (!active) {
4807 ns = nvme_subsys_ns(n->subsys, nsid);
4808 if (!ns) {
4809 return nvme_rpt_empty_id_struct(n, req);
4810 }
4811 } else {
4812 return nvme_rpt_empty_id_struct(n, req);
4813 }
4814 }
4815
4816 if (c->csi == NVME_CSI_NVM) {
4817 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
4818 req);
4819 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4820 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4821 req);
4822 }
4823
4824 return NVME_INVALID_FIELD | NVME_DNR;
4825}
4826
4827static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4828 bool active)
4829{
4830 NvmeNamespace *ns;
4831 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4832 uint32_t min_nsid = le32_to_cpu(c->nsid);
4833 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4834 static const int data_len = sizeof(list);
4835 uint32_t *list_ptr = (uint32_t *)list;
4836 int i, j = 0;
4837
4838 trace_pci_nvme_identify_nslist(min_nsid);
4839
4840
4841
4842
4843
4844
4845
4846 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4847 return NVME_INVALID_NSID | NVME_DNR;
4848 }
4849
4850 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4851 ns = nvme_ns(n, i);
4852 if (!ns) {
4853 if (!active) {
4854 ns = nvme_subsys_ns(n->subsys, i);
4855 if (!ns) {
4856 continue;
4857 }
4858 } else {
4859 continue;
4860 }
4861 }
4862 if (ns->params.nsid <= min_nsid) {
4863 continue;
4864 }
4865 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4866 if (j == data_len / sizeof(uint32_t)) {
4867 break;
4868 }
4869 }
4870
4871 return nvme_c2h(n, list, data_len, req);
4872}
4873
4874static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4875 bool active)
4876{
4877 NvmeNamespace *ns;
4878 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4879 uint32_t min_nsid = le32_to_cpu(c->nsid);
4880 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4881 static const int data_len = sizeof(list);
4882 uint32_t *list_ptr = (uint32_t *)list;
4883 int i, j = 0;
4884
4885 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4886
4887
4888
4889
4890 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4891 return NVME_INVALID_NSID | NVME_DNR;
4892 }
4893
4894 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4895 return NVME_INVALID_FIELD | NVME_DNR;
4896 }
4897
4898 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4899 ns = nvme_ns(n, i);
4900 if (!ns) {
4901 if (!active) {
4902 ns = nvme_subsys_ns(n->subsys, i);
4903 if (!ns) {
4904 continue;
4905 }
4906 } else {
4907 continue;
4908 }
4909 }
4910 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4911 continue;
4912 }
4913 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4914 if (j == data_len / sizeof(uint32_t)) {
4915 break;
4916 }
4917 }
4918
4919 return nvme_c2h(n, list, data_len, req);
4920}
4921
4922static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4923{
4924 NvmeNamespace *ns;
4925 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4926 uint32_t nsid = le32_to_cpu(c->nsid);
4927 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4928 uint8_t *pos = list;
4929 struct {
4930 NvmeIdNsDescr hdr;
4931 uint8_t v[NVME_NIDL_UUID];
4932 } QEMU_PACKED uuid = {};
4933 struct {
4934 NvmeIdNsDescr hdr;
4935 uint64_t v;
4936 } QEMU_PACKED eui64 = {};
4937 struct {
4938 NvmeIdNsDescr hdr;
4939 uint8_t v;
4940 } QEMU_PACKED csi = {};
4941
4942 trace_pci_nvme_identify_ns_descr_list(nsid);
4943
4944 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4945 return NVME_INVALID_NSID | NVME_DNR;
4946 }
4947
4948 ns = nvme_ns(n, nsid);
4949 if (unlikely(!ns)) {
4950 return NVME_INVALID_FIELD | NVME_DNR;
4951 }
4952
4953
4954
4955
4956
4957
4958 uuid.hdr.nidt = NVME_NIDT_UUID;
4959 uuid.hdr.nidl = NVME_NIDL_UUID;
4960 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4961 memcpy(pos, &uuid, sizeof(uuid));
4962 pos += sizeof(uuid);
4963
4964 if (ns->params.eui64) {
4965 eui64.hdr.nidt = NVME_NIDT_EUI64;
4966 eui64.hdr.nidl = NVME_NIDL_EUI64;
4967 eui64.v = cpu_to_be64(ns->params.eui64);
4968 memcpy(pos, &eui64, sizeof(eui64));
4969 pos += sizeof(eui64);
4970 }
4971
4972 csi.hdr.nidt = NVME_NIDT_CSI;
4973 csi.hdr.nidl = NVME_NIDL_CSI;
4974 csi.v = ns->csi;
4975 memcpy(pos, &csi, sizeof(csi));
4976 pos += sizeof(csi);
4977
4978 return nvme_c2h(n, list, sizeof(list), req);
4979}
4980
4981static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4982{
4983 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4984 static const int data_len = sizeof(list);
4985
4986 trace_pci_nvme_identify_cmd_set();
4987
4988 NVME_SET_CSI(*list, NVME_CSI_NVM);
4989 NVME_SET_CSI(*list, NVME_CSI_ZONED);
4990
4991 return nvme_c2h(n, list, data_len, req);
4992}
4993
4994static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4995{
4996 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4997
4998 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4999 c->csi);
5000
5001 switch (c->cns) {
5002 case NVME_ID_CNS_NS:
5003 return nvme_identify_ns(n, req, true);
5004 case NVME_ID_CNS_NS_PRESENT:
5005 return nvme_identify_ns(n, req, false);
5006 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
5007 return nvme_identify_ctrl_list(n, req, true);
5008 case NVME_ID_CNS_CTRL_LIST:
5009 return nvme_identify_ctrl_list(n, req, false);
5010 case NVME_ID_CNS_CS_NS:
5011 return nvme_identify_ns_csi(n, req, true);
5012 case NVME_ID_CNS_CS_NS_PRESENT:
5013 return nvme_identify_ns_csi(n, req, false);
5014 case NVME_ID_CNS_CTRL:
5015 return nvme_identify_ctrl(n, req);
5016 case NVME_ID_CNS_CS_CTRL:
5017 return nvme_identify_ctrl_csi(n, req);
5018 case NVME_ID_CNS_NS_ACTIVE_LIST:
5019 return nvme_identify_nslist(n, req, true);
5020 case NVME_ID_CNS_NS_PRESENT_LIST:
5021 return nvme_identify_nslist(n, req, false);
5022 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
5023 return nvme_identify_nslist_csi(n, req, true);
5024 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
5025 return nvme_identify_nslist_csi(n, req, false);
5026 case NVME_ID_CNS_NS_DESCR_LIST:
5027 return nvme_identify_ns_descr_list(n, req);
5028 case NVME_ID_CNS_IO_COMMAND_SET:
5029 return nvme_identify_cmd_set(n, req);
5030 default:
5031 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
5032 return NVME_INVALID_FIELD | NVME_DNR;
5033 }
5034}
5035
5036static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
5037{
5038 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
5039
5040 req->cqe.result = 1;
5041 if (nvme_check_sqid(n, sqid)) {
5042 return NVME_INVALID_FIELD | NVME_DNR;
5043 }
5044
5045 return NVME_SUCCESS;
5046}
5047
5048static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
5049{
5050 trace_pci_nvme_setfeat_timestamp(ts);
5051
5052 n->host_timestamp = le64_to_cpu(ts);
5053 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5054}
5055
5056static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
5057{
5058 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5059 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
5060
5061 union nvme_timestamp {
5062 struct {
5063 uint64_t timestamp:48;
5064 uint64_t sync:1;
5065 uint64_t origin:3;
5066 uint64_t rsvd1:12;
5067 };
5068 uint64_t all;
5069 };
5070
5071 union nvme_timestamp ts;
5072 ts.all = 0;
5073 ts.timestamp = n->host_timestamp + elapsed_time;
5074
5075
5076 ts.origin = n->host_timestamp ? 0x01 : 0x00;
5077
5078 trace_pci_nvme_getfeat_timestamp(ts.all);
5079
5080 return cpu_to_le64(ts.all);
5081}
5082
5083static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
5084{
5085 uint64_t timestamp = nvme_get_timestamp(n);
5086
5087 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
5088}
5089
5090static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
5091{
5092 NvmeCmd *cmd = &req->cmd;
5093 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5094 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5095 uint32_t nsid = le32_to_cpu(cmd->nsid);
5096 uint32_t result;
5097 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5098 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
5099 uint16_t iv;
5100 NvmeNamespace *ns;
5101 int i;
5102
5103 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
5104 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
5105 };
5106
5107 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
5108
5109 if (!nvme_feature_support[fid]) {
5110 return NVME_INVALID_FIELD | NVME_DNR;
5111 }
5112
5113 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5114 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5115
5116
5117
5118
5119
5120
5121
5122 return NVME_INVALID_NSID | NVME_DNR;
5123 }
5124
5125 if (!nvme_ns(n, nsid)) {
5126 return NVME_INVALID_FIELD | NVME_DNR;
5127 }
5128 }
5129
5130 switch (sel) {
5131 case NVME_GETFEAT_SELECT_CURRENT:
5132 break;
5133 case NVME_GETFEAT_SELECT_SAVED:
5134
5135 case NVME_GETFEAT_SELECT_DEFAULT:
5136 goto defaults;
5137 case NVME_GETFEAT_SELECT_CAP:
5138 result = nvme_feature_cap[fid];
5139 goto out;
5140 }
5141
5142 switch (fid) {
5143 case NVME_TEMPERATURE_THRESHOLD:
5144 result = 0;
5145
5146
5147
5148
5149
5150 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5151 goto out;
5152 }
5153
5154 switch (NVME_TEMP_THSEL(dw11)) {
5155 case NVME_TEMP_THSEL_OVER:
5156 result = n->features.temp_thresh_hi;
5157 goto out;
5158 case NVME_TEMP_THSEL_UNDER:
5159 result = n->features.temp_thresh_low;
5160 goto out;
5161 }
5162
5163 return NVME_INVALID_FIELD | NVME_DNR;
5164 case NVME_ERROR_RECOVERY:
5165 if (!nvme_nsid_valid(n, nsid)) {
5166 return NVME_INVALID_NSID | NVME_DNR;
5167 }
5168
5169 ns = nvme_ns(n, nsid);
5170 if (unlikely(!ns)) {
5171 return NVME_INVALID_FIELD | NVME_DNR;
5172 }
5173
5174 result = ns->features.err_rec;
5175 goto out;
5176 case NVME_VOLATILE_WRITE_CACHE:
5177 result = 0;
5178 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5179 ns = nvme_ns(n, i);
5180 if (!ns) {
5181 continue;
5182 }
5183
5184 result = blk_enable_write_cache(ns->blkconf.blk);
5185 if (result) {
5186 break;
5187 }
5188 }
5189 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
5190 goto out;
5191 case NVME_ASYNCHRONOUS_EVENT_CONF:
5192 result = n->features.async_config;
5193 goto out;
5194 case NVME_TIMESTAMP:
5195 return nvme_get_feature_timestamp(n, req);
5196 case NVME_HOST_BEHAVIOR_SUPPORT:
5197 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
5198 sizeof(n->features.hbs), req);
5199 default:
5200 break;
5201 }
5202
5203defaults:
5204 switch (fid) {
5205 case NVME_TEMPERATURE_THRESHOLD:
5206 result = 0;
5207
5208 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5209 break;
5210 }
5211
5212 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
5213 result = NVME_TEMPERATURE_WARNING;
5214 }
5215
5216 break;
5217 case NVME_NUMBER_OF_QUEUES:
5218 result = (n->params.max_ioqpairs - 1) |
5219 ((n->params.max_ioqpairs - 1) << 16);
5220 trace_pci_nvme_getfeat_numq(result);
5221 break;
5222 case NVME_INTERRUPT_VECTOR_CONF:
5223 iv = dw11 & 0xffff;
5224 if (iv >= n->params.max_ioqpairs + 1) {
5225 return NVME_INVALID_FIELD | NVME_DNR;
5226 }
5227
5228 result = iv;
5229 if (iv == n->admin_cq.vector) {
5230 result |= NVME_INTVC_NOCOALESCING;
5231 }
5232 break;
5233 default:
5234 result = nvme_feature_default[fid];
5235 break;
5236 }
5237
5238out:
5239 req->cqe.result = cpu_to_le32(result);
5240 return NVME_SUCCESS;
5241}
5242
5243static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
5244{
5245 uint16_t ret;
5246 uint64_t timestamp;
5247
5248 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
5249 if (ret) {
5250 return ret;
5251 }
5252
5253 nvme_set_timestamp(n, timestamp);
5254
5255 return NVME_SUCCESS;
5256}
5257
5258static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
5259{
5260 NvmeNamespace *ns = NULL;
5261
5262 NvmeCmd *cmd = &req->cmd;
5263 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5264 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5265 uint32_t nsid = le32_to_cpu(cmd->nsid);
5266 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5267 uint8_t save = NVME_SETFEAT_SAVE(dw10);
5268 uint16_t status;
5269 int i;
5270
5271 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5272
5273 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5274 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5275 }
5276
5277 if (!nvme_feature_support[fid]) {
5278 return NVME_INVALID_FIELD | NVME_DNR;
5279 }
5280
5281 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5282 if (nsid != NVME_NSID_BROADCAST) {
5283 if (!nvme_nsid_valid(n, nsid)) {
5284 return NVME_INVALID_NSID | NVME_DNR;
5285 }
5286
5287 ns = nvme_ns(n, nsid);
5288 if (unlikely(!ns)) {
5289 return NVME_INVALID_FIELD | NVME_DNR;
5290 }
5291 }
5292 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5293 if (!nvme_nsid_valid(n, nsid)) {
5294 return NVME_INVALID_NSID | NVME_DNR;
5295 }
5296
5297 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5298 }
5299
5300 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5301 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5302 }
5303
5304 switch (fid) {
5305 case NVME_TEMPERATURE_THRESHOLD:
5306 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5307 break;
5308 }
5309
5310 switch (NVME_TEMP_THSEL(dw11)) {
5311 case NVME_TEMP_THSEL_OVER:
5312 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5313 break;
5314 case NVME_TEMP_THSEL_UNDER:
5315 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5316 break;
5317 default:
5318 return NVME_INVALID_FIELD | NVME_DNR;
5319 }
5320
5321 if ((n->temperature >= n->features.temp_thresh_hi) ||
5322 (n->temperature <= n->features.temp_thresh_low)) {
5323 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
5324 }
5325
5326 break;
5327 case NVME_ERROR_RECOVERY:
5328 if (nsid == NVME_NSID_BROADCAST) {
5329 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5330 ns = nvme_ns(n, i);
5331
5332 if (!ns) {
5333 continue;
5334 }
5335
5336 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5337 ns->features.err_rec = dw11;
5338 }
5339 }
5340
5341 break;
5342 }
5343
5344 assert(ns);
5345 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5346 ns->features.err_rec = dw11;
5347 }
5348 break;
5349 case NVME_VOLATILE_WRITE_CACHE:
5350 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5351 ns = nvme_ns(n, i);
5352 if (!ns) {
5353 continue;
5354 }
5355
5356 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5357 blk_flush(ns->blkconf.blk);
5358 }
5359
5360 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5361 }
5362
5363 break;
5364
5365 case NVME_NUMBER_OF_QUEUES:
5366 if (n->qs_created) {
5367 return NVME_CMD_SEQ_ERROR | NVME_DNR;
5368 }
5369
5370
5371
5372
5373
5374 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5375 return NVME_INVALID_FIELD | NVME_DNR;
5376 }
5377
5378 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5379 ((dw11 >> 16) & 0xffff) + 1,
5380 n->params.max_ioqpairs,
5381 n->params.max_ioqpairs);
5382 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
5383 ((n->params.max_ioqpairs - 1) << 16));
5384 break;
5385 case NVME_ASYNCHRONOUS_EVENT_CONF:
5386 n->features.async_config = dw11;
5387 break;
5388 case NVME_TIMESTAMP:
5389 return nvme_set_feature_timestamp(n, req);
5390 case NVME_HOST_BEHAVIOR_SUPPORT:
5391 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
5392 sizeof(n->features.hbs), req);
5393 if (status) {
5394 return status;
5395 }
5396
5397 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5398 ns = nvme_ns(n, i);
5399
5400 if (!ns) {
5401 continue;
5402 }
5403
5404 ns->id_ns.nlbaf = ns->nlbaf - 1;
5405 if (!n->features.hbs.lbafee) {
5406 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
5407 }
5408 }
5409
5410 return status;
5411 case NVME_COMMAND_SET_PROFILE:
5412 if (dw11 & 0x1ff) {
5413 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5414 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5415 }
5416 break;
5417 default:
5418 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5419 }
5420 return NVME_SUCCESS;
5421}
5422
5423static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5424{
5425 trace_pci_nvme_aer(nvme_cid(req));
5426
5427 if (n->outstanding_aers > n->params.aerl) {
5428 trace_pci_nvme_aer_aerl_exceeded();
5429 return NVME_AER_LIMIT_EXCEEDED;
5430 }
5431
5432 n->aer_reqs[n->outstanding_aers] = req;
5433 n->outstanding_aers++;
5434
5435 if (!QTAILQ_EMPTY(&n->aer_queue)) {
5436 nvme_process_aers(n);
5437 }
5438
5439 return NVME_NO_COMPLETE;
5440}
5441
5442static void nvme_update_dmrsl(NvmeCtrl *n)
5443{
5444 int nsid;
5445
5446 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5447 NvmeNamespace *ns = nvme_ns(n, nsid);
5448 if (!ns) {
5449 continue;
5450 }
5451
5452 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5453 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5454 }
5455}
5456
5457static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5458{
5459 uint32_t cc = ldl_le_p(&n->bar.cc);
5460
5461 ns->iocs = nvme_cse_iocs_none;
5462 switch (ns->csi) {
5463 case NVME_CSI_NVM:
5464 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5465 ns->iocs = nvme_cse_iocs_nvm;
5466 }
5467 break;
5468 case NVME_CSI_ZONED:
5469 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5470 ns->iocs = nvme_cse_iocs_zoned;
5471 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5472 ns->iocs = nvme_cse_iocs_nvm;
5473 }
5474 break;
5475 }
5476}
5477
5478static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5479{
5480 NvmeNamespace *ns;
5481 NvmeCtrl *ctrl;
5482 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5483 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5484 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5485 uint8_t sel = dw10 & 0xf;
5486 uint16_t *nr_ids = &list[0];
5487 uint16_t *ids = &list[1];
5488 uint16_t ret;
5489 int i;
5490
5491 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5492
5493 if (!nvme_nsid_valid(n, nsid)) {
5494 return NVME_INVALID_NSID | NVME_DNR;
5495 }
5496
5497 ns = nvme_subsys_ns(n->subsys, nsid);
5498 if (!ns) {
5499 return NVME_INVALID_FIELD | NVME_DNR;
5500 }
5501
5502 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5503 if (ret) {
5504 return ret;
5505 }
5506
5507 if (!*nr_ids) {
5508 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5509 }
5510
5511 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5512 for (i = 0; i < *nr_ids; i++) {
5513 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5514 if (!ctrl) {
5515 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5516 }
5517
5518 switch (sel) {
5519 case NVME_NS_ATTACHMENT_ATTACH:
5520 if (nvme_ns(ctrl, nsid)) {
5521 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5522 }
5523
5524 if (ns->attached && !ns->params.shared) {
5525 return NVME_NS_PRIVATE | NVME_DNR;
5526 }
5527
5528 nvme_attach_ns(ctrl, ns);
5529 nvme_select_iocs_ns(ctrl, ns);
5530
5531 break;
5532
5533 case NVME_NS_ATTACHMENT_DETACH:
5534 if (!nvme_ns(ctrl, nsid)) {
5535 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5536 }
5537
5538 ctrl->namespaces[nsid] = NULL;
5539 ns->attached--;
5540
5541 nvme_update_dmrsl(ctrl);
5542
5543 break;
5544
5545 default:
5546 return NVME_INVALID_FIELD | NVME_DNR;
5547 }
5548
5549
5550
5551
5552
5553 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5554 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5555 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5556 NVME_LOG_CHANGED_NSLIST);
5557 }
5558 }
5559
5560 return NVME_SUCCESS;
5561}
5562
5563typedef struct NvmeFormatAIOCB {
5564 BlockAIOCB common;
5565 BlockAIOCB *aiocb;
5566 QEMUBH *bh;
5567 NvmeRequest *req;
5568 int ret;
5569
5570 NvmeNamespace *ns;
5571 uint32_t nsid;
5572 bool broadcast;
5573 int64_t offset;
5574
5575 uint8_t lbaf;
5576 uint8_t mset;
5577 uint8_t pi;
5578 uint8_t pil;
5579} NvmeFormatAIOCB;
5580
5581static void nvme_format_bh(void *opaque);
5582
5583static void nvme_format_cancel(BlockAIOCB *aiocb)
5584{
5585 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5586
5587 if (iocb->aiocb) {
5588 blk_aio_cancel_async(iocb->aiocb);
5589 }
5590}
5591
5592static const AIOCBInfo nvme_format_aiocb_info = {
5593 .aiocb_size = sizeof(NvmeFormatAIOCB),
5594 .cancel_async = nvme_format_cancel,
5595 .get_aio_context = nvme_get_aio_context,
5596};
5597
5598static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
5599 uint8_t pi, uint8_t pil)
5600{
5601 uint8_t lbafl = lbaf & 0xf;
5602 uint8_t lbafu = lbaf >> 4;
5603
5604 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5605
5606 ns->id_ns.dps = (pil << 3) | pi;
5607 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
5608
5609 nvme_ns_init_format(ns);
5610}
5611
5612static void nvme_format_ns_cb(void *opaque, int ret)
5613{
5614 NvmeFormatAIOCB *iocb = opaque;
5615 NvmeNamespace *ns = iocb->ns;
5616 int bytes;
5617
5618 if (ret < 0) {
5619 iocb->ret = ret;
5620 goto done;
5621 }
5622
5623 assert(ns);
5624
5625 if (iocb->offset < ns->size) {
5626 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5627
5628 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5629 bytes, BDRV_REQ_MAY_UNMAP,
5630 nvme_format_ns_cb, iocb);
5631
5632 iocb->offset += bytes;
5633 return;
5634 }
5635
5636 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
5637 ns->status = 0x0;
5638 iocb->ns = NULL;
5639 iocb->offset = 0;
5640
5641done:
5642 iocb->aiocb = NULL;
5643 qemu_bh_schedule(iocb->bh);
5644}
5645
5646static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5647{
5648 if (ns->params.zoned) {
5649 return NVME_INVALID_FORMAT | NVME_DNR;
5650 }
5651
5652 if (lbaf > ns->id_ns.nlbaf) {
5653 return NVME_INVALID_FORMAT | NVME_DNR;
5654 }
5655
5656 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
5657 return NVME_INVALID_FORMAT | NVME_DNR;
5658 }
5659
5660 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5661 return NVME_INVALID_FIELD | NVME_DNR;
5662 }
5663
5664 return NVME_SUCCESS;
5665}
5666
5667static void nvme_format_bh(void *opaque)
5668{
5669 NvmeFormatAIOCB *iocb = opaque;
5670 NvmeRequest *req = iocb->req;
5671 NvmeCtrl *n = nvme_ctrl(req);
5672 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5673 uint8_t lbaf = dw10 & 0xf;
5674 uint8_t pi = (dw10 >> 5) & 0x7;
5675 uint16_t status;
5676 int i;
5677
5678 if (iocb->ret < 0) {
5679 goto done;
5680 }
5681
5682 if (iocb->broadcast) {
5683 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5684 iocb->ns = nvme_ns(n, i);
5685 if (iocb->ns) {
5686 iocb->nsid = i;
5687 break;
5688 }
5689 }
5690 }
5691
5692 if (!iocb->ns) {
5693 goto done;
5694 }
5695
5696 status = nvme_format_check(iocb->ns, lbaf, pi);
5697 if (status) {
5698 req->status = status;
5699 goto done;
5700 }
5701
5702 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5703 nvme_format_ns_cb(iocb, 0);
5704 return;
5705
5706done:
5707 qemu_bh_delete(iocb->bh);
5708 iocb->bh = NULL;
5709
5710 iocb->common.cb(iocb->common.opaque, iocb->ret);
5711
5712 qemu_aio_unref(iocb);
5713}
5714
5715static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5716{
5717 NvmeFormatAIOCB *iocb;
5718 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5719 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5720 uint8_t lbaf = dw10 & 0xf;
5721 uint8_t mset = (dw10 >> 4) & 0x1;
5722 uint8_t pi = (dw10 >> 5) & 0x7;
5723 uint8_t pil = (dw10 >> 8) & 0x1;
5724 uint8_t lbafu = (dw10 >> 12) & 0x3;
5725 uint16_t status;
5726
5727 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5728
5729 iocb->req = req;
5730 iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5731 iocb->ret = 0;
5732 iocb->ns = NULL;
5733 iocb->nsid = 0;
5734 iocb->lbaf = lbaf;
5735 iocb->mset = mset;
5736 iocb->pi = pi;
5737 iocb->pil = pil;
5738 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5739 iocb->offset = 0;
5740
5741 if (n->features.hbs.lbafee) {
5742 iocb->lbaf |= lbafu << 4;
5743 }
5744
5745 if (!iocb->broadcast) {
5746 if (!nvme_nsid_valid(n, nsid)) {
5747 status = NVME_INVALID_NSID | NVME_DNR;
5748 goto out;
5749 }
5750
5751 iocb->ns = nvme_ns(n, nsid);
5752 if (!iocb->ns) {
5753 status = NVME_INVALID_FIELD | NVME_DNR;
5754 goto out;
5755 }
5756 }
5757
5758 req->aiocb = &iocb->common;
5759 qemu_bh_schedule(iocb->bh);
5760
5761 return NVME_NO_COMPLETE;
5762
5763out:
5764 qemu_bh_delete(iocb->bh);
5765 iocb->bh = NULL;
5766 qemu_aio_unref(iocb);
5767 return status;
5768}
5769
5770static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5771{
5772 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5773 nvme_adm_opc_str(req->cmd.opcode));
5774
5775 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5776 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5777 return NVME_INVALID_OPCODE | NVME_DNR;
5778 }
5779
5780
5781 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5782 return NVME_INVALID_FIELD | NVME_DNR;
5783 }
5784
5785 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
5786 return NVME_INVALID_FIELD;
5787 }
5788
5789 switch (req->cmd.opcode) {
5790 case NVME_ADM_CMD_DELETE_SQ:
5791 return nvme_del_sq(n, req);
5792 case NVME_ADM_CMD_CREATE_SQ:
5793 return nvme_create_sq(n, req);
5794 case NVME_ADM_CMD_GET_LOG_PAGE:
5795 return nvme_get_log(n, req);
5796 case NVME_ADM_CMD_DELETE_CQ:
5797 return nvme_del_cq(n, req);
5798 case NVME_ADM_CMD_CREATE_CQ:
5799 return nvme_create_cq(n, req);
5800 case NVME_ADM_CMD_IDENTIFY:
5801 return nvme_identify(n, req);
5802 case NVME_ADM_CMD_ABORT:
5803 return nvme_abort(n, req);
5804 case NVME_ADM_CMD_SET_FEATURES:
5805 return nvme_set_feature(n, req);
5806 case NVME_ADM_CMD_GET_FEATURES:
5807 return nvme_get_feature(n, req);
5808 case NVME_ADM_CMD_ASYNC_EV_REQ:
5809 return nvme_aer(n, req);
5810 case NVME_ADM_CMD_NS_ATTACHMENT:
5811 return nvme_ns_attachment(n, req);
5812 case NVME_ADM_CMD_FORMAT_NVM:
5813 return nvme_format(n, req);
5814 default:
5815 assert(false);
5816 }
5817
5818 return NVME_INVALID_OPCODE | NVME_DNR;
5819}
5820
5821static void nvme_process_sq(void *opaque)
5822{
5823 NvmeSQueue *sq = opaque;
5824 NvmeCtrl *n = sq->ctrl;
5825 NvmeCQueue *cq = n->cq[sq->cqid];
5826
5827 uint16_t status;
5828 hwaddr addr;
5829 NvmeCmd cmd;
5830 NvmeRequest *req;
5831
5832 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5833 addr = sq->dma_addr + sq->head * n->sqe_size;
5834 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5835 trace_pci_nvme_err_addr_read(addr);
5836 trace_pci_nvme_err_cfs();
5837 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
5838 break;
5839 }
5840 nvme_inc_sq_head(sq);
5841
5842 req = QTAILQ_FIRST(&sq->req_list);
5843 QTAILQ_REMOVE(&sq->req_list, req, entry);
5844 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5845 nvme_req_clear(req);
5846 req->cqe.cid = cmd.cid;
5847 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5848
5849 status = sq->sqid ? nvme_io_cmd(n, req) :
5850 nvme_admin_cmd(n, req);
5851 if (status != NVME_NO_COMPLETE) {
5852 req->status = status;
5853 nvme_enqueue_req_completion(cq, req);
5854 }
5855 }
5856}
5857
5858static void nvme_ctrl_reset(NvmeCtrl *n)
5859{
5860 NvmeNamespace *ns;
5861 int i;
5862
5863 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5864 ns = nvme_ns(n, i);
5865 if (!ns) {
5866 continue;
5867 }
5868
5869 nvme_ns_drain(ns);
5870 }
5871
5872 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5873 if (n->sq[i] != NULL) {
5874 nvme_free_sq(n->sq[i], n);
5875 }
5876 }
5877 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5878 if (n->cq[i] != NULL) {
5879 nvme_free_cq(n->cq[i], n);
5880 }
5881 }
5882
5883 while (!QTAILQ_EMPTY(&n->aer_queue)) {
5884 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5885 QTAILQ_REMOVE(&n->aer_queue, event, entry);
5886 g_free(event);
5887 }
5888
5889 n->aer_queued = 0;
5890 n->outstanding_aers = 0;
5891 n->qs_created = false;
5892}
5893
5894static void nvme_ctrl_shutdown(NvmeCtrl *n)
5895{
5896 NvmeNamespace *ns;
5897 int i;
5898
5899 if (n->pmr.dev) {
5900 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5901 }
5902
5903 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5904 ns = nvme_ns(n, i);
5905 if (!ns) {
5906 continue;
5907 }
5908
5909 nvme_ns_shutdown(ns);
5910 }
5911}
5912
5913static void nvme_select_iocs(NvmeCtrl *n)
5914{
5915 NvmeNamespace *ns;
5916 int i;
5917
5918 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5919 ns = nvme_ns(n, i);
5920 if (!ns) {
5921 continue;
5922 }
5923
5924 nvme_select_iocs_ns(n, ns);
5925 }
5926}
5927
5928static int nvme_start_ctrl(NvmeCtrl *n)
5929{
5930 uint64_t cap = ldq_le_p(&n->bar.cap);
5931 uint32_t cc = ldl_le_p(&n->bar.cc);
5932 uint32_t aqa = ldl_le_p(&n->bar.aqa);
5933 uint64_t asq = ldq_le_p(&n->bar.asq);
5934 uint64_t acq = ldq_le_p(&n->bar.acq);
5935 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
5936 uint32_t page_size = 1 << page_bits;
5937
5938 if (unlikely(n->cq[0])) {
5939 trace_pci_nvme_err_startfail_cq();
5940 return -1;
5941 }
5942 if (unlikely(n->sq[0])) {
5943 trace_pci_nvme_err_startfail_sq();
5944 return -1;
5945 }
5946 if (unlikely(asq & (page_size - 1))) {
5947 trace_pci_nvme_err_startfail_asq_misaligned(asq);
5948 return -1;
5949 }
5950 if (unlikely(acq & (page_size - 1))) {
5951 trace_pci_nvme_err_startfail_acq_misaligned(acq);
5952 return -1;
5953 }
5954 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
5955 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
5956 return -1;
5957 }
5958 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
5959 trace_pci_nvme_err_startfail_page_too_small(
5960 NVME_CC_MPS(cc),
5961 NVME_CAP_MPSMIN(cap));
5962 return -1;
5963 }
5964 if (unlikely(NVME_CC_MPS(cc) >
5965 NVME_CAP_MPSMAX(cap))) {
5966 trace_pci_nvme_err_startfail_page_too_large(
5967 NVME_CC_MPS(cc),
5968 NVME_CAP_MPSMAX(cap));
5969 return -1;
5970 }
5971 if (unlikely(NVME_CC_IOCQES(cc) <
5972 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5973 trace_pci_nvme_err_startfail_cqent_too_small(
5974 NVME_CC_IOCQES(cc),
5975 NVME_CTRL_CQES_MIN(cap));
5976 return -1;
5977 }
5978 if (unlikely(NVME_CC_IOCQES(cc) >
5979 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5980 trace_pci_nvme_err_startfail_cqent_too_large(
5981 NVME_CC_IOCQES(cc),
5982 NVME_CTRL_CQES_MAX(cap));
5983 return -1;
5984 }
5985 if (unlikely(NVME_CC_IOSQES(cc) <
5986 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5987 trace_pci_nvme_err_startfail_sqent_too_small(
5988 NVME_CC_IOSQES(cc),
5989 NVME_CTRL_SQES_MIN(cap));
5990 return -1;
5991 }
5992 if (unlikely(NVME_CC_IOSQES(cc) >
5993 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5994 trace_pci_nvme_err_startfail_sqent_too_large(
5995 NVME_CC_IOSQES(cc),
5996 NVME_CTRL_SQES_MAX(cap));
5997 return -1;
5998 }
5999 if (unlikely(!NVME_AQA_ASQS(aqa))) {
6000 trace_pci_nvme_err_startfail_asqent_sz_zero();
6001 return -1;
6002 }
6003 if (unlikely(!NVME_AQA_ACQS(aqa))) {
6004 trace_pci_nvme_err_startfail_acqent_sz_zero();
6005 return -1;
6006 }
6007
6008 n->page_bits = page_bits;
6009 n->page_size = page_size;
6010 n->max_prp_ents = n->page_size / sizeof(uint64_t);
6011 n->cqe_size = 1 << NVME_CC_IOCQES(cc);
6012 n->sqe_size = 1 << NVME_CC_IOSQES(cc);
6013 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
6014 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
6015
6016 nvme_set_timestamp(n, 0ULL);
6017
6018 QTAILQ_INIT(&n->aer_queue);
6019
6020 nvme_select_iocs(n);
6021
6022 return 0;
6023}
6024
6025static void nvme_cmb_enable_regs(NvmeCtrl *n)
6026{
6027 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
6028 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
6029
6030 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
6031 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
6032 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
6033 stl_le_p(&n->bar.cmbloc, cmbloc);
6034
6035 NVME_CMBSZ_SET_SQS(cmbsz, 1);
6036 NVME_CMBSZ_SET_CQS(cmbsz, 0);
6037 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
6038 NVME_CMBSZ_SET_RDS(cmbsz, 1);
6039 NVME_CMBSZ_SET_WDS(cmbsz, 1);
6040 NVME_CMBSZ_SET_SZU(cmbsz, 2);
6041 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
6042 stl_le_p(&n->bar.cmbsz, cmbsz);
6043}
6044
6045static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
6046 unsigned size)
6047{
6048 uint64_t cap = ldq_le_p(&n->bar.cap);
6049 uint32_t cc = ldl_le_p(&n->bar.cc);
6050 uint32_t intms = ldl_le_p(&n->bar.intms);
6051 uint32_t csts = ldl_le_p(&n->bar.csts);
6052 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
6053
6054 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
6055 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
6056 "MMIO write not 32-bit aligned,"
6057 " offset=0x%"PRIx64"", offset);
6058
6059 }
6060
6061 if (unlikely(size < sizeof(uint32_t))) {
6062 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
6063 "MMIO write smaller than 32-bits,"
6064 " offset=0x%"PRIx64", size=%u",
6065 offset, size);
6066
6067 }
6068
6069 switch (offset) {
6070 case NVME_REG_INTMS:
6071 if (unlikely(msix_enabled(&(n->parent_obj)))) {
6072 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
6073 "undefined access to interrupt mask set"
6074 " when MSI-X is enabled");
6075
6076 }
6077 intms |= data;
6078 stl_le_p(&n->bar.intms, intms);
6079 n->bar.intmc = n->bar.intms;
6080 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
6081 nvme_irq_check(n);
6082 break;
6083 case NVME_REG_INTMC:
6084 if (unlikely(msix_enabled(&(n->parent_obj)))) {
6085 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
6086 "undefined access to interrupt mask clr"
6087 " when MSI-X is enabled");
6088
6089 }
6090 intms &= ~data;
6091 stl_le_p(&n->bar.intms, intms);
6092 n->bar.intmc = n->bar.intms;
6093 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
6094 nvme_irq_check(n);
6095 break;
6096 case NVME_REG_CC:
6097 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
6098
6099
6100 if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
6101 !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
6102 {
6103 cc = data;
6104 }
6105
6106 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
6107 cc = data;
6108
6109
6110 stl_le_p(&n->bar.cc, cc);
6111 if (unlikely(nvme_start_ctrl(n))) {
6112 trace_pci_nvme_err_startfail();
6113 csts = NVME_CSTS_FAILED;
6114 } else {
6115 trace_pci_nvme_mmio_start_success();
6116 csts = NVME_CSTS_READY;
6117 }
6118 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
6119 trace_pci_nvme_mmio_stopped();
6120 nvme_ctrl_reset(n);
6121 cc = 0;
6122 csts &= ~NVME_CSTS_READY;
6123 }
6124
6125 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
6126 trace_pci_nvme_mmio_shutdown_set();
6127 nvme_ctrl_shutdown(n);
6128 cc = data;
6129 csts |= NVME_CSTS_SHST_COMPLETE;
6130 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
6131 trace_pci_nvme_mmio_shutdown_cleared();
6132 csts &= ~NVME_CSTS_SHST_COMPLETE;
6133 cc = data;
6134 }
6135
6136 stl_le_p(&n->bar.cc, cc);
6137 stl_le_p(&n->bar.csts, csts);
6138
6139 break;
6140 case NVME_REG_CSTS:
6141 if (data & (1 << 4)) {
6142 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
6143 "attempted to W1C CSTS.NSSRO"
6144 " but CAP.NSSRS is zero (not supported)");
6145 } else if (data != 0) {
6146 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
6147 "attempted to set a read only bit"
6148 " of controller status");
6149 }
6150 break;
6151 case NVME_REG_NSSR:
6152 if (data == 0x4e564d65) {
6153 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
6154 } else {
6155
6156 return;
6157 }
6158 break;
6159 case NVME_REG_AQA:
6160 stl_le_p(&n->bar.aqa, data);
6161 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
6162 break;
6163 case NVME_REG_ASQ:
6164 stn_le_p(&n->bar.asq, size, data);
6165 trace_pci_nvme_mmio_asqaddr(data);
6166 break;
6167 case NVME_REG_ASQ + 4:
6168 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
6169 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
6170 break;
6171 case NVME_REG_ACQ:
6172 trace_pci_nvme_mmio_acqaddr(data);
6173 stn_le_p(&n->bar.acq, size, data);
6174 break;
6175 case NVME_REG_ACQ + 4:
6176 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
6177 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
6178 break;
6179 case NVME_REG_CMBLOC:
6180 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
6181 "invalid write to reserved CMBLOC"
6182 " when CMBSZ is zero, ignored");
6183 return;
6184 case NVME_REG_CMBSZ:
6185 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
6186 "invalid write to read only CMBSZ, ignored");
6187 return;
6188 case NVME_REG_CMBMSC:
6189 if (!NVME_CAP_CMBS(cap)) {
6190 return;
6191 }
6192
6193 stn_le_p(&n->bar.cmbmsc, size, data);
6194 n->cmb.cmse = false;
6195
6196 if (NVME_CMBMSC_CRE(data)) {
6197 nvme_cmb_enable_regs(n);
6198
6199 if (NVME_CMBMSC_CMSE(data)) {
6200 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
6201 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
6202 if (cba + int128_get64(n->cmb.mem.size) < cba) {
6203 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
6204 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
6205 stl_le_p(&n->bar.cmbsts, cmbsts);
6206 return;
6207 }
6208
6209 n->cmb.cba = cba;
6210 n->cmb.cmse = true;
6211 }
6212 } else {
6213 n->bar.cmbsz = 0;
6214 n->bar.cmbloc = 0;
6215 }
6216
6217 return;
6218 case NVME_REG_CMBMSC + 4:
6219 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
6220 return;
6221
6222 case NVME_REG_PMRCAP:
6223 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
6224 "invalid write to PMRCAP register, ignored");
6225 return;
6226 case NVME_REG_PMRCTL:
6227 if (!NVME_CAP_PMRS(cap)) {
6228 return;
6229 }
6230
6231 stl_le_p(&n->bar.pmrctl, data);
6232 if (NVME_PMRCTL_EN(data)) {
6233 memory_region_set_enabled(&n->pmr.dev->mr, true);
6234 pmrsts = 0;
6235 } else {
6236 memory_region_set_enabled(&n->pmr.dev->mr, false);
6237 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
6238 n->pmr.cmse = false;
6239 }
6240 stl_le_p(&n->bar.pmrsts, pmrsts);
6241 return;
6242 case NVME_REG_PMRSTS:
6243 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
6244 "invalid write to PMRSTS register, ignored");
6245 return;
6246 case NVME_REG_PMREBS:
6247 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
6248 "invalid write to PMREBS register, ignored");
6249 return;
6250 case NVME_REG_PMRSWTP:
6251 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
6252 "invalid write to PMRSWTP register, ignored");
6253 return;
6254 case NVME_REG_PMRMSCL:
6255 if (!NVME_CAP_PMRS(cap)) {
6256 return;
6257 }
6258
6259 stl_le_p(&n->bar.pmrmscl, data);
6260 n->pmr.cmse = false;
6261
6262 if (NVME_PMRMSCL_CMSE(data)) {
6263 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
6264 hwaddr cba = pmrmscu << 32 |
6265 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
6266 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
6267 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
6268 stl_le_p(&n->bar.pmrsts, pmrsts);
6269 return;
6270 }
6271
6272 n->pmr.cmse = true;
6273 n->pmr.cba = cba;
6274 }
6275
6276 return;
6277 case NVME_REG_PMRMSCU:
6278 if (!NVME_CAP_PMRS(cap)) {
6279 return;
6280 }
6281
6282 stl_le_p(&n->bar.pmrmscu, data);
6283 return;
6284 default:
6285 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
6286 "invalid MMIO write,"
6287 " offset=0x%"PRIx64", data=%"PRIx64"",
6288 offset, data);
6289 break;
6290 }
6291}
6292
6293static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
6294{
6295 NvmeCtrl *n = (NvmeCtrl *)opaque;
6296 uint8_t *ptr = (uint8_t *)&n->bar;
6297
6298 trace_pci_nvme_mmio_read(addr, size);
6299
6300 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
6301 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
6302 "MMIO read not 32-bit aligned,"
6303 " offset=0x%"PRIx64"", addr);
6304
6305 } else if (unlikely(size < sizeof(uint32_t))) {
6306 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
6307 "MMIO read smaller than 32-bits,"
6308 " offset=0x%"PRIx64"", addr);
6309
6310 }
6311
6312 if (addr > sizeof(n->bar) - size) {
6313 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6314 "MMIO read beyond last register,"
6315 " offset=0x%"PRIx64", returning 0", addr);
6316
6317 return 0;
6318 }
6319
6320
6321
6322
6323
6324
6325 if (addr == NVME_REG_PMRSTS &&
6326 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6327 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6328 }
6329
6330 return ldn_le_p(ptr + addr, size);
6331}
6332
6333static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6334{
6335 uint32_t qid;
6336
6337 if (unlikely(addr & ((1 << 2) - 1))) {
6338 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6339 "doorbell write not 32-bit aligned,"
6340 " offset=0x%"PRIx64", ignoring", addr);
6341 return;
6342 }
6343
6344 if (((addr - 0x1000) >> 2) & 1) {
6345
6346
6347 uint16_t new_head = val & 0xffff;
6348 int start_sqs;
6349 NvmeCQueue *cq;
6350
6351 qid = (addr - (0x1000 + (1 << 2))) >> 3;
6352 if (unlikely(nvme_check_cqid(n, qid))) {
6353 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6354 "completion queue doorbell write"
6355 " for nonexistent queue,"
6356 " sqid=%"PRIu32", ignoring", qid);
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371 if (n->outstanding_aers) {
6372 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6373 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6374 NVME_LOG_ERROR_INFO);
6375 }
6376
6377 return;
6378 }
6379
6380 cq = n->cq[qid];
6381 if (unlikely(new_head >= cq->size)) {
6382 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6383 "completion queue doorbell write value"
6384 " beyond queue size, sqid=%"PRIu32","
6385 " new_head=%"PRIu16", ignoring",
6386 qid, new_head);
6387
6388 if (n->outstanding_aers) {
6389 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6390 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6391 NVME_LOG_ERROR_INFO);
6392 }
6393
6394 return;
6395 }
6396
6397 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6398
6399 start_sqs = nvme_cq_full(cq) ? 1 : 0;
6400 cq->head = new_head;
6401 if (start_sqs) {
6402 NvmeSQueue *sq;
6403 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6404 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6405 }
6406 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6407 }
6408
6409 if (cq->tail == cq->head) {
6410 if (cq->irq_enabled) {
6411 n->cq_pending--;
6412 }
6413
6414 nvme_irq_deassert(n, cq);
6415 }
6416 } else {
6417
6418
6419 uint16_t new_tail = val & 0xffff;
6420 NvmeSQueue *sq;
6421
6422 qid = (addr - 0x1000) >> 3;
6423 if (unlikely(nvme_check_sqid(n, qid))) {
6424 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6425 "submission queue doorbell write"
6426 " for nonexistent queue,"
6427 " sqid=%"PRIu32", ignoring", qid);
6428
6429 if (n->outstanding_aers) {
6430 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6431 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6432 NVME_LOG_ERROR_INFO);
6433 }
6434
6435 return;
6436 }
6437
6438 sq = n->sq[qid];
6439 if (unlikely(new_tail >= sq->size)) {
6440 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6441 "submission queue doorbell write value"
6442 " beyond queue size, sqid=%"PRIu32","
6443 " new_tail=%"PRIu16", ignoring",
6444 qid, new_tail);
6445
6446 if (n->outstanding_aers) {
6447 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6448 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6449 NVME_LOG_ERROR_INFO);
6450 }
6451
6452 return;
6453 }
6454
6455 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6456
6457 sq->tail = new_tail;
6458 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6459 }
6460}
6461
6462static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6463 unsigned size)
6464{
6465 NvmeCtrl *n = (NvmeCtrl *)opaque;
6466
6467 trace_pci_nvme_mmio_write(addr, data, size);
6468
6469 if (addr < sizeof(n->bar)) {
6470 nvme_write_bar(n, addr, data, size);
6471 } else {
6472 nvme_process_db(n, addr, data);
6473 }
6474}
6475
6476static const MemoryRegionOps nvme_mmio_ops = {
6477 .read = nvme_mmio_read,
6478 .write = nvme_mmio_write,
6479 .endianness = DEVICE_LITTLE_ENDIAN,
6480 .impl = {
6481 .min_access_size = 2,
6482 .max_access_size = 8,
6483 },
6484};
6485
6486static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
6487 unsigned size)
6488{
6489 NvmeCtrl *n = (NvmeCtrl *)opaque;
6490 stn_le_p(&n->cmb.buf[addr], size, data);
6491}
6492
6493static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
6494{
6495 NvmeCtrl *n = (NvmeCtrl *)opaque;
6496 return ldn_le_p(&n->cmb.buf[addr], size);
6497}
6498
6499static const MemoryRegionOps nvme_cmb_ops = {
6500 .read = nvme_cmb_read,
6501 .write = nvme_cmb_write,
6502 .endianness = DEVICE_LITTLE_ENDIAN,
6503 .impl = {
6504 .min_access_size = 1,
6505 .max_access_size = 8,
6506 },
6507};
6508
6509static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
6510{
6511 NvmeParams *params = &n->params;
6512
6513 if (params->num_queues) {
6514 warn_report("num_queues is deprecated; please use max_ioqpairs "
6515 "instead");
6516
6517 params->max_ioqpairs = params->num_queues - 1;
6518 }
6519
6520 if (n->namespace.blkconf.blk && n->subsys) {
6521 error_setg(errp, "subsystem support is unavailable with legacy "
6522 "namespace ('drive' property)");
6523 return;
6524 }
6525
6526 if (params->max_ioqpairs < 1 ||
6527 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
6528 error_setg(errp, "max_ioqpairs must be between 1 and %d",
6529 NVME_MAX_IOQPAIRS);
6530 return;
6531 }
6532
6533 if (params->msix_qsize < 1 ||
6534 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
6535 error_setg(errp, "msix_qsize must be between 1 and %d",
6536 PCI_MSIX_FLAGS_QSIZE + 1);
6537 return;
6538 }
6539
6540 if (!params->serial) {
6541 error_setg(errp, "serial property not set");
6542 return;
6543 }
6544
6545 if (n->pmr.dev) {
6546 if (host_memory_backend_is_mapped(n->pmr.dev)) {
6547 error_setg(errp, "can't use already busy memdev: %s",
6548 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
6549 return;
6550 }
6551
6552 if (!is_power_of_2(n->pmr.dev->size)) {
6553 error_setg(errp, "pmr backend size needs to be power of 2 in size");
6554 return;
6555 }
6556
6557 host_memory_backend_set_mapped(n->pmr.dev, true);
6558 }
6559
6560 if (n->params.zasl > n->params.mdts) {
6561 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
6562 "than or equal to mdts (Maximum Data Transfer Size)");
6563 return;
6564 }
6565
6566 if (!n->params.vsl) {
6567 error_setg(errp, "vsl must be non-zero");
6568 return;
6569 }
6570}
6571
6572static void nvme_init_state(NvmeCtrl *n)
6573{
6574
6575 n->reg_size = pow2ceil(sizeof(NvmeBar) +
6576 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
6577 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
6578 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
6579 n->temperature = NVME_TEMPERATURE;
6580 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
6581 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6582 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
6583}
6584
6585static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
6586{
6587 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
6588 uint64_t cap = ldq_le_p(&n->bar.cap);
6589
6590 n->cmb.buf = g_malloc0(cmb_size);
6591 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
6592 "nvme-cmb", cmb_size);
6593 pci_register_bar(pci_dev, NVME_CMB_BIR,
6594 PCI_BASE_ADDRESS_SPACE_MEMORY |
6595 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6596 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
6597
6598 NVME_CAP_SET_CMBS(cap, 1);
6599 stq_le_p(&n->bar.cap, cap);
6600
6601 if (n->params.legacy_cmb) {
6602 nvme_cmb_enable_regs(n);
6603 n->cmb.cmse = true;
6604 }
6605}
6606
6607static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
6608{
6609 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
6610
6611 NVME_PMRCAP_SET_RDS(pmrcap, 1);
6612 NVME_PMRCAP_SET_WDS(pmrcap, 1);
6613 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
6614
6615 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
6616 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
6617 stl_le_p(&n->bar.pmrcap, pmrcap);
6618
6619 pci_register_bar(pci_dev, NVME_PMR_BIR,
6620 PCI_BASE_ADDRESS_SPACE_MEMORY |
6621 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6622 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
6623
6624 memory_region_set_enabled(&n->pmr.dev->mr, false);
6625}
6626
6627static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
6628{
6629 uint8_t *pci_conf = pci_dev->config;
6630 uint64_t bar_size, msix_table_size, msix_pba_size;
6631 unsigned msix_table_offset, msix_pba_offset;
6632 int ret;
6633
6634 Error *err = NULL;
6635
6636 pci_conf[PCI_INTERRUPT_PIN] = 1;
6637 pci_config_set_prog_interface(pci_conf, 0x2);
6638
6639 if (n->params.use_intel_id) {
6640 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6641 pci_config_set_device_id(pci_conf, 0x5845);
6642 } else {
6643 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6644 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6645 }
6646
6647 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6648 pcie_endpoint_cap_init(pci_dev, 0x80);
6649
6650 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6651 msix_table_offset = bar_size;
6652 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6653
6654 bar_size += msix_table_size;
6655 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6656 msix_pba_offset = bar_size;
6657 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6658
6659 bar_size += msix_pba_size;
6660 bar_size = pow2ceil(bar_size);
6661
6662 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6663 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6664 n->reg_size);
6665 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6666
6667 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6668 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6669 ret = msix_init(pci_dev, n->params.msix_qsize,
6670 &n->bar0, 0, msix_table_offset,
6671 &n->bar0, 0, msix_pba_offset, 0, &err);
6672 if (ret < 0) {
6673 if (ret == -ENOTSUP) {
6674 warn_report_err(err);
6675 } else {
6676 error_propagate(errp, err);
6677 return ret;
6678 }
6679 }
6680
6681 if (n->params.cmb_size_mb) {
6682 nvme_init_cmb(n, pci_dev);
6683 }
6684
6685 if (n->pmr.dev) {
6686 nvme_init_pmr(n, pci_dev);
6687 }
6688
6689 return 0;
6690}
6691
6692static void nvme_init_subnqn(NvmeCtrl *n)
6693{
6694 NvmeSubsystem *subsys = n->subsys;
6695 NvmeIdCtrl *id = &n->id_ctrl;
6696
6697 if (!subsys) {
6698 snprintf((char *)id->subnqn, sizeof(id->subnqn),
6699 "nqn.2019-08.org.qemu:%s", n->params.serial);
6700 } else {
6701 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6702 }
6703}
6704
6705static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6706{
6707 NvmeIdCtrl *id = &n->id_ctrl;
6708 uint8_t *pci_conf = pci_dev->config;
6709 uint64_t cap = ldq_le_p(&n->bar.cap);
6710
6711 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6712 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6713 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6714 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6715 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6716
6717 id->cntlid = cpu_to_le16(n->cntlid);
6718
6719 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6720 id->ctratt |= cpu_to_le32(NVME_CTRATT_ELBAS);
6721
6722 id->rab = 6;
6723
6724 if (n->params.use_intel_id) {
6725 id->ieee[0] = 0xb3;
6726 id->ieee[1] = 0x02;
6727 id->ieee[2] = 0x00;
6728 } else {
6729 id->ieee[0] = 0x00;
6730 id->ieee[1] = 0x54;
6731 id->ieee[2] = 0x52;
6732 }
6733
6734 id->mdts = n->params.mdts;
6735 id->ver = cpu_to_le32(NVME_SPEC_VER);
6736 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6737 id->cntrltype = 0x1;
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750 id->acl = 3;
6751 id->aerl = n->params.aerl;
6752 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6753 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6754
6755
6756 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6757 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6758
6759 id->sqes = (0x6 << 4) | 0x6;
6760 id->cqes = (0x4 << 4) | 0x4;
6761 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6762 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6763 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6764 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6765
6766
6767
6768
6769
6770
6771
6772
6773 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6774
6775 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1);
6776 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6777 NVME_CTRL_SGLS_BITBUCKET);
6778
6779 nvme_init_subnqn(n);
6780
6781 id->psd[0].mp = cpu_to_le16(0x9c4);
6782 id->psd[0].enlat = cpu_to_le32(0x10);
6783 id->psd[0].exlat = cpu_to_le32(0x4);
6784
6785 if (n->subsys) {
6786 id->cmic |= NVME_CMIC_MULTI_CTRL;
6787 }
6788
6789 NVME_CAP_SET_MQES(cap, 0x7ff);
6790 NVME_CAP_SET_CQR(cap, 1);
6791 NVME_CAP_SET_TO(cap, 0xf);
6792 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
6793 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
6794 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
6795 NVME_CAP_SET_MPSMAX(cap, 4);
6796 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
6797 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
6798 stq_le_p(&n->bar.cap, cap);
6799
6800 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
6801 n->bar.intmc = n->bar.intms = 0;
6802}
6803
6804static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6805{
6806 int cntlid;
6807
6808 if (!n->subsys) {
6809 return 0;
6810 }
6811
6812 cntlid = nvme_subsys_register_ctrl(n, errp);
6813 if (cntlid < 0) {
6814 return -1;
6815 }
6816
6817 n->cntlid = cntlid;
6818
6819 return 0;
6820}
6821
6822void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6823{
6824 uint32_t nsid = ns->params.nsid;
6825 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6826
6827 n->namespaces[nsid] = ns;
6828 ns->attached++;
6829
6830 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6831 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6832}
6833
6834static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6835{
6836 NvmeCtrl *n = NVME(pci_dev);
6837 NvmeNamespace *ns;
6838 Error *local_err = NULL;
6839
6840 nvme_check_constraints(n, &local_err);
6841 if (local_err) {
6842 error_propagate(errp, local_err);
6843 return;
6844 }
6845
6846 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6847 &pci_dev->qdev, n->parent_obj.qdev.id);
6848
6849 nvme_init_state(n);
6850 if (nvme_init_pci(n, pci_dev, errp)) {
6851 return;
6852 }
6853
6854 if (nvme_init_subsys(n, errp)) {
6855 error_propagate(errp, local_err);
6856 return;
6857 }
6858 nvme_init_ctrl(n, pci_dev);
6859
6860
6861 if (n->namespace.blkconf.blk) {
6862 ns = &n->namespace;
6863 ns->params.nsid = 1;
6864
6865 if (nvme_ns_setup(ns, errp)) {
6866 return;
6867 }
6868
6869 nvme_attach_ns(n, ns);
6870 }
6871}
6872
6873static void nvme_exit(PCIDevice *pci_dev)
6874{
6875 NvmeCtrl *n = NVME(pci_dev);
6876 NvmeNamespace *ns;
6877 int i;
6878
6879 nvme_ctrl_reset(n);
6880
6881 if (n->subsys) {
6882 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6883 ns = nvme_ns(n, i);
6884 if (ns) {
6885 ns->attached--;
6886 }
6887 }
6888
6889 nvme_subsys_unregister_ctrl(n->subsys, n);
6890 }
6891
6892 g_free(n->cq);
6893 g_free(n->sq);
6894 g_free(n->aer_reqs);
6895
6896 if (n->params.cmb_size_mb) {
6897 g_free(n->cmb.buf);
6898 }
6899
6900 if (n->pmr.dev) {
6901 host_memory_backend_set_mapped(n->pmr.dev, false);
6902 }
6903 msix_uninit(pci_dev, &n->bar0, &n->bar0);
6904 memory_region_del_subregion(&n->bar0, &n->iomem);
6905}
6906
6907static Property nvme_props[] = {
6908 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6909 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6910 HostMemoryBackend *),
6911 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6912 NvmeSubsystem *),
6913 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6914 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6915 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6916 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6917 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6918 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6919 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6920 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6921 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6922 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6923 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6924 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6925 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6926 params.auto_transition_zones, true),
6927 DEFINE_PROP_END_OF_LIST(),
6928};
6929
6930static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6931 void *opaque, Error **errp)
6932{
6933 NvmeCtrl *n = NVME(obj);
6934 uint8_t value = n->smart_critical_warning;
6935
6936 visit_type_uint8(v, name, &value, errp);
6937}
6938
6939static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6940 void *opaque, Error **errp)
6941{
6942 NvmeCtrl *n = NVME(obj);
6943 uint8_t value, old_value, cap = 0, index, event;
6944
6945 if (!visit_type_uint8(v, name, &value, errp)) {
6946 return;
6947 }
6948
6949 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6950 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6951 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
6952 cap |= NVME_SMART_PMR_UNRELIABLE;
6953 }
6954
6955 if ((value & cap) != value) {
6956 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6957 value & ~cap);
6958 return;
6959 }
6960
6961 old_value = n->smart_critical_warning;
6962 n->smart_critical_warning = value;
6963
6964
6965 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6966 event = 1 << index;
6967 if (value & ~old_value & event)
6968 nvme_smart_event(n, event);
6969 }
6970}
6971
6972static const VMStateDescription nvme_vmstate = {
6973 .name = "nvme",
6974 .unmigratable = 1,
6975};
6976
6977static void nvme_class_init(ObjectClass *oc, void *data)
6978{
6979 DeviceClass *dc = DEVICE_CLASS(oc);
6980 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6981
6982 pc->realize = nvme_realize;
6983 pc->exit = nvme_exit;
6984 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6985 pc->revision = 2;
6986
6987 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6988 dc->desc = "Non-Volatile Memory Express";
6989 device_class_set_props(dc, nvme_props);
6990 dc->vmsd = &nvme_vmstate;
6991}
6992
6993static void nvme_instance_init(Object *obj)
6994{
6995 NvmeCtrl *n = NVME(obj);
6996
6997 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6998 "bootindex", "/namespace@1,0",
6999 DEVICE(obj));
7000
7001 object_property_add(obj, "smart_critical_warning", "uint8",
7002 nvme_get_smart_warning,
7003 nvme_set_smart_warning, NULL, NULL);
7004}
7005
7006static const TypeInfo nvme_info = {
7007 .name = TYPE_NVME,
7008 .parent = TYPE_PCI_DEVICE,
7009 .instance_size = sizeof(NvmeCtrl),
7010 .instance_init = nvme_instance_init,
7011 .class_init = nvme_class_init,
7012 .interfaces = (InterfaceInfo[]) {
7013 { INTERFACE_PCIE_DEVICE },
7014 { }
7015 },
7016};
7017
7018static const TypeInfo nvme_bus_info = {
7019 .name = TYPE_NVME_BUS,
7020 .parent = TYPE_BUS,
7021 .instance_size = sizeof(NvmeBus),
7022};
7023
7024static void nvme_register_types(void)
7025{
7026 type_register_static(&nvme_info);
7027 type_register_static(&nvme_bus_info);
7028}
7029
7030type_init(nvme_register_types)
7031