1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137#include "qemu/osdep.h"
138#include "qemu/units.h"
139#include "qemu/error-report.h"
140#include "hw/block/block.h"
141#include "hw/pci/msix.h"
142#include "hw/pci/pci.h"
143#include "hw/qdev-properties.h"
144#include "migration/vmstate.h"
145#include "sysemu/sysemu.h"
146#include "qapi/error.h"
147#include "qapi/visitor.h"
148#include "sysemu/hostmem.h"
149#include "sysemu/block-backend.h"
150#include "exec/memory.h"
151#include "qemu/log.h"
152#include "qemu/module.h"
153#include "qemu/cutils.h"
154#include "trace.h"
155#include "nvme.h"
156#include "nvme-ns.h"
157#include "nvme-dif.h"
158
159#define NVME_MAX_IOQPAIRS 0xffff
160#define NVME_DB_SIZE 4
161#define NVME_SPEC_VER 0x00010400
162#define NVME_CMB_BIR 2
163#define NVME_PMR_BIR 4
164#define NVME_TEMPERATURE 0x143
165#define NVME_TEMPERATURE_WARNING 0x157
166#define NVME_TEMPERATURE_CRITICAL 0x175
167#define NVME_NUM_FW_SLOTS 1
168
169#define NVME_GUEST_ERR(trace, fmt, ...) \
170 do { \
171 (trace_##trace)(__VA_ARGS__); \
172 qemu_log_mask(LOG_GUEST_ERROR, #trace \
173 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
174 } while (0)
175
176static const bool nvme_feature_support[NVME_FID_MAX] = {
177 [NVME_ARBITRATION] = true,
178 [NVME_POWER_MANAGEMENT] = true,
179 [NVME_TEMPERATURE_THRESHOLD] = true,
180 [NVME_ERROR_RECOVERY] = true,
181 [NVME_VOLATILE_WRITE_CACHE] = true,
182 [NVME_NUMBER_OF_QUEUES] = true,
183 [NVME_INTERRUPT_COALESCING] = true,
184 [NVME_INTERRUPT_VECTOR_CONF] = true,
185 [NVME_WRITE_ATOMICITY] = true,
186 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
187 [NVME_TIMESTAMP] = true,
188};
189
190static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
191 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
192 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
193 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
194 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
195 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
196 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
197};
198
199static const uint32_t nvme_cse_acs[256] = {
200 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
201 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
202 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
203 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
204 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
205 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
206 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
207 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
208 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
209 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
210 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
211 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
212};
213
214static const uint32_t nvme_cse_iocs_none[256];
215
216static const uint32_t nvme_cse_iocs_nvm[256] = {
217 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
218 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
219 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
220 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
221 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
222 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
223 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
225};
226
227static const uint32_t nvme_cse_iocs_zoned[256] = {
228 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
229 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
232 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
233 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
234 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
235 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
236 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
237 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
238 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
239};
240
241static void nvme_process_sq(void *opaque);
242
243static uint16_t nvme_sqid(NvmeRequest *req)
244{
245 return le16_to_cpu(req->sq->sqid);
246}
247
248static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
249 NvmeZoneState state)
250{
251 if (QTAILQ_IN_USE(zone, entry)) {
252 switch (nvme_get_zone_state(zone)) {
253 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
254 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
255 break;
256 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
257 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
258 break;
259 case NVME_ZONE_STATE_CLOSED:
260 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
261 break;
262 case NVME_ZONE_STATE_FULL:
263 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
264 default:
265 ;
266 }
267 }
268
269 nvme_set_zone_state(zone, state);
270
271 switch (state) {
272 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
273 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
274 break;
275 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
276 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
277 break;
278 case NVME_ZONE_STATE_CLOSED:
279 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
280 break;
281 case NVME_ZONE_STATE_FULL:
282 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
283 case NVME_ZONE_STATE_READ_ONLY:
284 break;
285 default:
286 zone->d.za = 0;
287 }
288}
289
290
291
292
293
294static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
295{
296 if (ns->params.max_active_zones != 0 &&
297 ns->nr_active_zones + act > ns->params.max_active_zones) {
298 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
299 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
300 }
301 if (ns->params.max_open_zones != 0 &&
302 ns->nr_open_zones + opn > ns->params.max_open_zones) {
303 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
304 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
305 }
306
307 return NVME_SUCCESS;
308}
309
310static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
311{
312 hwaddr hi, lo;
313
314 if (!n->cmb.cmse) {
315 return false;
316 }
317
318 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
319 hi = lo + int128_get64(n->cmb.mem.size);
320
321 return addr >= lo && addr < hi;
322}
323
324static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
325{
326 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
327 return &n->cmb.buf[addr - base];
328}
329
330static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
331{
332 hwaddr hi;
333
334 if (!n->pmr.cmse) {
335 return false;
336 }
337
338 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
339
340 return addr >= n->pmr.cba && addr < hi;
341}
342
343static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
344{
345 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
346}
347
348static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
349{
350 hwaddr hi = addr + size - 1;
351 if (hi < addr) {
352 return 1;
353 }
354
355 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
356 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
357 return 0;
358 }
359
360 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
361 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
362 return 0;
363 }
364
365 return pci_dma_read(&n->parent_obj, addr, buf, size);
366}
367
368static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
369{
370 hwaddr hi = addr + size - 1;
371 if (hi < addr) {
372 return 1;
373 }
374
375 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
376 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
377 return 0;
378 }
379
380 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
381 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
382 return 0;
383 }
384
385 return pci_dma_write(&n->parent_obj, addr, buf, size);
386}
387
388static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
389{
390 return nsid && (nsid == NVME_NSID_BROADCAST || nsid <= n->num_namespaces);
391}
392
393static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
394{
395 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
396}
397
398static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
399{
400 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
401}
402
403static void nvme_inc_cq_tail(NvmeCQueue *cq)
404{
405 cq->tail++;
406 if (cq->tail >= cq->size) {
407 cq->tail = 0;
408 cq->phase = !cq->phase;
409 }
410}
411
412static void nvme_inc_sq_head(NvmeSQueue *sq)
413{
414 sq->head = (sq->head + 1) % sq->size;
415}
416
417static uint8_t nvme_cq_full(NvmeCQueue *cq)
418{
419 return (cq->tail + 1) % cq->size == cq->head;
420}
421
422static uint8_t nvme_sq_empty(NvmeSQueue *sq)
423{
424 return sq->head == sq->tail;
425}
426
427static void nvme_irq_check(NvmeCtrl *n)
428{
429 if (msix_enabled(&(n->parent_obj))) {
430 return;
431 }
432 if (~n->bar.intms & n->irq_status) {
433 pci_irq_assert(&n->parent_obj);
434 } else {
435 pci_irq_deassert(&n->parent_obj);
436 }
437}
438
439static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
440{
441 if (cq->irq_enabled) {
442 if (msix_enabled(&(n->parent_obj))) {
443 trace_pci_nvme_irq_msix(cq->vector);
444 msix_notify(&(n->parent_obj), cq->vector);
445 } else {
446 trace_pci_nvme_irq_pin();
447 assert(cq->vector < 32);
448 n->irq_status |= 1 << cq->vector;
449 nvme_irq_check(n);
450 }
451 } else {
452 trace_pci_nvme_irq_masked();
453 }
454}
455
456static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
457{
458 if (cq->irq_enabled) {
459 if (msix_enabled(&(n->parent_obj))) {
460 return;
461 } else {
462 assert(cq->vector < 32);
463 n->irq_status &= ~(1 << cq->vector);
464 nvme_irq_check(n);
465 }
466 }
467}
468
469static void nvme_req_clear(NvmeRequest *req)
470{
471 req->ns = NULL;
472 req->opaque = NULL;
473 req->aiocb = NULL;
474 memset(&req->cqe, 0x0, sizeof(req->cqe));
475 req->status = NVME_SUCCESS;
476}
477
478static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
479{
480 if (dma) {
481 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
482 sg->flags = NVME_SG_DMA;
483 } else {
484 qemu_iovec_init(&sg->iov, 0);
485 }
486
487 sg->flags |= NVME_SG_ALLOC;
488}
489
490static inline void nvme_sg_unmap(NvmeSg *sg)
491{
492 if (!(sg->flags & NVME_SG_ALLOC)) {
493 return;
494 }
495
496 if (sg->flags & NVME_SG_DMA) {
497 qemu_sglist_destroy(&sg->qsg);
498 } else {
499 qemu_iovec_destroy(&sg->iov);
500 }
501
502 memset(sg, 0x0, sizeof(*sg));
503}
504
505
506
507
508
509
510static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
511 NvmeSg *mdata)
512{
513 NvmeSg *dst = data;
514 size_t size = nvme_lsize(ns);
515 size_t msize = nvme_msize(ns);
516 uint32_t trans_len, count = size;
517 uint64_t offset = 0;
518 bool dma = sg->flags & NVME_SG_DMA;
519 size_t sge_len;
520 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
521 int sg_idx = 0;
522
523 assert(sg->flags & NVME_SG_ALLOC);
524
525 while (sg_len) {
526 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
527
528 trans_len = MIN(sg_len, count);
529 trans_len = MIN(trans_len, sge_len - offset);
530
531 if (dst) {
532 if (dma) {
533 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
534 trans_len);
535 } else {
536 qemu_iovec_add(&dst->iov,
537 sg->iov.iov[sg_idx].iov_base + offset,
538 trans_len);
539 }
540 }
541
542 sg_len -= trans_len;
543 count -= trans_len;
544 offset += trans_len;
545
546 if (count == 0) {
547 dst = (dst == data) ? mdata : data;
548 count = (dst == data) ? size : msize;
549 }
550
551 if (sge_len == offset) {
552 offset = 0;
553 sg_idx++;
554 }
555 }
556}
557
558static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
559 size_t len)
560{
561 if (!len) {
562 return NVME_SUCCESS;
563 }
564
565 trace_pci_nvme_map_addr_cmb(addr, len);
566
567 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
568 return NVME_DATA_TRAS_ERROR;
569 }
570
571 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
572
573 return NVME_SUCCESS;
574}
575
576static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
577 size_t len)
578{
579 if (!len) {
580 return NVME_SUCCESS;
581 }
582
583 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
584 return NVME_DATA_TRAS_ERROR;
585 }
586
587 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
588
589 return NVME_SUCCESS;
590}
591
592static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
593{
594 bool cmb = false, pmr = false;
595
596 if (!len) {
597 return NVME_SUCCESS;
598 }
599
600 trace_pci_nvme_map_addr(addr, len);
601
602 if (nvme_addr_is_cmb(n, addr)) {
603 cmb = true;
604 } else if (nvme_addr_is_pmr(n, addr)) {
605 pmr = true;
606 }
607
608 if (cmb || pmr) {
609 if (sg->flags & NVME_SG_DMA) {
610 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
611 }
612
613 if (cmb) {
614 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
615 } else {
616 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
617 }
618 }
619
620 if (!(sg->flags & NVME_SG_DMA)) {
621 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
622 }
623
624 qemu_sglist_add(&sg->qsg, addr, len);
625
626 return NVME_SUCCESS;
627}
628
629static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
630{
631 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
632}
633
634static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
635 uint64_t prp2, uint32_t len)
636{
637 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
638 trans_len = MIN(len, trans_len);
639 int num_prps = (len >> n->page_bits) + 1;
640 uint16_t status;
641 int ret;
642
643 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
644
645 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
646
647 status = nvme_map_addr(n, sg, prp1, trans_len);
648 if (status) {
649 goto unmap;
650 }
651
652 len -= trans_len;
653 if (len) {
654 if (len > n->page_size) {
655 uint64_t prp_list[n->max_prp_ents];
656 uint32_t nents, prp_trans;
657 int i = 0;
658
659
660
661
662
663
664 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
665 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
666 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
667 if (ret) {
668 trace_pci_nvme_err_addr_read(prp2);
669 status = NVME_DATA_TRAS_ERROR;
670 goto unmap;
671 }
672 while (len != 0) {
673 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
674
675 if (i == nents - 1 && len > n->page_size) {
676 if (unlikely(prp_ent & (n->page_size - 1))) {
677 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
678 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
679 goto unmap;
680 }
681
682 i = 0;
683 nents = (len + n->page_size - 1) >> n->page_bits;
684 nents = MIN(nents, n->max_prp_ents);
685 prp_trans = nents * sizeof(uint64_t);
686 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
687 prp_trans);
688 if (ret) {
689 trace_pci_nvme_err_addr_read(prp_ent);
690 status = NVME_DATA_TRAS_ERROR;
691 goto unmap;
692 }
693 prp_ent = le64_to_cpu(prp_list[i]);
694 }
695
696 if (unlikely(prp_ent & (n->page_size - 1))) {
697 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
698 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
699 goto unmap;
700 }
701
702 trans_len = MIN(len, n->page_size);
703 status = nvme_map_addr(n, sg, prp_ent, trans_len);
704 if (status) {
705 goto unmap;
706 }
707
708 len -= trans_len;
709 i++;
710 }
711 } else {
712 if (unlikely(prp2 & (n->page_size - 1))) {
713 trace_pci_nvme_err_invalid_prp2_align(prp2);
714 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
715 goto unmap;
716 }
717 status = nvme_map_addr(n, sg, prp2, len);
718 if (status) {
719 goto unmap;
720 }
721 }
722 }
723
724 return NVME_SUCCESS;
725
726unmap:
727 nvme_sg_unmap(sg);
728 return status;
729}
730
731
732
733
734
735static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
736 NvmeSglDescriptor *segment, uint64_t nsgld,
737 size_t *len, NvmeCmd *cmd)
738{
739 dma_addr_t addr, trans_len;
740 uint32_t dlen;
741 uint16_t status;
742
743 for (int i = 0; i < nsgld; i++) {
744 uint8_t type = NVME_SGL_TYPE(segment[i].type);
745
746 switch (type) {
747 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
748 if (cmd->opcode == NVME_CMD_WRITE) {
749 continue;
750 }
751 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
752 break;
753 case NVME_SGL_DESCR_TYPE_SEGMENT:
754 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
755 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
756 default:
757 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
758 }
759
760 dlen = le32_to_cpu(segment[i].len);
761
762 if (!dlen) {
763 continue;
764 }
765
766 if (*len == 0) {
767
768
769
770
771
772 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
773 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
774 break;
775 }
776
777 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
778 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
779 }
780
781 trans_len = MIN(*len, dlen);
782
783 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
784 goto next;
785 }
786
787 addr = le64_to_cpu(segment[i].addr);
788
789 if (UINT64_MAX - addr < dlen) {
790 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
791 }
792
793 status = nvme_map_addr(n, sg, addr, trans_len);
794 if (status) {
795 return status;
796 }
797
798next:
799 *len -= trans_len;
800 }
801
802 return NVME_SUCCESS;
803}
804
805static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
806 size_t len, NvmeCmd *cmd)
807{
808
809
810
811
812
813
814
815 const int SEG_CHUNK_SIZE = 256;
816
817 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
818 uint64_t nsgld;
819 uint32_t seg_len;
820 uint16_t status;
821 hwaddr addr;
822 int ret;
823
824 sgld = &sgl;
825 addr = le64_to_cpu(sgl.addr);
826
827 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
828
829 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
830
831
832
833
834
835 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
836 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
837 if (status) {
838 goto unmap;
839 }
840
841 goto out;
842 }
843
844 for (;;) {
845 switch (NVME_SGL_TYPE(sgld->type)) {
846 case NVME_SGL_DESCR_TYPE_SEGMENT:
847 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
848 break;
849 default:
850 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
851 }
852
853 seg_len = le32_to_cpu(sgld->len);
854
855
856 if ((!seg_len || seg_len & 0xf) &&
857 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
858 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
859 }
860
861 if (UINT64_MAX - addr < seg_len) {
862 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
863 }
864
865 nsgld = seg_len / sizeof(NvmeSglDescriptor);
866
867 while (nsgld > SEG_CHUNK_SIZE) {
868 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
869 trace_pci_nvme_err_addr_read(addr);
870 status = NVME_DATA_TRAS_ERROR;
871 goto unmap;
872 }
873
874 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
875 &len, cmd);
876 if (status) {
877 goto unmap;
878 }
879
880 nsgld -= SEG_CHUNK_SIZE;
881 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
882 }
883
884 ret = nvme_addr_read(n, addr, segment, nsgld *
885 sizeof(NvmeSglDescriptor));
886 if (ret) {
887 trace_pci_nvme_err_addr_read(addr);
888 status = NVME_DATA_TRAS_ERROR;
889 goto unmap;
890 }
891
892 last_sgld = &segment[nsgld - 1];
893
894
895
896
897
898 switch (NVME_SGL_TYPE(last_sgld->type)) {
899 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
900 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
901 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
902 if (status) {
903 goto unmap;
904 }
905
906 goto out;
907
908 default:
909 break;
910 }
911
912
913
914
915
916 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
917 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
918 goto unmap;
919 }
920
921 sgld = last_sgld;
922 addr = le64_to_cpu(sgld->addr);
923
924
925
926
927
928 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
929 if (status) {
930 goto unmap;
931 }
932 }
933
934out:
935
936 if (len) {
937 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
938 goto unmap;
939 }
940
941 return NVME_SUCCESS;
942
943unmap:
944 nvme_sg_unmap(sg);
945 return status;
946}
947
948uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
949 NvmeCmd *cmd)
950{
951 uint64_t prp1, prp2;
952
953 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
954 case NVME_PSDT_PRP:
955 prp1 = le64_to_cpu(cmd->dptr.prp1);
956 prp2 = le64_to_cpu(cmd->dptr.prp2);
957
958 return nvme_map_prp(n, sg, prp1, prp2, len);
959 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
960 case NVME_PSDT_SGL_MPTR_SGL:
961 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
962 default:
963 return NVME_INVALID_FIELD;
964 }
965}
966
967static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
968 NvmeCmd *cmd)
969{
970 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
971 hwaddr mptr = le64_to_cpu(cmd->mptr);
972 uint16_t status;
973
974 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
975 NvmeSglDescriptor sgl;
976
977 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
978 return NVME_DATA_TRAS_ERROR;
979 }
980
981 status = nvme_map_sgl(n, sg, sgl, len, cmd);
982 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
983 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
984 }
985
986 return status;
987 }
988
989 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
990 status = nvme_map_addr(n, sg, mptr, len);
991 if (status) {
992 nvme_sg_unmap(sg);
993 }
994
995 return status;
996}
997
998static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
999{
1000 NvmeNamespace *ns = req->ns;
1001 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1002 uint16_t ctrl = le16_to_cpu(rw->control);
1003 size_t len = nvme_l2b(ns, nlb);
1004 uint16_t status;
1005
1006 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
1007 (ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
1008 goto out;
1009 }
1010
1011 if (nvme_ns_ext(ns)) {
1012 NvmeSg sg;
1013
1014 len += nvme_m2b(ns, nlb);
1015
1016 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1017 if (status) {
1018 return status;
1019 }
1020
1021 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1022 nvme_sg_split(&sg, ns, &req->sg, NULL);
1023 nvme_sg_unmap(&sg);
1024
1025 return NVME_SUCCESS;
1026 }
1027
1028out:
1029 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1030}
1031
1032static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1033{
1034 NvmeNamespace *ns = req->ns;
1035 size_t len = nvme_m2b(ns, nlb);
1036 uint16_t status;
1037
1038 if (nvme_ns_ext(ns)) {
1039 NvmeSg sg;
1040
1041 len += nvme_l2b(ns, nlb);
1042
1043 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1044 if (status) {
1045 return status;
1046 }
1047
1048 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1049 nvme_sg_split(&sg, ns, NULL, &req->sg);
1050 nvme_sg_unmap(&sg);
1051
1052 return NVME_SUCCESS;
1053 }
1054
1055 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1056}
1057
1058static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1059 uint32_t len, uint32_t bytes,
1060 int32_t skip_bytes, int64_t offset,
1061 NvmeTxDirection dir)
1062{
1063 hwaddr addr;
1064 uint32_t trans_len, count = bytes;
1065 bool dma = sg->flags & NVME_SG_DMA;
1066 int64_t sge_len;
1067 int sg_idx = 0;
1068 int ret;
1069
1070 assert(sg->flags & NVME_SG_ALLOC);
1071
1072 while (len) {
1073 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1074
1075 if (sge_len - offset < 0) {
1076 offset -= sge_len;
1077 sg_idx++;
1078 continue;
1079 }
1080
1081 if (sge_len == offset) {
1082 offset = 0;
1083 sg_idx++;
1084 continue;
1085 }
1086
1087 trans_len = MIN(len, count);
1088 trans_len = MIN(trans_len, sge_len - offset);
1089
1090 if (dma) {
1091 addr = sg->qsg.sg[sg_idx].base + offset;
1092 } else {
1093 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1094 }
1095
1096 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1097 ret = nvme_addr_read(n, addr, ptr, trans_len);
1098 } else {
1099 ret = nvme_addr_write(n, addr, ptr, trans_len);
1100 }
1101
1102 if (ret) {
1103 return NVME_DATA_TRAS_ERROR;
1104 }
1105
1106 ptr += trans_len;
1107 len -= trans_len;
1108 count -= trans_len;
1109 offset += trans_len;
1110
1111 if (count == 0) {
1112 count = bytes;
1113 offset += skip_bytes;
1114 }
1115 }
1116
1117 return NVME_SUCCESS;
1118}
1119
1120static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1121 NvmeTxDirection dir)
1122{
1123 assert(sg->flags & NVME_SG_ALLOC);
1124
1125 if (sg->flags & NVME_SG_DMA) {
1126 uint64_t residual;
1127
1128 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1129 residual = dma_buf_write(ptr, len, &sg->qsg);
1130 } else {
1131 residual = dma_buf_read(ptr, len, &sg->qsg);
1132 }
1133
1134 if (unlikely(residual)) {
1135 trace_pci_nvme_err_invalid_dma();
1136 return NVME_INVALID_FIELD | NVME_DNR;
1137 }
1138 } else {
1139 size_t bytes;
1140
1141 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1142 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1143 } else {
1144 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1145 }
1146
1147 if (unlikely(bytes != len)) {
1148 trace_pci_nvme_err_invalid_dma();
1149 return NVME_INVALID_FIELD | NVME_DNR;
1150 }
1151 }
1152
1153 return NVME_SUCCESS;
1154}
1155
1156static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1157 NvmeRequest *req)
1158{
1159 uint16_t status;
1160
1161 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1162 if (status) {
1163 return status;
1164 }
1165
1166 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1167}
1168
1169static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1170 NvmeRequest *req)
1171{
1172 uint16_t status;
1173
1174 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1175 if (status) {
1176 return status;
1177 }
1178
1179 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1180}
1181
1182uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1183 NvmeTxDirection dir, NvmeRequest *req)
1184{
1185 NvmeNamespace *ns = req->ns;
1186 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1187 uint16_t ctrl = le16_to_cpu(rw->control);
1188
1189 if (nvme_ns_ext(ns) &&
1190 !(ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
1191 size_t lsize = nvme_lsize(ns);
1192 size_t msize = nvme_msize(ns);
1193
1194 return nvme_tx_interleaved(n, &req->sg, ptr, len, lsize, msize, 0,
1195 dir);
1196 }
1197
1198 return nvme_tx(n, &req->sg, ptr, len, dir);
1199}
1200
1201uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1202 NvmeTxDirection dir, NvmeRequest *req)
1203{
1204 NvmeNamespace *ns = req->ns;
1205 uint16_t status;
1206
1207 if (nvme_ns_ext(ns)) {
1208 size_t lsize = nvme_lsize(ns);
1209 size_t msize = nvme_msize(ns);
1210
1211 return nvme_tx_interleaved(n, &req->sg, ptr, len, msize, lsize, lsize,
1212 dir);
1213 }
1214
1215 nvme_sg_unmap(&req->sg);
1216
1217 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1218 if (status) {
1219 return status;
1220 }
1221
1222 return nvme_tx(n, &req->sg, ptr, len, dir);
1223}
1224
1225static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1226 BlockCompletionFunc *cb, NvmeRequest *req)
1227{
1228 assert(req->sg.flags & NVME_SG_ALLOC);
1229
1230 if (req->sg.flags & NVME_SG_DMA) {
1231 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1232 cb, req);
1233 } else {
1234 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1235 }
1236}
1237
1238static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1239 BlockCompletionFunc *cb, NvmeRequest *req)
1240{
1241 assert(req->sg.flags & NVME_SG_ALLOC);
1242
1243 if (req->sg.flags & NVME_SG_DMA) {
1244 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1245 cb, req);
1246 } else {
1247 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1248 }
1249}
1250
1251static void nvme_post_cqes(void *opaque)
1252{
1253 NvmeCQueue *cq = opaque;
1254 NvmeCtrl *n = cq->ctrl;
1255 NvmeRequest *req, *next;
1256 int ret;
1257
1258 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1259 NvmeSQueue *sq;
1260 hwaddr addr;
1261
1262 if (nvme_cq_full(cq)) {
1263 break;
1264 }
1265
1266 sq = req->sq;
1267 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1268 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1269 req->cqe.sq_head = cpu_to_le16(sq->head);
1270 addr = cq->dma_addr + cq->tail * n->cqe_size;
1271 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1272 sizeof(req->cqe));
1273 if (ret) {
1274 trace_pci_nvme_err_addr_write(addr);
1275 trace_pci_nvme_err_cfs();
1276 n->bar.csts = NVME_CSTS_FAILED;
1277 break;
1278 }
1279 QTAILQ_REMOVE(&cq->req_list, req, entry);
1280 nvme_inc_cq_tail(cq);
1281 nvme_sg_unmap(&req->sg);
1282 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1283 }
1284 if (cq->tail != cq->head) {
1285 nvme_irq_assert(n, cq);
1286 }
1287}
1288
1289static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1290{
1291 assert(cq->cqid == req->sq->cqid);
1292 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1293 req->status);
1294
1295 if (req->status) {
1296 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1297 req->status, req->cmd.opcode);
1298 }
1299
1300 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1301 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1302 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1303}
1304
1305static void nvme_process_aers(void *opaque)
1306{
1307 NvmeCtrl *n = opaque;
1308 NvmeAsyncEvent *event, *next;
1309
1310 trace_pci_nvme_process_aers(n->aer_queued);
1311
1312 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1313 NvmeRequest *req;
1314 NvmeAerResult *result;
1315
1316
1317 if (!n->outstanding_aers) {
1318 trace_pci_nvme_no_outstanding_aers();
1319 break;
1320 }
1321
1322
1323 if (n->aer_mask & (1 << event->result.event_type)) {
1324 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1325 continue;
1326 }
1327
1328 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1329 n->aer_queued--;
1330
1331 n->aer_mask |= 1 << event->result.event_type;
1332 n->outstanding_aers--;
1333
1334 req = n->aer_reqs[n->outstanding_aers];
1335
1336 result = (NvmeAerResult *) &req->cqe.result;
1337 result->event_type = event->result.event_type;
1338 result->event_info = event->result.event_info;
1339 result->log_page = event->result.log_page;
1340 g_free(event);
1341
1342 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1343 result->log_page);
1344
1345 nvme_enqueue_req_completion(&n->admin_cq, req);
1346 }
1347}
1348
1349static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1350 uint8_t event_info, uint8_t log_page)
1351{
1352 NvmeAsyncEvent *event;
1353
1354 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1355
1356 if (n->aer_queued == n->params.aer_max_queued) {
1357 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1358 return;
1359 }
1360
1361 event = g_new(NvmeAsyncEvent, 1);
1362 event->result = (NvmeAerResult) {
1363 .event_type = event_type,
1364 .event_info = event_info,
1365 .log_page = log_page,
1366 };
1367
1368 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1369 n->aer_queued++;
1370
1371 nvme_process_aers(n);
1372}
1373
1374static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1375{
1376 uint8_t aer_info;
1377
1378
1379 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1380 return;
1381 }
1382
1383 switch (event) {
1384 case NVME_SMART_SPARE:
1385 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1386 break;
1387 case NVME_SMART_TEMPERATURE:
1388 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1389 break;
1390 case NVME_SMART_RELIABILITY:
1391 case NVME_SMART_MEDIA_READ_ONLY:
1392 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1393 case NVME_SMART_PMR_UNRELIABLE:
1394 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1395 break;
1396 default:
1397 return;
1398 }
1399
1400 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1401}
1402
1403static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1404{
1405 n->aer_mask &= ~(1 << event_type);
1406 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1407 nvme_process_aers(n);
1408 }
1409}
1410
1411static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1412{
1413 uint8_t mdts = n->params.mdts;
1414
1415 if (mdts && len > n->page_size << mdts) {
1416 trace_pci_nvme_err_mdts(len);
1417 return NVME_INVALID_FIELD | NVME_DNR;
1418 }
1419
1420 return NVME_SUCCESS;
1421}
1422
1423static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1424 uint32_t nlb)
1425{
1426 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1427
1428 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1429 return NVME_LBA_RANGE | NVME_DNR;
1430 }
1431
1432 return NVME_SUCCESS;
1433}
1434
1435static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1436 uint32_t nlb)
1437{
1438 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1439
1440 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1441 int64_t offset = nvme_l2b(ns, slba);
1442 bool zeroed;
1443 int ret;
1444
1445 Error *local_err = NULL;
1446
1447
1448
1449
1450
1451
1452
1453 do {
1454 bytes -= pnum;
1455
1456 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1457 if (ret < 0) {
1458 error_setg_errno(&local_err, -ret, "unable to get block status");
1459 error_report_err(local_err);
1460
1461 return NVME_INTERNAL_DEV_ERROR;
1462 }
1463
1464 zeroed = !!(ret & BDRV_BLOCK_ZERO);
1465
1466 trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
1467
1468 if (zeroed) {
1469 return NVME_DULB;
1470 }
1471
1472 offset += pnum;
1473 } while (pnum != bytes);
1474
1475 return NVME_SUCCESS;
1476}
1477
1478static void nvme_aio_err(NvmeRequest *req, int ret)
1479{
1480 uint16_t status = NVME_SUCCESS;
1481 Error *local_err = NULL;
1482
1483 switch (req->cmd.opcode) {
1484 case NVME_CMD_READ:
1485 status = NVME_UNRECOVERED_READ;
1486 break;
1487 case NVME_CMD_FLUSH:
1488 case NVME_CMD_WRITE:
1489 case NVME_CMD_WRITE_ZEROES:
1490 case NVME_CMD_ZONE_APPEND:
1491 status = NVME_WRITE_FAULT;
1492 break;
1493 default:
1494 status = NVME_INTERNAL_DEV_ERROR;
1495 break;
1496 }
1497
1498 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1499
1500 error_setg_errno(&local_err, -ret, "aio failed");
1501 error_report_err(local_err);
1502
1503
1504
1505
1506
1507 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1508 return;
1509 }
1510
1511 req->status = status;
1512}
1513
1514static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1515{
1516 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1517 slba / ns->zone_size;
1518}
1519
1520static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1521{
1522 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1523
1524 assert(zone_idx < ns->num_zones);
1525 return &ns->zone_array[zone_idx];
1526}
1527
1528static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1529{
1530 uint64_t zslba = zone->d.zslba;
1531
1532 switch (nvme_get_zone_state(zone)) {
1533 case NVME_ZONE_STATE_EMPTY:
1534 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1535 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1536 case NVME_ZONE_STATE_CLOSED:
1537 return NVME_SUCCESS;
1538 case NVME_ZONE_STATE_FULL:
1539 trace_pci_nvme_err_zone_is_full(zslba);
1540 return NVME_ZONE_FULL;
1541 case NVME_ZONE_STATE_OFFLINE:
1542 trace_pci_nvme_err_zone_is_offline(zslba);
1543 return NVME_ZONE_OFFLINE;
1544 case NVME_ZONE_STATE_READ_ONLY:
1545 trace_pci_nvme_err_zone_is_read_only(zslba);
1546 return NVME_ZONE_READ_ONLY;
1547 default:
1548 assert(false);
1549 }
1550
1551 return NVME_INTERNAL_DEV_ERROR;
1552}
1553
1554static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1555 uint64_t slba, uint32_t nlb)
1556{
1557 uint64_t zcap = nvme_zone_wr_boundary(zone);
1558 uint16_t status;
1559
1560 status = nvme_check_zone_state_for_write(zone);
1561 if (status) {
1562 return status;
1563 }
1564
1565 if (unlikely(slba != zone->w_ptr)) {
1566 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1567 return NVME_ZONE_INVALID_WRITE;
1568 }
1569
1570 if (unlikely((slba + nlb) > zcap)) {
1571 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1572 return NVME_ZONE_BOUNDARY_ERROR;
1573 }
1574
1575 return NVME_SUCCESS;
1576}
1577
1578static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1579{
1580 switch (nvme_get_zone_state(zone)) {
1581 case NVME_ZONE_STATE_EMPTY:
1582 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1583 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1584 case NVME_ZONE_STATE_FULL:
1585 case NVME_ZONE_STATE_CLOSED:
1586 case NVME_ZONE_STATE_READ_ONLY:
1587 return NVME_SUCCESS;
1588 case NVME_ZONE_STATE_OFFLINE:
1589 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1590 return NVME_ZONE_OFFLINE;
1591 default:
1592 assert(false);
1593 }
1594
1595 return NVME_INTERNAL_DEV_ERROR;
1596}
1597
1598static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1599 uint32_t nlb)
1600{
1601 NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
1602 uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
1603 uint64_t end = slba + nlb;
1604 uint16_t status;
1605
1606 status = nvme_check_zone_state_for_read(zone);
1607 if (status) {
1608 ;
1609 } else if (unlikely(end > bndry)) {
1610 if (!ns->params.cross_zone_read) {
1611 status = NVME_ZONE_BOUNDARY_ERROR;
1612 } else {
1613
1614
1615
1616
1617 do {
1618 zone++;
1619 status = nvme_check_zone_state_for_read(zone);
1620 if (status) {
1621 break;
1622 }
1623 } while (end > nvme_zone_rd_boundary(ns, zone));
1624 }
1625 }
1626
1627 return status;
1628}
1629
1630static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1631{
1632 switch (nvme_get_zone_state(zone)) {
1633 case NVME_ZONE_STATE_FULL:
1634 return NVME_SUCCESS;
1635
1636 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1637 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1638 nvme_aor_dec_open(ns);
1639
1640 case NVME_ZONE_STATE_CLOSED:
1641 nvme_aor_dec_active(ns);
1642
1643 case NVME_ZONE_STATE_EMPTY:
1644 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1645 return NVME_SUCCESS;
1646
1647 default:
1648 return NVME_ZONE_INVAL_TRANSITION;
1649 }
1650}
1651
1652static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1653{
1654 switch (nvme_get_zone_state(zone)) {
1655 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1656 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1657 nvme_aor_dec_open(ns);
1658 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1659
1660 case NVME_ZONE_STATE_CLOSED:
1661 return NVME_SUCCESS;
1662
1663 default:
1664 return NVME_ZONE_INVAL_TRANSITION;
1665 }
1666}
1667
1668static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1669{
1670 NvmeZone *zone;
1671
1672 if (ns->params.max_open_zones &&
1673 ns->nr_open_zones == ns->params.max_open_zones) {
1674 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1675 if (zone) {
1676
1677
1678
1679 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1680 nvme_zrm_close(ns, zone);
1681 }
1682 }
1683}
1684
1685static uint16_t __nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone,
1686 bool implicit)
1687{
1688 int act = 0;
1689 uint16_t status;
1690
1691 switch (nvme_get_zone_state(zone)) {
1692 case NVME_ZONE_STATE_EMPTY:
1693 act = 1;
1694
1695
1696
1697 case NVME_ZONE_STATE_CLOSED:
1698 nvme_zrm_auto_transition_zone(ns);
1699 status = nvme_aor_check(ns, act, 1);
1700 if (status) {
1701 return status;
1702 }
1703
1704 if (act) {
1705 nvme_aor_inc_active(ns);
1706 }
1707
1708 nvme_aor_inc_open(ns);
1709
1710 if (implicit) {
1711 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1712 return NVME_SUCCESS;
1713 }
1714
1715
1716
1717 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1718 if (implicit) {
1719 return NVME_SUCCESS;
1720 }
1721
1722 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1723
1724
1725
1726 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1727 return NVME_SUCCESS;
1728
1729 default:
1730 return NVME_ZONE_INVAL_TRANSITION;
1731 }
1732}
1733
1734static inline uint16_t nvme_zrm_auto(NvmeNamespace *ns, NvmeZone *zone)
1735{
1736 return __nvme_zrm_open(ns, zone, true);
1737}
1738
1739static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone)
1740{
1741 return __nvme_zrm_open(ns, zone, false);
1742}
1743
1744static void __nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1745 uint32_t nlb)
1746{
1747 zone->d.wp += nlb;
1748
1749 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1750 nvme_zrm_finish(ns, zone);
1751 }
1752}
1753
1754static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1755{
1756 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1757 NvmeZone *zone;
1758 uint64_t slba;
1759 uint32_t nlb;
1760
1761 slba = le64_to_cpu(rw->slba);
1762 nlb = le16_to_cpu(rw->nlb) + 1;
1763 zone = nvme_get_zone_by_slba(ns, slba);
1764
1765 __nvme_advance_zone_wp(ns, zone, nlb);
1766}
1767
1768static inline bool nvme_is_write(NvmeRequest *req)
1769{
1770 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1771
1772 return rw->opcode == NVME_CMD_WRITE ||
1773 rw->opcode == NVME_CMD_ZONE_APPEND ||
1774 rw->opcode == NVME_CMD_WRITE_ZEROES;
1775}
1776
1777static void nvme_misc_cb(void *opaque, int ret)
1778{
1779 NvmeRequest *req = opaque;
1780 NvmeNamespace *ns = req->ns;
1781
1782 BlockBackend *blk = ns->blkconf.blk;
1783 BlockAcctCookie *acct = &req->acct;
1784 BlockAcctStats *stats = blk_get_stats(blk);
1785
1786 trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk));
1787
1788 if (ret) {
1789 block_acct_failed(stats, acct);
1790 nvme_aio_err(req, ret);
1791 } else {
1792 block_acct_done(stats, acct);
1793 }
1794
1795 nvme_enqueue_req_completion(nvme_cq(req), req);
1796}
1797
1798void nvme_rw_complete_cb(void *opaque, int ret)
1799{
1800 NvmeRequest *req = opaque;
1801 NvmeNamespace *ns = req->ns;
1802 BlockBackend *blk = ns->blkconf.blk;
1803 BlockAcctCookie *acct = &req->acct;
1804 BlockAcctStats *stats = blk_get_stats(blk);
1805
1806 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1807
1808 if (ret) {
1809 block_acct_failed(stats, acct);
1810 nvme_aio_err(req, ret);
1811 } else {
1812 block_acct_done(stats, acct);
1813 }
1814
1815 if (ns->params.zoned && nvme_is_write(req)) {
1816 nvme_finalize_zoned_write(ns, req);
1817 }
1818
1819 nvme_enqueue_req_completion(nvme_cq(req), req);
1820}
1821
1822static void nvme_rw_cb(void *opaque, int ret)
1823{
1824 NvmeRequest *req = opaque;
1825 NvmeNamespace *ns = req->ns;
1826
1827 BlockBackend *blk = ns->blkconf.blk;
1828
1829 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1830
1831 if (ret) {
1832 goto out;
1833 }
1834
1835 if (nvme_msize(ns)) {
1836 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1837 uint64_t slba = le64_to_cpu(rw->slba);
1838 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1839 uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
1840
1841 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1842 size_t mlen = nvme_m2b(ns, nlb);
1843
1844 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1845 BDRV_REQ_MAY_UNMAP,
1846 nvme_rw_complete_cb, req);
1847 return;
1848 }
1849
1850 if (nvme_ns_ext(ns) || req->cmd.mptr) {
1851 uint16_t status;
1852
1853 nvme_sg_unmap(&req->sg);
1854 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1855 if (status) {
1856 ret = -EFAULT;
1857 goto out;
1858 }
1859
1860 if (req->cmd.opcode == NVME_CMD_READ) {
1861 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1862 }
1863
1864 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1865 }
1866 }
1867
1868out:
1869 nvme_rw_complete_cb(req, ret);
1870}
1871
1872struct nvme_aio_format_ctx {
1873 NvmeRequest *req;
1874 NvmeNamespace *ns;
1875
1876
1877 int *count;
1878};
1879
1880static void nvme_aio_format_cb(void *opaque, int ret)
1881{
1882 struct nvme_aio_format_ctx *ctx = opaque;
1883 NvmeRequest *req = ctx->req;
1884 NvmeNamespace *ns = ctx->ns;
1885 uintptr_t *num_formats = (uintptr_t *)&req->opaque;
1886 int *count = ctx->count;
1887
1888 g_free(ctx);
1889
1890 if (ret) {
1891 nvme_aio_err(req, ret);
1892 }
1893
1894 if (--(*count)) {
1895 return;
1896 }
1897
1898 g_free(count);
1899 ns->status = 0x0;
1900
1901 if (--(*num_formats)) {
1902 return;
1903 }
1904
1905 nvme_enqueue_req_completion(nvme_cq(req), req);
1906}
1907
1908struct nvme_aio_flush_ctx {
1909 NvmeRequest *req;
1910 NvmeNamespace *ns;
1911 BlockAcctCookie acct;
1912};
1913
1914static void nvme_aio_flush_cb(void *opaque, int ret)
1915{
1916 struct nvme_aio_flush_ctx *ctx = opaque;
1917 NvmeRequest *req = ctx->req;
1918 uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
1919
1920 BlockBackend *blk = ctx->ns->blkconf.blk;
1921 BlockAcctCookie *acct = &ctx->acct;
1922 BlockAcctStats *stats = blk_get_stats(blk);
1923
1924 trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
1925
1926 if (!ret) {
1927 block_acct_done(stats, acct);
1928 } else {
1929 block_acct_failed(stats, acct);
1930 nvme_aio_err(req, ret);
1931 }
1932
1933 (*num_flushes)--;
1934 g_free(ctx);
1935
1936 if (*num_flushes) {
1937 return;
1938 }
1939
1940 nvme_enqueue_req_completion(nvme_cq(req), req);
1941}
1942
1943static void nvme_verify_cb(void *opaque, int ret)
1944{
1945 NvmeBounceContext *ctx = opaque;
1946 NvmeRequest *req = ctx->req;
1947 NvmeNamespace *ns = req->ns;
1948 BlockBackend *blk = ns->blkconf.blk;
1949 BlockAcctCookie *acct = &req->acct;
1950 BlockAcctStats *stats = blk_get_stats(blk);
1951 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1952 uint64_t slba = le64_to_cpu(rw->slba);
1953 uint16_t ctrl = le16_to_cpu(rw->control);
1954 uint16_t apptag = le16_to_cpu(rw->apptag);
1955 uint16_t appmask = le16_to_cpu(rw->appmask);
1956 uint32_t reftag = le32_to_cpu(rw->reftag);
1957 uint16_t status;
1958
1959 trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag,
1960 appmask, reftag);
1961
1962 if (ret) {
1963 block_acct_failed(stats, acct);
1964 nvme_aio_err(req, ret);
1965 goto out;
1966 }
1967
1968 block_acct_done(stats, acct);
1969
1970 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1971 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1972 ctx->mdata.iov.size, slba);
1973 if (status) {
1974 req->status = status;
1975 goto out;
1976 }
1977
1978 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1979 ctx->mdata.bounce, ctx->mdata.iov.size,
1980 ctrl, slba, apptag, appmask, reftag);
1981 }
1982
1983out:
1984 qemu_iovec_destroy(&ctx->data.iov);
1985 g_free(ctx->data.bounce);
1986
1987 qemu_iovec_destroy(&ctx->mdata.iov);
1988 g_free(ctx->mdata.bounce);
1989
1990 g_free(ctx);
1991
1992 nvme_enqueue_req_completion(nvme_cq(req), req);
1993}
1994
1995
1996static void nvme_verify_mdata_in_cb(void *opaque, int ret)
1997{
1998 NvmeBounceContext *ctx = opaque;
1999 NvmeRequest *req = ctx->req;
2000 NvmeNamespace *ns = req->ns;
2001 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2002 uint64_t slba = le64_to_cpu(rw->slba);
2003 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2004 size_t mlen = nvme_m2b(ns, nlb);
2005 uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
2006 BlockBackend *blk = ns->blkconf.blk;
2007
2008 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2009
2010 if (ret) {
2011 goto out;
2012 }
2013
2014 ctx->mdata.bounce = g_malloc(mlen);
2015
2016 qemu_iovec_reset(&ctx->mdata.iov);
2017 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2018
2019 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2020 nvme_verify_cb, ctx);
2021 return;
2022
2023out:
2024 nvme_verify_cb(ctx, ret);
2025}
2026
2027static void nvme_aio_discard_cb(void *opaque, int ret)
2028{
2029 NvmeRequest *req = opaque;
2030 uintptr_t *discards = (uintptr_t *)&req->opaque;
2031
2032 trace_pci_nvme_aio_discard_cb(nvme_cid(req));
2033
2034 if (ret) {
2035 nvme_aio_err(req, ret);
2036 }
2037
2038 (*discards)--;
2039
2040 if (*discards) {
2041 return;
2042 }
2043
2044 nvme_enqueue_req_completion(nvme_cq(req), req);
2045}
2046
2047struct nvme_zone_reset_ctx {
2048 NvmeRequest *req;
2049 NvmeZone *zone;
2050};
2051
2052static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret)
2053{
2054 struct nvme_zone_reset_ctx *ctx = opaque;
2055 NvmeRequest *req = ctx->req;
2056 NvmeNamespace *ns = req->ns;
2057 NvmeZone *zone = ctx->zone;
2058 uintptr_t *resets = (uintptr_t *)&req->opaque;
2059
2060 if (ret) {
2061 nvme_aio_err(req, ret);
2062 goto out;
2063 }
2064
2065 switch (nvme_get_zone_state(zone)) {
2066 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2067 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2068 nvme_aor_dec_open(ns);
2069
2070 case NVME_ZONE_STATE_CLOSED:
2071 nvme_aor_dec_active(ns);
2072
2073 case NVME_ZONE_STATE_FULL:
2074 zone->w_ptr = zone->d.zslba;
2075 zone->d.wp = zone->w_ptr;
2076 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
2077
2078 default:
2079 break;
2080 }
2081
2082out:
2083 g_free(ctx);
2084
2085 (*resets)--;
2086
2087 if (*resets) {
2088 return;
2089 }
2090
2091 nvme_enqueue_req_completion(nvme_cq(req), req);
2092}
2093
2094static void nvme_aio_zone_reset_cb(void *opaque, int ret)
2095{
2096 struct nvme_zone_reset_ctx *ctx = opaque;
2097 NvmeRequest *req = ctx->req;
2098 NvmeNamespace *ns = req->ns;
2099 NvmeZone *zone = ctx->zone;
2100
2101 trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
2102
2103 if (ret) {
2104 goto out;
2105 }
2106
2107 if (nvme_msize(ns)) {
2108 int64_t offset = ns->mdata_offset + nvme_m2b(ns, zone->d.zslba);
2109
2110 blk_aio_pwrite_zeroes(ns->blkconf.blk, offset,
2111 nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
2112 nvme_aio_zone_reset_complete_cb, ctx);
2113 return;
2114 }
2115
2116out:
2117 nvme_aio_zone_reset_complete_cb(opaque, ret);
2118}
2119
2120struct nvme_copy_ctx {
2121 int copies;
2122 uint8_t *bounce;
2123 uint8_t *mbounce;
2124 uint32_t nlb;
2125 NvmeCopySourceRange *ranges;
2126};
2127
2128struct nvme_copy_in_ctx {
2129 NvmeRequest *req;
2130 QEMUIOVector iov;
2131 NvmeCopySourceRange *range;
2132};
2133
2134static void nvme_copy_complete_cb(void *opaque, int ret)
2135{
2136 NvmeRequest *req = opaque;
2137 NvmeNamespace *ns = req->ns;
2138 struct nvme_copy_ctx *ctx = req->opaque;
2139
2140 if (ret) {
2141 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2142 nvme_aio_err(req, ret);
2143 goto out;
2144 }
2145
2146 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2147
2148out:
2149 if (ns->params.zoned) {
2150 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2151 uint64_t sdlba = le64_to_cpu(copy->sdlba);
2152 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2153
2154 __nvme_advance_zone_wp(ns, zone, ctx->nlb);
2155 }
2156
2157 g_free(ctx->bounce);
2158 g_free(ctx->mbounce);
2159 g_free(ctx);
2160
2161 nvme_enqueue_req_completion(nvme_cq(req), req);
2162}
2163
2164static void nvme_copy_cb(void *opaque, int ret)
2165{
2166 NvmeRequest *req = opaque;
2167 NvmeNamespace *ns = req->ns;
2168 struct nvme_copy_ctx *ctx = req->opaque;
2169
2170 trace_pci_nvme_copy_cb(nvme_cid(req));
2171
2172 if (ret) {
2173 goto out;
2174 }
2175
2176 if (nvme_msize(ns)) {
2177 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2178 uint64_t sdlba = le64_to_cpu(copy->sdlba);
2179 int64_t offset = ns->mdata_offset + nvme_m2b(ns, sdlba);
2180
2181 qemu_iovec_reset(&req->sg.iov);
2182 qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb));
2183
2184 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0,
2185 nvme_copy_complete_cb, req);
2186 return;
2187 }
2188
2189out:
2190 nvme_copy_complete_cb(opaque, ret);
2191}
2192
2193static void nvme_copy_in_complete(NvmeRequest *req)
2194{
2195 NvmeNamespace *ns = req->ns;
2196 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2197 struct nvme_copy_ctx *ctx = req->opaque;
2198 uint64_t sdlba = le64_to_cpu(copy->sdlba);
2199 uint16_t status;
2200
2201 trace_pci_nvme_copy_in_complete(nvme_cid(req));
2202
2203 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2204
2205 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2206 uint16_t prinfor = (copy->control[0] >> 4) & 0xf;
2207 uint16_t prinfow = (copy->control[2] >> 2) & 0xf;
2208 uint16_t nr = copy->nr + 1;
2209 NvmeCopySourceRange *range;
2210 uint64_t slba;
2211 uint32_t nlb;
2212 uint16_t apptag, appmask;
2213 uint32_t reftag;
2214 uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce;
2215 size_t len, mlen;
2216 int i;
2217
2218
2219
2220
2221
2222 prinfor = prinfor << 10;
2223 prinfow = prinfow << 10;
2224
2225 for (i = 0; i < nr; i++) {
2226 range = &ctx->ranges[i];
2227 slba = le64_to_cpu(range->slba);
2228 nlb = le16_to_cpu(range->nlb) + 1;
2229 len = nvme_l2b(ns, nlb);
2230 mlen = nvme_m2b(ns, nlb);
2231 apptag = le16_to_cpu(range->apptag);
2232 appmask = le16_to_cpu(range->appmask);
2233 reftag = le32_to_cpu(range->reftag);
2234
2235 status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba,
2236 apptag, appmask, reftag);
2237 if (status) {
2238 goto invalid;
2239 }
2240
2241 buf += len;
2242 mbuf += mlen;
2243 }
2244
2245 apptag = le16_to_cpu(copy->apptag);
2246 appmask = le16_to_cpu(copy->appmask);
2247 reftag = le32_to_cpu(copy->reftag);
2248
2249 if (prinfow & NVME_RW_PRINFO_PRACT) {
2250 size_t len = nvme_l2b(ns, ctx->nlb);
2251 size_t mlen = nvme_m2b(ns, ctx->nlb);
2252
2253 status = nvme_check_prinfo(ns, prinfow, sdlba, reftag);
2254 if (status) {
2255 goto invalid;
2256 }
2257
2258 nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce,
2259 mlen, apptag, reftag);
2260 } else {
2261 status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen,
2262 prinfow, sdlba, apptag, appmask, reftag);
2263 if (status) {
2264 goto invalid;
2265 }
2266 }
2267 }
2268
2269 status = nvme_check_bounds(ns, sdlba, ctx->nlb);
2270 if (status) {
2271 trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);
2272 goto invalid;
2273 }
2274
2275 if (ns->params.zoned) {
2276 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2277
2278 status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb);
2279 if (status) {
2280 goto invalid;
2281 }
2282
2283 status = nvme_zrm_auto(ns, zone);
2284 if (status) {
2285 goto invalid;
2286 }
2287
2288 zone->w_ptr += ctx->nlb;
2289 }
2290
2291 qemu_iovec_init(&req->sg.iov, 1);
2292 qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
2293
2294 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2295 BLOCK_ACCT_WRITE);
2296
2297 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
2298 &req->sg.iov, 0, nvme_copy_cb, req);
2299
2300 return;
2301
2302invalid:
2303 req->status = status;
2304
2305 g_free(ctx->bounce);
2306 g_free(ctx);
2307
2308 nvme_enqueue_req_completion(nvme_cq(req), req);
2309}
2310
2311static void nvme_aio_copy_in_cb(void *opaque, int ret)
2312{
2313 struct nvme_copy_in_ctx *in_ctx = opaque;
2314 NvmeRequest *req = in_ctx->req;
2315 NvmeNamespace *ns = req->ns;
2316 struct nvme_copy_ctx *ctx = req->opaque;
2317
2318 qemu_iovec_destroy(&in_ctx->iov);
2319 g_free(in_ctx);
2320
2321 trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
2322
2323 if (ret) {
2324 nvme_aio_err(req, ret);
2325 }
2326
2327 ctx->copies--;
2328
2329 if (ctx->copies) {
2330 return;
2331 }
2332
2333 if (req->status) {
2334 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2335
2336 g_free(ctx->bounce);
2337 g_free(ctx->mbounce);
2338 g_free(ctx);
2339
2340 nvme_enqueue_req_completion(nvme_cq(req), req);
2341
2342 return;
2343 }
2344
2345 nvme_copy_in_complete(req);
2346}
2347
2348struct nvme_compare_ctx {
2349 struct {
2350 QEMUIOVector iov;
2351 uint8_t *bounce;
2352 } data;
2353
2354 struct {
2355 QEMUIOVector iov;
2356 uint8_t *bounce;
2357 } mdata;
2358};
2359
2360static void nvme_compare_mdata_cb(void *opaque, int ret)
2361{
2362 NvmeRequest *req = opaque;
2363 NvmeNamespace *ns = req->ns;
2364 NvmeCtrl *n = nvme_ctrl(req);
2365 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2366 uint16_t ctrl = le16_to_cpu(rw->control);
2367 uint16_t apptag = le16_to_cpu(rw->apptag);
2368 uint16_t appmask = le16_to_cpu(rw->appmask);
2369 uint32_t reftag = le32_to_cpu(rw->reftag);
2370 struct nvme_compare_ctx *ctx = req->opaque;
2371 g_autofree uint8_t *buf = NULL;
2372 uint16_t status = NVME_SUCCESS;
2373
2374 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2375
2376 buf = g_malloc(ctx->mdata.iov.size);
2377
2378 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2379 NVME_TX_DIRECTION_TO_DEVICE, req);
2380 if (status) {
2381 req->status = status;
2382 goto out;
2383 }
2384
2385 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2386 uint64_t slba = le64_to_cpu(rw->slba);
2387 uint8_t *bufp;
2388 uint8_t *mbufp = ctx->mdata.bounce;
2389 uint8_t *end = mbufp + ctx->mdata.iov.size;
2390 size_t msize = nvme_msize(ns);
2391 int16_t pil = 0;
2392
2393 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2394 ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
2395 slba, apptag, appmask, reftag);
2396 if (status) {
2397 req->status = status;
2398 goto out;
2399 }
2400
2401
2402
2403
2404
2405 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2406 pil = nvme_msize(ns) - sizeof(NvmeDifTuple);
2407 }
2408
2409 for (bufp = buf; mbufp < end; bufp += msize, mbufp += msize) {
2410 if (memcmp(bufp + pil, mbufp + pil, msize - pil)) {
2411 req->status = NVME_CMP_FAILURE;
2412 goto out;
2413 }
2414 }
2415
2416 goto out;
2417 }
2418
2419 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2420 req->status = NVME_CMP_FAILURE;
2421 goto out;
2422 }
2423
2424out:
2425 qemu_iovec_destroy(&ctx->data.iov);
2426 g_free(ctx->data.bounce);
2427
2428 qemu_iovec_destroy(&ctx->mdata.iov);
2429 g_free(ctx->mdata.bounce);
2430
2431 g_free(ctx);
2432
2433 nvme_enqueue_req_completion(nvme_cq(req), req);
2434}
2435
2436static void nvme_compare_data_cb(void *opaque, int ret)
2437{
2438 NvmeRequest *req = opaque;
2439 NvmeCtrl *n = nvme_ctrl(req);
2440 NvmeNamespace *ns = req->ns;
2441 BlockBackend *blk = ns->blkconf.blk;
2442 BlockAcctCookie *acct = &req->acct;
2443 BlockAcctStats *stats = blk_get_stats(blk);
2444
2445 struct nvme_compare_ctx *ctx = req->opaque;
2446 g_autofree uint8_t *buf = NULL;
2447 uint16_t status;
2448
2449 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2450
2451 if (ret) {
2452 block_acct_failed(stats, acct);
2453 nvme_aio_err(req, ret);
2454 goto out;
2455 }
2456
2457 buf = g_malloc(ctx->data.iov.size);
2458
2459 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2460 NVME_TX_DIRECTION_TO_DEVICE, req);
2461 if (status) {
2462 req->status = status;
2463 goto out;
2464 }
2465
2466 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2467 req->status = NVME_CMP_FAILURE;
2468 goto out;
2469 }
2470
2471 if (nvme_msize(ns)) {
2472 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2473 uint64_t slba = le64_to_cpu(rw->slba);
2474 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2475 size_t mlen = nvme_m2b(ns, nlb);
2476 uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
2477
2478 ctx->mdata.bounce = g_malloc(mlen);
2479
2480 qemu_iovec_init(&ctx->mdata.iov, 1);
2481 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2482
2483 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2484 nvme_compare_mdata_cb, req);
2485 return;
2486 }
2487
2488 block_acct_done(stats, acct);
2489
2490out:
2491 qemu_iovec_destroy(&ctx->data.iov);
2492 g_free(ctx->data.bounce);
2493 g_free(ctx);
2494
2495 nvme_enqueue_req_completion(nvme_cq(req), req);
2496}
2497
2498static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2499{
2500 NvmeNamespace *ns = req->ns;
2501 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2502
2503 uint32_t attr = le32_to_cpu(dsm->attributes);
2504 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2505
2506 uint16_t status = NVME_SUCCESS;
2507
2508 trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
2509
2510 if (attr & NVME_DSMGMT_AD) {
2511 int64_t offset;
2512 size_t len;
2513 NvmeDsmRange range[nr];
2514 uintptr_t *discards = (uintptr_t *)&req->opaque;
2515
2516 status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
2517 if (status) {
2518 return status;
2519 }
2520
2521
2522
2523
2524
2525
2526 *discards = 1;
2527
2528 for (int i = 0; i < nr; i++) {
2529 uint64_t slba = le64_to_cpu(range[i].slba);
2530 uint32_t nlb = le32_to_cpu(range[i].nlb);
2531
2532 if (nvme_check_bounds(ns, slba, nlb)) {
2533 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2534 ns->id_ns.nsze);
2535 continue;
2536 }
2537
2538 trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
2539 nlb);
2540
2541 if (nlb > n->dmrsl) {
2542 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2543 }
2544
2545 offset = nvme_l2b(ns, slba);
2546 len = nvme_l2b(ns, nlb);
2547
2548 while (len) {
2549 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
2550
2551 (*discards)++;
2552
2553 blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
2554 nvme_aio_discard_cb, req);
2555
2556 offset += bytes;
2557 len -= bytes;
2558 }
2559 }
2560
2561
2562 (*discards)--;
2563
2564 if (*discards) {
2565 status = NVME_NO_COMPLETE;
2566 } else {
2567 status = req->status;
2568 }
2569 }
2570
2571 return status;
2572}
2573
2574static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2575{
2576 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2577 NvmeNamespace *ns = req->ns;
2578 BlockBackend *blk = ns->blkconf.blk;
2579 uint64_t slba = le64_to_cpu(rw->slba);
2580 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2581 size_t len = nvme_l2b(ns, nlb);
2582 int64_t offset = nvme_l2b(ns, slba);
2583 uint16_t ctrl = le16_to_cpu(rw->control);
2584 uint32_t reftag = le32_to_cpu(rw->reftag);
2585 NvmeBounceContext *ctx = NULL;
2586 uint16_t status;
2587
2588 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2589
2590 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2591 status = nvme_check_prinfo(ns, ctrl, slba, reftag);
2592 if (status) {
2593 return status;
2594 }
2595
2596 if (ctrl & NVME_RW_PRINFO_PRACT) {
2597 return NVME_INVALID_PROT_INFO | NVME_DNR;
2598 }
2599 }
2600
2601 if (len > n->page_size << n->params.vsl) {
2602 return NVME_INVALID_FIELD | NVME_DNR;
2603 }
2604
2605 status = nvme_check_bounds(ns, slba, nlb);
2606 if (status) {
2607 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2608 return status;
2609 }
2610
2611 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2612 status = nvme_check_dulbe(ns, slba, nlb);
2613 if (status) {
2614 return status;
2615 }
2616 }
2617
2618 ctx = g_new0(NvmeBounceContext, 1);
2619 ctx->req = req;
2620
2621 ctx->data.bounce = g_malloc(len);
2622
2623 qemu_iovec_init(&ctx->data.iov, 1);
2624 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2625
2626 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2627 BLOCK_ACCT_READ);
2628
2629 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2630 nvme_verify_mdata_in_cb, ctx);
2631 return NVME_NO_COMPLETE;
2632}
2633
2634static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2635{
2636 NvmeNamespace *ns = req->ns;
2637 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2638
2639 uint16_t nr = copy->nr + 1;
2640 uint8_t format = copy->control[0] & 0xf;
2641
2642
2643
2644
2645
2646 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10;
2647 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10;
2648
2649 uint32_t nlb = 0;
2650 uint8_t *bounce = NULL, *bouncep = NULL;
2651 uint8_t *mbounce = NULL, *mbouncep = NULL;
2652 struct nvme_copy_ctx *ctx;
2653 uint16_t status;
2654 int i;
2655
2656 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2657
2658 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2659 ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) {
2660 return NVME_INVALID_FIELD | NVME_DNR;
2661 }
2662
2663 if (!(n->id_ctrl.ocfs & (1 << format))) {
2664 trace_pci_nvme_err_copy_invalid_format(format);
2665 return NVME_INVALID_FIELD | NVME_DNR;
2666 }
2667
2668 if (nr > ns->id_ns.msrc + 1) {
2669 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2670 }
2671
2672 ctx = g_new(struct nvme_copy_ctx, 1);
2673 ctx->ranges = g_new(NvmeCopySourceRange, nr);
2674
2675 status = nvme_h2c(n, (uint8_t *)ctx->ranges,
2676 nr * sizeof(NvmeCopySourceRange), req);
2677 if (status) {
2678 goto out;
2679 }
2680
2681 for (i = 0; i < nr; i++) {
2682 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2683 uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2684
2685 if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2686 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2687 goto out;
2688 }
2689
2690 status = nvme_check_bounds(ns, slba, _nlb);
2691 if (status) {
2692 trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze);
2693 goto out;
2694 }
2695
2696 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2697 status = nvme_check_dulbe(ns, slba, _nlb);
2698 if (status) {
2699 goto out;
2700 }
2701 }
2702
2703 if (ns->params.zoned) {
2704 status = nvme_check_zone_read(ns, slba, _nlb);
2705 if (status) {
2706 goto out;
2707 }
2708 }
2709
2710 nlb += _nlb;
2711 }
2712
2713 if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
2714 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2715 goto out;
2716 }
2717
2718 bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
2719 if (nvme_msize(ns)) {
2720 mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb));
2721 }
2722
2723 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2724 BLOCK_ACCT_READ);
2725
2726 ctx->bounce = bounce;
2727 ctx->mbounce = mbounce;
2728 ctx->nlb = nlb;
2729 ctx->copies = 1;
2730
2731 req->opaque = ctx;
2732
2733 for (i = 0; i < nr; i++) {
2734 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2735 uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2736
2737 size_t len = nvme_l2b(ns, nlb);
2738 int64_t offset = nvme_l2b(ns, slba);
2739
2740 trace_pci_nvme_copy_source_range(slba, nlb);
2741
2742 struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2743 in_ctx->req = req;
2744
2745 qemu_iovec_init(&in_ctx->iov, 1);
2746 qemu_iovec_add(&in_ctx->iov, bouncep, len);
2747
2748 ctx->copies++;
2749
2750 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2751 nvme_aio_copy_in_cb, in_ctx);
2752
2753 bouncep += len;
2754
2755 if (nvme_msize(ns)) {
2756 len = nvme_m2b(ns, nlb);
2757 offset = ns->mdata_offset + nvme_m2b(ns, slba);
2758
2759 in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2760 in_ctx->req = req;
2761
2762 qemu_iovec_init(&in_ctx->iov, 1);
2763 qemu_iovec_add(&in_ctx->iov, mbouncep, len);
2764
2765 ctx->copies++;
2766
2767 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2768 nvme_aio_copy_in_cb, in_ctx);
2769
2770 mbouncep += len;
2771 }
2772 }
2773
2774
2775 ctx->copies--;
2776
2777 if (!ctx->copies) {
2778 nvme_copy_in_complete(req);
2779 }
2780
2781 return NVME_NO_COMPLETE;
2782
2783out:
2784 g_free(ctx->ranges);
2785 g_free(ctx);
2786
2787 return status;
2788}
2789
2790static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2791{
2792 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2793 NvmeNamespace *ns = req->ns;
2794 BlockBackend *blk = ns->blkconf.blk;
2795 uint64_t slba = le64_to_cpu(rw->slba);
2796 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2797 uint16_t ctrl = le16_to_cpu(rw->control);
2798 size_t data_len = nvme_l2b(ns, nlb);
2799 size_t len = data_len;
2800 int64_t offset = nvme_l2b(ns, slba);
2801 struct nvme_compare_ctx *ctx = NULL;
2802 uint16_t status;
2803
2804 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2805
2806 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) {
2807 return NVME_INVALID_PROT_INFO | NVME_DNR;
2808 }
2809
2810 if (nvme_ns_ext(ns)) {
2811 len += nvme_m2b(ns, nlb);
2812 }
2813
2814 status = nvme_check_mdts(n, len);
2815 if (status) {
2816 return status;
2817 }
2818
2819 status = nvme_check_bounds(ns, slba, nlb);
2820 if (status) {
2821 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2822 return status;
2823 }
2824
2825 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2826 status = nvme_check_dulbe(ns, slba, nlb);
2827 if (status) {
2828 return status;
2829 }
2830 }
2831
2832 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2833 if (status) {
2834 return status;
2835 }
2836
2837 ctx = g_new(struct nvme_compare_ctx, 1);
2838 ctx->data.bounce = g_malloc(data_len);
2839
2840 req->opaque = ctx;
2841
2842 qemu_iovec_init(&ctx->data.iov, 1);
2843 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2844
2845 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2846 BLOCK_ACCT_READ);
2847 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2848 nvme_compare_data_cb, req);
2849
2850 return NVME_NO_COMPLETE;
2851}
2852
2853static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
2854{
2855 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
2856 uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
2857 uint16_t status;
2858 struct nvme_aio_flush_ctx *ctx;
2859 NvmeNamespace *ns;
2860
2861 trace_pci_nvme_flush(nvme_cid(req), nsid);
2862
2863 if (nsid != NVME_NSID_BROADCAST) {
2864 req->ns = nvme_ns(n, nsid);
2865 if (unlikely(!req->ns)) {
2866 return NVME_INVALID_FIELD | NVME_DNR;
2867 }
2868
2869 block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
2870 BLOCK_ACCT_FLUSH);
2871 req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req);
2872 return NVME_NO_COMPLETE;
2873 }
2874
2875
2876 *num_flushes = 1;
2877
2878 for (int i = 1; i <= n->num_namespaces; i++) {
2879 ns = nvme_ns(n, i);
2880 if (!ns) {
2881 continue;
2882 }
2883
2884 ctx = g_new(struct nvme_aio_flush_ctx, 1);
2885 ctx->req = req;
2886 ctx->ns = ns;
2887
2888 (*num_flushes)++;
2889
2890 block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
2891 BLOCK_ACCT_FLUSH);
2892 blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
2893 }
2894
2895
2896 (*num_flushes)--;
2897
2898 if (*num_flushes) {
2899 status = NVME_NO_COMPLETE;
2900 } else {
2901 status = req->status;
2902 }
2903
2904 return status;
2905}
2906
2907static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
2908{
2909 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2910 NvmeNamespace *ns = req->ns;
2911 uint64_t slba = le64_to_cpu(rw->slba);
2912 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2913 uint16_t ctrl = le16_to_cpu(rw->control);
2914 uint64_t data_size = nvme_l2b(ns, nlb);
2915 uint64_t mapped_size = data_size;
2916 uint64_t data_offset;
2917 BlockBackend *blk = ns->blkconf.blk;
2918 uint16_t status;
2919
2920 if (nvme_ns_ext(ns)) {
2921 mapped_size += nvme_m2b(ns, nlb);
2922
2923 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2924 bool pract = ctrl & NVME_RW_PRINFO_PRACT;
2925
2926 if (pract && nvme_msize(ns) == 8) {
2927 mapped_size = data_size;
2928 }
2929 }
2930 }
2931
2932 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
2933
2934 status = nvme_check_mdts(n, mapped_size);
2935 if (status) {
2936 goto invalid;
2937 }
2938
2939 status = nvme_check_bounds(ns, slba, nlb);
2940 if (status) {
2941 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2942 goto invalid;
2943 }
2944
2945 if (ns->params.zoned) {
2946 status = nvme_check_zone_read(ns, slba, nlb);
2947 if (status) {
2948 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
2949 goto invalid;
2950 }
2951 }
2952
2953 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2954 status = nvme_check_dulbe(ns, slba, nlb);
2955 if (status) {
2956 goto invalid;
2957 }
2958 }
2959
2960 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2961 return nvme_dif_rw(n, req);
2962 }
2963
2964 status = nvme_map_data(n, nlb, req);
2965 if (status) {
2966 goto invalid;
2967 }
2968
2969 data_offset = nvme_l2b(ns, slba);
2970
2971 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
2972 BLOCK_ACCT_READ);
2973 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
2974 return NVME_NO_COMPLETE;
2975
2976invalid:
2977 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
2978 return status | NVME_DNR;
2979}
2980
2981static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
2982 bool wrz)
2983{
2984 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2985 NvmeNamespace *ns = req->ns;
2986 uint64_t slba = le64_to_cpu(rw->slba);
2987 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2988 uint16_t ctrl = le16_to_cpu(rw->control);
2989 uint64_t data_size = nvme_l2b(ns, nlb);
2990 uint64_t mapped_size = data_size;
2991 uint64_t data_offset;
2992 NvmeZone *zone;
2993 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
2994 BlockBackend *blk = ns->blkconf.blk;
2995 uint16_t status;
2996
2997 if (nvme_ns_ext(ns)) {
2998 mapped_size += nvme_m2b(ns, nlb);
2999
3000 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3001 bool pract = ctrl & NVME_RW_PRINFO_PRACT;
3002
3003 if (pract && nvme_msize(ns) == 8) {
3004 mapped_size -= nvme_m2b(ns, nlb);
3005 }
3006 }
3007 }
3008
3009 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3010 nvme_nsid(ns), nlb, mapped_size, slba);
3011
3012 if (!wrz) {
3013 status = nvme_check_mdts(n, mapped_size);
3014 if (status) {
3015 goto invalid;
3016 }
3017 }
3018
3019 status = nvme_check_bounds(ns, slba, nlb);
3020 if (status) {
3021 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
3022 goto invalid;
3023 }
3024
3025 if (ns->params.zoned) {
3026 zone = nvme_get_zone_by_slba(ns, slba);
3027
3028 if (append) {
3029 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3030
3031 if (unlikely(slba != zone->d.zslba)) {
3032 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3033 status = NVME_INVALID_FIELD;
3034 goto invalid;
3035 }
3036
3037 if (n->params.zasl &&
3038 data_size > (uint64_t)n->page_size << n->params.zasl) {
3039 trace_pci_nvme_err_zasl(data_size);
3040 return NVME_INVALID_FIELD | NVME_DNR;
3041 }
3042
3043 slba = zone->w_ptr;
3044 rw->slba = cpu_to_le64(slba);
3045 res->slba = cpu_to_le64(slba);
3046
3047 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3048 case NVME_ID_NS_DPS_TYPE_1:
3049 if (!piremap) {
3050 return NVME_INVALID_PROT_INFO | NVME_DNR;
3051 }
3052
3053
3054
3055 case NVME_ID_NS_DPS_TYPE_2:
3056 if (piremap) {
3057 uint32_t reftag = le32_to_cpu(rw->reftag);
3058 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3059 }
3060
3061 break;
3062
3063 case NVME_ID_NS_DPS_TYPE_3:
3064 if (piremap) {
3065 return NVME_INVALID_PROT_INFO | NVME_DNR;
3066 }
3067
3068 break;
3069 }
3070 }
3071
3072 status = nvme_check_zone_write(ns, zone, slba, nlb);
3073 if (status) {
3074 goto invalid;
3075 }
3076
3077 status = nvme_zrm_auto(ns, zone);
3078 if (status) {
3079 goto invalid;
3080 }
3081
3082 zone->w_ptr += nlb;
3083 }
3084
3085 data_offset = nvme_l2b(ns, slba);
3086
3087 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3088 return nvme_dif_rw(n, req);
3089 }
3090
3091 if (!wrz) {
3092 status = nvme_map_data(n, nlb, req);
3093 if (status) {
3094 goto invalid;
3095 }
3096
3097 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3098 BLOCK_ACCT_WRITE);
3099 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3100 } else {
3101 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3102 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3103 req);
3104 }
3105
3106 return NVME_NO_COMPLETE;
3107
3108invalid:
3109 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3110 return status | NVME_DNR;
3111}
3112
3113static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3114{
3115 return nvme_do_write(n, req, false, false);
3116}
3117
3118static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3119{
3120 return nvme_do_write(n, req, false, true);
3121}
3122
3123static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3124{
3125 return nvme_do_write(n, req, true, false);
3126}
3127
3128static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3129 uint64_t *slba, uint32_t *zone_idx)
3130{
3131 uint32_t dw10 = le32_to_cpu(c->cdw10);
3132 uint32_t dw11 = le32_to_cpu(c->cdw11);
3133
3134 if (!ns->params.zoned) {
3135 trace_pci_nvme_err_invalid_opc(c->opcode);
3136 return NVME_INVALID_OPCODE | NVME_DNR;
3137 }
3138
3139 *slba = ((uint64_t)dw11) << 32 | dw10;
3140 if (unlikely(*slba >= ns->id_ns.nsze)) {
3141 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3142 *slba = 0;
3143 return NVME_LBA_RANGE | NVME_DNR;
3144 }
3145
3146 *zone_idx = nvme_zone_idx(ns, *slba);
3147 assert(*zone_idx < ns->num_zones);
3148
3149 return NVME_SUCCESS;
3150}
3151
3152typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3153 NvmeRequest *);
3154
3155enum NvmeZoneProcessingMask {
3156 NVME_PROC_CURRENT_ZONE = 0,
3157 NVME_PROC_OPENED_ZONES = 1 << 0,
3158 NVME_PROC_CLOSED_ZONES = 1 << 1,
3159 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3160 NVME_PROC_FULL_ZONES = 1 << 3,
3161};
3162
3163static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3164 NvmeZoneState state, NvmeRequest *req)
3165{
3166 return nvme_zrm_open(ns, zone);
3167}
3168
3169static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3170 NvmeZoneState state, NvmeRequest *req)
3171{
3172 return nvme_zrm_close(ns, zone);
3173}
3174
3175static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3176 NvmeZoneState state, NvmeRequest *req)
3177{
3178 return nvme_zrm_finish(ns, zone);
3179}
3180
3181static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
3182 NvmeZoneState state, NvmeRequest *req)
3183{
3184 uintptr_t *resets = (uintptr_t *)&req->opaque;
3185 struct nvme_zone_reset_ctx *ctx;
3186
3187 switch (state) {
3188 case NVME_ZONE_STATE_EMPTY:
3189 return NVME_SUCCESS;
3190 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3191 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3192 case NVME_ZONE_STATE_CLOSED:
3193 case NVME_ZONE_STATE_FULL:
3194 break;
3195 default:
3196 return NVME_ZONE_INVAL_TRANSITION;
3197 }
3198
3199
3200
3201
3202
3203 ctx = g_new(struct nvme_zone_reset_ctx, 1);
3204 ctx->req = req;
3205 ctx->zone = zone;
3206
3207 (*resets)++;
3208
3209 blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
3210 nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
3211 nvme_aio_zone_reset_cb, ctx);
3212
3213 return NVME_NO_COMPLETE;
3214}
3215
3216static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3217 NvmeZoneState state, NvmeRequest *req)
3218{
3219 switch (state) {
3220 case NVME_ZONE_STATE_READ_ONLY:
3221 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3222
3223 case NVME_ZONE_STATE_OFFLINE:
3224 return NVME_SUCCESS;
3225 default:
3226 return NVME_ZONE_INVAL_TRANSITION;
3227 }
3228}
3229
3230static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3231{
3232 uint16_t status;
3233 uint8_t state = nvme_get_zone_state(zone);
3234
3235 if (state == NVME_ZONE_STATE_EMPTY) {
3236 status = nvme_aor_check(ns, 1, 0);
3237 if (status) {
3238 return status;
3239 }
3240 nvme_aor_inc_active(ns);
3241 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3242 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3243 return NVME_SUCCESS;
3244 }
3245
3246 return NVME_ZONE_INVAL_TRANSITION;
3247}
3248
3249static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3250 enum NvmeZoneProcessingMask proc_mask,
3251 op_handler_t op_hndlr, NvmeRequest *req)
3252{
3253 uint16_t status = NVME_SUCCESS;
3254 NvmeZoneState zs = nvme_get_zone_state(zone);
3255 bool proc_zone;
3256
3257 switch (zs) {
3258 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3259 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3260 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3261 break;
3262 case NVME_ZONE_STATE_CLOSED:
3263 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3264 break;
3265 case NVME_ZONE_STATE_READ_ONLY:
3266 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3267 break;
3268 case NVME_ZONE_STATE_FULL:
3269 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3270 break;
3271 default:
3272 proc_zone = false;
3273 }
3274
3275 if (proc_zone) {
3276 status = op_hndlr(ns, zone, zs, req);
3277 }
3278
3279 return status;
3280}
3281
3282static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3283 enum NvmeZoneProcessingMask proc_mask,
3284 op_handler_t op_hndlr, NvmeRequest *req)
3285{
3286 NvmeZone *next;
3287 uint16_t status = NVME_SUCCESS;
3288 int i;
3289
3290 if (!proc_mask) {
3291 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3292 } else {
3293 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3294 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3295 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3296 req);
3297 if (status && status != NVME_NO_COMPLETE) {
3298 goto out;
3299 }
3300 }
3301 }
3302 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3303 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3304 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3305 req);
3306 if (status && status != NVME_NO_COMPLETE) {
3307 goto out;
3308 }
3309 }
3310
3311 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3312 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3313 req);
3314 if (status && status != NVME_NO_COMPLETE) {
3315 goto out;
3316 }
3317 }
3318 }
3319 if (proc_mask & NVME_PROC_FULL_ZONES) {
3320 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3321 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3322 req);
3323 if (status && status != NVME_NO_COMPLETE) {
3324 goto out;
3325 }
3326 }
3327 }
3328
3329 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3330 for (i = 0; i < ns->num_zones; i++, zone++) {
3331 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3332 req);
3333 if (status && status != NVME_NO_COMPLETE) {
3334 goto out;
3335 }
3336 }
3337 }
3338 }
3339
3340out:
3341 return status;
3342}
3343
3344static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3345{
3346 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3347 NvmeNamespace *ns = req->ns;
3348 NvmeZone *zone;
3349 uintptr_t *resets;
3350 uint8_t *zd_ext;
3351 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3352 uint64_t slba = 0;
3353 uint32_t zone_idx = 0;
3354 uint16_t status;
3355 uint8_t action;
3356 bool all;
3357 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3358
3359 action = dw13 & 0xff;
3360 all = dw13 & 0x100;
3361
3362 req->status = NVME_SUCCESS;
3363
3364 if (!all) {
3365 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3366 if (status) {
3367 return status;
3368 }
3369 }
3370
3371 zone = &ns->zone_array[zone_idx];
3372 if (slba != zone->d.zslba) {
3373 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3374 return NVME_INVALID_FIELD | NVME_DNR;
3375 }
3376
3377 switch (action) {
3378
3379 case NVME_ZONE_ACTION_OPEN:
3380 if (all) {
3381 proc_mask = NVME_PROC_CLOSED_ZONES;
3382 }
3383 trace_pci_nvme_open_zone(slba, zone_idx, all);
3384 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3385 break;
3386
3387 case NVME_ZONE_ACTION_CLOSE:
3388 if (all) {
3389 proc_mask = NVME_PROC_OPENED_ZONES;
3390 }
3391 trace_pci_nvme_close_zone(slba, zone_idx, all);
3392 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3393 break;
3394
3395 case NVME_ZONE_ACTION_FINISH:
3396 if (all) {
3397 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3398 }
3399 trace_pci_nvme_finish_zone(slba, zone_idx, all);
3400 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3401 break;
3402
3403 case NVME_ZONE_ACTION_RESET:
3404 resets = (uintptr_t *)&req->opaque;
3405
3406 if (all) {
3407 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
3408 NVME_PROC_FULL_ZONES;
3409 }
3410 trace_pci_nvme_reset_zone(slba, zone_idx, all);
3411
3412 *resets = 1;
3413
3414 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
3415
3416 (*resets)--;
3417
3418 return *resets ? NVME_NO_COMPLETE : req->status;
3419
3420 case NVME_ZONE_ACTION_OFFLINE:
3421 if (all) {
3422 proc_mask = NVME_PROC_READ_ONLY_ZONES;
3423 }
3424 trace_pci_nvme_offline_zone(slba, zone_idx, all);
3425 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3426 break;
3427
3428 case NVME_ZONE_ACTION_SET_ZD_EXT:
3429 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3430 if (all || !ns->params.zd_extension_size) {
3431 return NVME_INVALID_FIELD | NVME_DNR;
3432 }
3433 zd_ext = nvme_get_zd_extension(ns, zone_idx);
3434 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3435 if (status) {
3436 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3437 return status;
3438 }
3439
3440 status = nvme_set_zd_ext(ns, zone);
3441 if (status == NVME_SUCCESS) {
3442 trace_pci_nvme_zd_extension_set(zone_idx);
3443 return status;
3444 }
3445 break;
3446
3447 default:
3448 trace_pci_nvme_err_invalid_mgmt_action(action);
3449 status = NVME_INVALID_FIELD;
3450 }
3451
3452 if (status == NVME_ZONE_INVAL_TRANSITION) {
3453 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3454 zone->d.za);
3455 }
3456 if (status) {
3457 status |= NVME_DNR;
3458 }
3459
3460 return status;
3461}
3462
3463static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3464{
3465 NvmeZoneState zs = nvme_get_zone_state(zl);
3466
3467 switch (zafs) {
3468 case NVME_ZONE_REPORT_ALL:
3469 return true;
3470 case NVME_ZONE_REPORT_EMPTY:
3471 return zs == NVME_ZONE_STATE_EMPTY;
3472 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3473 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3474 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3475 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3476 case NVME_ZONE_REPORT_CLOSED:
3477 return zs == NVME_ZONE_STATE_CLOSED;
3478 case NVME_ZONE_REPORT_FULL:
3479 return zs == NVME_ZONE_STATE_FULL;
3480 case NVME_ZONE_REPORT_READ_ONLY:
3481 return zs == NVME_ZONE_STATE_READ_ONLY;
3482 case NVME_ZONE_REPORT_OFFLINE:
3483 return zs == NVME_ZONE_STATE_OFFLINE;
3484 default:
3485 return false;
3486 }
3487}
3488
3489static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3490{
3491 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3492 NvmeNamespace *ns = req->ns;
3493
3494 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3495 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3496 uint32_t zone_idx, zra, zrasf, partial;
3497 uint64_t max_zones, nr_zones = 0;
3498 uint16_t status;
3499 uint64_t slba;
3500 NvmeZoneDescr *z;
3501 NvmeZone *zone;
3502 NvmeZoneReportHeader *header;
3503 void *buf, *buf_p;
3504 size_t zone_entry_sz;
3505 int i;
3506
3507 req->status = NVME_SUCCESS;
3508
3509 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3510 if (status) {
3511 return status;
3512 }
3513
3514 zra = dw13 & 0xff;
3515 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3516 return NVME_INVALID_FIELD | NVME_DNR;
3517 }
3518 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3519 return NVME_INVALID_FIELD | NVME_DNR;
3520 }
3521
3522 zrasf = (dw13 >> 8) & 0xff;
3523 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3524 return NVME_INVALID_FIELD | NVME_DNR;
3525 }
3526
3527 if (data_size < sizeof(NvmeZoneReportHeader)) {
3528 return NVME_INVALID_FIELD | NVME_DNR;
3529 }
3530
3531 status = nvme_check_mdts(n, data_size);
3532 if (status) {
3533 return status;
3534 }
3535
3536 partial = (dw13 >> 16) & 0x01;
3537
3538 zone_entry_sz = sizeof(NvmeZoneDescr);
3539 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3540 zone_entry_sz += ns->params.zd_extension_size;
3541 }
3542
3543 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3544 buf = g_malloc0(data_size);
3545
3546 zone = &ns->zone_array[zone_idx];
3547 for (i = zone_idx; i < ns->num_zones; i++) {
3548 if (partial && nr_zones >= max_zones) {
3549 break;
3550 }
3551 if (nvme_zone_matches_filter(zrasf, zone++)) {
3552 nr_zones++;
3553 }
3554 }
3555 header = (NvmeZoneReportHeader *)buf;
3556 header->nr_zones = cpu_to_le64(nr_zones);
3557
3558 buf_p = buf + sizeof(NvmeZoneReportHeader);
3559 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3560 zone = &ns->zone_array[zone_idx];
3561 if (nvme_zone_matches_filter(zrasf, zone)) {
3562 z = (NvmeZoneDescr *)buf_p;
3563 buf_p += sizeof(NvmeZoneDescr);
3564
3565 z->zt = zone->d.zt;
3566 z->zs = zone->d.zs;
3567 z->zcap = cpu_to_le64(zone->d.zcap);
3568 z->zslba = cpu_to_le64(zone->d.zslba);
3569 z->za = zone->d.za;
3570
3571 if (nvme_wp_is_valid(zone)) {
3572 z->wp = cpu_to_le64(zone->d.wp);
3573 } else {
3574 z->wp = cpu_to_le64(~0ULL);
3575 }
3576
3577 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3578 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3579 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3580 ns->params.zd_extension_size);
3581 }
3582 buf_p += ns->params.zd_extension_size;
3583 }
3584
3585 max_zones--;
3586 }
3587 }
3588
3589 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3590
3591 g_free(buf);
3592
3593 return status;
3594}
3595
3596static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3597{
3598 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3599 uint16_t status;
3600
3601 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3602 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3603
3604 if (!nvme_nsid_valid(n, nsid)) {
3605 return NVME_INVALID_NSID | NVME_DNR;
3606 }
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627 if (req->cmd.opcode == NVME_CMD_FLUSH) {
3628 return nvme_flush(n, req);
3629 }
3630
3631 req->ns = nvme_ns(n, nsid);
3632 if (unlikely(!req->ns)) {
3633 return NVME_INVALID_FIELD | NVME_DNR;
3634 }
3635
3636 if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3637 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3638 return NVME_INVALID_OPCODE | NVME_DNR;
3639 }
3640
3641 status = nvme_ns_status(req->ns);
3642 if (unlikely(status)) {
3643 return status;
3644 }
3645
3646 switch (req->cmd.opcode) {
3647 case NVME_CMD_WRITE_ZEROES:
3648 return nvme_write_zeroes(n, req);
3649 case NVME_CMD_ZONE_APPEND:
3650 return nvme_zone_append(n, req);
3651 case NVME_CMD_WRITE:
3652 return nvme_write(n, req);
3653 case NVME_CMD_READ:
3654 return nvme_read(n, req);
3655 case NVME_CMD_COMPARE:
3656 return nvme_compare(n, req);
3657 case NVME_CMD_DSM:
3658 return nvme_dsm(n, req);
3659 case NVME_CMD_VERIFY:
3660 return nvme_verify(n, req);
3661 case NVME_CMD_COPY:
3662 return nvme_copy(n, req);
3663 case NVME_CMD_ZONE_MGMT_SEND:
3664 return nvme_zone_mgmt_send(n, req);
3665 case NVME_CMD_ZONE_MGMT_RECV:
3666 return nvme_zone_mgmt_recv(n, req);
3667 default:
3668 assert(false);
3669 }
3670
3671 return NVME_INVALID_OPCODE | NVME_DNR;
3672}
3673
3674static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3675{
3676 n->sq[sq->sqid] = NULL;
3677 timer_free(sq->timer);
3678 g_free(sq->io_req);
3679 if (sq->sqid) {
3680 g_free(sq);
3681 }
3682}
3683
3684static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3685{
3686 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3687 NvmeRequest *r, *next;
3688 NvmeSQueue *sq;
3689 NvmeCQueue *cq;
3690 uint16_t qid = le16_to_cpu(c->qid);
3691 uint32_t nsid;
3692
3693 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3694 trace_pci_nvme_err_invalid_del_sq(qid);
3695 return NVME_INVALID_QID | NVME_DNR;
3696 }
3697
3698 trace_pci_nvme_del_sq(qid);
3699
3700 sq = n->sq[qid];
3701 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3702 r = QTAILQ_FIRST(&sq->out_req_list);
3703 if (r->aiocb) {
3704 blk_aio_cancel(r->aiocb);
3705 }
3706 }
3707
3708
3709
3710
3711
3712 if (!QTAILQ_EMPTY(&sq->out_req_list)) {
3713 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
3714 NvmeNamespace *ns = nvme_ns(n, nsid);
3715 if (ns) {
3716 nvme_ns_drain(ns);
3717 }
3718 }
3719 }
3720
3721 assert(QTAILQ_EMPTY(&sq->out_req_list));
3722
3723 if (!nvme_check_cqid(n, sq->cqid)) {
3724 cq = n->cq[sq->cqid];
3725 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3726
3727 nvme_post_cqes(cq);
3728 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3729 if (r->sq == sq) {
3730 QTAILQ_REMOVE(&cq->req_list, r, entry);
3731 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3732 }
3733 }
3734 }
3735
3736 nvme_free_sq(sq, n);
3737 return NVME_SUCCESS;
3738}
3739
3740static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3741 uint16_t sqid, uint16_t cqid, uint16_t size)
3742{
3743 int i;
3744 NvmeCQueue *cq;
3745
3746 sq->ctrl = n;
3747 sq->dma_addr = dma_addr;
3748 sq->sqid = sqid;
3749 sq->size = size;
3750 sq->cqid = cqid;
3751 sq->head = sq->tail = 0;
3752 sq->io_req = g_new0(NvmeRequest, sq->size);
3753
3754 QTAILQ_INIT(&sq->req_list);
3755 QTAILQ_INIT(&sq->out_req_list);
3756 for (i = 0; i < sq->size; i++) {
3757 sq->io_req[i].sq = sq;
3758 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
3759 }
3760 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
3761
3762 assert(n->cq[cqid]);
3763 cq = n->cq[cqid];
3764 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
3765 n->sq[sqid] = sq;
3766}
3767
3768static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
3769{
3770 NvmeSQueue *sq;
3771 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
3772
3773 uint16_t cqid = le16_to_cpu(c->cqid);
3774 uint16_t sqid = le16_to_cpu(c->sqid);
3775 uint16_t qsize = le16_to_cpu(c->qsize);
3776 uint16_t qflags = le16_to_cpu(c->sq_flags);
3777 uint64_t prp1 = le64_to_cpu(c->prp1);
3778
3779 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
3780
3781 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
3782 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
3783 return NVME_INVALID_CQID | NVME_DNR;
3784 }
3785 if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
3786 n->sq[sqid] != NULL)) {
3787 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
3788 return NVME_INVALID_QID | NVME_DNR;
3789 }
3790 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
3791 trace_pci_nvme_err_invalid_create_sq_size(qsize);
3792 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
3793 }
3794 if (unlikely(prp1 & (n->page_size - 1))) {
3795 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
3796 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
3797 }
3798 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
3799 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
3800 return NVME_INVALID_FIELD | NVME_DNR;
3801 }
3802 sq = g_malloc0(sizeof(*sq));
3803 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
3804 return NVME_SUCCESS;
3805}
3806
3807struct nvme_stats {
3808 uint64_t units_read;
3809 uint64_t units_written;
3810 uint64_t read_commands;
3811 uint64_t write_commands;
3812};
3813
3814static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
3815{
3816 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
3817
3818 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
3819 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
3820 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
3821 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
3822}
3823
3824static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3825 uint64_t off, NvmeRequest *req)
3826{
3827 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3828 struct nvme_stats stats = { 0 };
3829 NvmeSmartLog smart = { 0 };
3830 uint32_t trans_len;
3831 NvmeNamespace *ns;
3832 time_t current_ms;
3833
3834 if (off >= sizeof(smart)) {
3835 return NVME_INVALID_FIELD | NVME_DNR;
3836 }
3837
3838 if (nsid != 0xffffffff) {
3839 ns = nvme_ns(n, nsid);
3840 if (!ns) {
3841 return NVME_INVALID_NSID | NVME_DNR;
3842 }
3843 nvme_set_blk_stats(ns, &stats);
3844 } else {
3845 int i;
3846
3847 for (i = 1; i <= n->num_namespaces; i++) {
3848 ns = nvme_ns(n, i);
3849 if (!ns) {
3850 continue;
3851 }
3852 nvme_set_blk_stats(ns, &stats);
3853 }
3854 }
3855
3856 trans_len = MIN(sizeof(smart) - off, buf_len);
3857 smart.critical_warning = n->smart_critical_warning;
3858
3859 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
3860 1000));
3861 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
3862 1000));
3863 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
3864 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
3865
3866 smart.temperature = cpu_to_le16(n->temperature);
3867
3868 if ((n->temperature >= n->features.temp_thresh_hi) ||
3869 (n->temperature <= n->features.temp_thresh_low)) {
3870 smart.critical_warning |= NVME_SMART_TEMPERATURE;
3871 }
3872
3873 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
3874 smart.power_on_hours[0] =
3875 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
3876
3877 if (!rae) {
3878 nvme_clear_events(n, NVME_AER_TYPE_SMART);
3879 }
3880
3881 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
3882}
3883
3884static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
3885 NvmeRequest *req)
3886{
3887 uint32_t trans_len;
3888 NvmeFwSlotInfoLog fw_log = {
3889 .afi = 0x1,
3890 };
3891
3892 if (off >= sizeof(fw_log)) {
3893 return NVME_INVALID_FIELD | NVME_DNR;
3894 }
3895
3896 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
3897 trans_len = MIN(sizeof(fw_log) - off, buf_len);
3898
3899 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
3900}
3901
3902static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3903 uint64_t off, NvmeRequest *req)
3904{
3905 uint32_t trans_len;
3906 NvmeErrorLog errlog;
3907
3908 if (off >= sizeof(errlog)) {
3909 return NVME_INVALID_FIELD | NVME_DNR;
3910 }
3911
3912 if (!rae) {
3913 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
3914 }
3915
3916 memset(&errlog, 0x0, sizeof(errlog));
3917 trans_len = MIN(sizeof(errlog) - off, buf_len);
3918
3919 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
3920}
3921
3922static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3923 uint64_t off, NvmeRequest *req)
3924{
3925 uint32_t nslist[1024];
3926 uint32_t trans_len;
3927 int i = 0;
3928 uint32_t nsid;
3929
3930 memset(nslist, 0x0, sizeof(nslist));
3931 trans_len = MIN(sizeof(nslist) - off, buf_len);
3932
3933 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
3934 NVME_CHANGED_NSID_SIZE) {
3935
3936
3937
3938
3939 if (i == ARRAY_SIZE(nslist)) {
3940 memset(nslist, 0x0, sizeof(nslist));
3941 nslist[0] = 0xffffffff;
3942 break;
3943 }
3944
3945 nslist[i++] = nsid;
3946 clear_bit(nsid, n->changed_nsids);
3947 }
3948
3949
3950
3951
3952
3953 if (nslist[0] == 0xffffffff) {
3954 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
3955 }
3956
3957 if (!rae) {
3958 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
3959 }
3960
3961 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
3962}
3963
3964static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
3965 uint64_t off, NvmeRequest *req)
3966{
3967 NvmeEffectsLog log = {};
3968 const uint32_t *src_iocs = NULL;
3969 uint32_t trans_len;
3970
3971 if (off >= sizeof(log)) {
3972 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
3973 return NVME_INVALID_FIELD | NVME_DNR;
3974 }
3975
3976 switch (NVME_CC_CSS(n->bar.cc)) {
3977 case NVME_CC_CSS_NVM:
3978 src_iocs = nvme_cse_iocs_nvm;
3979
3980 case NVME_CC_CSS_ADMIN_ONLY:
3981 break;
3982 case NVME_CC_CSS_CSI:
3983 switch (csi) {
3984 case NVME_CSI_NVM:
3985 src_iocs = nvme_cse_iocs_nvm;
3986 break;
3987 case NVME_CSI_ZONED:
3988 src_iocs = nvme_cse_iocs_zoned;
3989 break;
3990 }
3991 }
3992
3993 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
3994
3995 if (src_iocs) {
3996 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
3997 }
3998
3999 trans_len = MIN(sizeof(log) - off, buf_len);
4000
4001 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4002}
4003
4004static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4005{
4006 NvmeCmd *cmd = &req->cmd;
4007
4008 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4009 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4010 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4011 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4012 uint8_t lid = dw10 & 0xff;
4013 uint8_t lsp = (dw10 >> 8) & 0xf;
4014 uint8_t rae = (dw10 >> 15) & 0x1;
4015 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
4016 uint32_t numdl, numdu;
4017 uint64_t off, lpol, lpou;
4018 size_t len;
4019 uint16_t status;
4020
4021 numdl = (dw10 >> 16);
4022 numdu = (dw11 & 0xffff);
4023 lpol = dw12;
4024 lpou = dw13;
4025
4026 len = (((numdu << 16) | numdl) + 1) << 2;
4027 off = (lpou << 32ULL) | lpol;
4028
4029 if (off & 0x3) {
4030 return NVME_INVALID_FIELD | NVME_DNR;
4031 }
4032
4033 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4034
4035 status = nvme_check_mdts(n, len);
4036 if (status) {
4037 return status;
4038 }
4039
4040 switch (lid) {
4041 case NVME_LOG_ERROR_INFO:
4042 return nvme_error_info(n, rae, len, off, req);
4043 case NVME_LOG_SMART_INFO:
4044 return nvme_smart_info(n, rae, len, off, req);
4045 case NVME_LOG_FW_SLOT_INFO:
4046 return nvme_fw_log_info(n, len, off, req);
4047 case NVME_LOG_CHANGED_NSLIST:
4048 return nvme_changed_nslist(n, rae, len, off, req);
4049 case NVME_LOG_CMD_EFFECTS:
4050 return nvme_cmd_effects(n, csi, len, off, req);
4051 default:
4052 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4053 return NVME_INVALID_FIELD | NVME_DNR;
4054 }
4055}
4056
4057static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4058{
4059 n->cq[cq->cqid] = NULL;
4060 timer_free(cq->timer);
4061 if (msix_enabled(&n->parent_obj)) {
4062 msix_vector_unuse(&n->parent_obj, cq->vector);
4063 }
4064 if (cq->cqid) {
4065 g_free(cq);
4066 }
4067}
4068
4069static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4070{
4071 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4072 NvmeCQueue *cq;
4073 uint16_t qid = le16_to_cpu(c->qid);
4074
4075 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4076 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4077 return NVME_INVALID_CQID | NVME_DNR;
4078 }
4079
4080 cq = n->cq[qid];
4081 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4082 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4083 return NVME_INVALID_QUEUE_DEL;
4084 }
4085 nvme_irq_deassert(n, cq);
4086 trace_pci_nvme_del_cq(qid);
4087 nvme_free_cq(cq, n);
4088 return NVME_SUCCESS;
4089}
4090
4091static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4092 uint16_t cqid, uint16_t vector, uint16_t size,
4093 uint16_t irq_enabled)
4094{
4095 int ret;
4096
4097 if (msix_enabled(&n->parent_obj)) {
4098 ret = msix_vector_use(&n->parent_obj, vector);
4099 assert(ret == 0);
4100 }
4101 cq->ctrl = n;
4102 cq->cqid = cqid;
4103 cq->size = size;
4104 cq->dma_addr = dma_addr;
4105 cq->phase = 1;
4106 cq->irq_enabled = irq_enabled;
4107 cq->vector = vector;
4108 cq->head = cq->tail = 0;
4109 QTAILQ_INIT(&cq->req_list);
4110 QTAILQ_INIT(&cq->sq_list);
4111 n->cq[cqid] = cq;
4112 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4113}
4114
4115static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4116{
4117 NvmeCQueue *cq;
4118 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4119 uint16_t cqid = le16_to_cpu(c->cqid);
4120 uint16_t vector = le16_to_cpu(c->irq_vector);
4121 uint16_t qsize = le16_to_cpu(c->qsize);
4122 uint16_t qflags = le16_to_cpu(c->cq_flags);
4123 uint64_t prp1 = le64_to_cpu(c->prp1);
4124
4125 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4126 NVME_CQ_FLAGS_IEN(qflags) != 0);
4127
4128 if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4129 n->cq[cqid] != NULL)) {
4130 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4131 return NVME_INVALID_QID | NVME_DNR;
4132 }
4133 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
4134 trace_pci_nvme_err_invalid_create_cq_size(qsize);
4135 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4136 }
4137 if (unlikely(prp1 & (n->page_size - 1))) {
4138 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4139 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4140 }
4141 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4142 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4143 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4144 }
4145 if (unlikely(vector >= n->params.msix_qsize)) {
4146 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4147 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4148 }
4149 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4150 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4151 return NVME_INVALID_FIELD | NVME_DNR;
4152 }
4153
4154 cq = g_malloc0(sizeof(*cq));
4155 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4156 NVME_CQ_FLAGS_IEN(qflags));
4157
4158
4159
4160
4161
4162
4163 n->qs_created = true;
4164 return NVME_SUCCESS;
4165}
4166
4167static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4168{
4169 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4170
4171 return nvme_c2h(n, id, sizeof(id), req);
4172}
4173
4174static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
4175{
4176 switch (ns->csi) {
4177 case NVME_CSI_NVM:
4178 case NVME_CSI_ZONED:
4179 return true;
4180 }
4181 return false;
4182}
4183
4184static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4185{
4186 trace_pci_nvme_identify_ctrl();
4187
4188 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4189}
4190
4191static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4192{
4193 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4194 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4195 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4196
4197 trace_pci_nvme_identify_ctrl_csi(c->csi);
4198
4199 switch (c->csi) {
4200 case NVME_CSI_NVM:
4201 id_nvm->vsl = n->params.vsl;
4202 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4203 break;
4204
4205 case NVME_CSI_ZONED:
4206 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4207 break;
4208
4209 default:
4210 return NVME_INVALID_FIELD | NVME_DNR;
4211 }
4212
4213 return nvme_c2h(n, id, sizeof(id), req);
4214}
4215
4216static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4217{
4218 NvmeNamespace *ns;
4219 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4220 uint32_t nsid = le32_to_cpu(c->nsid);
4221
4222 trace_pci_nvme_identify_ns(nsid);
4223
4224 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4225 return NVME_INVALID_NSID | NVME_DNR;
4226 }
4227
4228 ns = nvme_ns(n, nsid);
4229 if (unlikely(!ns)) {
4230 if (!active) {
4231 ns = nvme_subsys_ns(n->subsys, nsid);
4232 if (!ns) {
4233 return nvme_rpt_empty_id_struct(n, req);
4234 }
4235 } else {
4236 return nvme_rpt_empty_id_struct(n, req);
4237 }
4238 }
4239
4240 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
4241 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4242 }
4243
4244 return NVME_INVALID_CMD_SET | NVME_DNR;
4245}
4246
4247static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req)
4248{
4249 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4250 uint16_t min_id = le16_to_cpu(c->ctrlid);
4251 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4252 uint16_t *ids = &list[1];
4253 NvmeNamespace *ns;
4254 NvmeCtrl *ctrl;
4255 int cntlid, nr_ids = 0;
4256
4257 trace_pci_nvme_identify_ns_attached_list(min_id);
4258
4259 if (c->nsid == NVME_NSID_BROADCAST) {
4260 return NVME_INVALID_FIELD | NVME_DNR;
4261 }
4262
4263 ns = nvme_subsys_ns(n->subsys, c->nsid);
4264 if (!ns) {
4265 return NVME_INVALID_FIELD | NVME_DNR;
4266 }
4267
4268 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4269 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4270 if (!ctrl) {
4271 continue;
4272 }
4273
4274 if (!nvme_ns(ctrl, c->nsid)) {
4275 continue;
4276 }
4277
4278 ids[nr_ids++] = cntlid;
4279 }
4280
4281 list[0] = nr_ids;
4282
4283 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4284}
4285
4286static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4287 bool active)
4288{
4289 NvmeNamespace *ns;
4290 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4291 uint32_t nsid = le32_to_cpu(c->nsid);
4292
4293 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4294
4295 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4296 return NVME_INVALID_NSID | NVME_DNR;
4297 }
4298
4299 ns = nvme_ns(n, nsid);
4300 if (unlikely(!ns)) {
4301 if (!active) {
4302 ns = nvme_subsys_ns(n->subsys, nsid);
4303 if (!ns) {
4304 return nvme_rpt_empty_id_struct(n, req);
4305 }
4306 } else {
4307 return nvme_rpt_empty_id_struct(n, req);
4308 }
4309 }
4310
4311 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
4312 return nvme_rpt_empty_id_struct(n, req);
4313 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4314 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4315 req);
4316 }
4317
4318 return NVME_INVALID_FIELD | NVME_DNR;
4319}
4320
4321static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4322 bool active)
4323{
4324 NvmeNamespace *ns;
4325 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4326 uint32_t min_nsid = le32_to_cpu(c->nsid);
4327 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4328 static const int data_len = sizeof(list);
4329 uint32_t *list_ptr = (uint32_t *)list;
4330 int i, j = 0;
4331
4332 trace_pci_nvme_identify_nslist(min_nsid);
4333
4334
4335
4336
4337
4338
4339
4340 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4341 return NVME_INVALID_NSID | NVME_DNR;
4342 }
4343
4344 for (i = 1; i <= n->num_namespaces; i++) {
4345 ns = nvme_ns(n, i);
4346 if (!ns) {
4347 if (!active) {
4348 ns = nvme_subsys_ns(n->subsys, i);
4349 if (!ns) {
4350 continue;
4351 }
4352 } else {
4353 continue;
4354 }
4355 }
4356 if (ns->params.nsid <= min_nsid) {
4357 continue;
4358 }
4359 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4360 if (j == data_len / sizeof(uint32_t)) {
4361 break;
4362 }
4363 }
4364
4365 return nvme_c2h(n, list, data_len, req);
4366}
4367
4368static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4369 bool active)
4370{
4371 NvmeNamespace *ns;
4372 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4373 uint32_t min_nsid = le32_to_cpu(c->nsid);
4374 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4375 static const int data_len = sizeof(list);
4376 uint32_t *list_ptr = (uint32_t *)list;
4377 int i, j = 0;
4378
4379 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4380
4381
4382
4383
4384 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4385 return NVME_INVALID_NSID | NVME_DNR;
4386 }
4387
4388 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4389 return NVME_INVALID_FIELD | NVME_DNR;
4390 }
4391
4392 for (i = 1; i <= n->num_namespaces; i++) {
4393 ns = nvme_ns(n, i);
4394 if (!ns) {
4395 if (!active) {
4396 ns = nvme_subsys_ns(n->subsys, i);
4397 if (!ns) {
4398 continue;
4399 }
4400 } else {
4401 continue;
4402 }
4403 }
4404 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4405 continue;
4406 }
4407 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4408 if (j == data_len / sizeof(uint32_t)) {
4409 break;
4410 }
4411 }
4412
4413 return nvme_c2h(n, list, data_len, req);
4414}
4415
4416static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4417{
4418 NvmeNamespace *ns;
4419 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4420 uint32_t nsid = le32_to_cpu(c->nsid);
4421 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4422
4423 struct data {
4424 struct {
4425 NvmeIdNsDescr hdr;
4426 uint8_t v[NVME_NIDL_UUID];
4427 } uuid;
4428 struct {
4429 NvmeIdNsDescr hdr;
4430 uint8_t v;
4431 } csi;
4432 };
4433
4434 struct data *ns_descrs = (struct data *)list;
4435
4436 trace_pci_nvme_identify_ns_descr_list(nsid);
4437
4438 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4439 return NVME_INVALID_NSID | NVME_DNR;
4440 }
4441
4442 ns = nvme_ns(n, nsid);
4443 if (unlikely(!ns)) {
4444 return NVME_INVALID_FIELD | NVME_DNR;
4445 }
4446
4447
4448
4449
4450
4451
4452 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
4453 ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID;
4454 memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4455
4456 ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI;
4457 ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI;
4458 ns_descrs->csi.v = ns->csi;
4459
4460 return nvme_c2h(n, list, sizeof(list), req);
4461}
4462
4463static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4464{
4465 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4466 static const int data_len = sizeof(list);
4467
4468 trace_pci_nvme_identify_cmd_set();
4469
4470 NVME_SET_CSI(*list, NVME_CSI_NVM);
4471 NVME_SET_CSI(*list, NVME_CSI_ZONED);
4472
4473 return nvme_c2h(n, list, data_len, req);
4474}
4475
4476static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4477{
4478 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4479
4480 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4481 c->csi);
4482
4483 switch (c->cns) {
4484 case NVME_ID_CNS_NS:
4485 return nvme_identify_ns(n, req, true);
4486 case NVME_ID_CNS_NS_PRESENT:
4487 return nvme_identify_ns(n, req, false);
4488 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4489 return nvme_identify_ns_attached_list(n, req);
4490 case NVME_ID_CNS_CS_NS:
4491 return nvme_identify_ns_csi(n, req, true);
4492 case NVME_ID_CNS_CS_NS_PRESENT:
4493 return nvme_identify_ns_csi(n, req, false);
4494 case NVME_ID_CNS_CTRL:
4495 return nvme_identify_ctrl(n, req);
4496 case NVME_ID_CNS_CS_CTRL:
4497 return nvme_identify_ctrl_csi(n, req);
4498 case NVME_ID_CNS_NS_ACTIVE_LIST:
4499 return nvme_identify_nslist(n, req, true);
4500 case NVME_ID_CNS_NS_PRESENT_LIST:
4501 return nvme_identify_nslist(n, req, false);
4502 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4503 return nvme_identify_nslist_csi(n, req, true);
4504 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4505 return nvme_identify_nslist_csi(n, req, false);
4506 case NVME_ID_CNS_NS_DESCR_LIST:
4507 return nvme_identify_ns_descr_list(n, req);
4508 case NVME_ID_CNS_IO_COMMAND_SET:
4509 return nvme_identify_cmd_set(n, req);
4510 default:
4511 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4512 return NVME_INVALID_FIELD | NVME_DNR;
4513 }
4514}
4515
4516static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4517{
4518 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4519
4520 req->cqe.result = 1;
4521 if (nvme_check_sqid(n, sqid)) {
4522 return NVME_INVALID_FIELD | NVME_DNR;
4523 }
4524
4525 return NVME_SUCCESS;
4526}
4527
4528static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4529{
4530 trace_pci_nvme_setfeat_timestamp(ts);
4531
4532 n->host_timestamp = le64_to_cpu(ts);
4533 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4534}
4535
4536static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4537{
4538 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4539 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4540
4541 union nvme_timestamp {
4542 struct {
4543 uint64_t timestamp:48;
4544 uint64_t sync:1;
4545 uint64_t origin:3;
4546 uint64_t rsvd1:12;
4547 };
4548 uint64_t all;
4549 };
4550
4551 union nvme_timestamp ts;
4552 ts.all = 0;
4553 ts.timestamp = n->host_timestamp + elapsed_time;
4554
4555
4556 ts.origin = n->host_timestamp ? 0x01 : 0x00;
4557
4558 trace_pci_nvme_getfeat_timestamp(ts.all);
4559
4560 return cpu_to_le64(ts.all);
4561}
4562
4563static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4564{
4565 uint64_t timestamp = nvme_get_timestamp(n);
4566
4567 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4568}
4569
4570static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4571{
4572 NvmeCmd *cmd = &req->cmd;
4573 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4574 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4575 uint32_t nsid = le32_to_cpu(cmd->nsid);
4576 uint32_t result;
4577 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4578 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4579 uint16_t iv;
4580 NvmeNamespace *ns;
4581 int i;
4582
4583 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4584 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4585 };
4586
4587 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4588
4589 if (!nvme_feature_support[fid]) {
4590 return NVME_INVALID_FIELD | NVME_DNR;
4591 }
4592
4593 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4594 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4595
4596
4597
4598
4599
4600
4601
4602 return NVME_INVALID_NSID | NVME_DNR;
4603 }
4604
4605 if (!nvme_ns(n, nsid)) {
4606 return NVME_INVALID_FIELD | NVME_DNR;
4607 }
4608 }
4609
4610 switch (sel) {
4611 case NVME_GETFEAT_SELECT_CURRENT:
4612 break;
4613 case NVME_GETFEAT_SELECT_SAVED:
4614
4615 case NVME_GETFEAT_SELECT_DEFAULT:
4616 goto defaults;
4617 case NVME_GETFEAT_SELECT_CAP:
4618 result = nvme_feature_cap[fid];
4619 goto out;
4620 }
4621
4622 switch (fid) {
4623 case NVME_TEMPERATURE_THRESHOLD:
4624 result = 0;
4625
4626
4627
4628
4629
4630 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4631 goto out;
4632 }
4633
4634 switch (NVME_TEMP_THSEL(dw11)) {
4635 case NVME_TEMP_THSEL_OVER:
4636 result = n->features.temp_thresh_hi;
4637 goto out;
4638 case NVME_TEMP_THSEL_UNDER:
4639 result = n->features.temp_thresh_low;
4640 goto out;
4641 }
4642
4643 return NVME_INVALID_FIELD | NVME_DNR;
4644 case NVME_ERROR_RECOVERY:
4645 if (!nvme_nsid_valid(n, nsid)) {
4646 return NVME_INVALID_NSID | NVME_DNR;
4647 }
4648
4649 ns = nvme_ns(n, nsid);
4650 if (unlikely(!ns)) {
4651 return NVME_INVALID_FIELD | NVME_DNR;
4652 }
4653
4654 result = ns->features.err_rec;
4655 goto out;
4656 case NVME_VOLATILE_WRITE_CACHE:
4657 result = 0;
4658 for (i = 1; i <= n->num_namespaces; i++) {
4659 ns = nvme_ns(n, i);
4660 if (!ns) {
4661 continue;
4662 }
4663
4664 result = blk_enable_write_cache(ns->blkconf.blk);
4665 if (result) {
4666 break;
4667 }
4668 }
4669 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4670 goto out;
4671 case NVME_ASYNCHRONOUS_EVENT_CONF:
4672 result = n->features.async_config;
4673 goto out;
4674 case NVME_TIMESTAMP:
4675 return nvme_get_feature_timestamp(n, req);
4676 default:
4677 break;
4678 }
4679
4680defaults:
4681 switch (fid) {
4682 case NVME_TEMPERATURE_THRESHOLD:
4683 result = 0;
4684
4685 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4686 break;
4687 }
4688
4689 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4690 result = NVME_TEMPERATURE_WARNING;
4691 }
4692
4693 break;
4694 case NVME_NUMBER_OF_QUEUES:
4695 result = (n->params.max_ioqpairs - 1) |
4696 ((n->params.max_ioqpairs - 1) << 16);
4697 trace_pci_nvme_getfeat_numq(result);
4698 break;
4699 case NVME_INTERRUPT_VECTOR_CONF:
4700 iv = dw11 & 0xffff;
4701 if (iv >= n->params.max_ioqpairs + 1) {
4702 return NVME_INVALID_FIELD | NVME_DNR;
4703 }
4704
4705 result = iv;
4706 if (iv == n->admin_cq.vector) {
4707 result |= NVME_INTVC_NOCOALESCING;
4708 }
4709 break;
4710 case NVME_COMMAND_SET_PROFILE:
4711 result = 0;
4712 break;
4713 default:
4714 result = nvme_feature_default[fid];
4715 break;
4716 }
4717
4718out:
4719 req->cqe.result = cpu_to_le32(result);
4720 return NVME_SUCCESS;
4721}
4722
4723static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4724{
4725 uint16_t ret;
4726 uint64_t timestamp;
4727
4728 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4729 if (ret) {
4730 return ret;
4731 }
4732
4733 nvme_set_timestamp(n, timestamp);
4734
4735 return NVME_SUCCESS;
4736}
4737
4738static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4739{
4740 NvmeNamespace *ns = NULL;
4741
4742 NvmeCmd *cmd = &req->cmd;
4743 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4744 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4745 uint32_t nsid = le32_to_cpu(cmd->nsid);
4746 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4747 uint8_t save = NVME_SETFEAT_SAVE(dw10);
4748 int i;
4749
4750 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
4751
4752 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
4753 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
4754 }
4755
4756 if (!nvme_feature_support[fid]) {
4757 return NVME_INVALID_FIELD | NVME_DNR;
4758 }
4759
4760 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4761 if (nsid != NVME_NSID_BROADCAST) {
4762 if (!nvme_nsid_valid(n, nsid)) {
4763 return NVME_INVALID_NSID | NVME_DNR;
4764 }
4765
4766 ns = nvme_ns(n, nsid);
4767 if (unlikely(!ns)) {
4768 return NVME_INVALID_FIELD | NVME_DNR;
4769 }
4770 }
4771 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
4772 if (!nvme_nsid_valid(n, nsid)) {
4773 return NVME_INVALID_NSID | NVME_DNR;
4774 }
4775
4776 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
4777 }
4778
4779 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
4780 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
4781 }
4782
4783 switch (fid) {
4784 case NVME_TEMPERATURE_THRESHOLD:
4785 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4786 break;
4787 }
4788
4789 switch (NVME_TEMP_THSEL(dw11)) {
4790 case NVME_TEMP_THSEL_OVER:
4791 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
4792 break;
4793 case NVME_TEMP_THSEL_UNDER:
4794 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
4795 break;
4796 default:
4797 return NVME_INVALID_FIELD | NVME_DNR;
4798 }
4799
4800 if ((n->temperature >= n->features.temp_thresh_hi) ||
4801 (n->temperature <= n->features.temp_thresh_low)) {
4802 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
4803 }
4804
4805 break;
4806 case NVME_ERROR_RECOVERY:
4807 if (nsid == NVME_NSID_BROADCAST) {
4808 for (i = 1; i <= n->num_namespaces; i++) {
4809 ns = nvme_ns(n, i);
4810
4811 if (!ns) {
4812 continue;
4813 }
4814
4815 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
4816 ns->features.err_rec = dw11;
4817 }
4818 }
4819
4820 break;
4821 }
4822
4823 assert(ns);
4824 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
4825 ns->features.err_rec = dw11;
4826 }
4827 break;
4828 case NVME_VOLATILE_WRITE_CACHE:
4829 for (i = 1; i <= n->num_namespaces; i++) {
4830 ns = nvme_ns(n, i);
4831 if (!ns) {
4832 continue;
4833 }
4834
4835 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
4836 blk_flush(ns->blkconf.blk);
4837 }
4838
4839 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
4840 }
4841
4842 break;
4843
4844 case NVME_NUMBER_OF_QUEUES:
4845 if (n->qs_created) {
4846 return NVME_CMD_SEQ_ERROR | NVME_DNR;
4847 }
4848
4849
4850
4851
4852
4853 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
4854 return NVME_INVALID_FIELD | NVME_DNR;
4855 }
4856
4857 trace_pci_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
4858 ((dw11 >> 16) & 0xFFFF) + 1,
4859 n->params.max_ioqpairs,
4860 n->params.max_ioqpairs);
4861 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
4862 ((n->params.max_ioqpairs - 1) << 16));
4863 break;
4864 case NVME_ASYNCHRONOUS_EVENT_CONF:
4865 n->features.async_config = dw11;
4866 break;
4867 case NVME_TIMESTAMP:
4868 return nvme_set_feature_timestamp(n, req);
4869 case NVME_COMMAND_SET_PROFILE:
4870 if (dw11 & 0x1ff) {
4871 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
4872 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
4873 }
4874 break;
4875 default:
4876 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
4877 }
4878 return NVME_SUCCESS;
4879}
4880
4881static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
4882{
4883 trace_pci_nvme_aer(nvme_cid(req));
4884
4885 if (n->outstanding_aers > n->params.aerl) {
4886 trace_pci_nvme_aer_aerl_exceeded();
4887 return NVME_AER_LIMIT_EXCEEDED;
4888 }
4889
4890 n->aer_reqs[n->outstanding_aers] = req;
4891 n->outstanding_aers++;
4892
4893 if (!QTAILQ_EMPTY(&n->aer_queue)) {
4894 nvme_process_aers(n);
4895 }
4896
4897 return NVME_NO_COMPLETE;
4898}
4899
4900static void nvme_update_dmrsl(NvmeCtrl *n)
4901{
4902 int nsid;
4903
4904 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
4905 NvmeNamespace *ns = nvme_ns(n, nsid);
4906 if (!ns) {
4907 continue;
4908 }
4909
4910 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
4911 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
4912 }
4913}
4914
4915static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns);
4916static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
4917{
4918 NvmeNamespace *ns;
4919 NvmeCtrl *ctrl;
4920 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4921 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4922 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
4923 bool attach = !(dw10 & 0xf);
4924 uint16_t *nr_ids = &list[0];
4925 uint16_t *ids = &list[1];
4926 uint16_t ret;
4927 int i;
4928
4929 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
4930
4931 if (!nvme_nsid_valid(n, nsid)) {
4932 return NVME_INVALID_NSID | NVME_DNR;
4933 }
4934
4935 ns = nvme_subsys_ns(n->subsys, nsid);
4936 if (!ns) {
4937 return NVME_INVALID_FIELD | NVME_DNR;
4938 }
4939
4940 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
4941 if (ret) {
4942 return ret;
4943 }
4944
4945 if (!*nr_ids) {
4946 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4947 }
4948
4949 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
4950 for (i = 0; i < *nr_ids; i++) {
4951 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
4952 if (!ctrl) {
4953 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4954 }
4955
4956 if (attach) {
4957 if (nvme_ns(ctrl, nsid)) {
4958 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
4959 }
4960
4961 if (ns->attached && !ns->params.shared) {
4962 return NVME_NS_PRIVATE | NVME_DNR;
4963 }
4964
4965 nvme_attach_ns(ctrl, ns);
4966 __nvme_select_ns_iocs(ctrl, ns);
4967 } else {
4968 if (!nvme_ns(ctrl, nsid)) {
4969 return NVME_NS_NOT_ATTACHED | NVME_DNR;
4970 }
4971
4972 ctrl->namespaces[nsid - 1] = NULL;
4973 ns->attached--;
4974
4975 nvme_update_dmrsl(ctrl);
4976 }
4977
4978
4979
4980
4981
4982 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
4983 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
4984 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
4985 NVME_LOG_CHANGED_NSLIST);
4986 }
4987 }
4988
4989 return NVME_SUCCESS;
4990}
4991
4992static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf,
4993 uint8_t mset, uint8_t pi, uint8_t pil,
4994 NvmeRequest *req)
4995{
4996 int64_t len, offset;
4997 struct nvme_aio_format_ctx *ctx;
4998 BlockBackend *blk = ns->blkconf.blk;
4999 uint16_t ms;
5000 uintptr_t *num_formats = (uintptr_t *)&req->opaque;
5001 int *count;
5002
5003 if (ns->params.zoned) {
5004 return NVME_INVALID_FORMAT | NVME_DNR;
5005 }
5006
5007 trace_pci_nvme_format_ns(nvme_cid(req), nvme_nsid(ns), lbaf, mset, pi, pil);
5008
5009 if (lbaf > ns->id_ns.nlbaf) {
5010 return NVME_INVALID_FORMAT | NVME_DNR;
5011 }
5012
5013 ms = ns->id_ns.lbaf[lbaf].ms;
5014
5015 if (pi && (ms < sizeof(NvmeDifTuple))) {
5016 return NVME_INVALID_FORMAT | NVME_DNR;
5017 }
5018
5019 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5020 return NVME_INVALID_FIELD | NVME_DNR;
5021 }
5022
5023 nvme_ns_drain(ns);
5024 nvme_ns_shutdown(ns);
5025 nvme_ns_cleanup(ns);
5026
5027 ns->id_ns.dps = (pil << 3) | pi;
5028 ns->id_ns.flbas = lbaf | (mset << 4);
5029
5030 nvme_ns_init_format(ns);
5031
5032 ns->status = NVME_FORMAT_IN_PROGRESS;
5033
5034 len = ns->size;
5035 offset = 0;
5036
5037 count = g_new(int, 1);
5038 *count = 1;
5039
5040 (*num_formats)++;
5041
5042 while (len) {
5043 ctx = g_new(struct nvme_aio_format_ctx, 1);
5044 ctx->req = req;
5045 ctx->ns = ns;
5046 ctx->count = count;
5047
5048 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
5049
5050 (*count)++;
5051
5052 blk_aio_pwrite_zeroes(blk, offset, bytes, BDRV_REQ_MAY_UNMAP,
5053 nvme_aio_format_cb, ctx);
5054
5055 offset += bytes;
5056 len -= bytes;
5057
5058 }
5059
5060 if (--(*count)) {
5061 return NVME_NO_COMPLETE;
5062 }
5063
5064 g_free(count);
5065 ns->status = 0x0;
5066 (*num_formats)--;
5067
5068 return NVME_SUCCESS;
5069}
5070
5071static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5072{
5073 NvmeNamespace *ns;
5074 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5075 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5076 uint8_t lbaf = dw10 & 0xf;
5077 uint8_t mset = (dw10 >> 4) & 0x1;
5078 uint8_t pi = (dw10 >> 5) & 0x7;
5079 uint8_t pil = (dw10 >> 8) & 0x1;
5080 uintptr_t *num_formats = (uintptr_t *)&req->opaque;
5081 uint16_t status;
5082 int i;
5083
5084 trace_pci_nvme_format(nvme_cid(req), nsid, lbaf, mset, pi, pil);
5085
5086
5087 *num_formats = 1;
5088
5089 if (nsid != NVME_NSID_BROADCAST) {
5090 if (!nvme_nsid_valid(n, nsid)) {
5091 return NVME_INVALID_NSID | NVME_DNR;
5092 }
5093
5094 ns = nvme_ns(n, nsid);
5095 if (!ns) {
5096 return NVME_INVALID_FIELD | NVME_DNR;
5097 }
5098
5099 status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
5100 if (status && status != NVME_NO_COMPLETE) {
5101 req->status = status;
5102 }
5103 } else {
5104 for (i = 1; i <= n->num_namespaces; i++) {
5105 ns = nvme_ns(n, i);
5106 if (!ns) {
5107 continue;
5108 }
5109
5110 status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
5111 if (status && status != NVME_NO_COMPLETE) {
5112 req->status = status;
5113 break;
5114 }
5115 }
5116 }
5117
5118
5119 if (--(*num_formats)) {
5120 return NVME_NO_COMPLETE;
5121 }
5122
5123 return req->status;
5124}
5125
5126static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5127{
5128 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5129 nvme_adm_opc_str(req->cmd.opcode));
5130
5131 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5132 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5133 return NVME_INVALID_OPCODE | NVME_DNR;
5134 }
5135
5136
5137 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5138 return NVME_INVALID_FIELD | NVME_DNR;
5139 }
5140
5141 switch (req->cmd.opcode) {
5142 case NVME_ADM_CMD_DELETE_SQ:
5143 return nvme_del_sq(n, req);
5144 case NVME_ADM_CMD_CREATE_SQ:
5145 return nvme_create_sq(n, req);
5146 case NVME_ADM_CMD_GET_LOG_PAGE:
5147 return nvme_get_log(n, req);
5148 case NVME_ADM_CMD_DELETE_CQ:
5149 return nvme_del_cq(n, req);
5150 case NVME_ADM_CMD_CREATE_CQ:
5151 return nvme_create_cq(n, req);
5152 case NVME_ADM_CMD_IDENTIFY:
5153 return nvme_identify(n, req);
5154 case NVME_ADM_CMD_ABORT:
5155 return nvme_abort(n, req);
5156 case NVME_ADM_CMD_SET_FEATURES:
5157 return nvme_set_feature(n, req);
5158 case NVME_ADM_CMD_GET_FEATURES:
5159 return nvme_get_feature(n, req);
5160 case NVME_ADM_CMD_ASYNC_EV_REQ:
5161 return nvme_aer(n, req);
5162 case NVME_ADM_CMD_NS_ATTACHMENT:
5163 return nvme_ns_attachment(n, req);
5164 case NVME_ADM_CMD_FORMAT_NVM:
5165 return nvme_format(n, req);
5166 default:
5167 assert(false);
5168 }
5169
5170 return NVME_INVALID_OPCODE | NVME_DNR;
5171}
5172
5173static void nvme_process_sq(void *opaque)
5174{
5175 NvmeSQueue *sq = opaque;
5176 NvmeCtrl *n = sq->ctrl;
5177 NvmeCQueue *cq = n->cq[sq->cqid];
5178
5179 uint16_t status;
5180 hwaddr addr;
5181 NvmeCmd cmd;
5182 NvmeRequest *req;
5183
5184 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5185 addr = sq->dma_addr + sq->head * n->sqe_size;
5186 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5187 trace_pci_nvme_err_addr_read(addr);
5188 trace_pci_nvme_err_cfs();
5189 n->bar.csts = NVME_CSTS_FAILED;
5190 break;
5191 }
5192 nvme_inc_sq_head(sq);
5193
5194 req = QTAILQ_FIRST(&sq->req_list);
5195 QTAILQ_REMOVE(&sq->req_list, req, entry);
5196 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5197 nvme_req_clear(req);
5198 req->cqe.cid = cmd.cid;
5199 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5200
5201 status = sq->sqid ? nvme_io_cmd(n, req) :
5202 nvme_admin_cmd(n, req);
5203 if (status != NVME_NO_COMPLETE) {
5204 req->status = status;
5205 nvme_enqueue_req_completion(cq, req);
5206 }
5207 }
5208}
5209
5210static void nvme_ctrl_reset(NvmeCtrl *n)
5211{
5212 NvmeNamespace *ns;
5213 int i;
5214
5215 for (i = 1; i <= n->num_namespaces; i++) {
5216 ns = nvme_ns(n, i);
5217 if (!ns) {
5218 continue;
5219 }
5220
5221 nvme_ns_drain(ns);
5222 }
5223
5224 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5225 if (n->sq[i] != NULL) {
5226 nvme_free_sq(n->sq[i], n);
5227 }
5228 }
5229 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5230 if (n->cq[i] != NULL) {
5231 nvme_free_cq(n->cq[i], n);
5232 }
5233 }
5234
5235 while (!QTAILQ_EMPTY(&n->aer_queue)) {
5236 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5237 QTAILQ_REMOVE(&n->aer_queue, event, entry);
5238 g_free(event);
5239 }
5240
5241 n->aer_queued = 0;
5242 n->outstanding_aers = 0;
5243 n->qs_created = false;
5244
5245 n->bar.cc = 0;
5246}
5247
5248static void nvme_ctrl_shutdown(NvmeCtrl *n)
5249{
5250 NvmeNamespace *ns;
5251 int i;
5252
5253 if (n->pmr.dev) {
5254 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5255 }
5256
5257 for (i = 1; i <= n->num_namespaces; i++) {
5258 ns = nvme_ns(n, i);
5259 if (!ns) {
5260 continue;
5261 }
5262
5263 nvme_ns_shutdown(ns);
5264 }
5265}
5266
5267static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns)
5268{
5269 ns->iocs = nvme_cse_iocs_none;
5270 switch (ns->csi) {
5271 case NVME_CSI_NVM:
5272 if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
5273 ns->iocs = nvme_cse_iocs_nvm;
5274 }
5275 break;
5276 case NVME_CSI_ZONED:
5277 if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
5278 ns->iocs = nvme_cse_iocs_zoned;
5279 } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
5280 ns->iocs = nvme_cse_iocs_nvm;
5281 }
5282 break;
5283 }
5284}
5285
5286static void nvme_select_ns_iocs(NvmeCtrl *n)
5287{
5288 NvmeNamespace *ns;
5289 int i;
5290
5291 for (i = 1; i <= n->num_namespaces; i++) {
5292 ns = nvme_ns(n, i);
5293 if (!ns) {
5294 continue;
5295 }
5296
5297 __nvme_select_ns_iocs(n, ns);
5298 }
5299}
5300
5301static int nvme_start_ctrl(NvmeCtrl *n)
5302{
5303 uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
5304 uint32_t page_size = 1 << page_bits;
5305
5306 if (unlikely(n->cq[0])) {
5307 trace_pci_nvme_err_startfail_cq();
5308 return -1;
5309 }
5310 if (unlikely(n->sq[0])) {
5311 trace_pci_nvme_err_startfail_sq();
5312 return -1;
5313 }
5314 if (unlikely(!n->bar.asq)) {
5315 trace_pci_nvme_err_startfail_nbarasq();
5316 return -1;
5317 }
5318 if (unlikely(!n->bar.acq)) {
5319 trace_pci_nvme_err_startfail_nbaracq();
5320 return -1;
5321 }
5322 if (unlikely(n->bar.asq & (page_size - 1))) {
5323 trace_pci_nvme_err_startfail_asq_misaligned(n->bar.asq);
5324 return -1;
5325 }
5326 if (unlikely(n->bar.acq & (page_size - 1))) {
5327 trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq);
5328 return -1;
5329 }
5330 if (unlikely(!(NVME_CAP_CSS(n->bar.cap) & (1 << NVME_CC_CSS(n->bar.cc))))) {
5331 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(n->bar.cc));
5332 return -1;
5333 }
5334 if (unlikely(NVME_CC_MPS(n->bar.cc) <
5335 NVME_CAP_MPSMIN(n->bar.cap))) {
5336 trace_pci_nvme_err_startfail_page_too_small(
5337 NVME_CC_MPS(n->bar.cc),
5338 NVME_CAP_MPSMIN(n->bar.cap));
5339 return -1;
5340 }
5341 if (unlikely(NVME_CC_MPS(n->bar.cc) >
5342 NVME_CAP_MPSMAX(n->bar.cap))) {
5343 trace_pci_nvme_err_startfail_page_too_large(
5344 NVME_CC_MPS(n->bar.cc),
5345 NVME_CAP_MPSMAX(n->bar.cap));
5346 return -1;
5347 }
5348 if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
5349 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5350 trace_pci_nvme_err_startfail_cqent_too_small(
5351 NVME_CC_IOCQES(n->bar.cc),
5352 NVME_CTRL_CQES_MIN(n->bar.cap));
5353 return -1;
5354 }
5355 if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
5356 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5357 trace_pci_nvme_err_startfail_cqent_too_large(
5358 NVME_CC_IOCQES(n->bar.cc),
5359 NVME_CTRL_CQES_MAX(n->bar.cap));
5360 return -1;
5361 }
5362 if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
5363 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5364 trace_pci_nvme_err_startfail_sqent_too_small(
5365 NVME_CC_IOSQES(n->bar.cc),
5366 NVME_CTRL_SQES_MIN(n->bar.cap));
5367 return -1;
5368 }
5369 if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
5370 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5371 trace_pci_nvme_err_startfail_sqent_too_large(
5372 NVME_CC_IOSQES(n->bar.cc),
5373 NVME_CTRL_SQES_MAX(n->bar.cap));
5374 return -1;
5375 }
5376 if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
5377 trace_pci_nvme_err_startfail_asqent_sz_zero();
5378 return -1;
5379 }
5380 if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
5381 trace_pci_nvme_err_startfail_acqent_sz_zero();
5382 return -1;
5383 }
5384
5385 n->page_bits = page_bits;
5386 n->page_size = page_size;
5387 n->max_prp_ents = n->page_size / sizeof(uint64_t);
5388 n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
5389 n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
5390 nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
5391 NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
5392 nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
5393 NVME_AQA_ASQS(n->bar.aqa) + 1);
5394
5395 nvme_set_timestamp(n, 0ULL);
5396
5397 QTAILQ_INIT(&n->aer_queue);
5398
5399 nvme_select_ns_iocs(n);
5400
5401 return 0;
5402}
5403
5404static void nvme_cmb_enable_regs(NvmeCtrl *n)
5405{
5406 NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1);
5407 NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1);
5408 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
5409
5410 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
5411 NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
5412 NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
5413 NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
5414 NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
5415 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2);
5416 NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
5417}
5418
5419static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5420 unsigned size)
5421{
5422 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5423 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5424 "MMIO write not 32-bit aligned,"
5425 " offset=0x%"PRIx64"", offset);
5426
5427 }
5428
5429 if (unlikely(size < sizeof(uint32_t))) {
5430 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5431 "MMIO write smaller than 32-bits,"
5432 " offset=0x%"PRIx64", size=%u",
5433 offset, size);
5434
5435 }
5436
5437 switch (offset) {
5438 case 0xc:
5439 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5440 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5441 "undefined access to interrupt mask set"
5442 " when MSI-X is enabled");
5443
5444 }
5445 n->bar.intms |= data & 0xffffffff;
5446 n->bar.intmc = n->bar.intms;
5447 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, n->bar.intmc);
5448 nvme_irq_check(n);
5449 break;
5450 case 0x10:
5451 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5452 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5453 "undefined access to interrupt mask clr"
5454 " when MSI-X is enabled");
5455
5456 }
5457 n->bar.intms &= ~(data & 0xffffffff);
5458 n->bar.intmc = n->bar.intms;
5459 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, n->bar.intmc);
5460 nvme_irq_check(n);
5461 break;
5462 case 0x14:
5463 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5464
5465 if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
5466 !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
5467 {
5468 n->bar.cc = data;
5469 }
5470
5471 if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
5472 n->bar.cc = data;
5473 if (unlikely(nvme_start_ctrl(n))) {
5474 trace_pci_nvme_err_startfail();
5475 n->bar.csts = NVME_CSTS_FAILED;
5476 } else {
5477 trace_pci_nvme_mmio_start_success();
5478 n->bar.csts = NVME_CSTS_READY;
5479 }
5480 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
5481 trace_pci_nvme_mmio_stopped();
5482 nvme_ctrl_reset(n);
5483 n->bar.csts &= ~NVME_CSTS_READY;
5484 }
5485 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
5486 trace_pci_nvme_mmio_shutdown_set();
5487 nvme_ctrl_shutdown(n);
5488 n->bar.cc = data;
5489 n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
5490 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
5491 trace_pci_nvme_mmio_shutdown_cleared();
5492 n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
5493 n->bar.cc = data;
5494 }
5495 break;
5496 case 0x1C:
5497 if (data & (1 << 4)) {
5498 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5499 "attempted to W1C CSTS.NSSRO"
5500 " but CAP.NSSRS is zero (not supported)");
5501 } else if (data != 0) {
5502 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5503 "attempted to set a read only bit"
5504 " of controller status");
5505 }
5506 break;
5507 case 0x20:
5508 if (data == 0x4E564D65) {
5509 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5510 } else {
5511
5512 return;
5513 }
5514 break;
5515 case 0x24:
5516 n->bar.aqa = data & 0xffffffff;
5517 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5518 break;
5519 case 0x28:
5520 n->bar.asq = size == 8 ? data :
5521 (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff);
5522 trace_pci_nvme_mmio_asqaddr(data);
5523 break;
5524 case 0x2c:
5525 n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32);
5526 trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
5527 break;
5528 case 0x30:
5529 trace_pci_nvme_mmio_acqaddr(data);
5530 n->bar.acq = size == 8 ? data :
5531 (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff);
5532 break;
5533 case 0x34:
5534 n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32);
5535 trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
5536 break;
5537 case 0x38:
5538 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5539 "invalid write to reserved CMBLOC"
5540 " when CMBSZ is zero, ignored");
5541 return;
5542 case 0x3C:
5543 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5544 "invalid write to read only CMBSZ, ignored");
5545 return;
5546 case 0x50:
5547 if (!NVME_CAP_CMBS(n->bar.cap)) {
5548 return;
5549 }
5550
5551 n->bar.cmbmsc = size == 8 ? data :
5552 (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff);
5553 n->cmb.cmse = false;
5554
5555 if (NVME_CMBMSC_CRE(data)) {
5556 nvme_cmb_enable_regs(n);
5557
5558 if (NVME_CMBMSC_CMSE(data)) {
5559 hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT;
5560 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5561 NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1);
5562 return;
5563 }
5564
5565 n->cmb.cba = cba;
5566 n->cmb.cmse = true;
5567 }
5568 } else {
5569 n->bar.cmbsz = 0;
5570 n->bar.cmbloc = 0;
5571 }
5572
5573 return;
5574 case 0x54:
5575 n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32);
5576 return;
5577
5578 case 0xE00:
5579 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5580 "invalid write to PMRCAP register, ignored");
5581 return;
5582 case 0xE04:
5583 n->bar.pmrctl = data;
5584 if (NVME_PMRCTL_EN(data)) {
5585 memory_region_set_enabled(&n->pmr.dev->mr, true);
5586 n->bar.pmrsts = 0;
5587 } else {
5588 memory_region_set_enabled(&n->pmr.dev->mr, false);
5589 NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
5590 n->pmr.cmse = false;
5591 }
5592 return;
5593 case 0xE08:
5594 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5595 "invalid write to PMRSTS register, ignored");
5596 return;
5597 case 0xE0C:
5598 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5599 "invalid write to PMREBS register, ignored");
5600 return;
5601 case 0xE10:
5602 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5603 "invalid write to PMRSWTP register, ignored");
5604 return;
5605 case 0xE14:
5606 if (!NVME_CAP_PMRS(n->bar.cap)) {
5607 return;
5608 }
5609
5610 n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff);
5611 n->pmr.cmse = false;
5612
5613 if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) {
5614 hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT;
5615 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5616 NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1);
5617 return;
5618 }
5619
5620 n->pmr.cmse = true;
5621 n->pmr.cba = cba;
5622 }
5623
5624 return;
5625 case 0xE18:
5626 if (!NVME_CAP_PMRS(n->bar.cap)) {
5627 return;
5628 }
5629
5630 n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32);
5631 return;
5632 default:
5633 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5634 "invalid MMIO write,"
5635 " offset=0x%"PRIx64", data=%"PRIx64"",
5636 offset, data);
5637 break;
5638 }
5639}
5640
5641static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5642{
5643 NvmeCtrl *n = (NvmeCtrl *)opaque;
5644 uint8_t *ptr = (uint8_t *)&n->bar;
5645 uint64_t val = 0;
5646
5647 trace_pci_nvme_mmio_read(addr, size);
5648
5649 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
5650 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
5651 "MMIO read not 32-bit aligned,"
5652 " offset=0x%"PRIx64"", addr);
5653
5654 } else if (unlikely(size < sizeof(uint32_t))) {
5655 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
5656 "MMIO read smaller than 32-bits,"
5657 " offset=0x%"PRIx64"", addr);
5658
5659 }
5660
5661 if (addr < sizeof(n->bar)) {
5662
5663
5664
5665
5666
5667 if (addr == 0xE08 &&
5668 (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
5669 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5670 }
5671 memcpy(&val, ptr + addr, size);
5672 } else {
5673 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
5674 "MMIO read beyond last register,"
5675 " offset=0x%"PRIx64", returning 0", addr);
5676 }
5677
5678 return val;
5679}
5680
5681static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
5682{
5683 uint32_t qid;
5684
5685 if (unlikely(addr & ((1 << 2) - 1))) {
5686 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
5687 "doorbell write not 32-bit aligned,"
5688 " offset=0x%"PRIx64", ignoring", addr);
5689 return;
5690 }
5691
5692 if (((addr - 0x1000) >> 2) & 1) {
5693
5694
5695 uint16_t new_head = val & 0xffff;
5696 int start_sqs;
5697 NvmeCQueue *cq;
5698
5699 qid = (addr - (0x1000 + (1 << 2))) >> 3;
5700 if (unlikely(nvme_check_cqid(n, qid))) {
5701 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
5702 "completion queue doorbell write"
5703 " for nonexistent queue,"
5704 " sqid=%"PRIu32", ignoring", qid);
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719 if (n->outstanding_aers) {
5720 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5721 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
5722 NVME_LOG_ERROR_INFO);
5723 }
5724
5725 return;
5726 }
5727
5728 cq = n->cq[qid];
5729 if (unlikely(new_head >= cq->size)) {
5730 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
5731 "completion queue doorbell write value"
5732 " beyond queue size, sqid=%"PRIu32","
5733 " new_head=%"PRIu16", ignoring",
5734 qid, new_head);
5735
5736 if (n->outstanding_aers) {
5737 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5738 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
5739 NVME_LOG_ERROR_INFO);
5740 }
5741
5742 return;
5743 }
5744
5745 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
5746
5747 start_sqs = nvme_cq_full(cq) ? 1 : 0;
5748 cq->head = new_head;
5749 if (start_sqs) {
5750 NvmeSQueue *sq;
5751 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
5752 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5753 }
5754 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5755 }
5756
5757 if (cq->tail == cq->head) {
5758 nvme_irq_deassert(n, cq);
5759 }
5760 } else {
5761
5762
5763 uint16_t new_tail = val & 0xffff;
5764 NvmeSQueue *sq;
5765
5766 qid = (addr - 0x1000) >> 3;
5767 if (unlikely(nvme_check_sqid(n, qid))) {
5768 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
5769 "submission queue doorbell write"
5770 " for nonexistent queue,"
5771 " sqid=%"PRIu32", ignoring", qid);
5772
5773 if (n->outstanding_aers) {
5774 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5775 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
5776 NVME_LOG_ERROR_INFO);
5777 }
5778
5779 return;
5780 }
5781
5782 sq = n->sq[qid];
5783 if (unlikely(new_tail >= sq->size)) {
5784 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
5785 "submission queue doorbell write value"
5786 " beyond queue size, sqid=%"PRIu32","
5787 " new_tail=%"PRIu16", ignoring",
5788 qid, new_tail);
5789
5790 if (n->outstanding_aers) {
5791 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5792 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
5793 NVME_LOG_ERROR_INFO);
5794 }
5795
5796 return;
5797 }
5798
5799 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
5800
5801 sq->tail = new_tail;
5802 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5803 }
5804}
5805
5806static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
5807 unsigned size)
5808{
5809 NvmeCtrl *n = (NvmeCtrl *)opaque;
5810
5811 trace_pci_nvme_mmio_write(addr, data, size);
5812
5813 if (addr < sizeof(n->bar)) {
5814 nvme_write_bar(n, addr, data, size);
5815 } else {
5816 nvme_process_db(n, addr, data);
5817 }
5818}
5819
5820static const MemoryRegionOps nvme_mmio_ops = {
5821 .read = nvme_mmio_read,
5822 .write = nvme_mmio_write,
5823 .endianness = DEVICE_LITTLE_ENDIAN,
5824 .impl = {
5825 .min_access_size = 2,
5826 .max_access_size = 8,
5827 },
5828};
5829
5830static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
5831 unsigned size)
5832{
5833 NvmeCtrl *n = (NvmeCtrl *)opaque;
5834 stn_le_p(&n->cmb.buf[addr], size, data);
5835}
5836
5837static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
5838{
5839 NvmeCtrl *n = (NvmeCtrl *)opaque;
5840 return ldn_le_p(&n->cmb.buf[addr], size);
5841}
5842
5843static const MemoryRegionOps nvme_cmb_ops = {
5844 .read = nvme_cmb_read,
5845 .write = nvme_cmb_write,
5846 .endianness = DEVICE_LITTLE_ENDIAN,
5847 .impl = {
5848 .min_access_size = 1,
5849 .max_access_size = 8,
5850 },
5851};
5852
5853static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
5854{
5855 NvmeParams *params = &n->params;
5856
5857 if (params->num_queues) {
5858 warn_report("num_queues is deprecated; please use max_ioqpairs "
5859 "instead");
5860
5861 params->max_ioqpairs = params->num_queues - 1;
5862 }
5863
5864 if (n->namespace.blkconf.blk && n->subsys) {
5865 error_setg(errp, "subsystem support is unavailable with legacy "
5866 "namespace ('drive' property)");
5867 return;
5868 }
5869
5870 if (params->max_ioqpairs < 1 ||
5871 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
5872 error_setg(errp, "max_ioqpairs must be between 1 and %d",
5873 NVME_MAX_IOQPAIRS);
5874 return;
5875 }
5876
5877 if (params->msix_qsize < 1 ||
5878 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
5879 error_setg(errp, "msix_qsize must be between 1 and %d",
5880 PCI_MSIX_FLAGS_QSIZE + 1);
5881 return;
5882 }
5883
5884 if (!params->serial) {
5885 error_setg(errp, "serial property not set");
5886 return;
5887 }
5888
5889 if (n->pmr.dev) {
5890 if (host_memory_backend_is_mapped(n->pmr.dev)) {
5891 error_setg(errp, "can't use already busy memdev: %s",
5892 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
5893 return;
5894 }
5895
5896 if (!is_power_of_2(n->pmr.dev->size)) {
5897 error_setg(errp, "pmr backend size needs to be power of 2 in size");
5898 return;
5899 }
5900
5901 host_memory_backend_set_mapped(n->pmr.dev, true);
5902 }
5903
5904 if (n->params.zasl > n->params.mdts) {
5905 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
5906 "than or equal to mdts (Maximum Data Transfer Size)");
5907 return;
5908 }
5909
5910 if (!n->params.vsl) {
5911 error_setg(errp, "vsl must be non-zero");
5912 return;
5913 }
5914}
5915
5916static void nvme_init_state(NvmeCtrl *n)
5917{
5918 n->num_namespaces = NVME_MAX_NAMESPACES;
5919
5920 n->reg_size = pow2ceil(sizeof(NvmeBar) +
5921 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
5922 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
5923 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
5924 n->temperature = NVME_TEMPERATURE;
5925 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
5926 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5927 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
5928}
5929
5930static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
5931{
5932 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
5933
5934 n->cmb.buf = g_malloc0(cmb_size);
5935 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
5936 "nvme-cmb", cmb_size);
5937 pci_register_bar(pci_dev, NVME_CMB_BIR,
5938 PCI_BASE_ADDRESS_SPACE_MEMORY |
5939 PCI_BASE_ADDRESS_MEM_TYPE_64 |
5940 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
5941
5942 NVME_CAP_SET_CMBS(n->bar.cap, 1);
5943
5944 if (n->params.legacy_cmb) {
5945 nvme_cmb_enable_regs(n);
5946 n->cmb.cmse = true;
5947 }
5948}
5949
5950static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
5951{
5952 NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1);
5953 NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1);
5954 NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
5955
5956 NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
5957 NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1);
5958
5959 pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
5960 PCI_BASE_ADDRESS_SPACE_MEMORY |
5961 PCI_BASE_ADDRESS_MEM_TYPE_64 |
5962 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
5963
5964 memory_region_set_enabled(&n->pmr.dev->mr, false);
5965}
5966
5967static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
5968{
5969 uint8_t *pci_conf = pci_dev->config;
5970 uint64_t bar_size, msix_table_size, msix_pba_size;
5971 unsigned msix_table_offset, msix_pba_offset;
5972 int ret;
5973
5974 Error *err = NULL;
5975
5976 pci_conf[PCI_INTERRUPT_PIN] = 1;
5977 pci_config_set_prog_interface(pci_conf, 0x2);
5978
5979 if (n->params.use_intel_id) {
5980 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
5981 pci_config_set_device_id(pci_conf, 0x5845);
5982 } else {
5983 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
5984 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
5985 }
5986
5987 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
5988 pcie_endpoint_cap_init(pci_dev, 0x80);
5989
5990 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
5991 msix_table_offset = bar_size;
5992 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
5993
5994 bar_size += msix_table_size;
5995 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
5996 msix_pba_offset = bar_size;
5997 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
5998
5999 bar_size += msix_pba_size;
6000 bar_size = pow2ceil(bar_size);
6001
6002 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6003 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6004 n->reg_size);
6005 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6006
6007 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6008 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6009 ret = msix_init(pci_dev, n->params.msix_qsize,
6010 &n->bar0, 0, msix_table_offset,
6011 &n->bar0, 0, msix_pba_offset, 0, &err);
6012 if (ret < 0) {
6013 if (ret == -ENOTSUP) {
6014 warn_report_err(err);
6015 } else {
6016 error_propagate(errp, err);
6017 return ret;
6018 }
6019 }
6020
6021 if (n->params.cmb_size_mb) {
6022 nvme_init_cmb(n, pci_dev);
6023 }
6024
6025 if (n->pmr.dev) {
6026 nvme_init_pmr(n, pci_dev);
6027 }
6028
6029 return 0;
6030}
6031
6032static void nvme_init_subnqn(NvmeCtrl *n)
6033{
6034 NvmeSubsystem *subsys = n->subsys;
6035 NvmeIdCtrl *id = &n->id_ctrl;
6036
6037 if (!subsys) {
6038 snprintf((char *)id->subnqn, sizeof(id->subnqn),
6039 "nqn.2019-08.org.qemu:%s", n->params.serial);
6040 } else {
6041 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6042 }
6043}
6044
6045static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6046{
6047 NvmeIdCtrl *id = &n->id_ctrl;
6048 uint8_t *pci_conf = pci_dev->config;
6049
6050 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6051 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6052 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6053 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6054 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6055
6056 id->cntlid = cpu_to_le16(n->cntlid);
6057
6058 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6059
6060 id->rab = 6;
6061
6062 if (n->params.use_intel_id) {
6063 id->ieee[0] = 0xb3;
6064 id->ieee[1] = 0x02;
6065 id->ieee[2] = 0x00;
6066 } else {
6067 id->ieee[0] = 0x00;
6068 id->ieee[1] = 0x54;
6069 id->ieee[2] = 0x52;
6070 }
6071
6072 id->mdts = n->params.mdts;
6073 id->ver = cpu_to_le32(NVME_SPEC_VER);
6074 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6075 id->cntrltype = 0x1;
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088 id->acl = 3;
6089 id->aerl = n->params.aerl;
6090 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6091 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6092
6093
6094 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6095 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6096
6097 id->sqes = (0x6 << 4) | 0x6;
6098 id->cqes = (0x4 << 4) | 0x4;
6099 id->nn = cpu_to_le32(n->num_namespaces);
6100 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6101 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6102 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6103
6104
6105
6106
6107
6108
6109
6110
6111 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6112
6113 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6114 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6115 NVME_CTRL_SGLS_BITBUCKET);
6116
6117 nvme_init_subnqn(n);
6118
6119 id->psd[0].mp = cpu_to_le16(0x9c4);
6120 id->psd[0].enlat = cpu_to_le32(0x10);
6121 id->psd[0].exlat = cpu_to_le32(0x4);
6122
6123 if (n->subsys) {
6124 id->cmic |= NVME_CMIC_MULTI_CTRL;
6125 }
6126
6127 NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
6128 NVME_CAP_SET_CQR(n->bar.cap, 1);
6129 NVME_CAP_SET_TO(n->bar.cap, 0xf);
6130 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM);
6131 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP);
6132 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
6133 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
6134 NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
6135 NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0);
6136
6137 n->bar.vs = NVME_SPEC_VER;
6138 n->bar.intmc = n->bar.intms = 0;
6139}
6140
6141static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6142{
6143 int cntlid;
6144
6145 if (!n->subsys) {
6146 return 0;
6147 }
6148
6149 cntlid = nvme_subsys_register_ctrl(n, errp);
6150 if (cntlid < 0) {
6151 return -1;
6152 }
6153
6154 n->cntlid = cntlid;
6155
6156 return 0;
6157}
6158
6159void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6160{
6161 uint32_t nsid = ns->params.nsid;
6162 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6163
6164 n->namespaces[nsid - 1] = ns;
6165 ns->attached++;
6166
6167 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6168 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6169}
6170
6171static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6172{
6173 NvmeCtrl *n = NVME(pci_dev);
6174 NvmeNamespace *ns;
6175 Error *local_err = NULL;
6176
6177 nvme_check_constraints(n, &local_err);
6178 if (local_err) {
6179 error_propagate(errp, local_err);
6180 return;
6181 }
6182
6183 qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6184 &pci_dev->qdev, n->parent_obj.qdev.id);
6185
6186 nvme_init_state(n);
6187 if (nvme_init_pci(n, pci_dev, errp)) {
6188 return;
6189 }
6190
6191 if (nvme_init_subsys(n, errp)) {
6192 error_propagate(errp, local_err);
6193 return;
6194 }
6195 nvme_init_ctrl(n, pci_dev);
6196
6197
6198 if (n->namespace.blkconf.blk) {
6199 ns = &n->namespace;
6200 ns->params.nsid = 1;
6201
6202 if (nvme_ns_setup(n, ns, errp)) {
6203 return;
6204 }
6205
6206 nvme_attach_ns(n, ns);
6207 }
6208}
6209
6210static void nvme_exit(PCIDevice *pci_dev)
6211{
6212 NvmeCtrl *n = NVME(pci_dev);
6213 NvmeNamespace *ns;
6214 int i;
6215
6216 nvme_ctrl_reset(n);
6217
6218 for (i = 1; i <= n->num_namespaces; i++) {
6219 ns = nvme_ns(n, i);
6220 if (!ns) {
6221 continue;
6222 }
6223
6224 nvme_ns_cleanup(ns);
6225 }
6226
6227 g_free(n->cq);
6228 g_free(n->sq);
6229 g_free(n->aer_reqs);
6230
6231 if (n->params.cmb_size_mb) {
6232 g_free(n->cmb.buf);
6233 }
6234
6235 if (n->pmr.dev) {
6236 host_memory_backend_set_mapped(n->pmr.dev, false);
6237 }
6238 msix_uninit(pci_dev, &n->bar0, &n->bar0);
6239 memory_region_del_subregion(&n->bar0, &n->iomem);
6240}
6241
6242static Property nvme_props[] = {
6243 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6244 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6245 HostMemoryBackend *),
6246 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6247 NvmeSubsystem *),
6248 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6249 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6250 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6251 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6252 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6253 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6254 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6255 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6256 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6257 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6258 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6259 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6260 DEFINE_PROP_END_OF_LIST(),
6261};
6262
6263static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6264 void *opaque, Error **errp)
6265{
6266 NvmeCtrl *n = NVME(obj);
6267 uint8_t value = n->smart_critical_warning;
6268
6269 visit_type_uint8(v, name, &value, errp);
6270}
6271
6272static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6273 void *opaque, Error **errp)
6274{
6275 NvmeCtrl *n = NVME(obj);
6276 uint8_t value, old_value, cap = 0, index, event;
6277
6278 if (!visit_type_uint8(v, name, &value, errp)) {
6279 return;
6280 }
6281
6282 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6283 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6284 if (NVME_CAP_PMRS(n->bar.cap)) {
6285 cap |= NVME_SMART_PMR_UNRELIABLE;
6286 }
6287
6288 if ((value & cap) != value) {
6289 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6290 value & ~cap);
6291 return;
6292 }
6293
6294 old_value = n->smart_critical_warning;
6295 n->smart_critical_warning = value;
6296
6297
6298 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6299 event = 1 << index;
6300 if (value & ~old_value & event)
6301 nvme_smart_event(n, event);
6302 }
6303}
6304
6305static const VMStateDescription nvme_vmstate = {
6306 .name = "nvme",
6307 .unmigratable = 1,
6308};
6309
6310static void nvme_class_init(ObjectClass *oc, void *data)
6311{
6312 DeviceClass *dc = DEVICE_CLASS(oc);
6313 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6314
6315 pc->realize = nvme_realize;
6316 pc->exit = nvme_exit;
6317 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6318 pc->revision = 2;
6319
6320 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6321 dc->desc = "Non-Volatile Memory Express";
6322 device_class_set_props(dc, nvme_props);
6323 dc->vmsd = &nvme_vmstate;
6324}
6325
6326static void nvme_instance_init(Object *obj)
6327{
6328 NvmeCtrl *n = NVME(obj);
6329
6330 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6331 "bootindex", "/namespace@1,0",
6332 DEVICE(obj));
6333
6334 object_property_add(obj, "smart_critical_warning", "uint8",
6335 nvme_get_smart_warning,
6336 nvme_set_smart_warning, NULL, NULL);
6337}
6338
6339static const TypeInfo nvme_info = {
6340 .name = TYPE_NVME,
6341 .parent = TYPE_PCI_DEVICE,
6342 .instance_size = sizeof(NvmeCtrl),
6343 .instance_init = nvme_instance_init,
6344 .class_init = nvme_class_init,
6345 .interfaces = (InterfaceInfo[]) {
6346 { INTERFACE_PCIE_DEVICE },
6347 { }
6348 },
6349};
6350
6351static const TypeInfo nvme_bus_info = {
6352 .name = TYPE_NVME_BUS,
6353 .parent = TYPE_BUS,
6354 .instance_size = sizeof(NvmeBus),
6355};
6356
6357static void nvme_register_types(void)
6358{
6359 type_register_static(&nvme_info);
6360 type_register_static(&nvme_bus_info);
6361}
6362
6363type_init(nvme_register_types)
6364