1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152#include "qemu/osdep.h"
153#include "qemu/cutils.h"
154#include "qemu/error-report.h"
155#include "qemu/log.h"
156#include "qemu/units.h"
157#include "qapi/error.h"
158#include "qapi/visitor.h"
159#include "sysemu/sysemu.h"
160#include "sysemu/block-backend.h"
161#include "sysemu/hostmem.h"
162#include "hw/pci/msix.h"
163#include "migration/vmstate.h"
164
165#include "nvme.h"
166#include "trace.h"
167
168#define NVME_MAX_IOQPAIRS 0xffff
169#define NVME_DB_SIZE 4
170#define NVME_SPEC_VER 0x00010400
171#define NVME_CMB_BIR 2
172#define NVME_PMR_BIR 4
173#define NVME_TEMPERATURE 0x143
174#define NVME_TEMPERATURE_WARNING 0x157
175#define NVME_TEMPERATURE_CRITICAL 0x175
176#define NVME_NUM_FW_SLOTS 1
177#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
178
179#define NVME_GUEST_ERR(trace, fmt, ...) \
180 do { \
181 (trace_##trace)(__VA_ARGS__); \
182 qemu_log_mask(LOG_GUEST_ERROR, #trace \
183 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
184 } while (0)
185
186static const bool nvme_feature_support[NVME_FID_MAX] = {
187 [NVME_ARBITRATION] = true,
188 [NVME_POWER_MANAGEMENT] = true,
189 [NVME_TEMPERATURE_THRESHOLD] = true,
190 [NVME_ERROR_RECOVERY] = true,
191 [NVME_VOLATILE_WRITE_CACHE] = true,
192 [NVME_NUMBER_OF_QUEUES] = true,
193 [NVME_INTERRUPT_COALESCING] = true,
194 [NVME_INTERRUPT_VECTOR_CONF] = true,
195 [NVME_WRITE_ATOMICITY] = true,
196 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
197 [NVME_TIMESTAMP] = true,
198 [NVME_COMMAND_SET_PROFILE] = true,
199};
200
201static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
202 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
203 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
204 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
205 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
206 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
207 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
208 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
209};
210
211static const uint32_t nvme_cse_acs[256] = {
212 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
213 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
214 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
215 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
216 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
217 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
218 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
219 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
220 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
221 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
222 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
223 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224};
225
226static const uint32_t nvme_cse_iocs_none[256];
227
228static const uint32_t nvme_cse_iocs_nvm[256] = {
229 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
233 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
235 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
236 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
237};
238
239static const uint32_t nvme_cse_iocs_zoned[256] = {
240 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
241 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
243 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
244 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
246 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
248 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
249 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
251};
252
253static void nvme_process_sq(void *opaque);
254
255static uint16_t nvme_sqid(NvmeRequest *req)
256{
257 return le16_to_cpu(req->sq->sqid);
258}
259
260static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
261 NvmeZoneState state)
262{
263 if (QTAILQ_IN_USE(zone, entry)) {
264 switch (nvme_get_zone_state(zone)) {
265 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
266 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
267 break;
268 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
269 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
270 break;
271 case NVME_ZONE_STATE_CLOSED:
272 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
273 break;
274 case NVME_ZONE_STATE_FULL:
275 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
276 default:
277 ;
278 }
279 }
280
281 nvme_set_zone_state(zone, state);
282
283 switch (state) {
284 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
285 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
286 break;
287 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
288 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
289 break;
290 case NVME_ZONE_STATE_CLOSED:
291 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
292 break;
293 case NVME_ZONE_STATE_FULL:
294 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
295 case NVME_ZONE_STATE_READ_ONLY:
296 break;
297 default:
298 zone->d.za = 0;
299 }
300}
301
302
303
304
305
306static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
307{
308 if (ns->params.max_active_zones != 0 &&
309 ns->nr_active_zones + act > ns->params.max_active_zones) {
310 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
311 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
312 }
313 if (ns->params.max_open_zones != 0 &&
314 ns->nr_open_zones + opn > ns->params.max_open_zones) {
315 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
316 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
317 }
318
319 return NVME_SUCCESS;
320}
321
322static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
323{
324 hwaddr hi, lo;
325
326 if (!n->cmb.cmse) {
327 return false;
328 }
329
330 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
331 hi = lo + int128_get64(n->cmb.mem.size);
332
333 return addr >= lo && addr < hi;
334}
335
336static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
337{
338 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
339 return &n->cmb.buf[addr - base];
340}
341
342static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
343{
344 hwaddr hi;
345
346 if (!n->pmr.cmse) {
347 return false;
348 }
349
350 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
351
352 return addr >= n->pmr.cba && addr < hi;
353}
354
355static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
356{
357 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
358}
359
360static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
361{
362 hwaddr hi = addr + size - 1;
363 if (hi < addr) {
364 return 1;
365 }
366
367 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
368 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
369 return 0;
370 }
371
372 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
373 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
374 return 0;
375 }
376
377 return pci_dma_read(&n->parent_obj, addr, buf, size);
378}
379
380static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
381{
382 hwaddr hi = addr + size - 1;
383 if (hi < addr) {
384 return 1;
385 }
386
387 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
388 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
389 return 0;
390 }
391
392 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
393 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
394 return 0;
395 }
396
397 return pci_dma_write(&n->parent_obj, addr, buf, size);
398}
399
400static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
401{
402 return nsid &&
403 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
404}
405
406static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
407{
408 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
409}
410
411static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
412{
413 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
414}
415
416static void nvme_inc_cq_tail(NvmeCQueue *cq)
417{
418 cq->tail++;
419 if (cq->tail >= cq->size) {
420 cq->tail = 0;
421 cq->phase = !cq->phase;
422 }
423}
424
425static void nvme_inc_sq_head(NvmeSQueue *sq)
426{
427 sq->head = (sq->head + 1) % sq->size;
428}
429
430static uint8_t nvme_cq_full(NvmeCQueue *cq)
431{
432 return (cq->tail + 1) % cq->size == cq->head;
433}
434
435static uint8_t nvme_sq_empty(NvmeSQueue *sq)
436{
437 return sq->head == sq->tail;
438}
439
440static void nvme_irq_check(NvmeCtrl *n)
441{
442 uint32_t intms = ldl_le_p(&n->bar.intms);
443
444 if (msix_enabled(&(n->parent_obj))) {
445 return;
446 }
447 if (~intms & n->irq_status) {
448 pci_irq_assert(&n->parent_obj);
449 } else {
450 pci_irq_deassert(&n->parent_obj);
451 }
452}
453
454static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
455{
456 if (cq->irq_enabled) {
457 if (msix_enabled(&(n->parent_obj))) {
458 trace_pci_nvme_irq_msix(cq->vector);
459 msix_notify(&(n->parent_obj), cq->vector);
460 } else {
461 trace_pci_nvme_irq_pin();
462 assert(cq->vector < 32);
463 n->irq_status |= 1 << cq->vector;
464 nvme_irq_check(n);
465 }
466 } else {
467 trace_pci_nvme_irq_masked();
468 }
469}
470
471static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
472{
473 if (cq->irq_enabled) {
474 if (msix_enabled(&(n->parent_obj))) {
475 return;
476 } else {
477 assert(cq->vector < 32);
478 if (!n->cq_pending) {
479 n->irq_status &= ~(1 << cq->vector);
480 }
481 nvme_irq_check(n);
482 }
483 }
484}
485
486static void nvme_req_clear(NvmeRequest *req)
487{
488 req->ns = NULL;
489 req->opaque = NULL;
490 req->aiocb = NULL;
491 memset(&req->cqe, 0x0, sizeof(req->cqe));
492 req->status = NVME_SUCCESS;
493}
494
495static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
496{
497 if (dma) {
498 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
499 sg->flags = NVME_SG_DMA;
500 } else {
501 qemu_iovec_init(&sg->iov, 0);
502 }
503
504 sg->flags |= NVME_SG_ALLOC;
505}
506
507static inline void nvme_sg_unmap(NvmeSg *sg)
508{
509 if (!(sg->flags & NVME_SG_ALLOC)) {
510 return;
511 }
512
513 if (sg->flags & NVME_SG_DMA) {
514 qemu_sglist_destroy(&sg->qsg);
515 } else {
516 qemu_iovec_destroy(&sg->iov);
517 }
518
519 memset(sg, 0x0, sizeof(*sg));
520}
521
522
523
524
525
526
527static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
528 NvmeSg *mdata)
529{
530 NvmeSg *dst = data;
531 uint32_t trans_len, count = ns->lbasz;
532 uint64_t offset = 0;
533 bool dma = sg->flags & NVME_SG_DMA;
534 size_t sge_len;
535 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
536 int sg_idx = 0;
537
538 assert(sg->flags & NVME_SG_ALLOC);
539
540 while (sg_len) {
541 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
542
543 trans_len = MIN(sg_len, count);
544 trans_len = MIN(trans_len, sge_len - offset);
545
546 if (dst) {
547 if (dma) {
548 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
549 trans_len);
550 } else {
551 qemu_iovec_add(&dst->iov,
552 sg->iov.iov[sg_idx].iov_base + offset,
553 trans_len);
554 }
555 }
556
557 sg_len -= trans_len;
558 count -= trans_len;
559 offset += trans_len;
560
561 if (count == 0) {
562 dst = (dst == data) ? mdata : data;
563 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
564 }
565
566 if (sge_len == offset) {
567 offset = 0;
568 sg_idx++;
569 }
570 }
571}
572
573static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
574 size_t len)
575{
576 if (!len) {
577 return NVME_SUCCESS;
578 }
579
580 trace_pci_nvme_map_addr_cmb(addr, len);
581
582 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
583 return NVME_DATA_TRAS_ERROR;
584 }
585
586 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
587
588 return NVME_SUCCESS;
589}
590
591static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
592 size_t len)
593{
594 if (!len) {
595 return NVME_SUCCESS;
596 }
597
598 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
599 return NVME_DATA_TRAS_ERROR;
600 }
601
602 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
603
604 return NVME_SUCCESS;
605}
606
607static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
608{
609 bool cmb = false, pmr = false;
610
611 if (!len) {
612 return NVME_SUCCESS;
613 }
614
615 trace_pci_nvme_map_addr(addr, len);
616
617 if (nvme_addr_is_cmb(n, addr)) {
618 cmb = true;
619 } else if (nvme_addr_is_pmr(n, addr)) {
620 pmr = true;
621 }
622
623 if (cmb || pmr) {
624 if (sg->flags & NVME_SG_DMA) {
625 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
626 }
627
628 if (sg->iov.niov + 1 > IOV_MAX) {
629 goto max_mappings_exceeded;
630 }
631
632 if (cmb) {
633 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
634 } else {
635 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
636 }
637 }
638
639 if (!(sg->flags & NVME_SG_DMA)) {
640 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
641 }
642
643 if (sg->qsg.nsg + 1 > IOV_MAX) {
644 goto max_mappings_exceeded;
645 }
646
647 qemu_sglist_add(&sg->qsg, addr, len);
648
649 return NVME_SUCCESS;
650
651max_mappings_exceeded:
652 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
653 "number of mappings exceed 1024");
654 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
655}
656
657static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
658{
659 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
660}
661
662static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
663 uint64_t prp2, uint32_t len)
664{
665 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
666 trans_len = MIN(len, trans_len);
667 int num_prps = (len >> n->page_bits) + 1;
668 uint16_t status;
669 int ret;
670
671 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
672
673 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
674
675 status = nvme_map_addr(n, sg, prp1, trans_len);
676 if (status) {
677 goto unmap;
678 }
679
680 len -= trans_len;
681 if (len) {
682 if (len > n->page_size) {
683 uint64_t prp_list[n->max_prp_ents];
684 uint32_t nents, prp_trans;
685 int i = 0;
686
687
688
689
690
691
692 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
693 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
694 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
695 if (ret) {
696 trace_pci_nvme_err_addr_read(prp2);
697 status = NVME_DATA_TRAS_ERROR;
698 goto unmap;
699 }
700 while (len != 0) {
701 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
702
703 if (i == nents - 1 && len > n->page_size) {
704 if (unlikely(prp_ent & (n->page_size - 1))) {
705 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
706 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
707 goto unmap;
708 }
709
710 i = 0;
711 nents = (len + n->page_size - 1) >> n->page_bits;
712 nents = MIN(nents, n->max_prp_ents);
713 prp_trans = nents * sizeof(uint64_t);
714 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
715 prp_trans);
716 if (ret) {
717 trace_pci_nvme_err_addr_read(prp_ent);
718 status = NVME_DATA_TRAS_ERROR;
719 goto unmap;
720 }
721 prp_ent = le64_to_cpu(prp_list[i]);
722 }
723
724 if (unlikely(prp_ent & (n->page_size - 1))) {
725 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
726 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
727 goto unmap;
728 }
729
730 trans_len = MIN(len, n->page_size);
731 status = nvme_map_addr(n, sg, prp_ent, trans_len);
732 if (status) {
733 goto unmap;
734 }
735
736 len -= trans_len;
737 i++;
738 }
739 } else {
740 if (unlikely(prp2 & (n->page_size - 1))) {
741 trace_pci_nvme_err_invalid_prp2_align(prp2);
742 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
743 goto unmap;
744 }
745 status = nvme_map_addr(n, sg, prp2, len);
746 if (status) {
747 goto unmap;
748 }
749 }
750 }
751
752 return NVME_SUCCESS;
753
754unmap:
755 nvme_sg_unmap(sg);
756 return status;
757}
758
759
760
761
762
763static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
764 NvmeSglDescriptor *segment, uint64_t nsgld,
765 size_t *len, NvmeCmd *cmd)
766{
767 dma_addr_t addr, trans_len;
768 uint32_t dlen;
769 uint16_t status;
770
771 for (int i = 0; i < nsgld; i++) {
772 uint8_t type = NVME_SGL_TYPE(segment[i].type);
773
774 switch (type) {
775 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
776 if (cmd->opcode == NVME_CMD_WRITE) {
777 continue;
778 }
779 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
780 break;
781 case NVME_SGL_DESCR_TYPE_SEGMENT:
782 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
783 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
784 default:
785 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
786 }
787
788 dlen = le32_to_cpu(segment[i].len);
789
790 if (!dlen) {
791 continue;
792 }
793
794 if (*len == 0) {
795
796
797
798
799
800 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
801 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
802 break;
803 }
804
805 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
806 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
807 }
808
809 trans_len = MIN(*len, dlen);
810
811 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
812 goto next;
813 }
814
815 addr = le64_to_cpu(segment[i].addr);
816
817 if (UINT64_MAX - addr < dlen) {
818 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
819 }
820
821 status = nvme_map_addr(n, sg, addr, trans_len);
822 if (status) {
823 return status;
824 }
825
826next:
827 *len -= trans_len;
828 }
829
830 return NVME_SUCCESS;
831}
832
833static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
834 size_t len, NvmeCmd *cmd)
835{
836
837
838
839
840
841
842
843 const int SEG_CHUNK_SIZE = 256;
844
845 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
846 uint64_t nsgld;
847 uint32_t seg_len;
848 uint16_t status;
849 hwaddr addr;
850 int ret;
851
852 sgld = &sgl;
853 addr = le64_to_cpu(sgl.addr);
854
855 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
856
857 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
858
859
860
861
862
863 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
864 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
865 if (status) {
866 goto unmap;
867 }
868
869 goto out;
870 }
871
872 for (;;) {
873 switch (NVME_SGL_TYPE(sgld->type)) {
874 case NVME_SGL_DESCR_TYPE_SEGMENT:
875 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
876 break;
877 default:
878 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
879 }
880
881 seg_len = le32_to_cpu(sgld->len);
882
883
884 if ((!seg_len || seg_len & 0xf) &&
885 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
886 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
887 }
888
889 if (UINT64_MAX - addr < seg_len) {
890 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
891 }
892
893 nsgld = seg_len / sizeof(NvmeSglDescriptor);
894
895 while (nsgld > SEG_CHUNK_SIZE) {
896 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
897 trace_pci_nvme_err_addr_read(addr);
898 status = NVME_DATA_TRAS_ERROR;
899 goto unmap;
900 }
901
902 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
903 &len, cmd);
904 if (status) {
905 goto unmap;
906 }
907
908 nsgld -= SEG_CHUNK_SIZE;
909 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
910 }
911
912 ret = nvme_addr_read(n, addr, segment, nsgld *
913 sizeof(NvmeSglDescriptor));
914 if (ret) {
915 trace_pci_nvme_err_addr_read(addr);
916 status = NVME_DATA_TRAS_ERROR;
917 goto unmap;
918 }
919
920 last_sgld = &segment[nsgld - 1];
921
922
923
924
925
926 switch (NVME_SGL_TYPE(last_sgld->type)) {
927 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
928 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
929 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
930 if (status) {
931 goto unmap;
932 }
933
934 goto out;
935
936 default:
937 break;
938 }
939
940
941
942
943
944 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
945 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
946 goto unmap;
947 }
948
949 sgld = last_sgld;
950 addr = le64_to_cpu(sgld->addr);
951
952
953
954
955
956 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
957 if (status) {
958 goto unmap;
959 }
960 }
961
962out:
963
964 if (len) {
965 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
966 goto unmap;
967 }
968
969 return NVME_SUCCESS;
970
971unmap:
972 nvme_sg_unmap(sg);
973 return status;
974}
975
976uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
977 NvmeCmd *cmd)
978{
979 uint64_t prp1, prp2;
980
981 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
982 case NVME_PSDT_PRP:
983 prp1 = le64_to_cpu(cmd->dptr.prp1);
984 prp2 = le64_to_cpu(cmd->dptr.prp2);
985
986 return nvme_map_prp(n, sg, prp1, prp2, len);
987 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
988 case NVME_PSDT_SGL_MPTR_SGL:
989 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
990 default:
991 return NVME_INVALID_FIELD;
992 }
993}
994
995static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
996 NvmeCmd *cmd)
997{
998 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
999 hwaddr mptr = le64_to_cpu(cmd->mptr);
1000 uint16_t status;
1001
1002 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1003 NvmeSglDescriptor sgl;
1004
1005 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1006 return NVME_DATA_TRAS_ERROR;
1007 }
1008
1009 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1010 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1011 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1012 }
1013
1014 return status;
1015 }
1016
1017 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1018 status = nvme_map_addr(n, sg, mptr, len);
1019 if (status) {
1020 nvme_sg_unmap(sg);
1021 }
1022
1023 return status;
1024}
1025
1026static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1027{
1028 NvmeNamespace *ns = req->ns;
1029 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1030 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1031 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1032 size_t len = nvme_l2b(ns, nlb);
1033 uint16_t status;
1034
1035 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1036 NvmeSg sg;
1037
1038 len += nvme_m2b(ns, nlb);
1039
1040 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1041 if (status) {
1042 return status;
1043 }
1044
1045 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1046 nvme_sg_split(&sg, ns, &req->sg, NULL);
1047 nvme_sg_unmap(&sg);
1048
1049 return NVME_SUCCESS;
1050 }
1051
1052 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1053}
1054
1055static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1056{
1057 NvmeNamespace *ns = req->ns;
1058 size_t len = nvme_m2b(ns, nlb);
1059 uint16_t status;
1060
1061 if (nvme_ns_ext(ns)) {
1062 NvmeSg sg;
1063
1064 len += nvme_l2b(ns, nlb);
1065
1066 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1067 if (status) {
1068 return status;
1069 }
1070
1071 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1072 nvme_sg_split(&sg, ns, NULL, &req->sg);
1073 nvme_sg_unmap(&sg);
1074
1075 return NVME_SUCCESS;
1076 }
1077
1078 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1079}
1080
1081static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1082 uint32_t len, uint32_t bytes,
1083 int32_t skip_bytes, int64_t offset,
1084 NvmeTxDirection dir)
1085{
1086 hwaddr addr;
1087 uint32_t trans_len, count = bytes;
1088 bool dma = sg->flags & NVME_SG_DMA;
1089 int64_t sge_len;
1090 int sg_idx = 0;
1091 int ret;
1092
1093 assert(sg->flags & NVME_SG_ALLOC);
1094
1095 while (len) {
1096 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1097
1098 if (sge_len - offset < 0) {
1099 offset -= sge_len;
1100 sg_idx++;
1101 continue;
1102 }
1103
1104 if (sge_len == offset) {
1105 offset = 0;
1106 sg_idx++;
1107 continue;
1108 }
1109
1110 trans_len = MIN(len, count);
1111 trans_len = MIN(trans_len, sge_len - offset);
1112
1113 if (dma) {
1114 addr = sg->qsg.sg[sg_idx].base + offset;
1115 } else {
1116 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1117 }
1118
1119 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1120 ret = nvme_addr_read(n, addr, ptr, trans_len);
1121 } else {
1122 ret = nvme_addr_write(n, addr, ptr, trans_len);
1123 }
1124
1125 if (ret) {
1126 return NVME_DATA_TRAS_ERROR;
1127 }
1128
1129 ptr += trans_len;
1130 len -= trans_len;
1131 count -= trans_len;
1132 offset += trans_len;
1133
1134 if (count == 0) {
1135 count = bytes;
1136 offset += skip_bytes;
1137 }
1138 }
1139
1140 return NVME_SUCCESS;
1141}
1142
1143static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1144 NvmeTxDirection dir)
1145{
1146 assert(sg->flags & NVME_SG_ALLOC);
1147
1148 if (sg->flags & NVME_SG_DMA) {
1149 uint64_t residual;
1150
1151 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1152 residual = dma_buf_write(ptr, len, &sg->qsg);
1153 } else {
1154 residual = dma_buf_read(ptr, len, &sg->qsg);
1155 }
1156
1157 if (unlikely(residual)) {
1158 trace_pci_nvme_err_invalid_dma();
1159 return NVME_INVALID_FIELD | NVME_DNR;
1160 }
1161 } else {
1162 size_t bytes;
1163
1164 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1165 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1166 } else {
1167 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1168 }
1169
1170 if (unlikely(bytes != len)) {
1171 trace_pci_nvme_err_invalid_dma();
1172 return NVME_INVALID_FIELD | NVME_DNR;
1173 }
1174 }
1175
1176 return NVME_SUCCESS;
1177}
1178
1179static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1180 NvmeRequest *req)
1181{
1182 uint16_t status;
1183
1184 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1185 if (status) {
1186 return status;
1187 }
1188
1189 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1190}
1191
1192static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1193 NvmeRequest *req)
1194{
1195 uint16_t status;
1196
1197 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1198 if (status) {
1199 return status;
1200 }
1201
1202 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1203}
1204
1205uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1206 NvmeTxDirection dir, NvmeRequest *req)
1207{
1208 NvmeNamespace *ns = req->ns;
1209 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1210 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1211 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1212
1213 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1214 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1215 ns->lbaf.ms, 0, dir);
1216 }
1217
1218 return nvme_tx(n, &req->sg, ptr, len, dir);
1219}
1220
1221uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1222 NvmeTxDirection dir, NvmeRequest *req)
1223{
1224 NvmeNamespace *ns = req->ns;
1225 uint16_t status;
1226
1227 if (nvme_ns_ext(ns)) {
1228 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1229 ns->lbasz, ns->lbasz, dir);
1230 }
1231
1232 nvme_sg_unmap(&req->sg);
1233
1234 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1235 if (status) {
1236 return status;
1237 }
1238
1239 return nvme_tx(n, &req->sg, ptr, len, dir);
1240}
1241
1242static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1243 BlockCompletionFunc *cb, NvmeRequest *req)
1244{
1245 assert(req->sg.flags & NVME_SG_ALLOC);
1246
1247 if (req->sg.flags & NVME_SG_DMA) {
1248 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1249 cb, req);
1250 } else {
1251 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1252 }
1253}
1254
1255static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1256 BlockCompletionFunc *cb, NvmeRequest *req)
1257{
1258 assert(req->sg.flags & NVME_SG_ALLOC);
1259
1260 if (req->sg.flags & NVME_SG_DMA) {
1261 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1262 cb, req);
1263 } else {
1264 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1265 }
1266}
1267
1268static void nvme_post_cqes(void *opaque)
1269{
1270 NvmeCQueue *cq = opaque;
1271 NvmeCtrl *n = cq->ctrl;
1272 NvmeRequest *req, *next;
1273 bool pending = cq->head != cq->tail;
1274 int ret;
1275
1276 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1277 NvmeSQueue *sq;
1278 hwaddr addr;
1279
1280 if (nvme_cq_full(cq)) {
1281 break;
1282 }
1283
1284 sq = req->sq;
1285 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1286 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1287 req->cqe.sq_head = cpu_to_le16(sq->head);
1288 addr = cq->dma_addr + cq->tail * n->cqe_size;
1289 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1290 sizeof(req->cqe));
1291 if (ret) {
1292 trace_pci_nvme_err_addr_write(addr);
1293 trace_pci_nvme_err_cfs();
1294 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1295 break;
1296 }
1297 QTAILQ_REMOVE(&cq->req_list, req, entry);
1298 nvme_inc_cq_tail(cq);
1299 nvme_sg_unmap(&req->sg);
1300 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1301 }
1302 if (cq->tail != cq->head) {
1303 if (cq->irq_enabled && !pending) {
1304 n->cq_pending++;
1305 }
1306
1307 nvme_irq_assert(n, cq);
1308 }
1309}
1310
1311static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1312{
1313 assert(cq->cqid == req->sq->cqid);
1314 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1315 le32_to_cpu(req->cqe.result),
1316 le32_to_cpu(req->cqe.dw1),
1317 req->status);
1318
1319 if (req->status) {
1320 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1321 req->status, req->cmd.opcode);
1322 }
1323
1324 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1325 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1326 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1327}
1328
1329static void nvme_process_aers(void *opaque)
1330{
1331 NvmeCtrl *n = opaque;
1332 NvmeAsyncEvent *event, *next;
1333
1334 trace_pci_nvme_process_aers(n->aer_queued);
1335
1336 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1337 NvmeRequest *req;
1338 NvmeAerResult *result;
1339
1340
1341 if (!n->outstanding_aers) {
1342 trace_pci_nvme_no_outstanding_aers();
1343 break;
1344 }
1345
1346
1347 if (n->aer_mask & (1 << event->result.event_type)) {
1348 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1349 continue;
1350 }
1351
1352 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1353 n->aer_queued--;
1354
1355 n->aer_mask |= 1 << event->result.event_type;
1356 n->outstanding_aers--;
1357
1358 req = n->aer_reqs[n->outstanding_aers];
1359
1360 result = (NvmeAerResult *) &req->cqe.result;
1361 result->event_type = event->result.event_type;
1362 result->event_info = event->result.event_info;
1363 result->log_page = event->result.log_page;
1364 g_free(event);
1365
1366 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1367 result->log_page);
1368
1369 nvme_enqueue_req_completion(&n->admin_cq, req);
1370 }
1371}
1372
1373static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1374 uint8_t event_info, uint8_t log_page)
1375{
1376 NvmeAsyncEvent *event;
1377
1378 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1379
1380 if (n->aer_queued == n->params.aer_max_queued) {
1381 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1382 return;
1383 }
1384
1385 event = g_new(NvmeAsyncEvent, 1);
1386 event->result = (NvmeAerResult) {
1387 .event_type = event_type,
1388 .event_info = event_info,
1389 .log_page = log_page,
1390 };
1391
1392 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1393 n->aer_queued++;
1394
1395 nvme_process_aers(n);
1396}
1397
1398static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1399{
1400 uint8_t aer_info;
1401
1402
1403 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1404 return;
1405 }
1406
1407 switch (event) {
1408 case NVME_SMART_SPARE:
1409 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1410 break;
1411 case NVME_SMART_TEMPERATURE:
1412 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1413 break;
1414 case NVME_SMART_RELIABILITY:
1415 case NVME_SMART_MEDIA_READ_ONLY:
1416 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1417 case NVME_SMART_PMR_UNRELIABLE:
1418 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1419 break;
1420 default:
1421 return;
1422 }
1423
1424 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1425}
1426
1427static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1428{
1429 n->aer_mask &= ~(1 << event_type);
1430 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1431 nvme_process_aers(n);
1432 }
1433}
1434
1435static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1436{
1437 uint8_t mdts = n->params.mdts;
1438
1439 if (mdts && len > n->page_size << mdts) {
1440 trace_pci_nvme_err_mdts(len);
1441 return NVME_INVALID_FIELD | NVME_DNR;
1442 }
1443
1444 return NVME_SUCCESS;
1445}
1446
1447static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1448 uint32_t nlb)
1449{
1450 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1451
1452 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1453 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1454 return NVME_LBA_RANGE | NVME_DNR;
1455 }
1456
1457 return NVME_SUCCESS;
1458}
1459
1460static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1461 uint32_t nlb, int flags)
1462{
1463 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1464
1465 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1466 int64_t offset = nvme_l2b(ns, slba);
1467 int ret;
1468
1469
1470
1471
1472
1473
1474
1475 do {
1476 bytes -= pnum;
1477
1478 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1479 if (ret < 0) {
1480 return ret;
1481 }
1482
1483
1484 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1485 !!(ret & BDRV_BLOCK_ZERO));
1486
1487 if (!(ret & flags)) {
1488 return 1;
1489 }
1490
1491 offset += pnum;
1492 } while (pnum != bytes);
1493
1494 return 0;
1495}
1496
1497static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1498 uint32_t nlb)
1499{
1500 int ret;
1501 Error *err = NULL;
1502
1503 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1504 if (ret) {
1505 if (ret < 0) {
1506 error_setg_errno(&err, -ret, "unable to get block status");
1507 error_report_err(err);
1508
1509 return NVME_INTERNAL_DEV_ERROR;
1510 }
1511
1512 return NVME_DULB;
1513 }
1514
1515 return NVME_SUCCESS;
1516}
1517
1518static void nvme_aio_err(NvmeRequest *req, int ret)
1519{
1520 uint16_t status = NVME_SUCCESS;
1521 Error *local_err = NULL;
1522
1523 switch (req->cmd.opcode) {
1524 case NVME_CMD_READ:
1525 status = NVME_UNRECOVERED_READ;
1526 break;
1527 case NVME_CMD_FLUSH:
1528 case NVME_CMD_WRITE:
1529 case NVME_CMD_WRITE_ZEROES:
1530 case NVME_CMD_ZONE_APPEND:
1531 status = NVME_WRITE_FAULT;
1532 break;
1533 default:
1534 status = NVME_INTERNAL_DEV_ERROR;
1535 break;
1536 }
1537
1538 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1539
1540 error_setg_errno(&local_err, -ret, "aio failed");
1541 error_report_err(local_err);
1542
1543
1544
1545
1546
1547 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1548 return;
1549 }
1550
1551 req->status = status;
1552}
1553
1554static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1555{
1556 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1557 slba / ns->zone_size;
1558}
1559
1560static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1561{
1562 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1563
1564 if (zone_idx >= ns->num_zones) {
1565 return NULL;
1566 }
1567
1568 return &ns->zone_array[zone_idx];
1569}
1570
1571static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1572{
1573 uint64_t zslba = zone->d.zslba;
1574
1575 switch (nvme_get_zone_state(zone)) {
1576 case NVME_ZONE_STATE_EMPTY:
1577 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1578 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1579 case NVME_ZONE_STATE_CLOSED:
1580 return NVME_SUCCESS;
1581 case NVME_ZONE_STATE_FULL:
1582 trace_pci_nvme_err_zone_is_full(zslba);
1583 return NVME_ZONE_FULL;
1584 case NVME_ZONE_STATE_OFFLINE:
1585 trace_pci_nvme_err_zone_is_offline(zslba);
1586 return NVME_ZONE_OFFLINE;
1587 case NVME_ZONE_STATE_READ_ONLY:
1588 trace_pci_nvme_err_zone_is_read_only(zslba);
1589 return NVME_ZONE_READ_ONLY;
1590 default:
1591 assert(false);
1592 }
1593
1594 return NVME_INTERNAL_DEV_ERROR;
1595}
1596
1597static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1598 uint64_t slba, uint32_t nlb)
1599{
1600 uint64_t zcap = nvme_zone_wr_boundary(zone);
1601 uint16_t status;
1602
1603 status = nvme_check_zone_state_for_write(zone);
1604 if (status) {
1605 return status;
1606 }
1607
1608 if (unlikely(slba != zone->w_ptr)) {
1609 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1610 return NVME_ZONE_INVALID_WRITE;
1611 }
1612
1613 if (unlikely((slba + nlb) > zcap)) {
1614 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1615 return NVME_ZONE_BOUNDARY_ERROR;
1616 }
1617
1618 return NVME_SUCCESS;
1619}
1620
1621static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1622{
1623 switch (nvme_get_zone_state(zone)) {
1624 case NVME_ZONE_STATE_EMPTY:
1625 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1626 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1627 case NVME_ZONE_STATE_FULL:
1628 case NVME_ZONE_STATE_CLOSED:
1629 case NVME_ZONE_STATE_READ_ONLY:
1630 return NVME_SUCCESS;
1631 case NVME_ZONE_STATE_OFFLINE:
1632 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1633 return NVME_ZONE_OFFLINE;
1634 default:
1635 assert(false);
1636 }
1637
1638 return NVME_INTERNAL_DEV_ERROR;
1639}
1640
1641static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1642 uint32_t nlb)
1643{
1644 NvmeZone *zone;
1645 uint64_t bndry, end;
1646 uint16_t status;
1647
1648 zone = nvme_get_zone_by_slba(ns, slba);
1649 assert(zone);
1650
1651 bndry = nvme_zone_rd_boundary(ns, zone);
1652 end = slba + nlb;
1653
1654 status = nvme_check_zone_state_for_read(zone);
1655 if (status) {
1656 ;
1657 } else if (unlikely(end > bndry)) {
1658 if (!ns->params.cross_zone_read) {
1659 status = NVME_ZONE_BOUNDARY_ERROR;
1660 } else {
1661
1662
1663
1664
1665 do {
1666 zone++;
1667 status = nvme_check_zone_state_for_read(zone);
1668 if (status) {
1669 break;
1670 }
1671 } while (end > nvme_zone_rd_boundary(ns, zone));
1672 }
1673 }
1674
1675 return status;
1676}
1677
1678static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1679{
1680 switch (nvme_get_zone_state(zone)) {
1681 case NVME_ZONE_STATE_FULL:
1682 return NVME_SUCCESS;
1683
1684 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1685 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1686 nvme_aor_dec_open(ns);
1687
1688 case NVME_ZONE_STATE_CLOSED:
1689 nvme_aor_dec_active(ns);
1690
1691 case NVME_ZONE_STATE_EMPTY:
1692 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1693 return NVME_SUCCESS;
1694
1695 default:
1696 return NVME_ZONE_INVAL_TRANSITION;
1697 }
1698}
1699
1700static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1701{
1702 switch (nvme_get_zone_state(zone)) {
1703 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1704 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1705 nvme_aor_dec_open(ns);
1706 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1707
1708 case NVME_ZONE_STATE_CLOSED:
1709 return NVME_SUCCESS;
1710
1711 default:
1712 return NVME_ZONE_INVAL_TRANSITION;
1713 }
1714}
1715
1716static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1717{
1718 switch (nvme_get_zone_state(zone)) {
1719 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1720 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1721 nvme_aor_dec_open(ns);
1722
1723 case NVME_ZONE_STATE_CLOSED:
1724 nvme_aor_dec_active(ns);
1725
1726 case NVME_ZONE_STATE_FULL:
1727 zone->w_ptr = zone->d.zslba;
1728 zone->d.wp = zone->w_ptr;
1729 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1730
1731 case NVME_ZONE_STATE_EMPTY:
1732 return NVME_SUCCESS;
1733
1734 default:
1735 return NVME_ZONE_INVAL_TRANSITION;
1736 }
1737}
1738
1739static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1740{
1741 NvmeZone *zone;
1742
1743 if (ns->params.max_open_zones &&
1744 ns->nr_open_zones == ns->params.max_open_zones) {
1745 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1746 if (zone) {
1747
1748
1749
1750 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1751 nvme_zrm_close(ns, zone);
1752 }
1753 }
1754}
1755
1756enum {
1757 NVME_ZRM_AUTO = 1 << 0,
1758};
1759
1760static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1761 NvmeZone *zone, int flags)
1762{
1763 int act = 0;
1764 uint16_t status;
1765
1766 switch (nvme_get_zone_state(zone)) {
1767 case NVME_ZONE_STATE_EMPTY:
1768 act = 1;
1769
1770
1771
1772 case NVME_ZONE_STATE_CLOSED:
1773 if (n->params.auto_transition_zones) {
1774 nvme_zrm_auto_transition_zone(ns);
1775 }
1776 status = nvme_aor_check(ns, act, 1);
1777 if (status) {
1778 return status;
1779 }
1780
1781 if (act) {
1782 nvme_aor_inc_active(ns);
1783 }
1784
1785 nvme_aor_inc_open(ns);
1786
1787 if (flags & NVME_ZRM_AUTO) {
1788 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1789 return NVME_SUCCESS;
1790 }
1791
1792
1793
1794 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1795 if (flags & NVME_ZRM_AUTO) {
1796 return NVME_SUCCESS;
1797 }
1798
1799 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1800
1801
1802
1803 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1804 return NVME_SUCCESS;
1805
1806 default:
1807 return NVME_ZONE_INVAL_TRANSITION;
1808 }
1809}
1810
1811static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1812 NvmeZone *zone)
1813{
1814 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1815}
1816
1817static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns,
1818 NvmeZone *zone)
1819{
1820 return nvme_zrm_open_flags(n, ns, zone, 0);
1821}
1822
1823static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1824 uint32_t nlb)
1825{
1826 zone->d.wp += nlb;
1827
1828 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1829 nvme_zrm_finish(ns, zone);
1830 }
1831}
1832
1833static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1834{
1835 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1836 NvmeZone *zone;
1837 uint64_t slba;
1838 uint32_t nlb;
1839
1840 slba = le64_to_cpu(rw->slba);
1841 nlb = le16_to_cpu(rw->nlb) + 1;
1842 zone = nvme_get_zone_by_slba(ns, slba);
1843 assert(zone);
1844
1845 nvme_advance_zone_wp(ns, zone, nlb);
1846}
1847
1848static inline bool nvme_is_write(NvmeRequest *req)
1849{
1850 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1851
1852 return rw->opcode == NVME_CMD_WRITE ||
1853 rw->opcode == NVME_CMD_ZONE_APPEND ||
1854 rw->opcode == NVME_CMD_WRITE_ZEROES;
1855}
1856
1857static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1858{
1859 return qemu_get_aio_context();
1860}
1861
1862static void nvme_misc_cb(void *opaque, int ret)
1863{
1864 NvmeRequest *req = opaque;
1865
1866 trace_pci_nvme_misc_cb(nvme_cid(req));
1867
1868 if (ret) {
1869 nvme_aio_err(req, ret);
1870 }
1871
1872 nvme_enqueue_req_completion(nvme_cq(req), req);
1873}
1874
1875void nvme_rw_complete_cb(void *opaque, int ret)
1876{
1877 NvmeRequest *req = opaque;
1878 NvmeNamespace *ns = req->ns;
1879 BlockBackend *blk = ns->blkconf.blk;
1880 BlockAcctCookie *acct = &req->acct;
1881 BlockAcctStats *stats = blk_get_stats(blk);
1882
1883 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1884
1885 if (ret) {
1886 block_acct_failed(stats, acct);
1887 nvme_aio_err(req, ret);
1888 } else {
1889 block_acct_done(stats, acct);
1890 }
1891
1892 if (ns->params.zoned && nvme_is_write(req)) {
1893 nvme_finalize_zoned_write(ns, req);
1894 }
1895
1896 nvme_enqueue_req_completion(nvme_cq(req), req);
1897}
1898
1899static void nvme_rw_cb(void *opaque, int ret)
1900{
1901 NvmeRequest *req = opaque;
1902 NvmeNamespace *ns = req->ns;
1903
1904 BlockBackend *blk = ns->blkconf.blk;
1905
1906 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1907
1908 if (ret) {
1909 goto out;
1910 }
1911
1912 if (ns->lbaf.ms) {
1913 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1914 uint64_t slba = le64_to_cpu(rw->slba);
1915 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1916 uint64_t offset = nvme_moff(ns, slba);
1917
1918 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1919 size_t mlen = nvme_m2b(ns, nlb);
1920
1921 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1922 BDRV_REQ_MAY_UNMAP,
1923 nvme_rw_complete_cb, req);
1924 return;
1925 }
1926
1927 if (nvme_ns_ext(ns) || req->cmd.mptr) {
1928 uint16_t status;
1929
1930 nvme_sg_unmap(&req->sg);
1931 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1932 if (status) {
1933 ret = -EFAULT;
1934 goto out;
1935 }
1936
1937 if (req->cmd.opcode == NVME_CMD_READ) {
1938 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1939 }
1940
1941 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1942 }
1943 }
1944
1945out:
1946 nvme_rw_complete_cb(req, ret);
1947}
1948
1949static void nvme_verify_cb(void *opaque, int ret)
1950{
1951 NvmeBounceContext *ctx = opaque;
1952 NvmeRequest *req = ctx->req;
1953 NvmeNamespace *ns = req->ns;
1954 BlockBackend *blk = ns->blkconf.blk;
1955 BlockAcctCookie *acct = &req->acct;
1956 BlockAcctStats *stats = blk_get_stats(blk);
1957 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1958 uint64_t slba = le64_to_cpu(rw->slba);
1959 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
1960 uint16_t apptag = le16_to_cpu(rw->apptag);
1961 uint16_t appmask = le16_to_cpu(rw->appmask);
1962 uint32_t reftag = le32_to_cpu(rw->reftag);
1963 uint16_t status;
1964
1965 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
1966
1967 if (ret) {
1968 block_acct_failed(stats, acct);
1969 nvme_aio_err(req, ret);
1970 goto out;
1971 }
1972
1973 block_acct_done(stats, acct);
1974
1975 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1976 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1977 ctx->mdata.iov.size, slba);
1978 if (status) {
1979 req->status = status;
1980 goto out;
1981 }
1982
1983 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1984 ctx->mdata.bounce, ctx->mdata.iov.size,
1985 prinfo, slba, apptag, appmask, &reftag);
1986 }
1987
1988out:
1989 qemu_iovec_destroy(&ctx->data.iov);
1990 g_free(ctx->data.bounce);
1991
1992 qemu_iovec_destroy(&ctx->mdata.iov);
1993 g_free(ctx->mdata.bounce);
1994
1995 g_free(ctx);
1996
1997 nvme_enqueue_req_completion(nvme_cq(req), req);
1998}
1999
2000
2001static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2002{
2003 NvmeBounceContext *ctx = opaque;
2004 NvmeRequest *req = ctx->req;
2005 NvmeNamespace *ns = req->ns;
2006 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2007 uint64_t slba = le64_to_cpu(rw->slba);
2008 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2009 size_t mlen = nvme_m2b(ns, nlb);
2010 uint64_t offset = nvme_moff(ns, slba);
2011 BlockBackend *blk = ns->blkconf.blk;
2012
2013 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2014
2015 if (ret) {
2016 goto out;
2017 }
2018
2019 ctx->mdata.bounce = g_malloc(mlen);
2020
2021 qemu_iovec_reset(&ctx->mdata.iov);
2022 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2023
2024 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2025 nvme_verify_cb, ctx);
2026 return;
2027
2028out:
2029 nvme_verify_cb(ctx, ret);
2030}
2031
2032struct nvme_compare_ctx {
2033 struct {
2034 QEMUIOVector iov;
2035 uint8_t *bounce;
2036 } data;
2037
2038 struct {
2039 QEMUIOVector iov;
2040 uint8_t *bounce;
2041 } mdata;
2042};
2043
2044static void nvme_compare_mdata_cb(void *opaque, int ret)
2045{
2046 NvmeRequest *req = opaque;
2047 NvmeNamespace *ns = req->ns;
2048 NvmeCtrl *n = nvme_ctrl(req);
2049 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2050 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2051 uint16_t apptag = le16_to_cpu(rw->apptag);
2052 uint16_t appmask = le16_to_cpu(rw->appmask);
2053 uint32_t reftag = le32_to_cpu(rw->reftag);
2054 struct nvme_compare_ctx *ctx = req->opaque;
2055 g_autofree uint8_t *buf = NULL;
2056 BlockBackend *blk = ns->blkconf.blk;
2057 BlockAcctCookie *acct = &req->acct;
2058 BlockAcctStats *stats = blk_get_stats(blk);
2059 uint16_t status = NVME_SUCCESS;
2060
2061 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2062
2063 if (ret) {
2064 block_acct_failed(stats, acct);
2065 nvme_aio_err(req, ret);
2066 goto out;
2067 }
2068
2069 buf = g_malloc(ctx->mdata.iov.size);
2070
2071 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2072 NVME_TX_DIRECTION_TO_DEVICE, req);
2073 if (status) {
2074 req->status = status;
2075 goto out;
2076 }
2077
2078 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2079 uint64_t slba = le64_to_cpu(rw->slba);
2080 uint8_t *bufp;
2081 uint8_t *mbufp = ctx->mdata.bounce;
2082 uint8_t *end = mbufp + ctx->mdata.iov.size;
2083 int16_t pil = 0;
2084
2085 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2086 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2087 slba, apptag, appmask, &reftag);
2088 if (status) {
2089 req->status = status;
2090 goto out;
2091 }
2092
2093
2094
2095
2096
2097 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2098 pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
2099 }
2100
2101 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2102 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2103 req->status = NVME_CMP_FAILURE;
2104 goto out;
2105 }
2106 }
2107
2108 goto out;
2109 }
2110
2111 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2112 req->status = NVME_CMP_FAILURE;
2113 goto out;
2114 }
2115
2116 block_acct_done(stats, acct);
2117
2118out:
2119 qemu_iovec_destroy(&ctx->data.iov);
2120 g_free(ctx->data.bounce);
2121
2122 qemu_iovec_destroy(&ctx->mdata.iov);
2123 g_free(ctx->mdata.bounce);
2124
2125 g_free(ctx);
2126
2127 nvme_enqueue_req_completion(nvme_cq(req), req);
2128}
2129
2130static void nvme_compare_data_cb(void *opaque, int ret)
2131{
2132 NvmeRequest *req = opaque;
2133 NvmeCtrl *n = nvme_ctrl(req);
2134 NvmeNamespace *ns = req->ns;
2135 BlockBackend *blk = ns->blkconf.blk;
2136 BlockAcctCookie *acct = &req->acct;
2137 BlockAcctStats *stats = blk_get_stats(blk);
2138
2139 struct nvme_compare_ctx *ctx = req->opaque;
2140 g_autofree uint8_t *buf = NULL;
2141 uint16_t status;
2142
2143 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2144
2145 if (ret) {
2146 block_acct_failed(stats, acct);
2147 nvme_aio_err(req, ret);
2148 goto out;
2149 }
2150
2151 buf = g_malloc(ctx->data.iov.size);
2152
2153 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2154 NVME_TX_DIRECTION_TO_DEVICE, req);
2155 if (status) {
2156 req->status = status;
2157 goto out;
2158 }
2159
2160 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2161 req->status = NVME_CMP_FAILURE;
2162 goto out;
2163 }
2164
2165 if (ns->lbaf.ms) {
2166 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2167 uint64_t slba = le64_to_cpu(rw->slba);
2168 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2169 size_t mlen = nvme_m2b(ns, nlb);
2170 uint64_t offset = nvme_moff(ns, slba);
2171
2172 ctx->mdata.bounce = g_malloc(mlen);
2173
2174 qemu_iovec_init(&ctx->mdata.iov, 1);
2175 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2176
2177 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2178 nvme_compare_mdata_cb, req);
2179 return;
2180 }
2181
2182 block_acct_done(stats, acct);
2183
2184out:
2185 qemu_iovec_destroy(&ctx->data.iov);
2186 g_free(ctx->data.bounce);
2187 g_free(ctx);
2188
2189 nvme_enqueue_req_completion(nvme_cq(req), req);
2190}
2191
2192typedef struct NvmeDSMAIOCB {
2193 BlockAIOCB common;
2194 BlockAIOCB *aiocb;
2195 NvmeRequest *req;
2196 QEMUBH *bh;
2197 int ret;
2198
2199 NvmeDsmRange *range;
2200 unsigned int nr;
2201 unsigned int idx;
2202} NvmeDSMAIOCB;
2203
2204static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2205{
2206 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2207
2208
2209 iocb->idx = iocb->nr;
2210 iocb->ret = -ECANCELED;
2211
2212 if (iocb->aiocb) {
2213 blk_aio_cancel_async(iocb->aiocb);
2214 iocb->aiocb = NULL;
2215 } else {
2216
2217
2218
2219
2220 assert(iocb->idx == iocb->nr);
2221 }
2222}
2223
2224static const AIOCBInfo nvme_dsm_aiocb_info = {
2225 .aiocb_size = sizeof(NvmeDSMAIOCB),
2226 .cancel_async = nvme_dsm_cancel,
2227};
2228
2229static void nvme_dsm_bh(void *opaque)
2230{
2231 NvmeDSMAIOCB *iocb = opaque;
2232
2233 iocb->common.cb(iocb->common.opaque, iocb->ret);
2234
2235 qemu_bh_delete(iocb->bh);
2236 iocb->bh = NULL;
2237 qemu_aio_unref(iocb);
2238}
2239
2240static void nvme_dsm_cb(void *opaque, int ret);
2241
2242static void nvme_dsm_md_cb(void *opaque, int ret)
2243{
2244 NvmeDSMAIOCB *iocb = opaque;
2245 NvmeRequest *req = iocb->req;
2246 NvmeNamespace *ns = req->ns;
2247 NvmeDsmRange *range;
2248 uint64_t slba;
2249 uint32_t nlb;
2250
2251 if (ret < 0) {
2252 iocb->ret = ret;
2253 goto done;
2254 }
2255
2256 if (!ns->lbaf.ms) {
2257 nvme_dsm_cb(iocb, 0);
2258 return;
2259 }
2260
2261 range = &iocb->range[iocb->idx - 1];
2262 slba = le64_to_cpu(range->slba);
2263 nlb = le32_to_cpu(range->nlb);
2264
2265
2266
2267
2268
2269
2270 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2271 if (ret) {
2272 if (ret < 0) {
2273 iocb->ret = ret;
2274 goto done;
2275 }
2276
2277 nvme_dsm_cb(iocb, 0);
2278 }
2279
2280 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2281 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2282 nvme_dsm_cb, iocb);
2283 return;
2284
2285done:
2286 iocb->aiocb = NULL;
2287 qemu_bh_schedule(iocb->bh);
2288}
2289
2290static void nvme_dsm_cb(void *opaque, int ret)
2291{
2292 NvmeDSMAIOCB *iocb = opaque;
2293 NvmeRequest *req = iocb->req;
2294 NvmeCtrl *n = nvme_ctrl(req);
2295 NvmeNamespace *ns = req->ns;
2296 NvmeDsmRange *range;
2297 uint64_t slba;
2298 uint32_t nlb;
2299
2300 if (ret < 0) {
2301 iocb->ret = ret;
2302 goto done;
2303 }
2304
2305next:
2306 if (iocb->idx == iocb->nr) {
2307 goto done;
2308 }
2309
2310 range = &iocb->range[iocb->idx++];
2311 slba = le64_to_cpu(range->slba);
2312 nlb = le32_to_cpu(range->nlb);
2313
2314 trace_pci_nvme_dsm_deallocate(slba, nlb);
2315
2316 if (nlb > n->dmrsl) {
2317 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2318 goto next;
2319 }
2320
2321 if (nvme_check_bounds(ns, slba, nlb)) {
2322 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2323 ns->id_ns.nsze);
2324 goto next;
2325 }
2326
2327 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2328 nvme_l2b(ns, nlb),
2329 nvme_dsm_md_cb, iocb);
2330 return;
2331
2332done:
2333 iocb->aiocb = NULL;
2334 qemu_bh_schedule(iocb->bh);
2335}
2336
2337static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2338{
2339 NvmeNamespace *ns = req->ns;
2340 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2341 uint32_t attr = le32_to_cpu(dsm->attributes);
2342 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2343 uint16_t status = NVME_SUCCESS;
2344
2345 trace_pci_nvme_dsm(nr, attr);
2346
2347 if (attr & NVME_DSMGMT_AD) {
2348 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2349 nvme_misc_cb, req);
2350
2351 iocb->req = req;
2352 iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2353 iocb->ret = 0;
2354 iocb->range = g_new(NvmeDsmRange, nr);
2355 iocb->nr = nr;
2356 iocb->idx = 0;
2357
2358 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2359 req);
2360 if (status) {
2361 return status;
2362 }
2363
2364 req->aiocb = &iocb->common;
2365 nvme_dsm_cb(iocb, 0);
2366
2367 return NVME_NO_COMPLETE;
2368 }
2369
2370 return status;
2371}
2372
2373static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2374{
2375 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2376 NvmeNamespace *ns = req->ns;
2377 BlockBackend *blk = ns->blkconf.blk;
2378 uint64_t slba = le64_to_cpu(rw->slba);
2379 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2380 size_t len = nvme_l2b(ns, nlb);
2381 int64_t offset = nvme_l2b(ns, slba);
2382 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2383 uint32_t reftag = le32_to_cpu(rw->reftag);
2384 NvmeBounceContext *ctx = NULL;
2385 uint16_t status;
2386
2387 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2388
2389 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2390 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2391 if (status) {
2392 return status;
2393 }
2394
2395 if (prinfo & NVME_PRINFO_PRACT) {
2396 return NVME_INVALID_PROT_INFO | NVME_DNR;
2397 }
2398 }
2399
2400 if (len > n->page_size << n->params.vsl) {
2401 return NVME_INVALID_FIELD | NVME_DNR;
2402 }
2403
2404 status = nvme_check_bounds(ns, slba, nlb);
2405 if (status) {
2406 return status;
2407 }
2408
2409 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2410 status = nvme_check_dulbe(ns, slba, nlb);
2411 if (status) {
2412 return status;
2413 }
2414 }
2415
2416 ctx = g_new0(NvmeBounceContext, 1);
2417 ctx->req = req;
2418
2419 ctx->data.bounce = g_malloc(len);
2420
2421 qemu_iovec_init(&ctx->data.iov, 1);
2422 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2423
2424 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2425 BLOCK_ACCT_READ);
2426
2427 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2428 nvme_verify_mdata_in_cb, ctx);
2429 return NVME_NO_COMPLETE;
2430}
2431
2432typedef struct NvmeCopyAIOCB {
2433 BlockAIOCB common;
2434 BlockAIOCB *aiocb;
2435 NvmeRequest *req;
2436 QEMUBH *bh;
2437 int ret;
2438
2439 NvmeCopySourceRange *ranges;
2440 int nr;
2441 int idx;
2442
2443 uint8_t *bounce;
2444 QEMUIOVector iov;
2445 struct {
2446 BlockAcctCookie read;
2447 BlockAcctCookie write;
2448 } acct;
2449
2450 uint32_t reftag;
2451 uint64_t slba;
2452
2453 NvmeZone *zone;
2454} NvmeCopyAIOCB;
2455
2456static void nvme_copy_cancel(BlockAIOCB *aiocb)
2457{
2458 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2459
2460 iocb->ret = -ECANCELED;
2461
2462 if (iocb->aiocb) {
2463 blk_aio_cancel_async(iocb->aiocb);
2464 iocb->aiocb = NULL;
2465 }
2466}
2467
2468static const AIOCBInfo nvme_copy_aiocb_info = {
2469 .aiocb_size = sizeof(NvmeCopyAIOCB),
2470 .cancel_async = nvme_copy_cancel,
2471};
2472
2473static void nvme_copy_bh(void *opaque)
2474{
2475 NvmeCopyAIOCB *iocb = opaque;
2476 NvmeRequest *req = iocb->req;
2477 NvmeNamespace *ns = req->ns;
2478 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2479
2480 if (iocb->idx != iocb->nr) {
2481 req->cqe.result = cpu_to_le32(iocb->idx);
2482 }
2483
2484 qemu_iovec_destroy(&iocb->iov);
2485 g_free(iocb->bounce);
2486
2487 qemu_bh_delete(iocb->bh);
2488 iocb->bh = NULL;
2489
2490 if (iocb->ret < 0) {
2491 block_acct_failed(stats, &iocb->acct.read);
2492 block_acct_failed(stats, &iocb->acct.write);
2493 } else {
2494 block_acct_done(stats, &iocb->acct.read);
2495 block_acct_done(stats, &iocb->acct.write);
2496 }
2497
2498 iocb->common.cb(iocb->common.opaque, iocb->ret);
2499 qemu_aio_unref(iocb);
2500}
2501
2502static void nvme_copy_cb(void *opaque, int ret);
2503
2504static void nvme_copy_out_completed_cb(void *opaque, int ret)
2505{
2506 NvmeCopyAIOCB *iocb = opaque;
2507 NvmeRequest *req = iocb->req;
2508 NvmeNamespace *ns = req->ns;
2509 NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
2510 uint32_t nlb = le32_to_cpu(range->nlb) + 1;
2511
2512 if (ret < 0) {
2513 iocb->ret = ret;
2514 goto out;
2515 } else if (iocb->ret < 0) {
2516 goto out;
2517 }
2518
2519 if (ns->params.zoned) {
2520 nvme_advance_zone_wp(ns, iocb->zone, nlb);
2521 }
2522
2523 iocb->idx++;
2524 iocb->slba += nlb;
2525out:
2526 nvme_copy_cb(iocb, iocb->ret);
2527}
2528
2529static void nvme_copy_out_cb(void *opaque, int ret)
2530{
2531 NvmeCopyAIOCB *iocb = opaque;
2532 NvmeRequest *req = iocb->req;
2533 NvmeNamespace *ns = req->ns;
2534 NvmeCopySourceRange *range;
2535 uint32_t nlb;
2536 size_t mlen;
2537 uint8_t *mbounce;
2538
2539 if (ret < 0) {
2540 iocb->ret = ret;
2541 goto out;
2542 } else if (iocb->ret < 0) {
2543 goto out;
2544 }
2545
2546 if (!ns->lbaf.ms) {
2547 nvme_copy_out_completed_cb(iocb, 0);
2548 return;
2549 }
2550
2551 range = &iocb->ranges[iocb->idx];
2552 nlb = le32_to_cpu(range->nlb) + 1;
2553
2554 mlen = nvme_m2b(ns, nlb);
2555 mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2556
2557 qemu_iovec_reset(&iocb->iov);
2558 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2559
2560 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2561 &iocb->iov, 0, nvme_copy_out_completed_cb,
2562 iocb);
2563
2564 return;
2565
2566out:
2567 nvme_copy_cb(iocb, ret);
2568}
2569
2570static void nvme_copy_in_completed_cb(void *opaque, int ret)
2571{
2572 NvmeCopyAIOCB *iocb = opaque;
2573 NvmeRequest *req = iocb->req;
2574 NvmeNamespace *ns = req->ns;
2575 NvmeCopySourceRange *range;
2576 uint32_t nlb;
2577 size_t len;
2578 uint16_t status;
2579
2580 if (ret < 0) {
2581 iocb->ret = ret;
2582 goto out;
2583 } else if (iocb->ret < 0) {
2584 goto out;
2585 }
2586
2587 range = &iocb->ranges[iocb->idx];
2588 nlb = le32_to_cpu(range->nlb) + 1;
2589 len = nvme_l2b(ns, nlb);
2590
2591 trace_pci_nvme_copy_out(iocb->slba, nlb);
2592
2593 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2594 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2595
2596 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2597 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2598
2599 uint16_t apptag = le16_to_cpu(range->apptag);
2600 uint16_t appmask = le16_to_cpu(range->appmask);
2601 uint32_t reftag = le32_to_cpu(range->reftag);
2602
2603 uint64_t slba = le64_to_cpu(range->slba);
2604 size_t mlen = nvme_m2b(ns, nlb);
2605 uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2606
2607 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2608 slba, apptag, appmask, &reftag);
2609 if (status) {
2610 goto invalid;
2611 }
2612
2613 apptag = le16_to_cpu(copy->apptag);
2614 appmask = le16_to_cpu(copy->appmask);
2615
2616 if (prinfow & NVME_PRINFO_PRACT) {
2617 status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2618 if (status) {
2619 goto invalid;
2620 }
2621
2622 nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2623 apptag, &iocb->reftag);
2624 } else {
2625 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2626 prinfow, iocb->slba, apptag, appmask,
2627 &iocb->reftag);
2628 if (status) {
2629 goto invalid;
2630 }
2631 }
2632 }
2633
2634 status = nvme_check_bounds(ns, iocb->slba, nlb);
2635 if (status) {
2636 goto invalid;
2637 }
2638
2639 if (ns->params.zoned) {
2640 status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2641 if (status) {
2642 goto invalid;
2643 }
2644
2645 iocb->zone->w_ptr += nlb;
2646 }
2647
2648 qemu_iovec_reset(&iocb->iov);
2649 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2650
2651 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2652 &iocb->iov, 0, nvme_copy_out_cb, iocb);
2653
2654 return;
2655
2656invalid:
2657 req->status = status;
2658 iocb->aiocb = NULL;
2659 if (iocb->bh) {
2660 qemu_bh_schedule(iocb->bh);
2661 }
2662
2663 return;
2664
2665out:
2666 nvme_copy_cb(iocb, ret);
2667}
2668
2669static void nvme_copy_in_cb(void *opaque, int ret)
2670{
2671 NvmeCopyAIOCB *iocb = opaque;
2672 NvmeRequest *req = iocb->req;
2673 NvmeNamespace *ns = req->ns;
2674 NvmeCopySourceRange *range;
2675 uint64_t slba;
2676 uint32_t nlb;
2677
2678 if (ret < 0) {
2679 iocb->ret = ret;
2680 goto out;
2681 } else if (iocb->ret < 0) {
2682 goto out;
2683 }
2684
2685 if (!ns->lbaf.ms) {
2686 nvme_copy_in_completed_cb(iocb, 0);
2687 return;
2688 }
2689
2690 range = &iocb->ranges[iocb->idx];
2691 slba = le64_to_cpu(range->slba);
2692 nlb = le32_to_cpu(range->nlb) + 1;
2693
2694 qemu_iovec_reset(&iocb->iov);
2695 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2696 nvme_m2b(ns, nlb));
2697
2698 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2699 &iocb->iov, 0, nvme_copy_in_completed_cb,
2700 iocb);
2701 return;
2702
2703out:
2704 nvme_copy_cb(iocb, iocb->ret);
2705}
2706
2707static void nvme_copy_cb(void *opaque, int ret)
2708{
2709 NvmeCopyAIOCB *iocb = opaque;
2710 NvmeRequest *req = iocb->req;
2711 NvmeNamespace *ns = req->ns;
2712 NvmeCopySourceRange *range;
2713 uint64_t slba;
2714 uint32_t nlb;
2715 size_t len;
2716 uint16_t status;
2717
2718 if (ret < 0) {
2719 iocb->ret = ret;
2720 goto done;
2721 } else if (iocb->ret < 0) {
2722 goto done;
2723 }
2724
2725 if (iocb->idx == iocb->nr) {
2726 goto done;
2727 }
2728
2729 range = &iocb->ranges[iocb->idx];
2730 slba = le64_to_cpu(range->slba);
2731 nlb = le32_to_cpu(range->nlb) + 1;
2732 len = nvme_l2b(ns, nlb);
2733
2734 trace_pci_nvme_copy_source_range(slba, nlb);
2735
2736 if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2737 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2738 goto invalid;
2739 }
2740
2741 status = nvme_check_bounds(ns, slba, nlb);
2742 if (status) {
2743 goto invalid;
2744 }
2745
2746 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2747 status = nvme_check_dulbe(ns, slba, nlb);
2748 if (status) {
2749 goto invalid;
2750 }
2751 }
2752
2753 if (ns->params.zoned) {
2754 status = nvme_check_zone_read(ns, slba, nlb);
2755 if (status) {
2756 goto invalid;
2757 }
2758 }
2759
2760 qemu_iovec_reset(&iocb->iov);
2761 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2762
2763 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2764 &iocb->iov, 0, nvme_copy_in_cb, iocb);
2765 return;
2766
2767invalid:
2768 req->status = status;
2769done:
2770 iocb->aiocb = NULL;
2771 if (iocb->bh) {
2772 qemu_bh_schedule(iocb->bh);
2773 }
2774}
2775
2776
2777static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2778{
2779 NvmeNamespace *ns = req->ns;
2780 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2781 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
2782 nvme_misc_cb, req);
2783 uint16_t nr = copy->nr + 1;
2784 uint8_t format = copy->control[0] & 0xf;
2785 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2786 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2787
2788 uint16_t status;
2789
2790 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2791
2792 iocb->ranges = NULL;
2793 iocb->zone = NULL;
2794
2795 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2796 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
2797 status = NVME_INVALID_FIELD | NVME_DNR;
2798 goto invalid;
2799 }
2800
2801 if (!(n->id_ctrl.ocfs & (1 << format))) {
2802 trace_pci_nvme_err_copy_invalid_format(format);
2803 status = NVME_INVALID_FIELD | NVME_DNR;
2804 goto invalid;
2805 }
2806
2807 if (nr > ns->id_ns.msrc + 1) {
2808 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2809 goto invalid;
2810 }
2811
2812 iocb->ranges = g_new(NvmeCopySourceRange, nr);
2813
2814 status = nvme_h2c(n, (uint8_t *)iocb->ranges,
2815 sizeof(NvmeCopySourceRange) * nr, req);
2816 if (status) {
2817 goto invalid;
2818 }
2819
2820 iocb->slba = le64_to_cpu(copy->sdlba);
2821
2822 if (ns->params.zoned) {
2823 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
2824 if (!iocb->zone) {
2825 status = NVME_LBA_RANGE | NVME_DNR;
2826 goto invalid;
2827 }
2828
2829 status = nvme_zrm_auto(n, ns, iocb->zone);
2830 if (status) {
2831 goto invalid;
2832 }
2833 }
2834
2835 iocb->req = req;
2836 iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
2837 iocb->ret = 0;
2838 iocb->nr = nr;
2839 iocb->idx = 0;
2840 iocb->reftag = le32_to_cpu(copy->reftag);
2841 iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
2842 ns->lbasz + ns->lbaf.ms);
2843
2844 qemu_iovec_init(&iocb->iov, 1);
2845
2846 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
2847 BLOCK_ACCT_READ);
2848 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
2849 BLOCK_ACCT_WRITE);
2850
2851 req->aiocb = &iocb->common;
2852 nvme_copy_cb(iocb, 0);
2853
2854 return NVME_NO_COMPLETE;
2855
2856invalid:
2857 g_free(iocb->ranges);
2858 qemu_aio_unref(iocb);
2859 return status;
2860}
2861
2862static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2863{
2864 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2865 NvmeNamespace *ns = req->ns;
2866 BlockBackend *blk = ns->blkconf.blk;
2867 uint64_t slba = le64_to_cpu(rw->slba);
2868 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2869 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2870 size_t data_len = nvme_l2b(ns, nlb);
2871 size_t len = data_len;
2872 int64_t offset = nvme_l2b(ns, slba);
2873 struct nvme_compare_ctx *ctx = NULL;
2874 uint16_t status;
2875
2876 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2877
2878 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
2879 return NVME_INVALID_PROT_INFO | NVME_DNR;
2880 }
2881
2882 if (nvme_ns_ext(ns)) {
2883 len += nvme_m2b(ns, nlb);
2884 }
2885
2886 status = nvme_check_mdts(n, len);
2887 if (status) {
2888 return status;
2889 }
2890
2891 status = nvme_check_bounds(ns, slba, nlb);
2892 if (status) {
2893 return status;
2894 }
2895
2896 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2897 status = nvme_check_dulbe(ns, slba, nlb);
2898 if (status) {
2899 return status;
2900 }
2901 }
2902
2903 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2904 if (status) {
2905 return status;
2906 }
2907
2908 ctx = g_new(struct nvme_compare_ctx, 1);
2909 ctx->data.bounce = g_malloc(data_len);
2910
2911 req->opaque = ctx;
2912
2913 qemu_iovec_init(&ctx->data.iov, 1);
2914 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2915
2916 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2917 BLOCK_ACCT_READ);
2918 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2919 nvme_compare_data_cb, req);
2920
2921 return NVME_NO_COMPLETE;
2922}
2923
2924typedef struct NvmeFlushAIOCB {
2925 BlockAIOCB common;
2926 BlockAIOCB *aiocb;
2927 NvmeRequest *req;
2928 QEMUBH *bh;
2929 int ret;
2930
2931 NvmeNamespace *ns;
2932 uint32_t nsid;
2933 bool broadcast;
2934} NvmeFlushAIOCB;
2935
2936static void nvme_flush_cancel(BlockAIOCB *acb)
2937{
2938 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
2939
2940 iocb->ret = -ECANCELED;
2941
2942 if (iocb->aiocb) {
2943 blk_aio_cancel_async(iocb->aiocb);
2944 }
2945}
2946
2947static const AIOCBInfo nvme_flush_aiocb_info = {
2948 .aiocb_size = sizeof(NvmeFlushAIOCB),
2949 .cancel_async = nvme_flush_cancel,
2950 .get_aio_context = nvme_get_aio_context,
2951};
2952
2953static void nvme_flush_ns_cb(void *opaque, int ret)
2954{
2955 NvmeFlushAIOCB *iocb = opaque;
2956 NvmeNamespace *ns = iocb->ns;
2957
2958 if (ret < 0) {
2959 iocb->ret = ret;
2960 goto out;
2961 } else if (iocb->ret < 0) {
2962 goto out;
2963 }
2964
2965 if (ns) {
2966 trace_pci_nvme_flush_ns(iocb->nsid);
2967
2968 iocb->ns = NULL;
2969 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
2970 return;
2971 }
2972
2973out:
2974 iocb->aiocb = NULL;
2975 qemu_bh_schedule(iocb->bh);
2976}
2977
2978static void nvme_flush_bh(void *opaque)
2979{
2980 NvmeFlushAIOCB *iocb = opaque;
2981 NvmeRequest *req = iocb->req;
2982 NvmeCtrl *n = nvme_ctrl(req);
2983 int i;
2984
2985 if (iocb->ret < 0) {
2986 goto done;
2987 }
2988
2989 if (iocb->broadcast) {
2990 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
2991 iocb->ns = nvme_ns(n, i);
2992 if (iocb->ns) {
2993 iocb->nsid = i;
2994 break;
2995 }
2996 }
2997 }
2998
2999 if (!iocb->ns) {
3000 goto done;
3001 }
3002
3003 nvme_flush_ns_cb(iocb, 0);
3004 return;
3005
3006done:
3007 qemu_bh_delete(iocb->bh);
3008 iocb->bh = NULL;
3009
3010 iocb->common.cb(iocb->common.opaque, iocb->ret);
3011
3012 qemu_aio_unref(iocb);
3013
3014 return;
3015}
3016
3017static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3018{
3019 NvmeFlushAIOCB *iocb;
3020 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3021 uint16_t status;
3022
3023 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3024
3025 iocb->req = req;
3026 iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3027 iocb->ret = 0;
3028 iocb->ns = NULL;
3029 iocb->nsid = 0;
3030 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3031
3032 if (!iocb->broadcast) {
3033 if (!nvme_nsid_valid(n, nsid)) {
3034 status = NVME_INVALID_NSID | NVME_DNR;
3035 goto out;
3036 }
3037
3038 iocb->ns = nvme_ns(n, nsid);
3039 if (!iocb->ns) {
3040 status = NVME_INVALID_FIELD | NVME_DNR;
3041 goto out;
3042 }
3043
3044 iocb->nsid = nsid;
3045 }
3046
3047 req->aiocb = &iocb->common;
3048 qemu_bh_schedule(iocb->bh);
3049
3050 return NVME_NO_COMPLETE;
3051
3052out:
3053 qemu_bh_delete(iocb->bh);
3054 iocb->bh = NULL;
3055 qemu_aio_unref(iocb);
3056
3057 return status;
3058}
3059
3060static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3061{
3062 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3063 NvmeNamespace *ns = req->ns;
3064 uint64_t slba = le64_to_cpu(rw->slba);
3065 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3066 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3067 uint64_t data_size = nvme_l2b(ns, nlb);
3068 uint64_t mapped_size = data_size;
3069 uint64_t data_offset;
3070 BlockBackend *blk = ns->blkconf.blk;
3071 uint16_t status;
3072
3073 if (nvme_ns_ext(ns)) {
3074 mapped_size += nvme_m2b(ns, nlb);
3075
3076 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3077 bool pract = prinfo & NVME_PRINFO_PRACT;
3078
3079 if (pract && ns->lbaf.ms == 8) {
3080 mapped_size = data_size;
3081 }
3082 }
3083 }
3084
3085 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3086
3087 status = nvme_check_mdts(n, mapped_size);
3088 if (status) {
3089 goto invalid;
3090 }
3091
3092 status = nvme_check_bounds(ns, slba, nlb);
3093 if (status) {
3094 goto invalid;
3095 }
3096
3097 if (ns->params.zoned) {
3098 status = nvme_check_zone_read(ns, slba, nlb);
3099 if (status) {
3100 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3101 goto invalid;
3102 }
3103 }
3104
3105 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3106 status = nvme_check_dulbe(ns, slba, nlb);
3107 if (status) {
3108 goto invalid;
3109 }
3110 }
3111
3112 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3113 return nvme_dif_rw(n, req);
3114 }
3115
3116 status = nvme_map_data(n, nlb, req);
3117 if (status) {
3118 goto invalid;
3119 }
3120
3121 data_offset = nvme_l2b(ns, slba);
3122
3123 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3124 BLOCK_ACCT_READ);
3125 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3126 return NVME_NO_COMPLETE;
3127
3128invalid:
3129 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3130 return status | NVME_DNR;
3131}
3132
3133static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3134 bool wrz)
3135{
3136 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3137 NvmeNamespace *ns = req->ns;
3138 uint64_t slba = le64_to_cpu(rw->slba);
3139 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3140 uint16_t ctrl = le16_to_cpu(rw->control);
3141 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3142 uint64_t data_size = nvme_l2b(ns, nlb);
3143 uint64_t mapped_size = data_size;
3144 uint64_t data_offset;
3145 NvmeZone *zone;
3146 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3147 BlockBackend *blk = ns->blkconf.blk;
3148 uint16_t status;
3149
3150 if (nvme_ns_ext(ns)) {
3151 mapped_size += nvme_m2b(ns, nlb);
3152
3153 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3154 bool pract = prinfo & NVME_PRINFO_PRACT;
3155
3156 if (pract && ns->lbaf.ms == 8) {
3157 mapped_size -= nvme_m2b(ns, nlb);
3158 }
3159 }
3160 }
3161
3162 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3163 nvme_nsid(ns), nlb, mapped_size, slba);
3164
3165 if (!wrz) {
3166 status = nvme_check_mdts(n, mapped_size);
3167 if (status) {
3168 goto invalid;
3169 }
3170 }
3171
3172 status = nvme_check_bounds(ns, slba, nlb);
3173 if (status) {
3174 goto invalid;
3175 }
3176
3177 if (ns->params.zoned) {
3178 zone = nvme_get_zone_by_slba(ns, slba);
3179 assert(zone);
3180
3181 if (append) {
3182 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3183
3184 if (unlikely(slba != zone->d.zslba)) {
3185 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3186 status = NVME_INVALID_FIELD;
3187 goto invalid;
3188 }
3189
3190 if (n->params.zasl &&
3191 data_size > (uint64_t)n->page_size << n->params.zasl) {
3192 trace_pci_nvme_err_zasl(data_size);
3193 return NVME_INVALID_FIELD | NVME_DNR;
3194 }
3195
3196 slba = zone->w_ptr;
3197 rw->slba = cpu_to_le64(slba);
3198 res->slba = cpu_to_le64(slba);
3199
3200 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3201 case NVME_ID_NS_DPS_TYPE_1:
3202 if (!piremap) {
3203 return NVME_INVALID_PROT_INFO | NVME_DNR;
3204 }
3205
3206
3207
3208 case NVME_ID_NS_DPS_TYPE_2:
3209 if (piremap) {
3210 uint32_t reftag = le32_to_cpu(rw->reftag);
3211 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3212 }
3213
3214 break;
3215
3216 case NVME_ID_NS_DPS_TYPE_3:
3217 if (piremap) {
3218 return NVME_INVALID_PROT_INFO | NVME_DNR;
3219 }
3220
3221 break;
3222 }
3223 }
3224
3225 status = nvme_check_zone_write(ns, zone, slba, nlb);
3226 if (status) {
3227 goto invalid;
3228 }
3229
3230 status = nvme_zrm_auto(n, ns, zone);
3231 if (status) {
3232 goto invalid;
3233 }
3234
3235 zone->w_ptr += nlb;
3236 }
3237
3238 data_offset = nvme_l2b(ns, slba);
3239
3240 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3241 return nvme_dif_rw(n, req);
3242 }
3243
3244 if (!wrz) {
3245 status = nvme_map_data(n, nlb, req);
3246 if (status) {
3247 goto invalid;
3248 }
3249
3250 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3251 BLOCK_ACCT_WRITE);
3252 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3253 } else {
3254 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3255 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3256 req);
3257 }
3258
3259 return NVME_NO_COMPLETE;
3260
3261invalid:
3262 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3263 return status | NVME_DNR;
3264}
3265
3266static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3267{
3268 return nvme_do_write(n, req, false, false);
3269}
3270
3271static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3272{
3273 return nvme_do_write(n, req, false, true);
3274}
3275
3276static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3277{
3278 return nvme_do_write(n, req, true, false);
3279}
3280
3281static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3282 uint64_t *slba, uint32_t *zone_idx)
3283{
3284 uint32_t dw10 = le32_to_cpu(c->cdw10);
3285 uint32_t dw11 = le32_to_cpu(c->cdw11);
3286
3287 if (!ns->params.zoned) {
3288 trace_pci_nvme_err_invalid_opc(c->opcode);
3289 return NVME_INVALID_OPCODE | NVME_DNR;
3290 }
3291
3292 *slba = ((uint64_t)dw11) << 32 | dw10;
3293 if (unlikely(*slba >= ns->id_ns.nsze)) {
3294 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3295 *slba = 0;
3296 return NVME_LBA_RANGE | NVME_DNR;
3297 }
3298
3299 *zone_idx = nvme_zone_idx(ns, *slba);
3300 assert(*zone_idx < ns->num_zones);
3301
3302 return NVME_SUCCESS;
3303}
3304
3305typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3306 NvmeRequest *);
3307
3308enum NvmeZoneProcessingMask {
3309 NVME_PROC_CURRENT_ZONE = 0,
3310 NVME_PROC_OPENED_ZONES = 1 << 0,
3311 NVME_PROC_CLOSED_ZONES = 1 << 1,
3312 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3313 NVME_PROC_FULL_ZONES = 1 << 3,
3314};
3315
3316static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3317 NvmeZoneState state, NvmeRequest *req)
3318{
3319 return nvme_zrm_open(nvme_ctrl(req), ns, zone);
3320}
3321
3322static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3323 NvmeZoneState state, NvmeRequest *req)
3324{
3325 return nvme_zrm_close(ns, zone);
3326}
3327
3328static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3329 NvmeZoneState state, NvmeRequest *req)
3330{
3331 return nvme_zrm_finish(ns, zone);
3332}
3333
3334static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3335 NvmeZoneState state, NvmeRequest *req)
3336{
3337 switch (state) {
3338 case NVME_ZONE_STATE_READ_ONLY:
3339 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3340
3341 case NVME_ZONE_STATE_OFFLINE:
3342 return NVME_SUCCESS;
3343 default:
3344 return NVME_ZONE_INVAL_TRANSITION;
3345 }
3346}
3347
3348static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3349{
3350 uint16_t status;
3351 uint8_t state = nvme_get_zone_state(zone);
3352
3353 if (state == NVME_ZONE_STATE_EMPTY) {
3354 status = nvme_aor_check(ns, 1, 0);
3355 if (status) {
3356 return status;
3357 }
3358 nvme_aor_inc_active(ns);
3359 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3360 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3361 return NVME_SUCCESS;
3362 }
3363
3364 return NVME_ZONE_INVAL_TRANSITION;
3365}
3366
3367static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3368 enum NvmeZoneProcessingMask proc_mask,
3369 op_handler_t op_hndlr, NvmeRequest *req)
3370{
3371 uint16_t status = NVME_SUCCESS;
3372 NvmeZoneState zs = nvme_get_zone_state(zone);
3373 bool proc_zone;
3374
3375 switch (zs) {
3376 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3377 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3378 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3379 break;
3380 case NVME_ZONE_STATE_CLOSED:
3381 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3382 break;
3383 case NVME_ZONE_STATE_READ_ONLY:
3384 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3385 break;
3386 case NVME_ZONE_STATE_FULL:
3387 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3388 break;
3389 default:
3390 proc_zone = false;
3391 }
3392
3393 if (proc_zone) {
3394 status = op_hndlr(ns, zone, zs, req);
3395 }
3396
3397 return status;
3398}
3399
3400static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3401 enum NvmeZoneProcessingMask proc_mask,
3402 op_handler_t op_hndlr, NvmeRequest *req)
3403{
3404 NvmeZone *next;
3405 uint16_t status = NVME_SUCCESS;
3406 int i;
3407
3408 if (!proc_mask) {
3409 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3410 } else {
3411 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3412 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3413 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3414 req);
3415 if (status && status != NVME_NO_COMPLETE) {
3416 goto out;
3417 }
3418 }
3419 }
3420 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3421 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3422 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3423 req);
3424 if (status && status != NVME_NO_COMPLETE) {
3425 goto out;
3426 }
3427 }
3428
3429 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3430 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3431 req);
3432 if (status && status != NVME_NO_COMPLETE) {
3433 goto out;
3434 }
3435 }
3436 }
3437 if (proc_mask & NVME_PROC_FULL_ZONES) {
3438 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3439 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3440 req);
3441 if (status && status != NVME_NO_COMPLETE) {
3442 goto out;
3443 }
3444 }
3445 }
3446
3447 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3448 for (i = 0; i < ns->num_zones; i++, zone++) {
3449 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3450 req);
3451 if (status && status != NVME_NO_COMPLETE) {
3452 goto out;
3453 }
3454 }
3455 }
3456 }
3457
3458out:
3459 return status;
3460}
3461
3462typedef struct NvmeZoneResetAIOCB {
3463 BlockAIOCB common;
3464 BlockAIOCB *aiocb;
3465 NvmeRequest *req;
3466 QEMUBH *bh;
3467 int ret;
3468
3469 bool all;
3470 int idx;
3471 NvmeZone *zone;
3472} NvmeZoneResetAIOCB;
3473
3474static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3475{
3476 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3477 NvmeRequest *req = iocb->req;
3478 NvmeNamespace *ns = req->ns;
3479
3480 iocb->idx = ns->num_zones;
3481
3482 iocb->ret = -ECANCELED;
3483
3484 if (iocb->aiocb) {
3485 blk_aio_cancel_async(iocb->aiocb);
3486 iocb->aiocb = NULL;
3487 }
3488}
3489
3490static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3491 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3492 .cancel_async = nvme_zone_reset_cancel,
3493};
3494
3495static void nvme_zone_reset_bh(void *opaque)
3496{
3497 NvmeZoneResetAIOCB *iocb = opaque;
3498
3499 iocb->common.cb(iocb->common.opaque, iocb->ret);
3500
3501 qemu_bh_delete(iocb->bh);
3502 iocb->bh = NULL;
3503 qemu_aio_unref(iocb);
3504}
3505
3506static void nvme_zone_reset_cb(void *opaque, int ret);
3507
3508static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3509{
3510 NvmeZoneResetAIOCB *iocb = opaque;
3511 NvmeRequest *req = iocb->req;
3512 NvmeNamespace *ns = req->ns;
3513 int64_t moff;
3514 int count;
3515
3516 if (ret < 0) {
3517 nvme_zone_reset_cb(iocb, ret);
3518 return;
3519 }
3520
3521 if (!ns->lbaf.ms) {
3522 nvme_zone_reset_cb(iocb, 0);
3523 return;
3524 }
3525
3526 moff = nvme_moff(ns, iocb->zone->d.zslba);
3527 count = nvme_m2b(ns, ns->zone_size);
3528
3529 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3530 BDRV_REQ_MAY_UNMAP,
3531 nvme_zone_reset_cb, iocb);
3532 return;
3533}
3534
3535static void nvme_zone_reset_cb(void *opaque, int ret)
3536{
3537 NvmeZoneResetAIOCB *iocb = opaque;
3538 NvmeRequest *req = iocb->req;
3539 NvmeNamespace *ns = req->ns;
3540
3541 if (ret < 0) {
3542 iocb->ret = ret;
3543 goto done;
3544 }
3545
3546 if (iocb->zone) {
3547 nvme_zrm_reset(ns, iocb->zone);
3548
3549 if (!iocb->all) {
3550 goto done;
3551 }
3552 }
3553
3554 while (iocb->idx < ns->num_zones) {
3555 NvmeZone *zone = &ns->zone_array[iocb->idx++];
3556
3557 switch (nvme_get_zone_state(zone)) {
3558 case NVME_ZONE_STATE_EMPTY:
3559 if (!iocb->all) {
3560 goto done;
3561 }
3562
3563 continue;
3564
3565 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3566 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3567 case NVME_ZONE_STATE_CLOSED:
3568 case NVME_ZONE_STATE_FULL:
3569 iocb->zone = zone;
3570 break;
3571
3572 default:
3573 continue;
3574 }
3575
3576 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3577
3578 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3579 nvme_l2b(ns, zone->d.zslba),
3580 nvme_l2b(ns, ns->zone_size),
3581 BDRV_REQ_MAY_UNMAP,
3582 nvme_zone_reset_epilogue_cb,
3583 iocb);
3584 return;
3585 }
3586
3587done:
3588 iocb->aiocb = NULL;
3589 if (iocb->bh) {
3590 qemu_bh_schedule(iocb->bh);
3591 }
3592}
3593
3594static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3595{
3596 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3597 NvmeNamespace *ns = req->ns;
3598 NvmeZone *zone;
3599 NvmeZoneResetAIOCB *iocb;
3600 uint8_t *zd_ext;
3601 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3602 uint64_t slba = 0;
3603 uint32_t zone_idx = 0;
3604 uint16_t status;
3605 uint8_t action;
3606 bool all;
3607 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3608
3609 action = dw13 & 0xff;
3610 all = !!(dw13 & 0x100);
3611
3612 req->status = NVME_SUCCESS;
3613
3614 if (!all) {
3615 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3616 if (status) {
3617 return status;
3618 }
3619 }
3620
3621 zone = &ns->zone_array[zone_idx];
3622 if (slba != zone->d.zslba) {
3623 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3624 return NVME_INVALID_FIELD | NVME_DNR;
3625 }
3626
3627 switch (action) {
3628
3629 case NVME_ZONE_ACTION_OPEN:
3630 if (all) {
3631 proc_mask = NVME_PROC_CLOSED_ZONES;
3632 }
3633 trace_pci_nvme_open_zone(slba, zone_idx, all);
3634 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3635 break;
3636
3637 case NVME_ZONE_ACTION_CLOSE:
3638 if (all) {
3639 proc_mask = NVME_PROC_OPENED_ZONES;
3640 }
3641 trace_pci_nvme_close_zone(slba, zone_idx, all);
3642 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3643 break;
3644
3645 case NVME_ZONE_ACTION_FINISH:
3646 if (all) {
3647 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3648 }
3649 trace_pci_nvme_finish_zone(slba, zone_idx, all);
3650 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3651 break;
3652
3653 case NVME_ZONE_ACTION_RESET:
3654 trace_pci_nvme_reset_zone(slba, zone_idx, all);
3655
3656 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3657 nvme_misc_cb, req);
3658
3659 iocb->req = req;
3660 iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3661 iocb->ret = 0;
3662 iocb->all = all;
3663 iocb->idx = zone_idx;
3664 iocb->zone = NULL;
3665
3666 req->aiocb = &iocb->common;
3667 nvme_zone_reset_cb(iocb, 0);
3668
3669 return NVME_NO_COMPLETE;
3670
3671 case NVME_ZONE_ACTION_OFFLINE:
3672 if (all) {
3673 proc_mask = NVME_PROC_READ_ONLY_ZONES;
3674 }
3675 trace_pci_nvme_offline_zone(slba, zone_idx, all);
3676 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3677 break;
3678
3679 case NVME_ZONE_ACTION_SET_ZD_EXT:
3680 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3681 if (all || !ns->params.zd_extension_size) {
3682 return NVME_INVALID_FIELD | NVME_DNR;
3683 }
3684 zd_ext = nvme_get_zd_extension(ns, zone_idx);
3685 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3686 if (status) {
3687 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3688 return status;
3689 }
3690
3691 status = nvme_set_zd_ext(ns, zone);
3692 if (status == NVME_SUCCESS) {
3693 trace_pci_nvme_zd_extension_set(zone_idx);
3694 return status;
3695 }
3696 break;
3697
3698 default:
3699 trace_pci_nvme_err_invalid_mgmt_action(action);
3700 status = NVME_INVALID_FIELD;
3701 }
3702
3703 if (status == NVME_ZONE_INVAL_TRANSITION) {
3704 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3705 zone->d.za);
3706 }
3707 if (status) {
3708 status |= NVME_DNR;
3709 }
3710
3711 return status;
3712}
3713
3714static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3715{
3716 NvmeZoneState zs = nvme_get_zone_state(zl);
3717
3718 switch (zafs) {
3719 case NVME_ZONE_REPORT_ALL:
3720 return true;
3721 case NVME_ZONE_REPORT_EMPTY:
3722 return zs == NVME_ZONE_STATE_EMPTY;
3723 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3724 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3725 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3726 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3727 case NVME_ZONE_REPORT_CLOSED:
3728 return zs == NVME_ZONE_STATE_CLOSED;
3729 case NVME_ZONE_REPORT_FULL:
3730 return zs == NVME_ZONE_STATE_FULL;
3731 case NVME_ZONE_REPORT_READ_ONLY:
3732 return zs == NVME_ZONE_STATE_READ_ONLY;
3733 case NVME_ZONE_REPORT_OFFLINE:
3734 return zs == NVME_ZONE_STATE_OFFLINE;
3735 default:
3736 return false;
3737 }
3738}
3739
3740static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3741{
3742 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3743 NvmeNamespace *ns = req->ns;
3744
3745 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3746 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3747 uint32_t zone_idx, zra, zrasf, partial;
3748 uint64_t max_zones, nr_zones = 0;
3749 uint16_t status;
3750 uint64_t slba;
3751 NvmeZoneDescr *z;
3752 NvmeZone *zone;
3753 NvmeZoneReportHeader *header;
3754 void *buf, *buf_p;
3755 size_t zone_entry_sz;
3756 int i;
3757
3758 req->status = NVME_SUCCESS;
3759
3760 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3761 if (status) {
3762 return status;
3763 }
3764
3765 zra = dw13 & 0xff;
3766 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3767 return NVME_INVALID_FIELD | NVME_DNR;
3768 }
3769 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3770 return NVME_INVALID_FIELD | NVME_DNR;
3771 }
3772
3773 zrasf = (dw13 >> 8) & 0xff;
3774 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3775 return NVME_INVALID_FIELD | NVME_DNR;
3776 }
3777
3778 if (data_size < sizeof(NvmeZoneReportHeader)) {
3779 return NVME_INVALID_FIELD | NVME_DNR;
3780 }
3781
3782 status = nvme_check_mdts(n, data_size);
3783 if (status) {
3784 return status;
3785 }
3786
3787 partial = (dw13 >> 16) & 0x01;
3788
3789 zone_entry_sz = sizeof(NvmeZoneDescr);
3790 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3791 zone_entry_sz += ns->params.zd_extension_size;
3792 }
3793
3794 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3795 buf = g_malloc0(data_size);
3796
3797 zone = &ns->zone_array[zone_idx];
3798 for (i = zone_idx; i < ns->num_zones; i++) {
3799 if (partial && nr_zones >= max_zones) {
3800 break;
3801 }
3802 if (nvme_zone_matches_filter(zrasf, zone++)) {
3803 nr_zones++;
3804 }
3805 }
3806 header = (NvmeZoneReportHeader *)buf;
3807 header->nr_zones = cpu_to_le64(nr_zones);
3808
3809 buf_p = buf + sizeof(NvmeZoneReportHeader);
3810 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3811 zone = &ns->zone_array[zone_idx];
3812 if (nvme_zone_matches_filter(zrasf, zone)) {
3813 z = (NvmeZoneDescr *)buf_p;
3814 buf_p += sizeof(NvmeZoneDescr);
3815
3816 z->zt = zone->d.zt;
3817 z->zs = zone->d.zs;
3818 z->zcap = cpu_to_le64(zone->d.zcap);
3819 z->zslba = cpu_to_le64(zone->d.zslba);
3820 z->za = zone->d.za;
3821
3822 if (nvme_wp_is_valid(zone)) {
3823 z->wp = cpu_to_le64(zone->d.wp);
3824 } else {
3825 z->wp = cpu_to_le64(~0ULL);
3826 }
3827
3828 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3829 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3830 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3831 ns->params.zd_extension_size);
3832 }
3833 buf_p += ns->params.zd_extension_size;
3834 }
3835
3836 max_zones--;
3837 }
3838 }
3839
3840 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3841
3842 g_free(buf);
3843
3844 return status;
3845}
3846
3847static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3848{
3849 NvmeNamespace *ns;
3850 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3851
3852 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3853 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3854
3855 if (!nvme_nsid_valid(n, nsid)) {
3856 return NVME_INVALID_NSID | NVME_DNR;
3857 }
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878 if (req->cmd.opcode == NVME_CMD_FLUSH) {
3879 return nvme_flush(n, req);
3880 }
3881
3882 ns = nvme_ns(n, nsid);
3883 if (unlikely(!ns)) {
3884 return NVME_INVALID_FIELD | NVME_DNR;
3885 }
3886
3887 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3888 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3889 return NVME_INVALID_OPCODE | NVME_DNR;
3890 }
3891
3892 if (ns->status) {
3893 return ns->status;
3894 }
3895
3896 req->ns = ns;
3897
3898 switch (req->cmd.opcode) {
3899 case NVME_CMD_WRITE_ZEROES:
3900 return nvme_write_zeroes(n, req);
3901 case NVME_CMD_ZONE_APPEND:
3902 return nvme_zone_append(n, req);
3903 case NVME_CMD_WRITE:
3904 return nvme_write(n, req);
3905 case NVME_CMD_READ:
3906 return nvme_read(n, req);
3907 case NVME_CMD_COMPARE:
3908 return nvme_compare(n, req);
3909 case NVME_CMD_DSM:
3910 return nvme_dsm(n, req);
3911 case NVME_CMD_VERIFY:
3912 return nvme_verify(n, req);
3913 case NVME_CMD_COPY:
3914 return nvme_copy(n, req);
3915 case NVME_CMD_ZONE_MGMT_SEND:
3916 return nvme_zone_mgmt_send(n, req);
3917 case NVME_CMD_ZONE_MGMT_RECV:
3918 return nvme_zone_mgmt_recv(n, req);
3919 default:
3920 assert(false);
3921 }
3922
3923 return NVME_INVALID_OPCODE | NVME_DNR;
3924}
3925
3926static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3927{
3928 n->sq[sq->sqid] = NULL;
3929 timer_free(sq->timer);
3930 g_free(sq->io_req);
3931 if (sq->sqid) {
3932 g_free(sq);
3933 }
3934}
3935
3936static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3937{
3938 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3939 NvmeRequest *r, *next;
3940 NvmeSQueue *sq;
3941 NvmeCQueue *cq;
3942 uint16_t qid = le16_to_cpu(c->qid);
3943
3944 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3945 trace_pci_nvme_err_invalid_del_sq(qid);
3946 return NVME_INVALID_QID | NVME_DNR;
3947 }
3948
3949 trace_pci_nvme_del_sq(qid);
3950
3951 sq = n->sq[qid];
3952 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3953 r = QTAILQ_FIRST(&sq->out_req_list);
3954 assert(r->aiocb);
3955 blk_aio_cancel(r->aiocb);
3956 }
3957
3958 assert(QTAILQ_EMPTY(&sq->out_req_list));
3959
3960 if (!nvme_check_cqid(n, sq->cqid)) {
3961 cq = n->cq[sq->cqid];
3962 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3963
3964 nvme_post_cqes(cq);
3965 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3966 if (r->sq == sq) {
3967 QTAILQ_REMOVE(&cq->req_list, r, entry);
3968 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3969 }
3970 }
3971 }
3972
3973 nvme_free_sq(sq, n);
3974 return NVME_SUCCESS;
3975}
3976
3977static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3978 uint16_t sqid, uint16_t cqid, uint16_t size)
3979{
3980 int i;
3981 NvmeCQueue *cq;
3982
3983 sq->ctrl = n;
3984 sq->dma_addr = dma_addr;
3985 sq->sqid = sqid;
3986 sq->size = size;
3987 sq->cqid = cqid;
3988 sq->head = sq->tail = 0;
3989 sq->io_req = g_new0(NvmeRequest, sq->size);
3990
3991 QTAILQ_INIT(&sq->req_list);
3992 QTAILQ_INIT(&sq->out_req_list);
3993 for (i = 0; i < sq->size; i++) {
3994 sq->io_req[i].sq = sq;
3995 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
3996 }
3997 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
3998
3999 assert(n->cq[cqid]);
4000 cq = n->cq[cqid];
4001 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4002 n->sq[sqid] = sq;
4003}
4004
4005static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4006{
4007 NvmeSQueue *sq;
4008 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4009
4010 uint16_t cqid = le16_to_cpu(c->cqid);
4011 uint16_t sqid = le16_to_cpu(c->sqid);
4012 uint16_t qsize = le16_to_cpu(c->qsize);
4013 uint16_t qflags = le16_to_cpu(c->sq_flags);
4014 uint64_t prp1 = le64_to_cpu(c->prp1);
4015
4016 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4017
4018 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4019 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4020 return NVME_INVALID_CQID | NVME_DNR;
4021 }
4022 if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
4023 n->sq[sqid] != NULL)) {
4024 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4025 return NVME_INVALID_QID | NVME_DNR;
4026 }
4027 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4028 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4029 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4030 }
4031 if (unlikely(prp1 & (n->page_size - 1))) {
4032 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4033 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4034 }
4035 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4036 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4037 return NVME_INVALID_FIELD | NVME_DNR;
4038 }
4039 sq = g_malloc0(sizeof(*sq));
4040 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4041 return NVME_SUCCESS;
4042}
4043
4044struct nvme_stats {
4045 uint64_t units_read;
4046 uint64_t units_written;
4047 uint64_t read_commands;
4048 uint64_t write_commands;
4049};
4050
4051static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4052{
4053 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4054
4055 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4056 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4057 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4058 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4059}
4060
4061static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4062 uint64_t off, NvmeRequest *req)
4063{
4064 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4065 struct nvme_stats stats = { 0 };
4066 NvmeSmartLog smart = { 0 };
4067 uint32_t trans_len;
4068 NvmeNamespace *ns;
4069 time_t current_ms;
4070
4071 if (off >= sizeof(smart)) {
4072 return NVME_INVALID_FIELD | NVME_DNR;
4073 }
4074
4075 if (nsid != 0xffffffff) {
4076 ns = nvme_ns(n, nsid);
4077 if (!ns) {
4078 return NVME_INVALID_NSID | NVME_DNR;
4079 }
4080 nvme_set_blk_stats(ns, &stats);
4081 } else {
4082 int i;
4083
4084 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4085 ns = nvme_ns(n, i);
4086 if (!ns) {
4087 continue;
4088 }
4089 nvme_set_blk_stats(ns, &stats);
4090 }
4091 }
4092
4093 trans_len = MIN(sizeof(smart) - off, buf_len);
4094 smart.critical_warning = n->smart_critical_warning;
4095
4096 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4097 1000));
4098 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4099 1000));
4100 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4101 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4102
4103 smart.temperature = cpu_to_le16(n->temperature);
4104
4105 if ((n->temperature >= n->features.temp_thresh_hi) ||
4106 (n->temperature <= n->features.temp_thresh_low)) {
4107 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4108 }
4109
4110 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4111 smart.power_on_hours[0] =
4112 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4113
4114 if (!rae) {
4115 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4116 }
4117
4118 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4119}
4120
4121static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4122 NvmeRequest *req)
4123{
4124 uint32_t trans_len;
4125 NvmeFwSlotInfoLog fw_log = {
4126 .afi = 0x1,
4127 };
4128
4129 if (off >= sizeof(fw_log)) {
4130 return NVME_INVALID_FIELD | NVME_DNR;
4131 }
4132
4133 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4134 trans_len = MIN(sizeof(fw_log) - off, buf_len);
4135
4136 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4137}
4138
4139static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4140 uint64_t off, NvmeRequest *req)
4141{
4142 uint32_t trans_len;
4143 NvmeErrorLog errlog;
4144
4145 if (off >= sizeof(errlog)) {
4146 return NVME_INVALID_FIELD | NVME_DNR;
4147 }
4148
4149 if (!rae) {
4150 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4151 }
4152
4153 memset(&errlog, 0x0, sizeof(errlog));
4154 trans_len = MIN(sizeof(errlog) - off, buf_len);
4155
4156 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4157}
4158
4159static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4160 uint64_t off, NvmeRequest *req)
4161{
4162 uint32_t nslist[1024];
4163 uint32_t trans_len;
4164 int i = 0;
4165 uint32_t nsid;
4166
4167 memset(nslist, 0x0, sizeof(nslist));
4168 trans_len = MIN(sizeof(nslist) - off, buf_len);
4169
4170 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4171 NVME_CHANGED_NSID_SIZE) {
4172
4173
4174
4175
4176 if (i == ARRAY_SIZE(nslist)) {
4177 memset(nslist, 0x0, sizeof(nslist));
4178 nslist[0] = 0xffffffff;
4179 break;
4180 }
4181
4182 nslist[i++] = nsid;
4183 clear_bit(nsid, n->changed_nsids);
4184 }
4185
4186
4187
4188
4189
4190 if (nslist[0] == 0xffffffff) {
4191 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4192 }
4193
4194 if (!rae) {
4195 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4196 }
4197
4198 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4199}
4200
4201static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4202 uint64_t off, NvmeRequest *req)
4203{
4204 NvmeEffectsLog log = {};
4205 const uint32_t *src_iocs = NULL;
4206 uint32_t trans_len;
4207
4208 if (off >= sizeof(log)) {
4209 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4210 return NVME_INVALID_FIELD | NVME_DNR;
4211 }
4212
4213 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4214 case NVME_CC_CSS_NVM:
4215 src_iocs = nvme_cse_iocs_nvm;
4216
4217 case NVME_CC_CSS_ADMIN_ONLY:
4218 break;
4219 case NVME_CC_CSS_CSI:
4220 switch (csi) {
4221 case NVME_CSI_NVM:
4222 src_iocs = nvme_cse_iocs_nvm;
4223 break;
4224 case NVME_CSI_ZONED:
4225 src_iocs = nvme_cse_iocs_zoned;
4226 break;
4227 }
4228 }
4229
4230 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4231
4232 if (src_iocs) {
4233 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4234 }
4235
4236 trans_len = MIN(sizeof(log) - off, buf_len);
4237
4238 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4239}
4240
4241static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4242{
4243 NvmeCmd *cmd = &req->cmd;
4244
4245 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4246 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4247 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4248 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4249 uint8_t lid = dw10 & 0xff;
4250 uint8_t lsp = (dw10 >> 8) & 0xf;
4251 uint8_t rae = (dw10 >> 15) & 0x1;
4252 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
4253 uint32_t numdl, numdu;
4254 uint64_t off, lpol, lpou;
4255 size_t len;
4256 uint16_t status;
4257
4258 numdl = (dw10 >> 16);
4259 numdu = (dw11 & 0xffff);
4260 lpol = dw12;
4261 lpou = dw13;
4262
4263 len = (((numdu << 16) | numdl) + 1) << 2;
4264 off = (lpou << 32ULL) | lpol;
4265
4266 if (off & 0x3) {
4267 return NVME_INVALID_FIELD | NVME_DNR;
4268 }
4269
4270 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4271
4272 status = nvme_check_mdts(n, len);
4273 if (status) {
4274 return status;
4275 }
4276
4277 switch (lid) {
4278 case NVME_LOG_ERROR_INFO:
4279 return nvme_error_info(n, rae, len, off, req);
4280 case NVME_LOG_SMART_INFO:
4281 return nvme_smart_info(n, rae, len, off, req);
4282 case NVME_LOG_FW_SLOT_INFO:
4283 return nvme_fw_log_info(n, len, off, req);
4284 case NVME_LOG_CHANGED_NSLIST:
4285 return nvme_changed_nslist(n, rae, len, off, req);
4286 case NVME_LOG_CMD_EFFECTS:
4287 return nvme_cmd_effects(n, csi, len, off, req);
4288 default:
4289 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4290 return NVME_INVALID_FIELD | NVME_DNR;
4291 }
4292}
4293
4294static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4295{
4296 n->cq[cq->cqid] = NULL;
4297 timer_free(cq->timer);
4298 if (msix_enabled(&n->parent_obj)) {
4299 msix_vector_unuse(&n->parent_obj, cq->vector);
4300 }
4301 if (cq->cqid) {
4302 g_free(cq);
4303 }
4304}
4305
4306static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4307{
4308 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4309 NvmeCQueue *cq;
4310 uint16_t qid = le16_to_cpu(c->qid);
4311
4312 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4313 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4314 return NVME_INVALID_CQID | NVME_DNR;
4315 }
4316
4317 cq = n->cq[qid];
4318 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4319 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4320 return NVME_INVALID_QUEUE_DEL;
4321 }
4322
4323 if (cq->irq_enabled && cq->tail != cq->head) {
4324 n->cq_pending--;
4325 }
4326
4327 nvme_irq_deassert(n, cq);
4328 trace_pci_nvme_del_cq(qid);
4329 nvme_free_cq(cq, n);
4330 return NVME_SUCCESS;
4331}
4332
4333static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4334 uint16_t cqid, uint16_t vector, uint16_t size,
4335 uint16_t irq_enabled)
4336{
4337 int ret;
4338
4339 if (msix_enabled(&n->parent_obj)) {
4340 ret = msix_vector_use(&n->parent_obj, vector);
4341 assert(ret == 0);
4342 }
4343 cq->ctrl = n;
4344 cq->cqid = cqid;
4345 cq->size = size;
4346 cq->dma_addr = dma_addr;
4347 cq->phase = 1;
4348 cq->irq_enabled = irq_enabled;
4349 cq->vector = vector;
4350 cq->head = cq->tail = 0;
4351 QTAILQ_INIT(&cq->req_list);
4352 QTAILQ_INIT(&cq->sq_list);
4353 n->cq[cqid] = cq;
4354 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4355}
4356
4357static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4358{
4359 NvmeCQueue *cq;
4360 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4361 uint16_t cqid = le16_to_cpu(c->cqid);
4362 uint16_t vector = le16_to_cpu(c->irq_vector);
4363 uint16_t qsize = le16_to_cpu(c->qsize);
4364 uint16_t qflags = le16_to_cpu(c->cq_flags);
4365 uint64_t prp1 = le64_to_cpu(c->prp1);
4366
4367 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4368 NVME_CQ_FLAGS_IEN(qflags) != 0);
4369
4370 if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4371 n->cq[cqid] != NULL)) {
4372 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4373 return NVME_INVALID_QID | NVME_DNR;
4374 }
4375 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4376 trace_pci_nvme_err_invalid_create_cq_size(qsize);
4377 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4378 }
4379 if (unlikely(prp1 & (n->page_size - 1))) {
4380 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4381 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4382 }
4383 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4384 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4385 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4386 }
4387 if (unlikely(vector >= n->params.msix_qsize)) {
4388 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4389 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4390 }
4391 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4392 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4393 return NVME_INVALID_FIELD | NVME_DNR;
4394 }
4395
4396 cq = g_malloc0(sizeof(*cq));
4397 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4398 NVME_CQ_FLAGS_IEN(qflags));
4399
4400
4401
4402
4403
4404
4405 n->qs_created = true;
4406 return NVME_SUCCESS;
4407}
4408
4409static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4410{
4411 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4412
4413 return nvme_c2h(n, id, sizeof(id), req);
4414}
4415
4416static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4417{
4418 trace_pci_nvme_identify_ctrl();
4419
4420 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4421}
4422
4423static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4424{
4425 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4426 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4427 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4428
4429 trace_pci_nvme_identify_ctrl_csi(c->csi);
4430
4431 switch (c->csi) {
4432 case NVME_CSI_NVM:
4433 id_nvm->vsl = n->params.vsl;
4434 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4435 break;
4436
4437 case NVME_CSI_ZONED:
4438 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4439 break;
4440
4441 default:
4442 return NVME_INVALID_FIELD | NVME_DNR;
4443 }
4444
4445 return nvme_c2h(n, id, sizeof(id), req);
4446}
4447
4448static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4449{
4450 NvmeNamespace *ns;
4451 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4452 uint32_t nsid = le32_to_cpu(c->nsid);
4453
4454 trace_pci_nvme_identify_ns(nsid);
4455
4456 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4457 return NVME_INVALID_NSID | NVME_DNR;
4458 }
4459
4460 ns = nvme_ns(n, nsid);
4461 if (unlikely(!ns)) {
4462 if (!active) {
4463 ns = nvme_subsys_ns(n->subsys, nsid);
4464 if (!ns) {
4465 return nvme_rpt_empty_id_struct(n, req);
4466 }
4467 } else {
4468 return nvme_rpt_empty_id_struct(n, req);
4469 }
4470 }
4471
4472 if (active || ns->csi == NVME_CSI_NVM) {
4473 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4474 }
4475
4476 return NVME_INVALID_CMD_SET | NVME_DNR;
4477}
4478
4479static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4480 bool attached)
4481{
4482 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4483 uint32_t nsid = le32_to_cpu(c->nsid);
4484 uint16_t min_id = le16_to_cpu(c->ctrlid);
4485 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4486 uint16_t *ids = &list[1];
4487 NvmeNamespace *ns;
4488 NvmeCtrl *ctrl;
4489 int cntlid, nr_ids = 0;
4490
4491 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4492
4493 if (!n->subsys) {
4494 return NVME_INVALID_FIELD | NVME_DNR;
4495 }
4496
4497 if (attached) {
4498 if (nsid == NVME_NSID_BROADCAST) {
4499 return NVME_INVALID_FIELD | NVME_DNR;
4500 }
4501
4502 ns = nvme_subsys_ns(n->subsys, nsid);
4503 if (!ns) {
4504 return NVME_INVALID_FIELD | NVME_DNR;
4505 }
4506 }
4507
4508 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4509 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4510 if (!ctrl) {
4511 continue;
4512 }
4513
4514 if (attached && !nvme_ns(ctrl, nsid)) {
4515 continue;
4516 }
4517
4518 ids[nr_ids++] = cntlid;
4519 }
4520
4521 list[0] = nr_ids;
4522
4523 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4524}
4525
4526static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4527 bool active)
4528{
4529 NvmeNamespace *ns;
4530 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4531 uint32_t nsid = le32_to_cpu(c->nsid);
4532
4533 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4534
4535 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4536 return NVME_INVALID_NSID | NVME_DNR;
4537 }
4538
4539 ns = nvme_ns(n, nsid);
4540 if (unlikely(!ns)) {
4541 if (!active) {
4542 ns = nvme_subsys_ns(n->subsys, nsid);
4543 if (!ns) {
4544 return nvme_rpt_empty_id_struct(n, req);
4545 }
4546 } else {
4547 return nvme_rpt_empty_id_struct(n, req);
4548 }
4549 }
4550
4551 if (c->csi == NVME_CSI_NVM) {
4552 return nvme_rpt_empty_id_struct(n, req);
4553 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4554 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4555 req);
4556 }
4557
4558 return NVME_INVALID_FIELD | NVME_DNR;
4559}
4560
4561static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4562 bool active)
4563{
4564 NvmeNamespace *ns;
4565 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4566 uint32_t min_nsid = le32_to_cpu(c->nsid);
4567 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4568 static const int data_len = sizeof(list);
4569 uint32_t *list_ptr = (uint32_t *)list;
4570 int i, j = 0;
4571
4572 trace_pci_nvme_identify_nslist(min_nsid);
4573
4574
4575
4576
4577
4578
4579
4580 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4581 return NVME_INVALID_NSID | NVME_DNR;
4582 }
4583
4584 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4585 ns = nvme_ns(n, i);
4586 if (!ns) {
4587 if (!active) {
4588 ns = nvme_subsys_ns(n->subsys, i);
4589 if (!ns) {
4590 continue;
4591 }
4592 } else {
4593 continue;
4594 }
4595 }
4596 if (ns->params.nsid <= min_nsid) {
4597 continue;
4598 }
4599 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4600 if (j == data_len / sizeof(uint32_t)) {
4601 break;
4602 }
4603 }
4604
4605 return nvme_c2h(n, list, data_len, req);
4606}
4607
4608static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4609 bool active)
4610{
4611 NvmeNamespace *ns;
4612 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4613 uint32_t min_nsid = le32_to_cpu(c->nsid);
4614 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4615 static const int data_len = sizeof(list);
4616 uint32_t *list_ptr = (uint32_t *)list;
4617 int i, j = 0;
4618
4619 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4620
4621
4622
4623
4624 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4625 return NVME_INVALID_NSID | NVME_DNR;
4626 }
4627
4628 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4629 return NVME_INVALID_FIELD | NVME_DNR;
4630 }
4631
4632 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4633 ns = nvme_ns(n, i);
4634 if (!ns) {
4635 if (!active) {
4636 ns = nvme_subsys_ns(n->subsys, i);
4637 if (!ns) {
4638 continue;
4639 }
4640 } else {
4641 continue;
4642 }
4643 }
4644 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4645 continue;
4646 }
4647 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4648 if (j == data_len / sizeof(uint32_t)) {
4649 break;
4650 }
4651 }
4652
4653 return nvme_c2h(n, list, data_len, req);
4654}
4655
4656static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4657{
4658 NvmeNamespace *ns;
4659 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4660 uint32_t nsid = le32_to_cpu(c->nsid);
4661 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4662 uint8_t *pos = list;
4663 struct {
4664 NvmeIdNsDescr hdr;
4665 uint8_t v[NVME_NIDL_UUID];
4666 } QEMU_PACKED uuid = {};
4667 struct {
4668 NvmeIdNsDescr hdr;
4669 uint64_t v;
4670 } QEMU_PACKED eui64 = {};
4671 struct {
4672 NvmeIdNsDescr hdr;
4673 uint8_t v;
4674 } QEMU_PACKED csi = {};
4675
4676 trace_pci_nvme_identify_ns_descr_list(nsid);
4677
4678 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4679 return NVME_INVALID_NSID | NVME_DNR;
4680 }
4681
4682 ns = nvme_ns(n, nsid);
4683 if (unlikely(!ns)) {
4684 return NVME_INVALID_FIELD | NVME_DNR;
4685 }
4686
4687
4688
4689
4690
4691
4692 uuid.hdr.nidt = NVME_NIDT_UUID;
4693 uuid.hdr.nidl = NVME_NIDL_UUID;
4694 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4695 memcpy(pos, &uuid, sizeof(uuid));
4696 pos += sizeof(uuid);
4697
4698 if (ns->params.eui64) {
4699 eui64.hdr.nidt = NVME_NIDT_EUI64;
4700 eui64.hdr.nidl = NVME_NIDL_EUI64;
4701 eui64.v = cpu_to_be64(ns->params.eui64);
4702 memcpy(pos, &eui64, sizeof(eui64));
4703 pos += sizeof(eui64);
4704 }
4705
4706 csi.hdr.nidt = NVME_NIDT_CSI;
4707 csi.hdr.nidl = NVME_NIDL_CSI;
4708 csi.v = ns->csi;
4709 memcpy(pos, &csi, sizeof(csi));
4710 pos += sizeof(csi);
4711
4712 return nvme_c2h(n, list, sizeof(list), req);
4713}
4714
4715static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4716{
4717 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4718 static const int data_len = sizeof(list);
4719
4720 trace_pci_nvme_identify_cmd_set();
4721
4722 NVME_SET_CSI(*list, NVME_CSI_NVM);
4723 NVME_SET_CSI(*list, NVME_CSI_ZONED);
4724
4725 return nvme_c2h(n, list, data_len, req);
4726}
4727
4728static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4729{
4730 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4731
4732 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4733 c->csi);
4734
4735 switch (c->cns) {
4736 case NVME_ID_CNS_NS:
4737 return nvme_identify_ns(n, req, true);
4738 case NVME_ID_CNS_NS_PRESENT:
4739 return nvme_identify_ns(n, req, false);
4740 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4741 return nvme_identify_ctrl_list(n, req, true);
4742 case NVME_ID_CNS_CTRL_LIST:
4743 return nvme_identify_ctrl_list(n, req, false);
4744 case NVME_ID_CNS_CS_NS:
4745 return nvme_identify_ns_csi(n, req, true);
4746 case NVME_ID_CNS_CS_NS_PRESENT:
4747 return nvme_identify_ns_csi(n, req, false);
4748 case NVME_ID_CNS_CTRL:
4749 return nvme_identify_ctrl(n, req);
4750 case NVME_ID_CNS_CS_CTRL:
4751 return nvme_identify_ctrl_csi(n, req);
4752 case NVME_ID_CNS_NS_ACTIVE_LIST:
4753 return nvme_identify_nslist(n, req, true);
4754 case NVME_ID_CNS_NS_PRESENT_LIST:
4755 return nvme_identify_nslist(n, req, false);
4756 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4757 return nvme_identify_nslist_csi(n, req, true);
4758 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4759 return nvme_identify_nslist_csi(n, req, false);
4760 case NVME_ID_CNS_NS_DESCR_LIST:
4761 return nvme_identify_ns_descr_list(n, req);
4762 case NVME_ID_CNS_IO_COMMAND_SET:
4763 return nvme_identify_cmd_set(n, req);
4764 default:
4765 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4766 return NVME_INVALID_FIELD | NVME_DNR;
4767 }
4768}
4769
4770static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4771{
4772 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4773
4774 req->cqe.result = 1;
4775 if (nvme_check_sqid(n, sqid)) {
4776 return NVME_INVALID_FIELD | NVME_DNR;
4777 }
4778
4779 return NVME_SUCCESS;
4780}
4781
4782static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4783{
4784 trace_pci_nvme_setfeat_timestamp(ts);
4785
4786 n->host_timestamp = le64_to_cpu(ts);
4787 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4788}
4789
4790static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4791{
4792 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4793 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4794
4795 union nvme_timestamp {
4796 struct {
4797 uint64_t timestamp:48;
4798 uint64_t sync:1;
4799 uint64_t origin:3;
4800 uint64_t rsvd1:12;
4801 };
4802 uint64_t all;
4803 };
4804
4805 union nvme_timestamp ts;
4806 ts.all = 0;
4807 ts.timestamp = n->host_timestamp + elapsed_time;
4808
4809
4810 ts.origin = n->host_timestamp ? 0x01 : 0x00;
4811
4812 trace_pci_nvme_getfeat_timestamp(ts.all);
4813
4814 return cpu_to_le64(ts.all);
4815}
4816
4817static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4818{
4819 uint64_t timestamp = nvme_get_timestamp(n);
4820
4821 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4822}
4823
4824static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4825{
4826 NvmeCmd *cmd = &req->cmd;
4827 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4828 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4829 uint32_t nsid = le32_to_cpu(cmd->nsid);
4830 uint32_t result;
4831 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4832 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4833 uint16_t iv;
4834 NvmeNamespace *ns;
4835 int i;
4836
4837 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4838 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4839 };
4840
4841 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4842
4843 if (!nvme_feature_support[fid]) {
4844 return NVME_INVALID_FIELD | NVME_DNR;
4845 }
4846
4847 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4848 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4849
4850
4851
4852
4853
4854
4855
4856 return NVME_INVALID_NSID | NVME_DNR;
4857 }
4858
4859 if (!nvme_ns(n, nsid)) {
4860 return NVME_INVALID_FIELD | NVME_DNR;
4861 }
4862 }
4863
4864 switch (sel) {
4865 case NVME_GETFEAT_SELECT_CURRENT:
4866 break;
4867 case NVME_GETFEAT_SELECT_SAVED:
4868
4869 case NVME_GETFEAT_SELECT_DEFAULT:
4870 goto defaults;
4871 case NVME_GETFEAT_SELECT_CAP:
4872 result = nvme_feature_cap[fid];
4873 goto out;
4874 }
4875
4876 switch (fid) {
4877 case NVME_TEMPERATURE_THRESHOLD:
4878 result = 0;
4879
4880
4881
4882
4883
4884 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4885 goto out;
4886 }
4887
4888 switch (NVME_TEMP_THSEL(dw11)) {
4889 case NVME_TEMP_THSEL_OVER:
4890 result = n->features.temp_thresh_hi;
4891 goto out;
4892 case NVME_TEMP_THSEL_UNDER:
4893 result = n->features.temp_thresh_low;
4894 goto out;
4895 }
4896
4897 return NVME_INVALID_FIELD | NVME_DNR;
4898 case NVME_ERROR_RECOVERY:
4899 if (!nvme_nsid_valid(n, nsid)) {
4900 return NVME_INVALID_NSID | NVME_DNR;
4901 }
4902
4903 ns = nvme_ns(n, nsid);
4904 if (unlikely(!ns)) {
4905 return NVME_INVALID_FIELD | NVME_DNR;
4906 }
4907
4908 result = ns->features.err_rec;
4909 goto out;
4910 case NVME_VOLATILE_WRITE_CACHE:
4911 result = 0;
4912 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4913 ns = nvme_ns(n, i);
4914 if (!ns) {
4915 continue;
4916 }
4917
4918 result = blk_enable_write_cache(ns->blkconf.blk);
4919 if (result) {
4920 break;
4921 }
4922 }
4923 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4924 goto out;
4925 case NVME_ASYNCHRONOUS_EVENT_CONF:
4926 result = n->features.async_config;
4927 goto out;
4928 case NVME_TIMESTAMP:
4929 return nvme_get_feature_timestamp(n, req);
4930 default:
4931 break;
4932 }
4933
4934defaults:
4935 switch (fid) {
4936 case NVME_TEMPERATURE_THRESHOLD:
4937 result = 0;
4938
4939 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4940 break;
4941 }
4942
4943 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4944 result = NVME_TEMPERATURE_WARNING;
4945 }
4946
4947 break;
4948 case NVME_NUMBER_OF_QUEUES:
4949 result = (n->params.max_ioqpairs - 1) |
4950 ((n->params.max_ioqpairs - 1) << 16);
4951 trace_pci_nvme_getfeat_numq(result);
4952 break;
4953 case NVME_INTERRUPT_VECTOR_CONF:
4954 iv = dw11 & 0xffff;
4955 if (iv >= n->params.max_ioqpairs + 1) {
4956 return NVME_INVALID_FIELD | NVME_DNR;
4957 }
4958
4959 result = iv;
4960 if (iv == n->admin_cq.vector) {
4961 result |= NVME_INTVC_NOCOALESCING;
4962 }
4963 break;
4964 default:
4965 result = nvme_feature_default[fid];
4966 break;
4967 }
4968
4969out:
4970 req->cqe.result = cpu_to_le32(result);
4971 return NVME_SUCCESS;
4972}
4973
4974static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4975{
4976 uint16_t ret;
4977 uint64_t timestamp;
4978
4979 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4980 if (ret) {
4981 return ret;
4982 }
4983
4984 nvme_set_timestamp(n, timestamp);
4985
4986 return NVME_SUCCESS;
4987}
4988
4989static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4990{
4991 NvmeNamespace *ns = NULL;
4992
4993 NvmeCmd *cmd = &req->cmd;
4994 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4995 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4996 uint32_t nsid = le32_to_cpu(cmd->nsid);
4997 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4998 uint8_t save = NVME_SETFEAT_SAVE(dw10);
4999 int i;
5000
5001 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5002
5003 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5004 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5005 }
5006
5007 if (!nvme_feature_support[fid]) {
5008 return NVME_INVALID_FIELD | NVME_DNR;
5009 }
5010
5011 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5012 if (nsid != NVME_NSID_BROADCAST) {
5013 if (!nvme_nsid_valid(n, nsid)) {
5014 return NVME_INVALID_NSID | NVME_DNR;
5015 }
5016
5017 ns = nvme_ns(n, nsid);
5018 if (unlikely(!ns)) {
5019 return NVME_INVALID_FIELD | NVME_DNR;
5020 }
5021 }
5022 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5023 if (!nvme_nsid_valid(n, nsid)) {
5024 return NVME_INVALID_NSID | NVME_DNR;
5025 }
5026
5027 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5028 }
5029
5030 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5031 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5032 }
5033
5034 switch (fid) {
5035 case NVME_TEMPERATURE_THRESHOLD:
5036 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5037 break;
5038 }
5039
5040 switch (NVME_TEMP_THSEL(dw11)) {
5041 case NVME_TEMP_THSEL_OVER:
5042 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5043 break;
5044 case NVME_TEMP_THSEL_UNDER:
5045 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5046 break;
5047 default:
5048 return NVME_INVALID_FIELD | NVME_DNR;
5049 }
5050
5051 if ((n->temperature >= n->features.temp_thresh_hi) ||
5052 (n->temperature <= n->features.temp_thresh_low)) {
5053 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
5054 }
5055
5056 break;
5057 case NVME_ERROR_RECOVERY:
5058 if (nsid == NVME_NSID_BROADCAST) {
5059 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5060 ns = nvme_ns(n, i);
5061
5062 if (!ns) {
5063 continue;
5064 }
5065
5066 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5067 ns->features.err_rec = dw11;
5068 }
5069 }
5070
5071 break;
5072 }
5073
5074 assert(ns);
5075 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5076 ns->features.err_rec = dw11;
5077 }
5078 break;
5079 case NVME_VOLATILE_WRITE_CACHE:
5080 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5081 ns = nvme_ns(n, i);
5082 if (!ns) {
5083 continue;
5084 }
5085
5086 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5087 blk_flush(ns->blkconf.blk);
5088 }
5089
5090 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5091 }
5092
5093 break;
5094
5095 case NVME_NUMBER_OF_QUEUES:
5096 if (n->qs_created) {
5097 return NVME_CMD_SEQ_ERROR | NVME_DNR;
5098 }
5099
5100
5101
5102
5103
5104 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5105 return NVME_INVALID_FIELD | NVME_DNR;
5106 }
5107
5108 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5109 ((dw11 >> 16) & 0xffff) + 1,
5110 n->params.max_ioqpairs,
5111 n->params.max_ioqpairs);
5112 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
5113 ((n->params.max_ioqpairs - 1) << 16));
5114 break;
5115 case NVME_ASYNCHRONOUS_EVENT_CONF:
5116 n->features.async_config = dw11;
5117 break;
5118 case NVME_TIMESTAMP:
5119 return nvme_set_feature_timestamp(n, req);
5120 case NVME_COMMAND_SET_PROFILE:
5121 if (dw11 & 0x1ff) {
5122 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5123 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5124 }
5125 break;
5126 default:
5127 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5128 }
5129 return NVME_SUCCESS;
5130}
5131
5132static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5133{
5134 trace_pci_nvme_aer(nvme_cid(req));
5135
5136 if (n->outstanding_aers > n->params.aerl) {
5137 trace_pci_nvme_aer_aerl_exceeded();
5138 return NVME_AER_LIMIT_EXCEEDED;
5139 }
5140
5141 n->aer_reqs[n->outstanding_aers] = req;
5142 n->outstanding_aers++;
5143
5144 if (!QTAILQ_EMPTY(&n->aer_queue)) {
5145 nvme_process_aers(n);
5146 }
5147
5148 return NVME_NO_COMPLETE;
5149}
5150
5151static void nvme_update_dmrsl(NvmeCtrl *n)
5152{
5153 int nsid;
5154
5155 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5156 NvmeNamespace *ns = nvme_ns(n, nsid);
5157 if (!ns) {
5158 continue;
5159 }
5160
5161 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5162 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5163 }
5164}
5165
5166static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5167{
5168 uint32_t cc = ldl_le_p(&n->bar.cc);
5169
5170 ns->iocs = nvme_cse_iocs_none;
5171 switch (ns->csi) {
5172 case NVME_CSI_NVM:
5173 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5174 ns->iocs = nvme_cse_iocs_nvm;
5175 }
5176 break;
5177 case NVME_CSI_ZONED:
5178 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5179 ns->iocs = nvme_cse_iocs_zoned;
5180 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5181 ns->iocs = nvme_cse_iocs_nvm;
5182 }
5183 break;
5184 }
5185}
5186
5187static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5188{
5189 NvmeNamespace *ns;
5190 NvmeCtrl *ctrl;
5191 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5192 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5193 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5194 bool attach = !(dw10 & 0xf);
5195 uint16_t *nr_ids = &list[0];
5196 uint16_t *ids = &list[1];
5197 uint16_t ret;
5198 int i;
5199
5200 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5201
5202 if (!nvme_nsid_valid(n, nsid)) {
5203 return NVME_INVALID_NSID | NVME_DNR;
5204 }
5205
5206 ns = nvme_subsys_ns(n->subsys, nsid);
5207 if (!ns) {
5208 return NVME_INVALID_FIELD | NVME_DNR;
5209 }
5210
5211 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5212 if (ret) {
5213 return ret;
5214 }
5215
5216 if (!*nr_ids) {
5217 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5218 }
5219
5220 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5221 for (i = 0; i < *nr_ids; i++) {
5222 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5223 if (!ctrl) {
5224 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5225 }
5226
5227 if (attach) {
5228 if (nvme_ns(ctrl, nsid)) {
5229 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5230 }
5231
5232 if (ns->attached && !ns->params.shared) {
5233 return NVME_NS_PRIVATE | NVME_DNR;
5234 }
5235
5236 nvme_attach_ns(ctrl, ns);
5237 nvme_select_iocs_ns(ctrl, ns);
5238 } else {
5239 if (!nvme_ns(ctrl, nsid)) {
5240 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5241 }
5242
5243 ctrl->namespaces[nsid] = NULL;
5244 ns->attached--;
5245
5246 nvme_update_dmrsl(ctrl);
5247 }
5248
5249
5250
5251
5252
5253 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5254 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5255 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5256 NVME_LOG_CHANGED_NSLIST);
5257 }
5258 }
5259
5260 return NVME_SUCCESS;
5261}
5262
5263typedef struct NvmeFormatAIOCB {
5264 BlockAIOCB common;
5265 BlockAIOCB *aiocb;
5266 QEMUBH *bh;
5267 NvmeRequest *req;
5268 int ret;
5269
5270 NvmeNamespace *ns;
5271 uint32_t nsid;
5272 bool broadcast;
5273 int64_t offset;
5274} NvmeFormatAIOCB;
5275
5276static void nvme_format_bh(void *opaque);
5277
5278static void nvme_format_cancel(BlockAIOCB *aiocb)
5279{
5280 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5281
5282 if (iocb->aiocb) {
5283 blk_aio_cancel_async(iocb->aiocb);
5284 }
5285}
5286
5287static const AIOCBInfo nvme_format_aiocb_info = {
5288 .aiocb_size = sizeof(NvmeFormatAIOCB),
5289 .cancel_async = nvme_format_cancel,
5290 .get_aio_context = nvme_get_aio_context,
5291};
5292
5293static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
5294{
5295 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5296 uint8_t lbaf = dw10 & 0xf;
5297 uint8_t pi = (dw10 >> 5) & 0x7;
5298 uint8_t mset = (dw10 >> 4) & 0x1;
5299 uint8_t pil = (dw10 >> 8) & 0x1;
5300
5301 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5302
5303 ns->id_ns.dps = (pil << 3) | pi;
5304 ns->id_ns.flbas = lbaf | (mset << 4);
5305
5306 nvme_ns_init_format(ns);
5307}
5308
5309static void nvme_format_ns_cb(void *opaque, int ret)
5310{
5311 NvmeFormatAIOCB *iocb = opaque;
5312 NvmeRequest *req = iocb->req;
5313 NvmeNamespace *ns = iocb->ns;
5314 int bytes;
5315
5316 if (ret < 0) {
5317 iocb->ret = ret;
5318 goto done;
5319 }
5320
5321 assert(ns);
5322
5323 if (iocb->offset < ns->size) {
5324 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5325
5326 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5327 bytes, BDRV_REQ_MAY_UNMAP,
5328 nvme_format_ns_cb, iocb);
5329
5330 iocb->offset += bytes;
5331 return;
5332 }
5333
5334 nvme_format_set(ns, &req->cmd);
5335 ns->status = 0x0;
5336 iocb->ns = NULL;
5337 iocb->offset = 0;
5338
5339done:
5340 iocb->aiocb = NULL;
5341 qemu_bh_schedule(iocb->bh);
5342}
5343
5344static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5345{
5346 if (ns->params.zoned) {
5347 return NVME_INVALID_FORMAT | NVME_DNR;
5348 }
5349
5350 if (lbaf > ns->id_ns.nlbaf) {
5351 return NVME_INVALID_FORMAT | NVME_DNR;
5352 }
5353
5354 if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
5355 return NVME_INVALID_FORMAT | NVME_DNR;
5356 }
5357
5358 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5359 return NVME_INVALID_FIELD | NVME_DNR;
5360 }
5361
5362 return NVME_SUCCESS;
5363}
5364
5365static void nvme_format_bh(void *opaque)
5366{
5367 NvmeFormatAIOCB *iocb = opaque;
5368 NvmeRequest *req = iocb->req;
5369 NvmeCtrl *n = nvme_ctrl(req);
5370 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5371 uint8_t lbaf = dw10 & 0xf;
5372 uint8_t pi = (dw10 >> 5) & 0x7;
5373 uint16_t status;
5374 int i;
5375
5376 if (iocb->ret < 0) {
5377 goto done;
5378 }
5379
5380 if (iocb->broadcast) {
5381 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5382 iocb->ns = nvme_ns(n, i);
5383 if (iocb->ns) {
5384 iocb->nsid = i;
5385 break;
5386 }
5387 }
5388 }
5389
5390 if (!iocb->ns) {
5391 goto done;
5392 }
5393
5394 status = nvme_format_check(iocb->ns, lbaf, pi);
5395 if (status) {
5396 req->status = status;
5397 goto done;
5398 }
5399
5400 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5401 nvme_format_ns_cb(iocb, 0);
5402 return;
5403
5404done:
5405 qemu_bh_delete(iocb->bh);
5406 iocb->bh = NULL;
5407
5408 iocb->common.cb(iocb->common.opaque, iocb->ret);
5409
5410 qemu_aio_unref(iocb);
5411}
5412
5413static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5414{
5415 NvmeFormatAIOCB *iocb;
5416 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5417 uint16_t status;
5418
5419 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5420
5421 iocb->req = req;
5422 iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5423 iocb->ret = 0;
5424 iocb->ns = NULL;
5425 iocb->nsid = 0;
5426 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5427 iocb->offset = 0;
5428
5429 if (!iocb->broadcast) {
5430 if (!nvme_nsid_valid(n, nsid)) {
5431 status = NVME_INVALID_NSID | NVME_DNR;
5432 goto out;
5433 }
5434
5435 iocb->ns = nvme_ns(n, nsid);
5436 if (!iocb->ns) {
5437 status = NVME_INVALID_FIELD | NVME_DNR;
5438 goto out;
5439 }
5440 }
5441
5442 req->aiocb = &iocb->common;
5443 qemu_bh_schedule(iocb->bh);
5444
5445 return NVME_NO_COMPLETE;
5446
5447out:
5448 qemu_bh_delete(iocb->bh);
5449 iocb->bh = NULL;
5450 qemu_aio_unref(iocb);
5451 return status;
5452}
5453
5454static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5455{
5456 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5457 nvme_adm_opc_str(req->cmd.opcode));
5458
5459 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5460 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5461 return NVME_INVALID_OPCODE | NVME_DNR;
5462 }
5463
5464
5465 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5466 return NVME_INVALID_FIELD | NVME_DNR;
5467 }
5468
5469 switch (req->cmd.opcode) {
5470 case NVME_ADM_CMD_DELETE_SQ:
5471 return nvme_del_sq(n, req);
5472 case NVME_ADM_CMD_CREATE_SQ:
5473 return nvme_create_sq(n, req);
5474 case NVME_ADM_CMD_GET_LOG_PAGE:
5475 return nvme_get_log(n, req);
5476 case NVME_ADM_CMD_DELETE_CQ:
5477 return nvme_del_cq(n, req);
5478 case NVME_ADM_CMD_CREATE_CQ:
5479 return nvme_create_cq(n, req);
5480 case NVME_ADM_CMD_IDENTIFY:
5481 return nvme_identify(n, req);
5482 case NVME_ADM_CMD_ABORT:
5483 return nvme_abort(n, req);
5484 case NVME_ADM_CMD_SET_FEATURES:
5485 return nvme_set_feature(n, req);
5486 case NVME_ADM_CMD_GET_FEATURES:
5487 return nvme_get_feature(n, req);
5488 case NVME_ADM_CMD_ASYNC_EV_REQ:
5489 return nvme_aer(n, req);
5490 case NVME_ADM_CMD_NS_ATTACHMENT:
5491 return nvme_ns_attachment(n, req);
5492 case NVME_ADM_CMD_FORMAT_NVM:
5493 return nvme_format(n, req);
5494 default:
5495 assert(false);
5496 }
5497
5498 return NVME_INVALID_OPCODE | NVME_DNR;
5499}
5500
5501static void nvme_process_sq(void *opaque)
5502{
5503 NvmeSQueue *sq = opaque;
5504 NvmeCtrl *n = sq->ctrl;
5505 NvmeCQueue *cq = n->cq[sq->cqid];
5506
5507 uint16_t status;
5508 hwaddr addr;
5509 NvmeCmd cmd;
5510 NvmeRequest *req;
5511
5512 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5513 addr = sq->dma_addr + sq->head * n->sqe_size;
5514 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5515 trace_pci_nvme_err_addr_read(addr);
5516 trace_pci_nvme_err_cfs();
5517 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
5518 break;
5519 }
5520 nvme_inc_sq_head(sq);
5521
5522 req = QTAILQ_FIRST(&sq->req_list);
5523 QTAILQ_REMOVE(&sq->req_list, req, entry);
5524 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5525 nvme_req_clear(req);
5526 req->cqe.cid = cmd.cid;
5527 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5528
5529 status = sq->sqid ? nvme_io_cmd(n, req) :
5530 nvme_admin_cmd(n, req);
5531 if (status != NVME_NO_COMPLETE) {
5532 req->status = status;
5533 nvme_enqueue_req_completion(cq, req);
5534 }
5535 }
5536}
5537
5538static void nvme_ctrl_reset(NvmeCtrl *n)
5539{
5540 NvmeNamespace *ns;
5541 int i;
5542
5543 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5544 ns = nvme_ns(n, i);
5545 if (!ns) {
5546 continue;
5547 }
5548
5549 nvme_ns_drain(ns);
5550 }
5551
5552 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5553 if (n->sq[i] != NULL) {
5554 nvme_free_sq(n->sq[i], n);
5555 }
5556 }
5557 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5558 if (n->cq[i] != NULL) {
5559 nvme_free_cq(n->cq[i], n);
5560 }
5561 }
5562
5563 while (!QTAILQ_EMPTY(&n->aer_queue)) {
5564 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5565 QTAILQ_REMOVE(&n->aer_queue, event, entry);
5566 g_free(event);
5567 }
5568
5569 n->aer_queued = 0;
5570 n->outstanding_aers = 0;
5571 n->qs_created = false;
5572}
5573
5574static void nvme_ctrl_shutdown(NvmeCtrl *n)
5575{
5576 NvmeNamespace *ns;
5577 int i;
5578
5579 if (n->pmr.dev) {
5580 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5581 }
5582
5583 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5584 ns = nvme_ns(n, i);
5585 if (!ns) {
5586 continue;
5587 }
5588
5589 nvme_ns_shutdown(ns);
5590 }
5591}
5592
5593static void nvme_select_iocs(NvmeCtrl *n)
5594{
5595 NvmeNamespace *ns;
5596 int i;
5597
5598 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5599 ns = nvme_ns(n, i);
5600 if (!ns) {
5601 continue;
5602 }
5603
5604 nvme_select_iocs_ns(n, ns);
5605 }
5606}
5607
5608static int nvme_start_ctrl(NvmeCtrl *n)
5609{
5610 uint64_t cap = ldq_le_p(&n->bar.cap);
5611 uint32_t cc = ldl_le_p(&n->bar.cc);
5612 uint32_t aqa = ldl_le_p(&n->bar.aqa);
5613 uint64_t asq = ldq_le_p(&n->bar.asq);
5614 uint64_t acq = ldq_le_p(&n->bar.acq);
5615 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
5616 uint32_t page_size = 1 << page_bits;
5617
5618 if (unlikely(n->cq[0])) {
5619 trace_pci_nvme_err_startfail_cq();
5620 return -1;
5621 }
5622 if (unlikely(n->sq[0])) {
5623 trace_pci_nvme_err_startfail_sq();
5624 return -1;
5625 }
5626 if (unlikely(!asq)) {
5627 trace_pci_nvme_err_startfail_nbarasq();
5628 return -1;
5629 }
5630 if (unlikely(!acq)) {
5631 trace_pci_nvme_err_startfail_nbaracq();
5632 return -1;
5633 }
5634 if (unlikely(asq & (page_size - 1))) {
5635 trace_pci_nvme_err_startfail_asq_misaligned(asq);
5636 return -1;
5637 }
5638 if (unlikely(acq & (page_size - 1))) {
5639 trace_pci_nvme_err_startfail_acq_misaligned(acq);
5640 return -1;
5641 }
5642 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
5643 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
5644 return -1;
5645 }
5646 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
5647 trace_pci_nvme_err_startfail_page_too_small(
5648 NVME_CC_MPS(cc),
5649 NVME_CAP_MPSMIN(cap));
5650 return -1;
5651 }
5652 if (unlikely(NVME_CC_MPS(cc) >
5653 NVME_CAP_MPSMAX(cap))) {
5654 trace_pci_nvme_err_startfail_page_too_large(
5655 NVME_CC_MPS(cc),
5656 NVME_CAP_MPSMAX(cap));
5657 return -1;
5658 }
5659 if (unlikely(NVME_CC_IOCQES(cc) <
5660 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5661 trace_pci_nvme_err_startfail_cqent_too_small(
5662 NVME_CC_IOCQES(cc),
5663 NVME_CTRL_CQES_MIN(cap));
5664 return -1;
5665 }
5666 if (unlikely(NVME_CC_IOCQES(cc) >
5667 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5668 trace_pci_nvme_err_startfail_cqent_too_large(
5669 NVME_CC_IOCQES(cc),
5670 NVME_CTRL_CQES_MAX(cap));
5671 return -1;
5672 }
5673 if (unlikely(NVME_CC_IOSQES(cc) <
5674 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5675 trace_pci_nvme_err_startfail_sqent_too_small(
5676 NVME_CC_IOSQES(cc),
5677 NVME_CTRL_SQES_MIN(cap));
5678 return -1;
5679 }
5680 if (unlikely(NVME_CC_IOSQES(cc) >
5681 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5682 trace_pci_nvme_err_startfail_sqent_too_large(
5683 NVME_CC_IOSQES(cc),
5684 NVME_CTRL_SQES_MAX(cap));
5685 return -1;
5686 }
5687 if (unlikely(!NVME_AQA_ASQS(aqa))) {
5688 trace_pci_nvme_err_startfail_asqent_sz_zero();
5689 return -1;
5690 }
5691 if (unlikely(!NVME_AQA_ACQS(aqa))) {
5692 trace_pci_nvme_err_startfail_acqent_sz_zero();
5693 return -1;
5694 }
5695
5696 n->page_bits = page_bits;
5697 n->page_size = page_size;
5698 n->max_prp_ents = n->page_size / sizeof(uint64_t);
5699 n->cqe_size = 1 << NVME_CC_IOCQES(cc);
5700 n->sqe_size = 1 << NVME_CC_IOSQES(cc);
5701 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
5702 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
5703
5704 nvme_set_timestamp(n, 0ULL);
5705
5706 QTAILQ_INIT(&n->aer_queue);
5707
5708 nvme_select_iocs(n);
5709
5710 return 0;
5711}
5712
5713static void nvme_cmb_enable_regs(NvmeCtrl *n)
5714{
5715 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
5716 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
5717
5718 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
5719 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
5720 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
5721 stl_le_p(&n->bar.cmbloc, cmbloc);
5722
5723 NVME_CMBSZ_SET_SQS(cmbsz, 1);
5724 NVME_CMBSZ_SET_CQS(cmbsz, 0);
5725 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
5726 NVME_CMBSZ_SET_RDS(cmbsz, 1);
5727 NVME_CMBSZ_SET_WDS(cmbsz, 1);
5728 NVME_CMBSZ_SET_SZU(cmbsz, 2);
5729 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
5730 stl_le_p(&n->bar.cmbsz, cmbsz);
5731}
5732
5733static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5734 unsigned size)
5735{
5736 uint64_t cap = ldq_le_p(&n->bar.cap);
5737 uint32_t cc = ldl_le_p(&n->bar.cc);
5738 uint32_t intms = ldl_le_p(&n->bar.intms);
5739 uint32_t csts = ldl_le_p(&n->bar.csts);
5740 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
5741
5742 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5743 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5744 "MMIO write not 32-bit aligned,"
5745 " offset=0x%"PRIx64"", offset);
5746
5747 }
5748
5749 if (unlikely(size < sizeof(uint32_t))) {
5750 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5751 "MMIO write smaller than 32-bits,"
5752 " offset=0x%"PRIx64", size=%u",
5753 offset, size);
5754
5755 }
5756
5757 switch (offset) {
5758 case NVME_REG_INTMS:
5759 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5760 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5761 "undefined access to interrupt mask set"
5762 " when MSI-X is enabled");
5763
5764 }
5765 intms |= data;
5766 stl_le_p(&n->bar.intms, intms);
5767 n->bar.intmc = n->bar.intms;
5768 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
5769 nvme_irq_check(n);
5770 break;
5771 case NVME_REG_INTMC:
5772 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5773 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5774 "undefined access to interrupt mask clr"
5775 " when MSI-X is enabled");
5776
5777 }
5778 intms &= ~data;
5779 stl_le_p(&n->bar.intms, intms);
5780 n->bar.intmc = n->bar.intms;
5781 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
5782 nvme_irq_check(n);
5783 break;
5784 case NVME_REG_CC:
5785 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5786
5787
5788 if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
5789 !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
5790 {
5791 cc = data;
5792 }
5793
5794 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
5795 cc = data;
5796
5797
5798 stl_le_p(&n->bar.cc, cc);
5799 if (unlikely(nvme_start_ctrl(n))) {
5800 trace_pci_nvme_err_startfail();
5801 csts = NVME_CSTS_FAILED;
5802 } else {
5803 trace_pci_nvme_mmio_start_success();
5804 csts = NVME_CSTS_READY;
5805 }
5806 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
5807 trace_pci_nvme_mmio_stopped();
5808 nvme_ctrl_reset(n);
5809 cc = 0;
5810 csts &= ~NVME_CSTS_READY;
5811 }
5812
5813 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
5814 trace_pci_nvme_mmio_shutdown_set();
5815 nvme_ctrl_shutdown(n);
5816 cc = data;
5817 csts |= NVME_CSTS_SHST_COMPLETE;
5818 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
5819 trace_pci_nvme_mmio_shutdown_cleared();
5820 csts &= ~NVME_CSTS_SHST_COMPLETE;
5821 cc = data;
5822 }
5823
5824 stl_le_p(&n->bar.cc, cc);
5825 stl_le_p(&n->bar.csts, csts);
5826
5827 break;
5828 case NVME_REG_CSTS:
5829 if (data & (1 << 4)) {
5830 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5831 "attempted to W1C CSTS.NSSRO"
5832 " but CAP.NSSRS is zero (not supported)");
5833 } else if (data != 0) {
5834 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5835 "attempted to set a read only bit"
5836 " of controller status");
5837 }
5838 break;
5839 case NVME_REG_NSSR:
5840 if (data == 0x4e564d65) {
5841 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5842 } else {
5843
5844 return;
5845 }
5846 break;
5847 case NVME_REG_AQA:
5848 stl_le_p(&n->bar.aqa, data);
5849 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5850 break;
5851 case NVME_REG_ASQ:
5852 stn_le_p(&n->bar.asq, size, data);
5853 trace_pci_nvme_mmio_asqaddr(data);
5854 break;
5855 case NVME_REG_ASQ + 4:
5856 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
5857 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
5858 break;
5859 case NVME_REG_ACQ:
5860 trace_pci_nvme_mmio_acqaddr(data);
5861 stn_le_p(&n->bar.acq, size, data);
5862 break;
5863 case NVME_REG_ACQ + 4:
5864 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
5865 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
5866 break;
5867 case NVME_REG_CMBLOC:
5868 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5869 "invalid write to reserved CMBLOC"
5870 " when CMBSZ is zero, ignored");
5871 return;
5872 case NVME_REG_CMBSZ:
5873 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5874 "invalid write to read only CMBSZ, ignored");
5875 return;
5876 case NVME_REG_CMBMSC:
5877 if (!NVME_CAP_CMBS(cap)) {
5878 return;
5879 }
5880
5881 stn_le_p(&n->bar.cmbmsc, size, data);
5882 n->cmb.cmse = false;
5883
5884 if (NVME_CMBMSC_CRE(data)) {
5885 nvme_cmb_enable_regs(n);
5886
5887 if (NVME_CMBMSC_CMSE(data)) {
5888 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
5889 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
5890 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5891 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
5892 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
5893 stl_le_p(&n->bar.cmbsts, cmbsts);
5894 return;
5895 }
5896
5897 n->cmb.cba = cba;
5898 n->cmb.cmse = true;
5899 }
5900 } else {
5901 n->bar.cmbsz = 0;
5902 n->bar.cmbloc = 0;
5903 }
5904
5905 return;
5906 case NVME_REG_CMBMSC + 4:
5907 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
5908 return;
5909
5910 case NVME_REG_PMRCAP:
5911 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5912 "invalid write to PMRCAP register, ignored");
5913 return;
5914 case NVME_REG_PMRCTL:
5915 if (!NVME_CAP_PMRS(cap)) {
5916 return;
5917 }
5918
5919 stl_le_p(&n->bar.pmrctl, data);
5920 if (NVME_PMRCTL_EN(data)) {
5921 memory_region_set_enabled(&n->pmr.dev->mr, true);
5922 pmrsts = 0;
5923 } else {
5924 memory_region_set_enabled(&n->pmr.dev->mr, false);
5925 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
5926 n->pmr.cmse = false;
5927 }
5928 stl_le_p(&n->bar.pmrsts, pmrsts);
5929 return;
5930 case NVME_REG_PMRSTS:
5931 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5932 "invalid write to PMRSTS register, ignored");
5933 return;
5934 case NVME_REG_PMREBS:
5935 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5936 "invalid write to PMREBS register, ignored");
5937 return;
5938 case NVME_REG_PMRSWTP:
5939 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5940 "invalid write to PMRSWTP register, ignored");
5941 return;
5942 case NVME_REG_PMRMSCL:
5943 if (!NVME_CAP_PMRS(cap)) {
5944 return;
5945 }
5946
5947 stl_le_p(&n->bar.pmrmscl, data);
5948 n->pmr.cmse = false;
5949
5950 if (NVME_PMRMSCL_CMSE(data)) {
5951 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
5952 hwaddr cba = pmrmscu << 32 |
5953 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
5954 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5955 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
5956 stl_le_p(&n->bar.pmrsts, pmrsts);
5957 return;
5958 }
5959
5960 n->pmr.cmse = true;
5961 n->pmr.cba = cba;
5962 }
5963
5964 return;
5965 case NVME_REG_PMRMSCU:
5966 if (!NVME_CAP_PMRS(cap)) {
5967 return;
5968 }
5969
5970 stl_le_p(&n->bar.pmrmscu, data);
5971 return;
5972 default:
5973 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5974 "invalid MMIO write,"
5975 " offset=0x%"PRIx64", data=%"PRIx64"",
5976 offset, data);
5977 break;
5978 }
5979}
5980
5981static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5982{
5983 NvmeCtrl *n = (NvmeCtrl *)opaque;
5984 uint8_t *ptr = (uint8_t *)&n->bar;
5985
5986 trace_pci_nvme_mmio_read(addr, size);
5987
5988 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
5989 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
5990 "MMIO read not 32-bit aligned,"
5991 " offset=0x%"PRIx64"", addr);
5992
5993 } else if (unlikely(size < sizeof(uint32_t))) {
5994 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
5995 "MMIO read smaller than 32-bits,"
5996 " offset=0x%"PRIx64"", addr);
5997
5998 }
5999
6000 if (addr > sizeof(n->bar) - size) {
6001 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6002 "MMIO read beyond last register,"
6003 " offset=0x%"PRIx64", returning 0", addr);
6004
6005 return 0;
6006 }
6007
6008
6009
6010
6011
6012
6013 if (addr == NVME_REG_PMRSTS &&
6014 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6015 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6016 }
6017
6018 return ldn_le_p(ptr + addr, size);
6019}
6020
6021static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6022{
6023 uint32_t qid;
6024
6025 if (unlikely(addr & ((1 << 2) - 1))) {
6026 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6027 "doorbell write not 32-bit aligned,"
6028 " offset=0x%"PRIx64", ignoring", addr);
6029 return;
6030 }
6031
6032 if (((addr - 0x1000) >> 2) & 1) {
6033
6034
6035 uint16_t new_head = val & 0xffff;
6036 int start_sqs;
6037 NvmeCQueue *cq;
6038
6039 qid = (addr - (0x1000 + (1 << 2))) >> 3;
6040 if (unlikely(nvme_check_cqid(n, qid))) {
6041 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6042 "completion queue doorbell write"
6043 " for nonexistent queue,"
6044 " sqid=%"PRIu32", ignoring", qid);
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059 if (n->outstanding_aers) {
6060 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6061 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6062 NVME_LOG_ERROR_INFO);
6063 }
6064
6065 return;
6066 }
6067
6068 cq = n->cq[qid];
6069 if (unlikely(new_head >= cq->size)) {
6070 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6071 "completion queue doorbell write value"
6072 " beyond queue size, sqid=%"PRIu32","
6073 " new_head=%"PRIu16", ignoring",
6074 qid, new_head);
6075
6076 if (n->outstanding_aers) {
6077 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6078 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6079 NVME_LOG_ERROR_INFO);
6080 }
6081
6082 return;
6083 }
6084
6085 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6086
6087 start_sqs = nvme_cq_full(cq) ? 1 : 0;
6088 cq->head = new_head;
6089 if (start_sqs) {
6090 NvmeSQueue *sq;
6091 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6092 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6093 }
6094 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6095 }
6096
6097 if (cq->tail == cq->head) {
6098 if (cq->irq_enabled) {
6099 n->cq_pending--;
6100 }
6101
6102 nvme_irq_deassert(n, cq);
6103 }
6104 } else {
6105
6106
6107 uint16_t new_tail = val & 0xffff;
6108 NvmeSQueue *sq;
6109
6110 qid = (addr - 0x1000) >> 3;
6111 if (unlikely(nvme_check_sqid(n, qid))) {
6112 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6113 "submission queue doorbell write"
6114 " for nonexistent queue,"
6115 " sqid=%"PRIu32", ignoring", qid);
6116
6117 if (n->outstanding_aers) {
6118 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6119 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6120 NVME_LOG_ERROR_INFO);
6121 }
6122
6123 return;
6124 }
6125
6126 sq = n->sq[qid];
6127 if (unlikely(new_tail >= sq->size)) {
6128 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6129 "submission queue doorbell write value"
6130 " beyond queue size, sqid=%"PRIu32","
6131 " new_tail=%"PRIu16", ignoring",
6132 qid, new_tail);
6133
6134 if (n->outstanding_aers) {
6135 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6136 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6137 NVME_LOG_ERROR_INFO);
6138 }
6139
6140 return;
6141 }
6142
6143 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6144
6145 sq->tail = new_tail;
6146 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6147 }
6148}
6149
6150static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6151 unsigned size)
6152{
6153 NvmeCtrl *n = (NvmeCtrl *)opaque;
6154
6155 trace_pci_nvme_mmio_write(addr, data, size);
6156
6157 if (addr < sizeof(n->bar)) {
6158 nvme_write_bar(n, addr, data, size);
6159 } else {
6160 nvme_process_db(n, addr, data);
6161 }
6162}
6163
6164static const MemoryRegionOps nvme_mmio_ops = {
6165 .read = nvme_mmio_read,
6166 .write = nvme_mmio_write,
6167 .endianness = DEVICE_LITTLE_ENDIAN,
6168 .impl = {
6169 .min_access_size = 2,
6170 .max_access_size = 8,
6171 },
6172};
6173
6174static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
6175 unsigned size)
6176{
6177 NvmeCtrl *n = (NvmeCtrl *)opaque;
6178 stn_le_p(&n->cmb.buf[addr], size, data);
6179}
6180
6181static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
6182{
6183 NvmeCtrl *n = (NvmeCtrl *)opaque;
6184 return ldn_le_p(&n->cmb.buf[addr], size);
6185}
6186
6187static const MemoryRegionOps nvme_cmb_ops = {
6188 .read = nvme_cmb_read,
6189 .write = nvme_cmb_write,
6190 .endianness = DEVICE_LITTLE_ENDIAN,
6191 .impl = {
6192 .min_access_size = 1,
6193 .max_access_size = 8,
6194 },
6195};
6196
6197static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
6198{
6199 NvmeParams *params = &n->params;
6200
6201 if (params->num_queues) {
6202 warn_report("num_queues is deprecated; please use max_ioqpairs "
6203 "instead");
6204
6205 params->max_ioqpairs = params->num_queues - 1;
6206 }
6207
6208 if (n->namespace.blkconf.blk && n->subsys) {
6209 error_setg(errp, "subsystem support is unavailable with legacy "
6210 "namespace ('drive' property)");
6211 return;
6212 }
6213
6214 if (params->max_ioqpairs < 1 ||
6215 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
6216 error_setg(errp, "max_ioqpairs must be between 1 and %d",
6217 NVME_MAX_IOQPAIRS);
6218 return;
6219 }
6220
6221 if (params->msix_qsize < 1 ||
6222 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
6223 error_setg(errp, "msix_qsize must be between 1 and %d",
6224 PCI_MSIX_FLAGS_QSIZE + 1);
6225 return;
6226 }
6227
6228 if (!params->serial) {
6229 error_setg(errp, "serial property not set");
6230 return;
6231 }
6232
6233 if (n->pmr.dev) {
6234 if (host_memory_backend_is_mapped(n->pmr.dev)) {
6235 error_setg(errp, "can't use already busy memdev: %s",
6236 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
6237 return;
6238 }
6239
6240 if (!is_power_of_2(n->pmr.dev->size)) {
6241 error_setg(errp, "pmr backend size needs to be power of 2 in size");
6242 return;
6243 }
6244
6245 host_memory_backend_set_mapped(n->pmr.dev, true);
6246 }
6247
6248 if (n->params.zasl > n->params.mdts) {
6249 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
6250 "than or equal to mdts (Maximum Data Transfer Size)");
6251 return;
6252 }
6253
6254 if (!n->params.vsl) {
6255 error_setg(errp, "vsl must be non-zero");
6256 return;
6257 }
6258}
6259
6260static void nvme_init_state(NvmeCtrl *n)
6261{
6262
6263 n->reg_size = pow2ceil(sizeof(NvmeBar) +
6264 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
6265 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
6266 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
6267 n->temperature = NVME_TEMPERATURE;
6268 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
6269 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6270 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
6271}
6272
6273static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
6274{
6275 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
6276 uint64_t cap = ldq_le_p(&n->bar.cap);
6277
6278 n->cmb.buf = g_malloc0(cmb_size);
6279 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
6280 "nvme-cmb", cmb_size);
6281 pci_register_bar(pci_dev, NVME_CMB_BIR,
6282 PCI_BASE_ADDRESS_SPACE_MEMORY |
6283 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6284 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
6285
6286 NVME_CAP_SET_CMBS(cap, 1);
6287 stq_le_p(&n->bar.cap, cap);
6288
6289 if (n->params.legacy_cmb) {
6290 nvme_cmb_enable_regs(n);
6291 n->cmb.cmse = true;
6292 }
6293}
6294
6295static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
6296{
6297 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
6298
6299 NVME_PMRCAP_SET_RDS(pmrcap, 1);
6300 NVME_PMRCAP_SET_WDS(pmrcap, 1);
6301 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
6302
6303 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
6304 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
6305 stl_le_p(&n->bar.pmrcap, pmrcap);
6306
6307 pci_register_bar(pci_dev, NVME_PMR_BIR,
6308 PCI_BASE_ADDRESS_SPACE_MEMORY |
6309 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6310 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
6311
6312 memory_region_set_enabled(&n->pmr.dev->mr, false);
6313}
6314
6315static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
6316{
6317 uint8_t *pci_conf = pci_dev->config;
6318 uint64_t bar_size, msix_table_size, msix_pba_size;
6319 unsigned msix_table_offset, msix_pba_offset;
6320 int ret;
6321
6322 Error *err = NULL;
6323
6324 pci_conf[PCI_INTERRUPT_PIN] = 1;
6325 pci_config_set_prog_interface(pci_conf, 0x2);
6326
6327 if (n->params.use_intel_id) {
6328 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6329 pci_config_set_device_id(pci_conf, 0x5845);
6330 } else {
6331 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6332 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6333 }
6334
6335 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6336 pcie_endpoint_cap_init(pci_dev, 0x80);
6337
6338 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6339 msix_table_offset = bar_size;
6340 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6341
6342 bar_size += msix_table_size;
6343 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6344 msix_pba_offset = bar_size;
6345 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6346
6347 bar_size += msix_pba_size;
6348 bar_size = pow2ceil(bar_size);
6349
6350 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6351 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6352 n->reg_size);
6353 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6354
6355 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6356 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6357 ret = msix_init(pci_dev, n->params.msix_qsize,
6358 &n->bar0, 0, msix_table_offset,
6359 &n->bar0, 0, msix_pba_offset, 0, &err);
6360 if (ret < 0) {
6361 if (ret == -ENOTSUP) {
6362 warn_report_err(err);
6363 } else {
6364 error_propagate(errp, err);
6365 return ret;
6366 }
6367 }
6368
6369 if (n->params.cmb_size_mb) {
6370 nvme_init_cmb(n, pci_dev);
6371 }
6372
6373 if (n->pmr.dev) {
6374 nvme_init_pmr(n, pci_dev);
6375 }
6376
6377 return 0;
6378}
6379
6380static void nvme_init_subnqn(NvmeCtrl *n)
6381{
6382 NvmeSubsystem *subsys = n->subsys;
6383 NvmeIdCtrl *id = &n->id_ctrl;
6384
6385 if (!subsys) {
6386 snprintf((char *)id->subnqn, sizeof(id->subnqn),
6387 "nqn.2019-08.org.qemu:%s", n->params.serial);
6388 } else {
6389 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6390 }
6391}
6392
6393static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6394{
6395 NvmeIdCtrl *id = &n->id_ctrl;
6396 uint8_t *pci_conf = pci_dev->config;
6397 uint64_t cap = ldq_le_p(&n->bar.cap);
6398
6399 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6400 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6401 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6402 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6403 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6404
6405 id->cntlid = cpu_to_le16(n->cntlid);
6406
6407 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6408
6409 id->rab = 6;
6410
6411 if (n->params.use_intel_id) {
6412 id->ieee[0] = 0xb3;
6413 id->ieee[1] = 0x02;
6414 id->ieee[2] = 0x00;
6415 } else {
6416 id->ieee[0] = 0x00;
6417 id->ieee[1] = 0x54;
6418 id->ieee[2] = 0x52;
6419 }
6420
6421 id->mdts = n->params.mdts;
6422 id->ver = cpu_to_le32(NVME_SPEC_VER);
6423 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6424 id->cntrltype = 0x1;
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437 id->acl = 3;
6438 id->aerl = n->params.aerl;
6439 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6440 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6441
6442
6443 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6444 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6445
6446 id->sqes = (0x6 << 4) | 0x6;
6447 id->cqes = (0x4 << 4) | 0x4;
6448 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6449 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6450 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6451 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6452
6453
6454
6455
6456
6457
6458
6459
6460 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6461
6462 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6463 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6464 NVME_CTRL_SGLS_BITBUCKET);
6465
6466 nvme_init_subnqn(n);
6467
6468 id->psd[0].mp = cpu_to_le16(0x9c4);
6469 id->psd[0].enlat = cpu_to_le32(0x10);
6470 id->psd[0].exlat = cpu_to_le32(0x4);
6471
6472 if (n->subsys) {
6473 id->cmic |= NVME_CMIC_MULTI_CTRL;
6474 }
6475
6476 NVME_CAP_SET_MQES(cap, 0x7ff);
6477 NVME_CAP_SET_CQR(cap, 1);
6478 NVME_CAP_SET_TO(cap, 0xf);
6479 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
6480 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
6481 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
6482 NVME_CAP_SET_MPSMAX(cap, 4);
6483 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
6484 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
6485 stq_le_p(&n->bar.cap, cap);
6486
6487 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
6488 n->bar.intmc = n->bar.intms = 0;
6489}
6490
6491static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6492{
6493 int cntlid;
6494
6495 if (!n->subsys) {
6496 return 0;
6497 }
6498
6499 cntlid = nvme_subsys_register_ctrl(n, errp);
6500 if (cntlid < 0) {
6501 return -1;
6502 }
6503
6504 n->cntlid = cntlid;
6505
6506 return 0;
6507}
6508
6509void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6510{
6511 uint32_t nsid = ns->params.nsid;
6512 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6513
6514 n->namespaces[nsid] = ns;
6515 ns->attached++;
6516
6517 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6518 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6519}
6520
6521static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6522{
6523 NvmeCtrl *n = NVME(pci_dev);
6524 NvmeNamespace *ns;
6525 Error *local_err = NULL;
6526
6527 nvme_check_constraints(n, &local_err);
6528 if (local_err) {
6529 error_propagate(errp, local_err);
6530 return;
6531 }
6532
6533 qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6534 &pci_dev->qdev, n->parent_obj.qdev.id);
6535
6536 nvme_init_state(n);
6537 if (nvme_init_pci(n, pci_dev, errp)) {
6538 return;
6539 }
6540
6541 if (nvme_init_subsys(n, errp)) {
6542 error_propagate(errp, local_err);
6543 return;
6544 }
6545 nvme_init_ctrl(n, pci_dev);
6546
6547
6548 if (n->namespace.blkconf.blk) {
6549 ns = &n->namespace;
6550 ns->params.nsid = 1;
6551
6552 if (nvme_ns_setup(ns, errp)) {
6553 return;
6554 }
6555
6556 nvme_attach_ns(n, ns);
6557 }
6558}
6559
6560static void nvme_exit(PCIDevice *pci_dev)
6561{
6562 NvmeCtrl *n = NVME(pci_dev);
6563 NvmeNamespace *ns;
6564 int i;
6565
6566 nvme_ctrl_reset(n);
6567
6568 if (n->subsys) {
6569 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6570 ns = nvme_ns(n, i);
6571 if (ns) {
6572 ns->attached--;
6573 }
6574 }
6575
6576 nvme_subsys_unregister_ctrl(n->subsys, n);
6577 }
6578
6579 g_free(n->cq);
6580 g_free(n->sq);
6581 g_free(n->aer_reqs);
6582
6583 if (n->params.cmb_size_mb) {
6584 g_free(n->cmb.buf);
6585 }
6586
6587 if (n->pmr.dev) {
6588 host_memory_backend_set_mapped(n->pmr.dev, false);
6589 }
6590 msix_uninit(pci_dev, &n->bar0, &n->bar0);
6591 memory_region_del_subregion(&n->bar0, &n->iomem);
6592}
6593
6594static Property nvme_props[] = {
6595 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6596 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6597 HostMemoryBackend *),
6598 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6599 NvmeSubsystem *),
6600 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6601 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6602 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6603 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6604 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6605 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6606 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6607 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6608 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6609 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6610 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6611 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6612 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6613 params.auto_transition_zones, true),
6614 DEFINE_PROP_END_OF_LIST(),
6615};
6616
6617static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6618 void *opaque, Error **errp)
6619{
6620 NvmeCtrl *n = NVME(obj);
6621 uint8_t value = n->smart_critical_warning;
6622
6623 visit_type_uint8(v, name, &value, errp);
6624}
6625
6626static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6627 void *opaque, Error **errp)
6628{
6629 NvmeCtrl *n = NVME(obj);
6630 uint8_t value, old_value, cap = 0, index, event;
6631
6632 if (!visit_type_uint8(v, name, &value, errp)) {
6633 return;
6634 }
6635
6636 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6637 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6638 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
6639 cap |= NVME_SMART_PMR_UNRELIABLE;
6640 }
6641
6642 if ((value & cap) != value) {
6643 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6644 value & ~cap);
6645 return;
6646 }
6647
6648 old_value = n->smart_critical_warning;
6649 n->smart_critical_warning = value;
6650
6651
6652 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6653 event = 1 << index;
6654 if (value & ~old_value & event)
6655 nvme_smart_event(n, event);
6656 }
6657}
6658
6659static const VMStateDescription nvme_vmstate = {
6660 .name = "nvme",
6661 .unmigratable = 1,
6662};
6663
6664static void nvme_class_init(ObjectClass *oc, void *data)
6665{
6666 DeviceClass *dc = DEVICE_CLASS(oc);
6667 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6668
6669 pc->realize = nvme_realize;
6670 pc->exit = nvme_exit;
6671 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6672 pc->revision = 2;
6673
6674 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6675 dc->desc = "Non-Volatile Memory Express";
6676 device_class_set_props(dc, nvme_props);
6677 dc->vmsd = &nvme_vmstate;
6678}
6679
6680static void nvme_instance_init(Object *obj)
6681{
6682 NvmeCtrl *n = NVME(obj);
6683
6684 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6685 "bootindex", "/namespace@1,0",
6686 DEVICE(obj));
6687
6688 object_property_add(obj, "smart_critical_warning", "uint8",
6689 nvme_get_smart_warning,
6690 nvme_set_smart_warning, NULL, NULL);
6691}
6692
6693static const TypeInfo nvme_info = {
6694 .name = TYPE_NVME,
6695 .parent = TYPE_PCI_DEVICE,
6696 .instance_size = sizeof(NvmeCtrl),
6697 .instance_init = nvme_instance_init,
6698 .class_init = nvme_class_init,
6699 .interfaces = (InterfaceInfo[]) {
6700 { INTERFACE_PCIE_DEVICE },
6701 { }
6702 },
6703};
6704
6705static const TypeInfo nvme_bus_info = {
6706 .name = TYPE_NVME_BUS,
6707 .parent = TYPE_BUS,
6708 .instance_size = sizeof(NvmeBus),
6709};
6710
6711static void nvme_register_types(void)
6712{
6713 type_register_static(&nvme_info);
6714 type_register_static(&nvme_bus_info);
6715}
6716
6717type_init(nvme_register_types)
6718