1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152#include "qemu/osdep.h"
153#include "qemu/cutils.h"
154#include "qemu/error-report.h"
155#include "qemu/log.h"
156#include "qemu/units.h"
157#include "qapi/error.h"
158#include "qapi/visitor.h"
159#include "sysemu/sysemu.h"
160#include "sysemu/block-backend.h"
161#include "sysemu/hostmem.h"
162#include "hw/pci/msix.h"
163#include "migration/vmstate.h"
164
165#include "nvme.h"
166#include "trace.h"
167
168#define NVME_MAX_IOQPAIRS 0xffff
169#define NVME_DB_SIZE 4
170#define NVME_SPEC_VER 0x00010400
171#define NVME_CMB_BIR 2
172#define NVME_PMR_BIR 4
173#define NVME_TEMPERATURE 0x143
174#define NVME_TEMPERATURE_WARNING 0x157
175#define NVME_TEMPERATURE_CRITICAL 0x175
176#define NVME_NUM_FW_SLOTS 1
177#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
178
179#define NVME_GUEST_ERR(trace, fmt, ...) \
180 do { \
181 (trace_##trace)(__VA_ARGS__); \
182 qemu_log_mask(LOG_GUEST_ERROR, #trace \
183 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
184 } while (0)
185
186static const bool nvme_feature_support[NVME_FID_MAX] = {
187 [NVME_ARBITRATION] = true,
188 [NVME_POWER_MANAGEMENT] = true,
189 [NVME_TEMPERATURE_THRESHOLD] = true,
190 [NVME_ERROR_RECOVERY] = true,
191 [NVME_VOLATILE_WRITE_CACHE] = true,
192 [NVME_NUMBER_OF_QUEUES] = true,
193 [NVME_INTERRUPT_COALESCING] = true,
194 [NVME_INTERRUPT_VECTOR_CONF] = true,
195 [NVME_WRITE_ATOMICITY] = true,
196 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
197 [NVME_TIMESTAMP] = true,
198 [NVME_COMMAND_SET_PROFILE] = true,
199};
200
201static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
202 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
203 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
204 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
205 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
206 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
207 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
208 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
209};
210
211static const uint32_t nvme_cse_acs[256] = {
212 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
213 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
214 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
215 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
216 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
217 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
218 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
219 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
220 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
221 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
222 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
223 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224};
225
226static const uint32_t nvme_cse_iocs_none[256];
227
228static const uint32_t nvme_cse_iocs_nvm[256] = {
229 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
233 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
235 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
236 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
237};
238
239static const uint32_t nvme_cse_iocs_zoned[256] = {
240 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
241 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
243 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
244 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
246 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
248 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
249 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
251};
252
253static void nvme_process_sq(void *opaque);
254
255static uint16_t nvme_sqid(NvmeRequest *req)
256{
257 return le16_to_cpu(req->sq->sqid);
258}
259
260static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
261 NvmeZoneState state)
262{
263 if (QTAILQ_IN_USE(zone, entry)) {
264 switch (nvme_get_zone_state(zone)) {
265 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
266 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
267 break;
268 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
269 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
270 break;
271 case NVME_ZONE_STATE_CLOSED:
272 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
273 break;
274 case NVME_ZONE_STATE_FULL:
275 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
276 default:
277 ;
278 }
279 }
280
281 nvme_set_zone_state(zone, state);
282
283 switch (state) {
284 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
285 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
286 break;
287 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
288 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
289 break;
290 case NVME_ZONE_STATE_CLOSED:
291 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
292 break;
293 case NVME_ZONE_STATE_FULL:
294 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
295 case NVME_ZONE_STATE_READ_ONLY:
296 break;
297 default:
298 zone->d.za = 0;
299 }
300}
301
302
303
304
305
306static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
307{
308 if (ns->params.max_active_zones != 0 &&
309 ns->nr_active_zones + act > ns->params.max_active_zones) {
310 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
311 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
312 }
313 if (ns->params.max_open_zones != 0 &&
314 ns->nr_open_zones + opn > ns->params.max_open_zones) {
315 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
316 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
317 }
318
319 return NVME_SUCCESS;
320}
321
322static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
323{
324 hwaddr hi, lo;
325
326 if (!n->cmb.cmse) {
327 return false;
328 }
329
330 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
331 hi = lo + int128_get64(n->cmb.mem.size);
332
333 return addr >= lo && addr < hi;
334}
335
336static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
337{
338 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
339 return &n->cmb.buf[addr - base];
340}
341
342static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
343{
344 hwaddr hi;
345
346 if (!n->pmr.cmse) {
347 return false;
348 }
349
350 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
351
352 return addr >= n->pmr.cba && addr < hi;
353}
354
355static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
356{
357 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
358}
359
360static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
361{
362 hwaddr hi = addr + size - 1;
363 if (hi < addr) {
364 return 1;
365 }
366
367 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
368 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
369 return 0;
370 }
371
372 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
373 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
374 return 0;
375 }
376
377 return pci_dma_read(&n->parent_obj, addr, buf, size);
378}
379
380static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
381{
382 hwaddr hi = addr + size - 1;
383 if (hi < addr) {
384 return 1;
385 }
386
387 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
388 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
389 return 0;
390 }
391
392 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
393 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
394 return 0;
395 }
396
397 return pci_dma_write(&n->parent_obj, addr, buf, size);
398}
399
400static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
401{
402 return nsid &&
403 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
404}
405
406static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
407{
408 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
409}
410
411static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
412{
413 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
414}
415
416static void nvme_inc_cq_tail(NvmeCQueue *cq)
417{
418 cq->tail++;
419 if (cq->tail >= cq->size) {
420 cq->tail = 0;
421 cq->phase = !cq->phase;
422 }
423}
424
425static void nvme_inc_sq_head(NvmeSQueue *sq)
426{
427 sq->head = (sq->head + 1) % sq->size;
428}
429
430static uint8_t nvme_cq_full(NvmeCQueue *cq)
431{
432 return (cq->tail + 1) % cq->size == cq->head;
433}
434
435static uint8_t nvme_sq_empty(NvmeSQueue *sq)
436{
437 return sq->head == sq->tail;
438}
439
440static void nvme_irq_check(NvmeCtrl *n)
441{
442 uint32_t intms = ldl_le_p(&n->bar.intms);
443
444 if (msix_enabled(&(n->parent_obj))) {
445 return;
446 }
447 if (~intms & n->irq_status) {
448 pci_irq_assert(&n->parent_obj);
449 } else {
450 pci_irq_deassert(&n->parent_obj);
451 }
452}
453
454static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
455{
456 if (cq->irq_enabled) {
457 if (msix_enabled(&(n->parent_obj))) {
458 trace_pci_nvme_irq_msix(cq->vector);
459 msix_notify(&(n->parent_obj), cq->vector);
460 } else {
461 trace_pci_nvme_irq_pin();
462 assert(cq->vector < 32);
463 n->irq_status |= 1 << cq->vector;
464 nvme_irq_check(n);
465 }
466 } else {
467 trace_pci_nvme_irq_masked();
468 }
469}
470
471static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
472{
473 if (cq->irq_enabled) {
474 if (msix_enabled(&(n->parent_obj))) {
475 return;
476 } else {
477 assert(cq->vector < 32);
478 if (!n->cq_pending) {
479 n->irq_status &= ~(1 << cq->vector);
480 }
481 nvme_irq_check(n);
482 }
483 }
484}
485
486static void nvme_req_clear(NvmeRequest *req)
487{
488 req->ns = NULL;
489 req->opaque = NULL;
490 req->aiocb = NULL;
491 memset(&req->cqe, 0x0, sizeof(req->cqe));
492 req->status = NVME_SUCCESS;
493}
494
495static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
496{
497 if (dma) {
498 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
499 sg->flags = NVME_SG_DMA;
500 } else {
501 qemu_iovec_init(&sg->iov, 0);
502 }
503
504 sg->flags |= NVME_SG_ALLOC;
505}
506
507static inline void nvme_sg_unmap(NvmeSg *sg)
508{
509 if (!(sg->flags & NVME_SG_ALLOC)) {
510 return;
511 }
512
513 if (sg->flags & NVME_SG_DMA) {
514 qemu_sglist_destroy(&sg->qsg);
515 } else {
516 qemu_iovec_destroy(&sg->iov);
517 }
518
519 memset(sg, 0x0, sizeof(*sg));
520}
521
522
523
524
525
526
527static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
528 NvmeSg *mdata)
529{
530 NvmeSg *dst = data;
531 uint32_t trans_len, count = ns->lbasz;
532 uint64_t offset = 0;
533 bool dma = sg->flags & NVME_SG_DMA;
534 size_t sge_len;
535 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
536 int sg_idx = 0;
537
538 assert(sg->flags & NVME_SG_ALLOC);
539
540 while (sg_len) {
541 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
542
543 trans_len = MIN(sg_len, count);
544 trans_len = MIN(trans_len, sge_len - offset);
545
546 if (dst) {
547 if (dma) {
548 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
549 trans_len);
550 } else {
551 qemu_iovec_add(&dst->iov,
552 sg->iov.iov[sg_idx].iov_base + offset,
553 trans_len);
554 }
555 }
556
557 sg_len -= trans_len;
558 count -= trans_len;
559 offset += trans_len;
560
561 if (count == 0) {
562 dst = (dst == data) ? mdata : data;
563 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
564 }
565
566 if (sge_len == offset) {
567 offset = 0;
568 sg_idx++;
569 }
570 }
571}
572
573static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
574 size_t len)
575{
576 if (!len) {
577 return NVME_SUCCESS;
578 }
579
580 trace_pci_nvme_map_addr_cmb(addr, len);
581
582 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
583 return NVME_DATA_TRAS_ERROR;
584 }
585
586 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
587
588 return NVME_SUCCESS;
589}
590
591static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
592 size_t len)
593{
594 if (!len) {
595 return NVME_SUCCESS;
596 }
597
598 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
599 return NVME_DATA_TRAS_ERROR;
600 }
601
602 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
603
604 return NVME_SUCCESS;
605}
606
607static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
608{
609 bool cmb = false, pmr = false;
610
611 if (!len) {
612 return NVME_SUCCESS;
613 }
614
615 trace_pci_nvme_map_addr(addr, len);
616
617 if (nvme_addr_is_cmb(n, addr)) {
618 cmb = true;
619 } else if (nvme_addr_is_pmr(n, addr)) {
620 pmr = true;
621 }
622
623 if (cmb || pmr) {
624 if (sg->flags & NVME_SG_DMA) {
625 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
626 }
627
628 if (sg->iov.niov + 1 > IOV_MAX) {
629 goto max_mappings_exceeded;
630 }
631
632 if (cmb) {
633 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
634 } else {
635 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
636 }
637 }
638
639 if (!(sg->flags & NVME_SG_DMA)) {
640 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
641 }
642
643 if (sg->qsg.nsg + 1 > IOV_MAX) {
644 goto max_mappings_exceeded;
645 }
646
647 qemu_sglist_add(&sg->qsg, addr, len);
648
649 return NVME_SUCCESS;
650
651max_mappings_exceeded:
652 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
653 "number of mappings exceed 1024");
654 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
655}
656
657static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
658{
659 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
660}
661
662static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
663 uint64_t prp2, uint32_t len)
664{
665 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
666 trans_len = MIN(len, trans_len);
667 int num_prps = (len >> n->page_bits) + 1;
668 uint16_t status;
669 int ret;
670
671 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
672
673 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
674
675 status = nvme_map_addr(n, sg, prp1, trans_len);
676 if (status) {
677 goto unmap;
678 }
679
680 len -= trans_len;
681 if (len) {
682 if (len > n->page_size) {
683 uint64_t prp_list[n->max_prp_ents];
684 uint32_t nents, prp_trans;
685 int i = 0;
686
687
688
689
690
691
692 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
693 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
694 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
695 if (ret) {
696 trace_pci_nvme_err_addr_read(prp2);
697 status = NVME_DATA_TRAS_ERROR;
698 goto unmap;
699 }
700 while (len != 0) {
701 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
702
703 if (i == nents - 1 && len > n->page_size) {
704 if (unlikely(prp_ent & (n->page_size - 1))) {
705 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
706 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
707 goto unmap;
708 }
709
710 i = 0;
711 nents = (len + n->page_size - 1) >> n->page_bits;
712 nents = MIN(nents, n->max_prp_ents);
713 prp_trans = nents * sizeof(uint64_t);
714 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
715 prp_trans);
716 if (ret) {
717 trace_pci_nvme_err_addr_read(prp_ent);
718 status = NVME_DATA_TRAS_ERROR;
719 goto unmap;
720 }
721 prp_ent = le64_to_cpu(prp_list[i]);
722 }
723
724 if (unlikely(prp_ent & (n->page_size - 1))) {
725 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
726 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
727 goto unmap;
728 }
729
730 trans_len = MIN(len, n->page_size);
731 status = nvme_map_addr(n, sg, prp_ent, trans_len);
732 if (status) {
733 goto unmap;
734 }
735
736 len -= trans_len;
737 i++;
738 }
739 } else {
740 if (unlikely(prp2 & (n->page_size - 1))) {
741 trace_pci_nvme_err_invalid_prp2_align(prp2);
742 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
743 goto unmap;
744 }
745 status = nvme_map_addr(n, sg, prp2, len);
746 if (status) {
747 goto unmap;
748 }
749 }
750 }
751
752 return NVME_SUCCESS;
753
754unmap:
755 nvme_sg_unmap(sg);
756 return status;
757}
758
759
760
761
762
763static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
764 NvmeSglDescriptor *segment, uint64_t nsgld,
765 size_t *len, NvmeCmd *cmd)
766{
767 dma_addr_t addr, trans_len;
768 uint32_t dlen;
769 uint16_t status;
770
771 for (int i = 0; i < nsgld; i++) {
772 uint8_t type = NVME_SGL_TYPE(segment[i].type);
773
774 switch (type) {
775 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
776 if (cmd->opcode == NVME_CMD_WRITE) {
777 continue;
778 }
779 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
780 break;
781 case NVME_SGL_DESCR_TYPE_SEGMENT:
782 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
783 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
784 default:
785 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
786 }
787
788 dlen = le32_to_cpu(segment[i].len);
789
790 if (!dlen) {
791 continue;
792 }
793
794 if (*len == 0) {
795
796
797
798
799
800 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
801 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
802 break;
803 }
804
805 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
806 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
807 }
808
809 trans_len = MIN(*len, dlen);
810
811 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
812 goto next;
813 }
814
815 addr = le64_to_cpu(segment[i].addr);
816
817 if (UINT64_MAX - addr < dlen) {
818 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
819 }
820
821 status = nvme_map_addr(n, sg, addr, trans_len);
822 if (status) {
823 return status;
824 }
825
826next:
827 *len -= trans_len;
828 }
829
830 return NVME_SUCCESS;
831}
832
833static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
834 size_t len, NvmeCmd *cmd)
835{
836
837
838
839
840
841
842
843 const int SEG_CHUNK_SIZE = 256;
844
845 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
846 uint64_t nsgld;
847 uint32_t seg_len;
848 uint16_t status;
849 hwaddr addr;
850 int ret;
851
852 sgld = &sgl;
853 addr = le64_to_cpu(sgl.addr);
854
855 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
856
857 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
858
859
860
861
862
863 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
864 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
865 if (status) {
866 goto unmap;
867 }
868
869 goto out;
870 }
871
872 for (;;) {
873 switch (NVME_SGL_TYPE(sgld->type)) {
874 case NVME_SGL_DESCR_TYPE_SEGMENT:
875 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
876 break;
877 default:
878 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
879 }
880
881 seg_len = le32_to_cpu(sgld->len);
882
883
884 if ((!seg_len || seg_len & 0xf) &&
885 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
886 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
887 }
888
889 if (UINT64_MAX - addr < seg_len) {
890 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
891 }
892
893 nsgld = seg_len / sizeof(NvmeSglDescriptor);
894
895 while (nsgld > SEG_CHUNK_SIZE) {
896 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
897 trace_pci_nvme_err_addr_read(addr);
898 status = NVME_DATA_TRAS_ERROR;
899 goto unmap;
900 }
901
902 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
903 &len, cmd);
904 if (status) {
905 goto unmap;
906 }
907
908 nsgld -= SEG_CHUNK_SIZE;
909 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
910 }
911
912 ret = nvme_addr_read(n, addr, segment, nsgld *
913 sizeof(NvmeSglDescriptor));
914 if (ret) {
915 trace_pci_nvme_err_addr_read(addr);
916 status = NVME_DATA_TRAS_ERROR;
917 goto unmap;
918 }
919
920 last_sgld = &segment[nsgld - 1];
921
922
923
924
925
926 switch (NVME_SGL_TYPE(last_sgld->type)) {
927 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
928 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
929 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
930 if (status) {
931 goto unmap;
932 }
933
934 goto out;
935
936 default:
937 break;
938 }
939
940
941
942
943
944 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
945 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
946 goto unmap;
947 }
948
949 sgld = last_sgld;
950 addr = le64_to_cpu(sgld->addr);
951
952
953
954
955
956 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
957 if (status) {
958 goto unmap;
959 }
960 }
961
962out:
963
964 if (len) {
965 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
966 goto unmap;
967 }
968
969 return NVME_SUCCESS;
970
971unmap:
972 nvme_sg_unmap(sg);
973 return status;
974}
975
976uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
977 NvmeCmd *cmd)
978{
979 uint64_t prp1, prp2;
980
981 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
982 case NVME_PSDT_PRP:
983 prp1 = le64_to_cpu(cmd->dptr.prp1);
984 prp2 = le64_to_cpu(cmd->dptr.prp2);
985
986 return nvme_map_prp(n, sg, prp1, prp2, len);
987 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
988 case NVME_PSDT_SGL_MPTR_SGL:
989 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
990 default:
991 return NVME_INVALID_FIELD;
992 }
993}
994
995static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
996 NvmeCmd *cmd)
997{
998 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
999 hwaddr mptr = le64_to_cpu(cmd->mptr);
1000 uint16_t status;
1001
1002 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1003 NvmeSglDescriptor sgl;
1004
1005 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1006 return NVME_DATA_TRAS_ERROR;
1007 }
1008
1009 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1010 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1011 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1012 }
1013
1014 return status;
1015 }
1016
1017 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1018 status = nvme_map_addr(n, sg, mptr, len);
1019 if (status) {
1020 nvme_sg_unmap(sg);
1021 }
1022
1023 return status;
1024}
1025
1026static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1027{
1028 NvmeNamespace *ns = req->ns;
1029 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1030 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1031 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1032 size_t len = nvme_l2b(ns, nlb);
1033 uint16_t status;
1034
1035 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1036 NvmeSg sg;
1037
1038 len += nvme_m2b(ns, nlb);
1039
1040 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1041 if (status) {
1042 return status;
1043 }
1044
1045 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1046 nvme_sg_split(&sg, ns, &req->sg, NULL);
1047 nvme_sg_unmap(&sg);
1048
1049 return NVME_SUCCESS;
1050 }
1051
1052 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1053}
1054
1055static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1056{
1057 NvmeNamespace *ns = req->ns;
1058 size_t len = nvme_m2b(ns, nlb);
1059 uint16_t status;
1060
1061 if (nvme_ns_ext(ns)) {
1062 NvmeSg sg;
1063
1064 len += nvme_l2b(ns, nlb);
1065
1066 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1067 if (status) {
1068 return status;
1069 }
1070
1071 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1072 nvme_sg_split(&sg, ns, NULL, &req->sg);
1073 nvme_sg_unmap(&sg);
1074
1075 return NVME_SUCCESS;
1076 }
1077
1078 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1079}
1080
1081static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1082 uint32_t len, uint32_t bytes,
1083 int32_t skip_bytes, int64_t offset,
1084 NvmeTxDirection dir)
1085{
1086 hwaddr addr;
1087 uint32_t trans_len, count = bytes;
1088 bool dma = sg->flags & NVME_SG_DMA;
1089 int64_t sge_len;
1090 int sg_idx = 0;
1091 int ret;
1092
1093 assert(sg->flags & NVME_SG_ALLOC);
1094
1095 while (len) {
1096 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1097
1098 if (sge_len - offset < 0) {
1099 offset -= sge_len;
1100 sg_idx++;
1101 continue;
1102 }
1103
1104 if (sge_len == offset) {
1105 offset = 0;
1106 sg_idx++;
1107 continue;
1108 }
1109
1110 trans_len = MIN(len, count);
1111 trans_len = MIN(trans_len, sge_len - offset);
1112
1113 if (dma) {
1114 addr = sg->qsg.sg[sg_idx].base + offset;
1115 } else {
1116 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1117 }
1118
1119 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1120 ret = nvme_addr_read(n, addr, ptr, trans_len);
1121 } else {
1122 ret = nvme_addr_write(n, addr, ptr, trans_len);
1123 }
1124
1125 if (ret) {
1126 return NVME_DATA_TRAS_ERROR;
1127 }
1128
1129 ptr += trans_len;
1130 len -= trans_len;
1131 count -= trans_len;
1132 offset += trans_len;
1133
1134 if (count == 0) {
1135 count = bytes;
1136 offset += skip_bytes;
1137 }
1138 }
1139
1140 return NVME_SUCCESS;
1141}
1142
1143static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1144 NvmeTxDirection dir)
1145{
1146 assert(sg->flags & NVME_SG_ALLOC);
1147
1148 if (sg->flags & NVME_SG_DMA) {
1149 uint64_t residual;
1150
1151 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1152 residual = dma_buf_write(ptr, len, &sg->qsg);
1153 } else {
1154 residual = dma_buf_read(ptr, len, &sg->qsg);
1155 }
1156
1157 if (unlikely(residual)) {
1158 trace_pci_nvme_err_invalid_dma();
1159 return NVME_INVALID_FIELD | NVME_DNR;
1160 }
1161 } else {
1162 size_t bytes;
1163
1164 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1165 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1166 } else {
1167 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1168 }
1169
1170 if (unlikely(bytes != len)) {
1171 trace_pci_nvme_err_invalid_dma();
1172 return NVME_INVALID_FIELD | NVME_DNR;
1173 }
1174 }
1175
1176 return NVME_SUCCESS;
1177}
1178
1179static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1180 NvmeRequest *req)
1181{
1182 uint16_t status;
1183
1184 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1185 if (status) {
1186 return status;
1187 }
1188
1189 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1190}
1191
1192static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1193 NvmeRequest *req)
1194{
1195 uint16_t status;
1196
1197 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1198 if (status) {
1199 return status;
1200 }
1201
1202 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1203}
1204
1205uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1206 NvmeTxDirection dir, NvmeRequest *req)
1207{
1208 NvmeNamespace *ns = req->ns;
1209 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1210 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1211 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1212
1213 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1214 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1215 ns->lbaf.ms, 0, dir);
1216 }
1217
1218 return nvme_tx(n, &req->sg, ptr, len, dir);
1219}
1220
1221uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1222 NvmeTxDirection dir, NvmeRequest *req)
1223{
1224 NvmeNamespace *ns = req->ns;
1225 uint16_t status;
1226
1227 if (nvme_ns_ext(ns)) {
1228 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1229 ns->lbasz, ns->lbasz, dir);
1230 }
1231
1232 nvme_sg_unmap(&req->sg);
1233
1234 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1235 if (status) {
1236 return status;
1237 }
1238
1239 return nvme_tx(n, &req->sg, ptr, len, dir);
1240}
1241
1242static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1243 BlockCompletionFunc *cb, NvmeRequest *req)
1244{
1245 assert(req->sg.flags & NVME_SG_ALLOC);
1246
1247 if (req->sg.flags & NVME_SG_DMA) {
1248 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1249 cb, req);
1250 } else {
1251 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1252 }
1253}
1254
1255static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1256 BlockCompletionFunc *cb, NvmeRequest *req)
1257{
1258 assert(req->sg.flags & NVME_SG_ALLOC);
1259
1260 if (req->sg.flags & NVME_SG_DMA) {
1261 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1262 cb, req);
1263 } else {
1264 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1265 }
1266}
1267
1268static void nvme_post_cqes(void *opaque)
1269{
1270 NvmeCQueue *cq = opaque;
1271 NvmeCtrl *n = cq->ctrl;
1272 NvmeRequest *req, *next;
1273 bool pending = cq->head != cq->tail;
1274 int ret;
1275
1276 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1277 NvmeSQueue *sq;
1278 hwaddr addr;
1279
1280 if (nvme_cq_full(cq)) {
1281 break;
1282 }
1283
1284 sq = req->sq;
1285 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1286 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1287 req->cqe.sq_head = cpu_to_le16(sq->head);
1288 addr = cq->dma_addr + cq->tail * n->cqe_size;
1289 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1290 sizeof(req->cqe));
1291 if (ret) {
1292 trace_pci_nvme_err_addr_write(addr);
1293 trace_pci_nvme_err_cfs();
1294 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1295 break;
1296 }
1297 QTAILQ_REMOVE(&cq->req_list, req, entry);
1298 nvme_inc_cq_tail(cq);
1299 nvme_sg_unmap(&req->sg);
1300 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1301 }
1302 if (cq->tail != cq->head) {
1303 if (cq->irq_enabled && !pending) {
1304 n->cq_pending++;
1305 }
1306
1307 nvme_irq_assert(n, cq);
1308 }
1309}
1310
1311static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1312{
1313 assert(cq->cqid == req->sq->cqid);
1314 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1315 le32_to_cpu(req->cqe.result),
1316 le32_to_cpu(req->cqe.dw1),
1317 req->status);
1318
1319 if (req->status) {
1320 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1321 req->status, req->cmd.opcode);
1322 }
1323
1324 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1325 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1326 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1327}
1328
1329static void nvme_process_aers(void *opaque)
1330{
1331 NvmeCtrl *n = opaque;
1332 NvmeAsyncEvent *event, *next;
1333
1334 trace_pci_nvme_process_aers(n->aer_queued);
1335
1336 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1337 NvmeRequest *req;
1338 NvmeAerResult *result;
1339
1340
1341 if (!n->outstanding_aers) {
1342 trace_pci_nvme_no_outstanding_aers();
1343 break;
1344 }
1345
1346
1347 if (n->aer_mask & (1 << event->result.event_type)) {
1348 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1349 continue;
1350 }
1351
1352 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1353 n->aer_queued--;
1354
1355 n->aer_mask |= 1 << event->result.event_type;
1356 n->outstanding_aers--;
1357
1358 req = n->aer_reqs[n->outstanding_aers];
1359
1360 result = (NvmeAerResult *) &req->cqe.result;
1361 result->event_type = event->result.event_type;
1362 result->event_info = event->result.event_info;
1363 result->log_page = event->result.log_page;
1364 g_free(event);
1365
1366 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1367 result->log_page);
1368
1369 nvme_enqueue_req_completion(&n->admin_cq, req);
1370 }
1371}
1372
1373static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1374 uint8_t event_info, uint8_t log_page)
1375{
1376 NvmeAsyncEvent *event;
1377
1378 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1379
1380 if (n->aer_queued == n->params.aer_max_queued) {
1381 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1382 return;
1383 }
1384
1385 event = g_new(NvmeAsyncEvent, 1);
1386 event->result = (NvmeAerResult) {
1387 .event_type = event_type,
1388 .event_info = event_info,
1389 .log_page = log_page,
1390 };
1391
1392 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1393 n->aer_queued++;
1394
1395 nvme_process_aers(n);
1396}
1397
1398static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1399{
1400 uint8_t aer_info;
1401
1402
1403 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1404 return;
1405 }
1406
1407 switch (event) {
1408 case NVME_SMART_SPARE:
1409 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1410 break;
1411 case NVME_SMART_TEMPERATURE:
1412 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1413 break;
1414 case NVME_SMART_RELIABILITY:
1415 case NVME_SMART_MEDIA_READ_ONLY:
1416 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1417 case NVME_SMART_PMR_UNRELIABLE:
1418 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1419 break;
1420 default:
1421 return;
1422 }
1423
1424 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1425}
1426
1427static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1428{
1429 n->aer_mask &= ~(1 << event_type);
1430 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1431 nvme_process_aers(n);
1432 }
1433}
1434
1435static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1436{
1437 uint8_t mdts = n->params.mdts;
1438
1439 if (mdts && len > n->page_size << mdts) {
1440 trace_pci_nvme_err_mdts(len);
1441 return NVME_INVALID_FIELD | NVME_DNR;
1442 }
1443
1444 return NVME_SUCCESS;
1445}
1446
1447static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1448 uint32_t nlb)
1449{
1450 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1451
1452 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1453 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1454 return NVME_LBA_RANGE | NVME_DNR;
1455 }
1456
1457 return NVME_SUCCESS;
1458}
1459
1460static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1461 uint32_t nlb, int flags)
1462{
1463 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1464
1465 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1466 int64_t offset = nvme_l2b(ns, slba);
1467 int ret;
1468
1469
1470
1471
1472
1473
1474
1475 do {
1476 bytes -= pnum;
1477
1478 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1479 if (ret < 0) {
1480 return ret;
1481 }
1482
1483
1484 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1485 !!(ret & BDRV_BLOCK_ZERO));
1486
1487 if (!(ret & flags)) {
1488 return 1;
1489 }
1490
1491 offset += pnum;
1492 } while (pnum != bytes);
1493
1494 return 0;
1495}
1496
1497static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1498 uint32_t nlb)
1499{
1500 int ret;
1501 Error *err = NULL;
1502
1503 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1504 if (ret) {
1505 if (ret < 0) {
1506 error_setg_errno(&err, -ret, "unable to get block status");
1507 error_report_err(err);
1508
1509 return NVME_INTERNAL_DEV_ERROR;
1510 }
1511
1512 return NVME_DULB;
1513 }
1514
1515 return NVME_SUCCESS;
1516}
1517
1518static void nvme_aio_err(NvmeRequest *req, int ret)
1519{
1520 uint16_t status = NVME_SUCCESS;
1521 Error *local_err = NULL;
1522
1523 switch (req->cmd.opcode) {
1524 case NVME_CMD_READ:
1525 status = NVME_UNRECOVERED_READ;
1526 break;
1527 case NVME_CMD_FLUSH:
1528 case NVME_CMD_WRITE:
1529 case NVME_CMD_WRITE_ZEROES:
1530 case NVME_CMD_ZONE_APPEND:
1531 status = NVME_WRITE_FAULT;
1532 break;
1533 default:
1534 status = NVME_INTERNAL_DEV_ERROR;
1535 break;
1536 }
1537
1538 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1539
1540 error_setg_errno(&local_err, -ret, "aio failed");
1541 error_report_err(local_err);
1542
1543
1544
1545
1546
1547 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1548 return;
1549 }
1550
1551 req->status = status;
1552}
1553
1554static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1555{
1556 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1557 slba / ns->zone_size;
1558}
1559
1560static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1561{
1562 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1563
1564 if (zone_idx >= ns->num_zones) {
1565 return NULL;
1566 }
1567
1568 return &ns->zone_array[zone_idx];
1569}
1570
1571static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1572{
1573 uint64_t zslba = zone->d.zslba;
1574
1575 switch (nvme_get_zone_state(zone)) {
1576 case NVME_ZONE_STATE_EMPTY:
1577 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1578 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1579 case NVME_ZONE_STATE_CLOSED:
1580 return NVME_SUCCESS;
1581 case NVME_ZONE_STATE_FULL:
1582 trace_pci_nvme_err_zone_is_full(zslba);
1583 return NVME_ZONE_FULL;
1584 case NVME_ZONE_STATE_OFFLINE:
1585 trace_pci_nvme_err_zone_is_offline(zslba);
1586 return NVME_ZONE_OFFLINE;
1587 case NVME_ZONE_STATE_READ_ONLY:
1588 trace_pci_nvme_err_zone_is_read_only(zslba);
1589 return NVME_ZONE_READ_ONLY;
1590 default:
1591 assert(false);
1592 }
1593
1594 return NVME_INTERNAL_DEV_ERROR;
1595}
1596
1597static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1598 uint64_t slba, uint32_t nlb)
1599{
1600 uint64_t zcap = nvme_zone_wr_boundary(zone);
1601 uint16_t status;
1602
1603 status = nvme_check_zone_state_for_write(zone);
1604 if (status) {
1605 return status;
1606 }
1607
1608 if (unlikely(slba != zone->w_ptr)) {
1609 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1610 return NVME_ZONE_INVALID_WRITE;
1611 }
1612
1613 if (unlikely((slba + nlb) > zcap)) {
1614 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1615 return NVME_ZONE_BOUNDARY_ERROR;
1616 }
1617
1618 return NVME_SUCCESS;
1619}
1620
1621static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1622{
1623 switch (nvme_get_zone_state(zone)) {
1624 case NVME_ZONE_STATE_EMPTY:
1625 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1626 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1627 case NVME_ZONE_STATE_FULL:
1628 case NVME_ZONE_STATE_CLOSED:
1629 case NVME_ZONE_STATE_READ_ONLY:
1630 return NVME_SUCCESS;
1631 case NVME_ZONE_STATE_OFFLINE:
1632 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1633 return NVME_ZONE_OFFLINE;
1634 default:
1635 assert(false);
1636 }
1637
1638 return NVME_INTERNAL_DEV_ERROR;
1639}
1640
1641static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1642 uint32_t nlb)
1643{
1644 NvmeZone *zone;
1645 uint64_t bndry, end;
1646 uint16_t status;
1647
1648 zone = nvme_get_zone_by_slba(ns, slba);
1649 assert(zone);
1650
1651 bndry = nvme_zone_rd_boundary(ns, zone);
1652 end = slba + nlb;
1653
1654 status = nvme_check_zone_state_for_read(zone);
1655 if (status) {
1656 ;
1657 } else if (unlikely(end > bndry)) {
1658 if (!ns->params.cross_zone_read) {
1659 status = NVME_ZONE_BOUNDARY_ERROR;
1660 } else {
1661
1662
1663
1664
1665 do {
1666 zone++;
1667 status = nvme_check_zone_state_for_read(zone);
1668 if (status) {
1669 break;
1670 }
1671 } while (end > nvme_zone_rd_boundary(ns, zone));
1672 }
1673 }
1674
1675 return status;
1676}
1677
1678static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1679{
1680 switch (nvme_get_zone_state(zone)) {
1681 case NVME_ZONE_STATE_FULL:
1682 return NVME_SUCCESS;
1683
1684 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1685 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1686 nvme_aor_dec_open(ns);
1687
1688 case NVME_ZONE_STATE_CLOSED:
1689 nvme_aor_dec_active(ns);
1690
1691 case NVME_ZONE_STATE_EMPTY:
1692 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1693 return NVME_SUCCESS;
1694
1695 default:
1696 return NVME_ZONE_INVAL_TRANSITION;
1697 }
1698}
1699
1700static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1701{
1702 switch (nvme_get_zone_state(zone)) {
1703 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1704 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1705 nvme_aor_dec_open(ns);
1706 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1707
1708 case NVME_ZONE_STATE_CLOSED:
1709 return NVME_SUCCESS;
1710
1711 default:
1712 return NVME_ZONE_INVAL_TRANSITION;
1713 }
1714}
1715
1716static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1717{
1718 switch (nvme_get_zone_state(zone)) {
1719 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1720 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1721 nvme_aor_dec_open(ns);
1722
1723 case NVME_ZONE_STATE_CLOSED:
1724 nvme_aor_dec_active(ns);
1725
1726 case NVME_ZONE_STATE_FULL:
1727 zone->w_ptr = zone->d.zslba;
1728 zone->d.wp = zone->w_ptr;
1729 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1730
1731 case NVME_ZONE_STATE_EMPTY:
1732 return NVME_SUCCESS;
1733
1734 default:
1735 return NVME_ZONE_INVAL_TRANSITION;
1736 }
1737}
1738
1739static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1740{
1741 NvmeZone *zone;
1742
1743 if (ns->params.max_open_zones &&
1744 ns->nr_open_zones == ns->params.max_open_zones) {
1745 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1746 if (zone) {
1747
1748
1749
1750 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1751 nvme_zrm_close(ns, zone);
1752 }
1753 }
1754}
1755
1756enum {
1757 NVME_ZRM_AUTO = 1 << 0,
1758};
1759
1760static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1761 NvmeZone *zone, int flags)
1762{
1763 int act = 0;
1764 uint16_t status;
1765
1766 switch (nvme_get_zone_state(zone)) {
1767 case NVME_ZONE_STATE_EMPTY:
1768 act = 1;
1769
1770
1771
1772 case NVME_ZONE_STATE_CLOSED:
1773 if (n->params.auto_transition_zones) {
1774 nvme_zrm_auto_transition_zone(ns);
1775 }
1776 status = nvme_aor_check(ns, act, 1);
1777 if (status) {
1778 return status;
1779 }
1780
1781 if (act) {
1782 nvme_aor_inc_active(ns);
1783 }
1784
1785 nvme_aor_inc_open(ns);
1786
1787 if (flags & NVME_ZRM_AUTO) {
1788 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1789 return NVME_SUCCESS;
1790 }
1791
1792
1793
1794 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1795 if (flags & NVME_ZRM_AUTO) {
1796 return NVME_SUCCESS;
1797 }
1798
1799 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1800
1801
1802
1803 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1804 return NVME_SUCCESS;
1805
1806 default:
1807 return NVME_ZONE_INVAL_TRANSITION;
1808 }
1809}
1810
1811static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1812 NvmeZone *zone)
1813{
1814 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1815}
1816
1817static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns,
1818 NvmeZone *zone)
1819{
1820 return nvme_zrm_open_flags(n, ns, zone, 0);
1821}
1822
1823static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1824 uint32_t nlb)
1825{
1826 zone->d.wp += nlb;
1827
1828 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1829 nvme_zrm_finish(ns, zone);
1830 }
1831}
1832
1833static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1834{
1835 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1836 NvmeZone *zone;
1837 uint64_t slba;
1838 uint32_t nlb;
1839
1840 slba = le64_to_cpu(rw->slba);
1841 nlb = le16_to_cpu(rw->nlb) + 1;
1842 zone = nvme_get_zone_by_slba(ns, slba);
1843 assert(zone);
1844
1845 nvme_advance_zone_wp(ns, zone, nlb);
1846}
1847
1848static inline bool nvme_is_write(NvmeRequest *req)
1849{
1850 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1851
1852 return rw->opcode == NVME_CMD_WRITE ||
1853 rw->opcode == NVME_CMD_ZONE_APPEND ||
1854 rw->opcode == NVME_CMD_WRITE_ZEROES;
1855}
1856
1857static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1858{
1859 return qemu_get_aio_context();
1860}
1861
1862static void nvme_misc_cb(void *opaque, int ret)
1863{
1864 NvmeRequest *req = opaque;
1865
1866 trace_pci_nvme_misc_cb(nvme_cid(req));
1867
1868 if (ret) {
1869 nvme_aio_err(req, ret);
1870 }
1871
1872 nvme_enqueue_req_completion(nvme_cq(req), req);
1873}
1874
1875void nvme_rw_complete_cb(void *opaque, int ret)
1876{
1877 NvmeRequest *req = opaque;
1878 NvmeNamespace *ns = req->ns;
1879 BlockBackend *blk = ns->blkconf.blk;
1880 BlockAcctCookie *acct = &req->acct;
1881 BlockAcctStats *stats = blk_get_stats(blk);
1882
1883 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1884
1885 if (ret) {
1886 block_acct_failed(stats, acct);
1887 nvme_aio_err(req, ret);
1888 } else {
1889 block_acct_done(stats, acct);
1890 }
1891
1892 if (ns->params.zoned && nvme_is_write(req)) {
1893 nvme_finalize_zoned_write(ns, req);
1894 }
1895
1896 nvme_enqueue_req_completion(nvme_cq(req), req);
1897}
1898
1899static void nvme_rw_cb(void *opaque, int ret)
1900{
1901 NvmeRequest *req = opaque;
1902 NvmeNamespace *ns = req->ns;
1903
1904 BlockBackend *blk = ns->blkconf.blk;
1905
1906 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1907
1908 if (ret) {
1909 goto out;
1910 }
1911
1912 if (ns->lbaf.ms) {
1913 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1914 uint64_t slba = le64_to_cpu(rw->slba);
1915 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1916 uint64_t offset = nvme_moff(ns, slba);
1917
1918 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1919 size_t mlen = nvme_m2b(ns, nlb);
1920
1921 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1922 BDRV_REQ_MAY_UNMAP,
1923 nvme_rw_complete_cb, req);
1924 return;
1925 }
1926
1927 if (nvme_ns_ext(ns) || req->cmd.mptr) {
1928 uint16_t status;
1929
1930 nvme_sg_unmap(&req->sg);
1931 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1932 if (status) {
1933 ret = -EFAULT;
1934 goto out;
1935 }
1936
1937 if (req->cmd.opcode == NVME_CMD_READ) {
1938 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1939 }
1940
1941 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1942 }
1943 }
1944
1945out:
1946 nvme_rw_complete_cb(req, ret);
1947}
1948
1949static void nvme_verify_cb(void *opaque, int ret)
1950{
1951 NvmeBounceContext *ctx = opaque;
1952 NvmeRequest *req = ctx->req;
1953 NvmeNamespace *ns = req->ns;
1954 BlockBackend *blk = ns->blkconf.blk;
1955 BlockAcctCookie *acct = &req->acct;
1956 BlockAcctStats *stats = blk_get_stats(blk);
1957 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1958 uint64_t slba = le64_to_cpu(rw->slba);
1959 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
1960 uint16_t apptag = le16_to_cpu(rw->apptag);
1961 uint16_t appmask = le16_to_cpu(rw->appmask);
1962 uint32_t reftag = le32_to_cpu(rw->reftag);
1963 uint16_t status;
1964
1965 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
1966
1967 if (ret) {
1968 block_acct_failed(stats, acct);
1969 nvme_aio_err(req, ret);
1970 goto out;
1971 }
1972
1973 block_acct_done(stats, acct);
1974
1975 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1976 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1977 ctx->mdata.iov.size, slba);
1978 if (status) {
1979 req->status = status;
1980 goto out;
1981 }
1982
1983 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1984 ctx->mdata.bounce, ctx->mdata.iov.size,
1985 prinfo, slba, apptag, appmask, &reftag);
1986 }
1987
1988out:
1989 qemu_iovec_destroy(&ctx->data.iov);
1990 g_free(ctx->data.bounce);
1991
1992 qemu_iovec_destroy(&ctx->mdata.iov);
1993 g_free(ctx->mdata.bounce);
1994
1995 g_free(ctx);
1996
1997 nvme_enqueue_req_completion(nvme_cq(req), req);
1998}
1999
2000
2001static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2002{
2003 NvmeBounceContext *ctx = opaque;
2004 NvmeRequest *req = ctx->req;
2005 NvmeNamespace *ns = req->ns;
2006 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2007 uint64_t slba = le64_to_cpu(rw->slba);
2008 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2009 size_t mlen = nvme_m2b(ns, nlb);
2010 uint64_t offset = nvme_moff(ns, slba);
2011 BlockBackend *blk = ns->blkconf.blk;
2012
2013 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2014
2015 if (ret) {
2016 goto out;
2017 }
2018
2019 ctx->mdata.bounce = g_malloc(mlen);
2020
2021 qemu_iovec_reset(&ctx->mdata.iov);
2022 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2023
2024 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2025 nvme_verify_cb, ctx);
2026 return;
2027
2028out:
2029 nvme_verify_cb(ctx, ret);
2030}
2031
2032struct nvme_compare_ctx {
2033 struct {
2034 QEMUIOVector iov;
2035 uint8_t *bounce;
2036 } data;
2037
2038 struct {
2039 QEMUIOVector iov;
2040 uint8_t *bounce;
2041 } mdata;
2042};
2043
2044static void nvme_compare_mdata_cb(void *opaque, int ret)
2045{
2046 NvmeRequest *req = opaque;
2047 NvmeNamespace *ns = req->ns;
2048 NvmeCtrl *n = nvme_ctrl(req);
2049 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2050 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2051 uint16_t apptag = le16_to_cpu(rw->apptag);
2052 uint16_t appmask = le16_to_cpu(rw->appmask);
2053 uint32_t reftag = le32_to_cpu(rw->reftag);
2054 struct nvme_compare_ctx *ctx = req->opaque;
2055 g_autofree uint8_t *buf = NULL;
2056 BlockBackend *blk = ns->blkconf.blk;
2057 BlockAcctCookie *acct = &req->acct;
2058 BlockAcctStats *stats = blk_get_stats(blk);
2059 uint16_t status = NVME_SUCCESS;
2060
2061 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2062
2063 if (ret) {
2064 block_acct_failed(stats, acct);
2065 nvme_aio_err(req, ret);
2066 goto out;
2067 }
2068
2069 buf = g_malloc(ctx->mdata.iov.size);
2070
2071 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2072 NVME_TX_DIRECTION_TO_DEVICE, req);
2073 if (status) {
2074 req->status = status;
2075 goto out;
2076 }
2077
2078 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2079 uint64_t slba = le64_to_cpu(rw->slba);
2080 uint8_t *bufp;
2081 uint8_t *mbufp = ctx->mdata.bounce;
2082 uint8_t *end = mbufp + ctx->mdata.iov.size;
2083 int16_t pil = 0;
2084
2085 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2086 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2087 slba, apptag, appmask, &reftag);
2088 if (status) {
2089 req->status = status;
2090 goto out;
2091 }
2092
2093
2094
2095
2096
2097 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2098 pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
2099 }
2100
2101 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2102 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2103 req->status = NVME_CMP_FAILURE;
2104 goto out;
2105 }
2106 }
2107
2108 goto out;
2109 }
2110
2111 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2112 req->status = NVME_CMP_FAILURE;
2113 goto out;
2114 }
2115
2116 block_acct_done(stats, acct);
2117
2118out:
2119 qemu_iovec_destroy(&ctx->data.iov);
2120 g_free(ctx->data.bounce);
2121
2122 qemu_iovec_destroy(&ctx->mdata.iov);
2123 g_free(ctx->mdata.bounce);
2124
2125 g_free(ctx);
2126
2127 nvme_enqueue_req_completion(nvme_cq(req), req);
2128}
2129
2130static void nvme_compare_data_cb(void *opaque, int ret)
2131{
2132 NvmeRequest *req = opaque;
2133 NvmeCtrl *n = nvme_ctrl(req);
2134 NvmeNamespace *ns = req->ns;
2135 BlockBackend *blk = ns->blkconf.blk;
2136 BlockAcctCookie *acct = &req->acct;
2137 BlockAcctStats *stats = blk_get_stats(blk);
2138
2139 struct nvme_compare_ctx *ctx = req->opaque;
2140 g_autofree uint8_t *buf = NULL;
2141 uint16_t status;
2142
2143 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2144
2145 if (ret) {
2146 block_acct_failed(stats, acct);
2147 nvme_aio_err(req, ret);
2148 goto out;
2149 }
2150
2151 buf = g_malloc(ctx->data.iov.size);
2152
2153 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2154 NVME_TX_DIRECTION_TO_DEVICE, req);
2155 if (status) {
2156 req->status = status;
2157 goto out;
2158 }
2159
2160 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2161 req->status = NVME_CMP_FAILURE;
2162 goto out;
2163 }
2164
2165 if (ns->lbaf.ms) {
2166 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2167 uint64_t slba = le64_to_cpu(rw->slba);
2168 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2169 size_t mlen = nvme_m2b(ns, nlb);
2170 uint64_t offset = nvme_moff(ns, slba);
2171
2172 ctx->mdata.bounce = g_malloc(mlen);
2173
2174 qemu_iovec_init(&ctx->mdata.iov, 1);
2175 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2176
2177 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2178 nvme_compare_mdata_cb, req);
2179 return;
2180 }
2181
2182 block_acct_done(stats, acct);
2183
2184out:
2185 qemu_iovec_destroy(&ctx->data.iov);
2186 g_free(ctx->data.bounce);
2187 g_free(ctx);
2188
2189 nvme_enqueue_req_completion(nvme_cq(req), req);
2190}
2191
2192typedef struct NvmeDSMAIOCB {
2193 BlockAIOCB common;
2194 BlockAIOCB *aiocb;
2195 NvmeRequest *req;
2196 QEMUBH *bh;
2197 int ret;
2198
2199 NvmeDsmRange *range;
2200 unsigned int nr;
2201 unsigned int idx;
2202} NvmeDSMAIOCB;
2203
2204static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2205{
2206 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2207
2208
2209 iocb->idx = iocb->nr;
2210 iocb->ret = -ECANCELED;
2211
2212 if (iocb->aiocb) {
2213 blk_aio_cancel_async(iocb->aiocb);
2214 iocb->aiocb = NULL;
2215 } else {
2216
2217
2218
2219
2220 assert(iocb->idx == iocb->nr);
2221 }
2222}
2223
2224static const AIOCBInfo nvme_dsm_aiocb_info = {
2225 .aiocb_size = sizeof(NvmeDSMAIOCB),
2226 .cancel_async = nvme_dsm_cancel,
2227};
2228
2229static void nvme_dsm_bh(void *opaque)
2230{
2231 NvmeDSMAIOCB *iocb = opaque;
2232
2233 iocb->common.cb(iocb->common.opaque, iocb->ret);
2234
2235 qemu_bh_delete(iocb->bh);
2236 iocb->bh = NULL;
2237 qemu_aio_unref(iocb);
2238}
2239
2240static void nvme_dsm_cb(void *opaque, int ret);
2241
2242static void nvme_dsm_md_cb(void *opaque, int ret)
2243{
2244 NvmeDSMAIOCB *iocb = opaque;
2245 NvmeRequest *req = iocb->req;
2246 NvmeNamespace *ns = req->ns;
2247 NvmeDsmRange *range;
2248 uint64_t slba;
2249 uint32_t nlb;
2250
2251 if (ret < 0) {
2252 iocb->ret = ret;
2253 goto done;
2254 }
2255
2256 if (!ns->lbaf.ms) {
2257 nvme_dsm_cb(iocb, 0);
2258 return;
2259 }
2260
2261 range = &iocb->range[iocb->idx - 1];
2262 slba = le64_to_cpu(range->slba);
2263 nlb = le32_to_cpu(range->nlb);
2264
2265
2266
2267
2268
2269
2270 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2271 if (ret) {
2272 if (ret < 0) {
2273 iocb->ret = ret;
2274 goto done;
2275 }
2276
2277 nvme_dsm_cb(iocb, 0);
2278 }
2279
2280 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2281 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2282 nvme_dsm_cb, iocb);
2283 return;
2284
2285done:
2286 iocb->aiocb = NULL;
2287 qemu_bh_schedule(iocb->bh);
2288}
2289
2290static void nvme_dsm_cb(void *opaque, int ret)
2291{
2292 NvmeDSMAIOCB *iocb = opaque;
2293 NvmeRequest *req = iocb->req;
2294 NvmeCtrl *n = nvme_ctrl(req);
2295 NvmeNamespace *ns = req->ns;
2296 NvmeDsmRange *range;
2297 uint64_t slba;
2298 uint32_t nlb;
2299
2300 if (ret < 0) {
2301 iocb->ret = ret;
2302 goto done;
2303 }
2304
2305next:
2306 if (iocb->idx == iocb->nr) {
2307 goto done;
2308 }
2309
2310 range = &iocb->range[iocb->idx++];
2311 slba = le64_to_cpu(range->slba);
2312 nlb = le32_to_cpu(range->nlb);
2313
2314 trace_pci_nvme_dsm_deallocate(slba, nlb);
2315
2316 if (nlb > n->dmrsl) {
2317 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2318 goto next;
2319 }
2320
2321 if (nvme_check_bounds(ns, slba, nlb)) {
2322 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2323 ns->id_ns.nsze);
2324 goto next;
2325 }
2326
2327 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2328 nvme_l2b(ns, nlb),
2329 nvme_dsm_md_cb, iocb);
2330 return;
2331
2332done:
2333 iocb->aiocb = NULL;
2334 qemu_bh_schedule(iocb->bh);
2335}
2336
2337static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2338{
2339 NvmeNamespace *ns = req->ns;
2340 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2341 uint32_t attr = le32_to_cpu(dsm->attributes);
2342 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2343 uint16_t status = NVME_SUCCESS;
2344
2345 trace_pci_nvme_dsm(nr, attr);
2346
2347 if (attr & NVME_DSMGMT_AD) {
2348 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2349 nvme_misc_cb, req);
2350
2351 iocb->req = req;
2352 iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2353 iocb->ret = 0;
2354 iocb->range = g_new(NvmeDsmRange, nr);
2355 iocb->nr = nr;
2356 iocb->idx = 0;
2357
2358 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2359 req);
2360 if (status) {
2361 return status;
2362 }
2363
2364 req->aiocb = &iocb->common;
2365 nvme_dsm_cb(iocb, 0);
2366
2367 return NVME_NO_COMPLETE;
2368 }
2369
2370 return status;
2371}
2372
2373static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2374{
2375 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2376 NvmeNamespace *ns = req->ns;
2377 BlockBackend *blk = ns->blkconf.blk;
2378 uint64_t slba = le64_to_cpu(rw->slba);
2379 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2380 size_t len = nvme_l2b(ns, nlb);
2381 int64_t offset = nvme_l2b(ns, slba);
2382 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2383 uint32_t reftag = le32_to_cpu(rw->reftag);
2384 NvmeBounceContext *ctx = NULL;
2385 uint16_t status;
2386
2387 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2388
2389 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2390 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2391 if (status) {
2392 return status;
2393 }
2394
2395 if (prinfo & NVME_PRINFO_PRACT) {
2396 return NVME_INVALID_PROT_INFO | NVME_DNR;
2397 }
2398 }
2399
2400 if (len > n->page_size << n->params.vsl) {
2401 return NVME_INVALID_FIELD | NVME_DNR;
2402 }
2403
2404 status = nvme_check_bounds(ns, slba, nlb);
2405 if (status) {
2406 return status;
2407 }
2408
2409 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2410 status = nvme_check_dulbe(ns, slba, nlb);
2411 if (status) {
2412 return status;
2413 }
2414 }
2415
2416 ctx = g_new0(NvmeBounceContext, 1);
2417 ctx->req = req;
2418
2419 ctx->data.bounce = g_malloc(len);
2420
2421 qemu_iovec_init(&ctx->data.iov, 1);
2422 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2423
2424 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2425 BLOCK_ACCT_READ);
2426
2427 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2428 nvme_verify_mdata_in_cb, ctx);
2429 return NVME_NO_COMPLETE;
2430}
2431
2432typedef struct NvmeCopyAIOCB {
2433 BlockAIOCB common;
2434 BlockAIOCB *aiocb;
2435 NvmeRequest *req;
2436 QEMUBH *bh;
2437 int ret;
2438
2439 NvmeCopySourceRange *ranges;
2440 int nr;
2441 int idx;
2442
2443 uint8_t *bounce;
2444 QEMUIOVector iov;
2445 struct {
2446 BlockAcctCookie read;
2447 BlockAcctCookie write;
2448 } acct;
2449
2450 uint32_t reftag;
2451 uint64_t slba;
2452
2453 NvmeZone *zone;
2454} NvmeCopyAIOCB;
2455
2456static void nvme_copy_cancel(BlockAIOCB *aiocb)
2457{
2458 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2459
2460 iocb->ret = -ECANCELED;
2461
2462 if (iocb->aiocb) {
2463 blk_aio_cancel_async(iocb->aiocb);
2464 iocb->aiocb = NULL;
2465 }
2466}
2467
2468static const AIOCBInfo nvme_copy_aiocb_info = {
2469 .aiocb_size = sizeof(NvmeCopyAIOCB),
2470 .cancel_async = nvme_copy_cancel,
2471};
2472
2473static void nvme_copy_bh(void *opaque)
2474{
2475 NvmeCopyAIOCB *iocb = opaque;
2476 NvmeRequest *req = iocb->req;
2477 NvmeNamespace *ns = req->ns;
2478 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2479
2480 if (iocb->idx != iocb->nr) {
2481 req->cqe.result = cpu_to_le32(iocb->idx);
2482 }
2483
2484 qemu_iovec_destroy(&iocb->iov);
2485 g_free(iocb->bounce);
2486
2487 qemu_bh_delete(iocb->bh);
2488 iocb->bh = NULL;
2489
2490 if (iocb->ret < 0) {
2491 block_acct_failed(stats, &iocb->acct.read);
2492 block_acct_failed(stats, &iocb->acct.write);
2493 } else {
2494 block_acct_done(stats, &iocb->acct.read);
2495 block_acct_done(stats, &iocb->acct.write);
2496 }
2497
2498 iocb->common.cb(iocb->common.opaque, iocb->ret);
2499 qemu_aio_unref(iocb);
2500}
2501
2502static void nvme_copy_cb(void *opaque, int ret);
2503
2504static void nvme_copy_out_completed_cb(void *opaque, int ret)
2505{
2506 NvmeCopyAIOCB *iocb = opaque;
2507 NvmeRequest *req = iocb->req;
2508 NvmeNamespace *ns = req->ns;
2509 NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
2510 uint32_t nlb = le32_to_cpu(range->nlb) + 1;
2511
2512 if (ret < 0) {
2513 iocb->ret = ret;
2514 goto out;
2515 } else if (iocb->ret < 0) {
2516 goto out;
2517 }
2518
2519 if (ns->params.zoned) {
2520 nvme_advance_zone_wp(ns, iocb->zone, nlb);
2521 }
2522
2523 iocb->idx++;
2524 iocb->slba += nlb;
2525out:
2526 nvme_copy_cb(iocb, iocb->ret);
2527}
2528
2529static void nvme_copy_out_cb(void *opaque, int ret)
2530{
2531 NvmeCopyAIOCB *iocb = opaque;
2532 NvmeRequest *req = iocb->req;
2533 NvmeNamespace *ns = req->ns;
2534 NvmeCopySourceRange *range;
2535 uint32_t nlb;
2536 size_t mlen;
2537 uint8_t *mbounce;
2538
2539 if (ret < 0) {
2540 iocb->ret = ret;
2541 goto out;
2542 } else if (iocb->ret < 0) {
2543 goto out;
2544 }
2545
2546 if (!ns->lbaf.ms) {
2547 nvme_copy_out_completed_cb(iocb, 0);
2548 return;
2549 }
2550
2551 range = &iocb->ranges[iocb->idx];
2552 nlb = le32_to_cpu(range->nlb) + 1;
2553
2554 mlen = nvme_m2b(ns, nlb);
2555 mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2556
2557 qemu_iovec_reset(&iocb->iov);
2558 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2559
2560 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2561 &iocb->iov, 0, nvme_copy_out_completed_cb,
2562 iocb);
2563
2564 return;
2565
2566out:
2567 nvme_copy_cb(iocb, ret);
2568}
2569
2570static void nvme_copy_in_completed_cb(void *opaque, int ret)
2571{
2572 NvmeCopyAIOCB *iocb = opaque;
2573 NvmeRequest *req = iocb->req;
2574 NvmeNamespace *ns = req->ns;
2575 NvmeCopySourceRange *range;
2576 uint32_t nlb;
2577 size_t len;
2578 uint16_t status;
2579
2580 if (ret < 0) {
2581 iocb->ret = ret;
2582 goto out;
2583 } else if (iocb->ret < 0) {
2584 goto out;
2585 }
2586
2587 range = &iocb->ranges[iocb->idx];
2588 nlb = le32_to_cpu(range->nlb) + 1;
2589 len = nvme_l2b(ns, nlb);
2590
2591 trace_pci_nvme_copy_out(iocb->slba, nlb);
2592
2593 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2594 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2595
2596 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2597 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2598
2599 uint16_t apptag = le16_to_cpu(range->apptag);
2600 uint16_t appmask = le16_to_cpu(range->appmask);
2601 uint32_t reftag = le32_to_cpu(range->reftag);
2602
2603 uint64_t slba = le64_to_cpu(range->slba);
2604 size_t mlen = nvme_m2b(ns, nlb);
2605 uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2606
2607 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2608 slba, apptag, appmask, &reftag);
2609 if (status) {
2610 goto invalid;
2611 }
2612
2613 apptag = le16_to_cpu(copy->apptag);
2614 appmask = le16_to_cpu(copy->appmask);
2615
2616 if (prinfow & NVME_PRINFO_PRACT) {
2617 status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2618 if (status) {
2619 goto invalid;
2620 }
2621
2622 nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2623 apptag, &iocb->reftag);
2624 } else {
2625 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2626 prinfow, iocb->slba, apptag, appmask,
2627 &iocb->reftag);
2628 if (status) {
2629 goto invalid;
2630 }
2631 }
2632 }
2633
2634 status = nvme_check_bounds(ns, iocb->slba, nlb);
2635 if (status) {
2636 goto invalid;
2637 }
2638
2639 if (ns->params.zoned) {
2640 status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2641 if (status) {
2642 goto invalid;
2643 }
2644
2645 iocb->zone->w_ptr += nlb;
2646 }
2647
2648 qemu_iovec_reset(&iocb->iov);
2649 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2650
2651 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2652 &iocb->iov, 0, nvme_copy_out_cb, iocb);
2653
2654 return;
2655
2656invalid:
2657 req->status = status;
2658 iocb->aiocb = NULL;
2659 if (iocb->bh) {
2660 qemu_bh_schedule(iocb->bh);
2661 }
2662
2663 return;
2664
2665out:
2666 nvme_copy_cb(iocb, ret);
2667}
2668
2669static void nvme_copy_in_cb(void *opaque, int ret)
2670{
2671 NvmeCopyAIOCB *iocb = opaque;
2672 NvmeRequest *req = iocb->req;
2673 NvmeNamespace *ns = req->ns;
2674 NvmeCopySourceRange *range;
2675 uint64_t slba;
2676 uint32_t nlb;
2677
2678 if (ret < 0) {
2679 iocb->ret = ret;
2680 goto out;
2681 } else if (iocb->ret < 0) {
2682 goto out;
2683 }
2684
2685 if (!ns->lbaf.ms) {
2686 nvme_copy_in_completed_cb(iocb, 0);
2687 return;
2688 }
2689
2690 range = &iocb->ranges[iocb->idx];
2691 slba = le64_to_cpu(range->slba);
2692 nlb = le32_to_cpu(range->nlb) + 1;
2693
2694 qemu_iovec_reset(&iocb->iov);
2695 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2696 nvme_m2b(ns, nlb));
2697
2698 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2699 &iocb->iov, 0, nvme_copy_in_completed_cb,
2700 iocb);
2701 return;
2702
2703out:
2704 nvme_copy_cb(iocb, iocb->ret);
2705}
2706
2707static void nvme_copy_cb(void *opaque, int ret)
2708{
2709 NvmeCopyAIOCB *iocb = opaque;
2710 NvmeRequest *req = iocb->req;
2711 NvmeNamespace *ns = req->ns;
2712 NvmeCopySourceRange *range;
2713 uint64_t slba;
2714 uint32_t nlb;
2715 size_t len;
2716 uint16_t status;
2717
2718 if (ret < 0) {
2719 iocb->ret = ret;
2720 goto done;
2721 } else if (iocb->ret < 0) {
2722 goto done;
2723 }
2724
2725 if (iocb->idx == iocb->nr) {
2726 goto done;
2727 }
2728
2729 range = &iocb->ranges[iocb->idx];
2730 slba = le64_to_cpu(range->slba);
2731 nlb = le32_to_cpu(range->nlb) + 1;
2732 len = nvme_l2b(ns, nlb);
2733
2734 trace_pci_nvme_copy_source_range(slba, nlb);
2735
2736 if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2737 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2738 goto invalid;
2739 }
2740
2741 status = nvme_check_bounds(ns, slba, nlb);
2742 if (status) {
2743 goto invalid;
2744 }
2745
2746 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2747 status = nvme_check_dulbe(ns, slba, nlb);
2748 if (status) {
2749 goto invalid;
2750 }
2751 }
2752
2753 if (ns->params.zoned) {
2754 status = nvme_check_zone_read(ns, slba, nlb);
2755 if (status) {
2756 goto invalid;
2757 }
2758 }
2759
2760 qemu_iovec_reset(&iocb->iov);
2761 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2762
2763 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2764 &iocb->iov, 0, nvme_copy_in_cb, iocb);
2765 return;
2766
2767invalid:
2768 req->status = status;
2769done:
2770 iocb->aiocb = NULL;
2771 if (iocb->bh) {
2772 qemu_bh_schedule(iocb->bh);
2773 }
2774}
2775
2776
2777static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2778{
2779 NvmeNamespace *ns = req->ns;
2780 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2781 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
2782 nvme_misc_cb, req);
2783 uint16_t nr = copy->nr + 1;
2784 uint8_t format = copy->control[0] & 0xf;
2785 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2786 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2787
2788 uint16_t status;
2789
2790 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2791
2792 iocb->ranges = NULL;
2793 iocb->zone = NULL;
2794
2795 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2796 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
2797 status = NVME_INVALID_FIELD | NVME_DNR;
2798 goto invalid;
2799 }
2800
2801 if (!(n->id_ctrl.ocfs & (1 << format))) {
2802 trace_pci_nvme_err_copy_invalid_format(format);
2803 status = NVME_INVALID_FIELD | NVME_DNR;
2804 goto invalid;
2805 }
2806
2807 if (nr > ns->id_ns.msrc + 1) {
2808 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2809 goto invalid;
2810 }
2811
2812 iocb->ranges = g_new(NvmeCopySourceRange, nr);
2813
2814 status = nvme_h2c(n, (uint8_t *)iocb->ranges,
2815 sizeof(NvmeCopySourceRange) * nr, req);
2816 if (status) {
2817 goto invalid;
2818 }
2819
2820 iocb->slba = le64_to_cpu(copy->sdlba);
2821
2822 if (ns->params.zoned) {
2823 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
2824 if (!iocb->zone) {
2825 status = NVME_LBA_RANGE | NVME_DNR;
2826 goto invalid;
2827 }
2828
2829 status = nvme_zrm_auto(n, ns, iocb->zone);
2830 if (status) {
2831 goto invalid;
2832 }
2833 }
2834
2835 iocb->req = req;
2836 iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
2837 iocb->ret = 0;
2838 iocb->nr = nr;
2839 iocb->idx = 0;
2840 iocb->reftag = le32_to_cpu(copy->reftag);
2841 iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
2842 ns->lbasz + ns->lbaf.ms);
2843
2844 qemu_iovec_init(&iocb->iov, 1);
2845
2846 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
2847 BLOCK_ACCT_READ);
2848 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
2849 BLOCK_ACCT_WRITE);
2850
2851 req->aiocb = &iocb->common;
2852 nvme_copy_cb(iocb, 0);
2853
2854 return NVME_NO_COMPLETE;
2855
2856invalid:
2857 g_free(iocb->ranges);
2858 qemu_aio_unref(iocb);
2859 return status;
2860}
2861
2862static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2863{
2864 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2865 NvmeNamespace *ns = req->ns;
2866 BlockBackend *blk = ns->blkconf.blk;
2867 uint64_t slba = le64_to_cpu(rw->slba);
2868 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2869 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2870 size_t data_len = nvme_l2b(ns, nlb);
2871 size_t len = data_len;
2872 int64_t offset = nvme_l2b(ns, slba);
2873 struct nvme_compare_ctx *ctx = NULL;
2874 uint16_t status;
2875
2876 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2877
2878 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
2879 return NVME_INVALID_PROT_INFO | NVME_DNR;
2880 }
2881
2882 if (nvme_ns_ext(ns)) {
2883 len += nvme_m2b(ns, nlb);
2884 }
2885
2886 status = nvme_check_mdts(n, len);
2887 if (status) {
2888 return status;
2889 }
2890
2891 status = nvme_check_bounds(ns, slba, nlb);
2892 if (status) {
2893 return status;
2894 }
2895
2896 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2897 status = nvme_check_dulbe(ns, slba, nlb);
2898 if (status) {
2899 return status;
2900 }
2901 }
2902
2903 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2904 if (status) {
2905 return status;
2906 }
2907
2908 ctx = g_new(struct nvme_compare_ctx, 1);
2909 ctx->data.bounce = g_malloc(data_len);
2910
2911 req->opaque = ctx;
2912
2913 qemu_iovec_init(&ctx->data.iov, 1);
2914 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2915
2916 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2917 BLOCK_ACCT_READ);
2918 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2919 nvme_compare_data_cb, req);
2920
2921 return NVME_NO_COMPLETE;
2922}
2923
2924typedef struct NvmeFlushAIOCB {
2925 BlockAIOCB common;
2926 BlockAIOCB *aiocb;
2927 NvmeRequest *req;
2928 QEMUBH *bh;
2929 int ret;
2930
2931 NvmeNamespace *ns;
2932 uint32_t nsid;
2933 bool broadcast;
2934} NvmeFlushAIOCB;
2935
2936static void nvme_flush_cancel(BlockAIOCB *acb)
2937{
2938 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
2939
2940 iocb->ret = -ECANCELED;
2941
2942 if (iocb->aiocb) {
2943 blk_aio_cancel_async(iocb->aiocb);
2944 }
2945}
2946
2947static const AIOCBInfo nvme_flush_aiocb_info = {
2948 .aiocb_size = sizeof(NvmeFlushAIOCB),
2949 .cancel_async = nvme_flush_cancel,
2950 .get_aio_context = nvme_get_aio_context,
2951};
2952
2953static void nvme_flush_ns_cb(void *opaque, int ret)
2954{
2955 NvmeFlushAIOCB *iocb = opaque;
2956 NvmeNamespace *ns = iocb->ns;
2957
2958 if (ret < 0) {
2959 iocb->ret = ret;
2960 goto out;
2961 } else if (iocb->ret < 0) {
2962 goto out;
2963 }
2964
2965 if (ns) {
2966 trace_pci_nvme_flush_ns(iocb->nsid);
2967
2968 iocb->ns = NULL;
2969 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
2970 return;
2971 }
2972
2973out:
2974 iocb->aiocb = NULL;
2975 qemu_bh_schedule(iocb->bh);
2976}
2977
2978static void nvme_flush_bh(void *opaque)
2979{
2980 NvmeFlushAIOCB *iocb = opaque;
2981 NvmeRequest *req = iocb->req;
2982 NvmeCtrl *n = nvme_ctrl(req);
2983 int i;
2984
2985 if (iocb->ret < 0) {
2986 goto done;
2987 }
2988
2989 if (iocb->broadcast) {
2990 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
2991 iocb->ns = nvme_ns(n, i);
2992 if (iocb->ns) {
2993 iocb->nsid = i;
2994 break;
2995 }
2996 }
2997 }
2998
2999 if (!iocb->ns) {
3000 goto done;
3001 }
3002
3003 nvme_flush_ns_cb(iocb, 0);
3004 return;
3005
3006done:
3007 qemu_bh_delete(iocb->bh);
3008 iocb->bh = NULL;
3009
3010 iocb->common.cb(iocb->common.opaque, iocb->ret);
3011
3012 qemu_aio_unref(iocb);
3013
3014 return;
3015}
3016
3017static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3018{
3019 NvmeFlushAIOCB *iocb;
3020 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3021 uint16_t status;
3022
3023 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3024
3025 iocb->req = req;
3026 iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3027 iocb->ret = 0;
3028 iocb->ns = NULL;
3029 iocb->nsid = 0;
3030 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3031
3032 if (!iocb->broadcast) {
3033 if (!nvme_nsid_valid(n, nsid)) {
3034 status = NVME_INVALID_NSID | NVME_DNR;
3035 goto out;
3036 }
3037
3038 iocb->ns = nvme_ns(n, nsid);
3039 if (!iocb->ns) {
3040 status = NVME_INVALID_FIELD | NVME_DNR;
3041 goto out;
3042 }
3043
3044 iocb->nsid = nsid;
3045 }
3046
3047 req->aiocb = &iocb->common;
3048 qemu_bh_schedule(iocb->bh);
3049
3050 return NVME_NO_COMPLETE;
3051
3052out:
3053 qemu_bh_delete(iocb->bh);
3054 iocb->bh = NULL;
3055 qemu_aio_unref(iocb);
3056
3057 return status;
3058}
3059
3060static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3061{
3062 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3063 NvmeNamespace *ns = req->ns;
3064 uint64_t slba = le64_to_cpu(rw->slba);
3065 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3066 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3067 uint64_t data_size = nvme_l2b(ns, nlb);
3068 uint64_t mapped_size = data_size;
3069 uint64_t data_offset;
3070 BlockBackend *blk = ns->blkconf.blk;
3071 uint16_t status;
3072
3073 if (nvme_ns_ext(ns)) {
3074 mapped_size += nvme_m2b(ns, nlb);
3075
3076 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3077 bool pract = prinfo & NVME_PRINFO_PRACT;
3078
3079 if (pract && ns->lbaf.ms == 8) {
3080 mapped_size = data_size;
3081 }
3082 }
3083 }
3084
3085 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3086
3087 status = nvme_check_mdts(n, mapped_size);
3088 if (status) {
3089 goto invalid;
3090 }
3091
3092 status = nvme_check_bounds(ns, slba, nlb);
3093 if (status) {
3094 goto invalid;
3095 }
3096
3097 if (ns->params.zoned) {
3098 status = nvme_check_zone_read(ns, slba, nlb);
3099 if (status) {
3100 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3101 goto invalid;
3102 }
3103 }
3104
3105 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3106 status = nvme_check_dulbe(ns, slba, nlb);
3107 if (status) {
3108 goto invalid;
3109 }
3110 }
3111
3112 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3113 return nvme_dif_rw(n, req);
3114 }
3115
3116 status = nvme_map_data(n, nlb, req);
3117 if (status) {
3118 goto invalid;
3119 }
3120
3121 data_offset = nvme_l2b(ns, slba);
3122
3123 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3124 BLOCK_ACCT_READ);
3125 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3126 return NVME_NO_COMPLETE;
3127
3128invalid:
3129 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3130 return status | NVME_DNR;
3131}
3132
3133static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3134 bool wrz)
3135{
3136 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3137 NvmeNamespace *ns = req->ns;
3138 uint64_t slba = le64_to_cpu(rw->slba);
3139 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3140 uint16_t ctrl = le16_to_cpu(rw->control);
3141 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3142 uint64_t data_size = nvme_l2b(ns, nlb);
3143 uint64_t mapped_size = data_size;
3144 uint64_t data_offset;
3145 NvmeZone *zone;
3146 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3147 BlockBackend *blk = ns->blkconf.blk;
3148 uint16_t status;
3149
3150 if (nvme_ns_ext(ns)) {
3151 mapped_size += nvme_m2b(ns, nlb);
3152
3153 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3154 bool pract = prinfo & NVME_PRINFO_PRACT;
3155
3156 if (pract && ns->lbaf.ms == 8) {
3157 mapped_size -= nvme_m2b(ns, nlb);
3158 }
3159 }
3160 }
3161
3162 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3163 nvme_nsid(ns), nlb, mapped_size, slba);
3164
3165 if (!wrz) {
3166 status = nvme_check_mdts(n, mapped_size);
3167 if (status) {
3168 goto invalid;
3169 }
3170 }
3171
3172 status = nvme_check_bounds(ns, slba, nlb);
3173 if (status) {
3174 goto invalid;
3175 }
3176
3177 if (ns->params.zoned) {
3178 zone = nvme_get_zone_by_slba(ns, slba);
3179 assert(zone);
3180
3181 if (append) {
3182 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3183
3184 if (unlikely(slba != zone->d.zslba)) {
3185 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3186 status = NVME_INVALID_FIELD;
3187 goto invalid;
3188 }
3189
3190 if (n->params.zasl &&
3191 data_size > (uint64_t)n->page_size << n->params.zasl) {
3192 trace_pci_nvme_err_zasl(data_size);
3193 return NVME_INVALID_FIELD | NVME_DNR;
3194 }
3195
3196 slba = zone->w_ptr;
3197 rw->slba = cpu_to_le64(slba);
3198 res->slba = cpu_to_le64(slba);
3199
3200 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3201 case NVME_ID_NS_DPS_TYPE_1:
3202 if (!piremap) {
3203 return NVME_INVALID_PROT_INFO | NVME_DNR;
3204 }
3205
3206
3207
3208 case NVME_ID_NS_DPS_TYPE_2:
3209 if (piremap) {
3210 uint32_t reftag = le32_to_cpu(rw->reftag);
3211 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3212 }
3213
3214 break;
3215
3216 case NVME_ID_NS_DPS_TYPE_3:
3217 if (piremap) {
3218 return NVME_INVALID_PROT_INFO | NVME_DNR;
3219 }
3220
3221 break;
3222 }
3223 }
3224
3225 status = nvme_check_zone_write(ns, zone, slba, nlb);
3226 if (status) {
3227 goto invalid;
3228 }
3229
3230 status = nvme_zrm_auto(n, ns, zone);
3231 if (status) {
3232 goto invalid;
3233 }
3234
3235 zone->w_ptr += nlb;
3236 }
3237
3238 data_offset = nvme_l2b(ns, slba);
3239
3240 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3241 return nvme_dif_rw(n, req);
3242 }
3243
3244 if (!wrz) {
3245 status = nvme_map_data(n, nlb, req);
3246 if (status) {
3247 goto invalid;
3248 }
3249
3250 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3251 BLOCK_ACCT_WRITE);
3252 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3253 } else {
3254 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3255 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3256 req);
3257 }
3258
3259 return NVME_NO_COMPLETE;
3260
3261invalid:
3262 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3263 return status | NVME_DNR;
3264}
3265
3266static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3267{
3268 return nvme_do_write(n, req, false, false);
3269}
3270
3271static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3272{
3273 return nvme_do_write(n, req, false, true);
3274}
3275
3276static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3277{
3278 return nvme_do_write(n, req, true, false);
3279}
3280
3281static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3282 uint64_t *slba, uint32_t *zone_idx)
3283{
3284 uint32_t dw10 = le32_to_cpu(c->cdw10);
3285 uint32_t dw11 = le32_to_cpu(c->cdw11);
3286
3287 if (!ns->params.zoned) {
3288 trace_pci_nvme_err_invalid_opc(c->opcode);
3289 return NVME_INVALID_OPCODE | NVME_DNR;
3290 }
3291
3292 *slba = ((uint64_t)dw11) << 32 | dw10;
3293 if (unlikely(*slba >= ns->id_ns.nsze)) {
3294 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3295 *slba = 0;
3296 return NVME_LBA_RANGE | NVME_DNR;
3297 }
3298
3299 *zone_idx = nvme_zone_idx(ns, *slba);
3300 assert(*zone_idx < ns->num_zones);
3301
3302 return NVME_SUCCESS;
3303}
3304
3305typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3306 NvmeRequest *);
3307
3308enum NvmeZoneProcessingMask {
3309 NVME_PROC_CURRENT_ZONE = 0,
3310 NVME_PROC_OPENED_ZONES = 1 << 0,
3311 NVME_PROC_CLOSED_ZONES = 1 << 1,
3312 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3313 NVME_PROC_FULL_ZONES = 1 << 3,
3314};
3315
3316static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3317 NvmeZoneState state, NvmeRequest *req)
3318{
3319 return nvme_zrm_open(nvme_ctrl(req), ns, zone);
3320}
3321
3322static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3323 NvmeZoneState state, NvmeRequest *req)
3324{
3325 return nvme_zrm_close(ns, zone);
3326}
3327
3328static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3329 NvmeZoneState state, NvmeRequest *req)
3330{
3331 return nvme_zrm_finish(ns, zone);
3332}
3333
3334static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3335 NvmeZoneState state, NvmeRequest *req)
3336{
3337 switch (state) {
3338 case NVME_ZONE_STATE_READ_ONLY:
3339 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3340
3341 case NVME_ZONE_STATE_OFFLINE:
3342 return NVME_SUCCESS;
3343 default:
3344 return NVME_ZONE_INVAL_TRANSITION;
3345 }
3346}
3347
3348static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3349{
3350 uint16_t status;
3351 uint8_t state = nvme_get_zone_state(zone);
3352
3353 if (state == NVME_ZONE_STATE_EMPTY) {
3354 status = nvme_aor_check(ns, 1, 0);
3355 if (status) {
3356 return status;
3357 }
3358 nvme_aor_inc_active(ns);
3359 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3360 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3361 return NVME_SUCCESS;
3362 }
3363
3364 return NVME_ZONE_INVAL_TRANSITION;
3365}
3366
3367static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3368 enum NvmeZoneProcessingMask proc_mask,
3369 op_handler_t op_hndlr, NvmeRequest *req)
3370{
3371 uint16_t status = NVME_SUCCESS;
3372 NvmeZoneState zs = nvme_get_zone_state(zone);
3373 bool proc_zone;
3374
3375 switch (zs) {
3376 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3377 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3378 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3379 break;
3380 case NVME_ZONE_STATE_CLOSED:
3381 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3382 break;
3383 case NVME_ZONE_STATE_READ_ONLY:
3384 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3385 break;
3386 case NVME_ZONE_STATE_FULL:
3387 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3388 break;
3389 default:
3390 proc_zone = false;
3391 }
3392
3393 if (proc_zone) {
3394 status = op_hndlr(ns, zone, zs, req);
3395 }
3396
3397 return status;
3398}
3399
3400static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3401 enum NvmeZoneProcessingMask proc_mask,
3402 op_handler_t op_hndlr, NvmeRequest *req)
3403{
3404 NvmeZone *next;
3405 uint16_t status = NVME_SUCCESS;
3406 int i;
3407
3408 if (!proc_mask) {
3409 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3410 } else {
3411 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3412 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3413 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3414 req);
3415 if (status && status != NVME_NO_COMPLETE) {
3416 goto out;
3417 }
3418 }
3419 }
3420 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3421 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3422 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3423 req);
3424 if (status && status != NVME_NO_COMPLETE) {
3425 goto out;
3426 }
3427 }
3428
3429 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3430 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3431 req);
3432 if (status && status != NVME_NO_COMPLETE) {
3433 goto out;
3434 }
3435 }
3436 }
3437 if (proc_mask & NVME_PROC_FULL_ZONES) {
3438 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3439 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3440 req);
3441 if (status && status != NVME_NO_COMPLETE) {
3442 goto out;
3443 }
3444 }
3445 }
3446
3447 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3448 for (i = 0; i < ns->num_zones; i++, zone++) {
3449 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3450 req);
3451 if (status && status != NVME_NO_COMPLETE) {
3452 goto out;
3453 }
3454 }
3455 }
3456 }
3457
3458out:
3459 return status;
3460}
3461
3462typedef struct NvmeZoneResetAIOCB {
3463 BlockAIOCB common;
3464 BlockAIOCB *aiocb;
3465 NvmeRequest *req;
3466 QEMUBH *bh;
3467 int ret;
3468
3469 bool all;
3470 int idx;
3471 NvmeZone *zone;
3472} NvmeZoneResetAIOCB;
3473
3474static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3475{
3476 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3477 NvmeRequest *req = iocb->req;
3478 NvmeNamespace *ns = req->ns;
3479
3480 iocb->idx = ns->num_zones;
3481
3482 iocb->ret = -ECANCELED;
3483
3484 if (iocb->aiocb) {
3485 blk_aio_cancel_async(iocb->aiocb);
3486 iocb->aiocb = NULL;
3487 }
3488}
3489
3490static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3491 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3492 .cancel_async = nvme_zone_reset_cancel,
3493};
3494
3495static void nvme_zone_reset_bh(void *opaque)
3496{
3497 NvmeZoneResetAIOCB *iocb = opaque;
3498
3499 iocb->common.cb(iocb->common.opaque, iocb->ret);
3500
3501 qemu_bh_delete(iocb->bh);
3502 iocb->bh = NULL;
3503 qemu_aio_unref(iocb);
3504}
3505
3506static void nvme_zone_reset_cb(void *opaque, int ret);
3507
3508static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3509{
3510 NvmeZoneResetAIOCB *iocb = opaque;
3511 NvmeRequest *req = iocb->req;
3512 NvmeNamespace *ns = req->ns;
3513 int64_t moff;
3514 int count;
3515
3516 if (ret < 0) {
3517 nvme_zone_reset_cb(iocb, ret);
3518 return;
3519 }
3520
3521 if (!ns->lbaf.ms) {
3522 nvme_zone_reset_cb(iocb, 0);
3523 return;
3524 }
3525
3526 moff = nvme_moff(ns, iocb->zone->d.zslba);
3527 count = nvme_m2b(ns, ns->zone_size);
3528
3529 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3530 BDRV_REQ_MAY_UNMAP,
3531 nvme_zone_reset_cb, iocb);
3532 return;
3533}
3534
3535static void nvme_zone_reset_cb(void *opaque, int ret)
3536{
3537 NvmeZoneResetAIOCB *iocb = opaque;
3538 NvmeRequest *req = iocb->req;
3539 NvmeNamespace *ns = req->ns;
3540
3541 if (ret < 0) {
3542 iocb->ret = ret;
3543 goto done;
3544 }
3545
3546 if (iocb->zone) {
3547 nvme_zrm_reset(ns, iocb->zone);
3548
3549 if (!iocb->all) {
3550 goto done;
3551 }
3552 }
3553
3554 while (iocb->idx < ns->num_zones) {
3555 NvmeZone *zone = &ns->zone_array[iocb->idx++];
3556
3557 switch (nvme_get_zone_state(zone)) {
3558 case NVME_ZONE_STATE_EMPTY:
3559 if (!iocb->all) {
3560 goto done;
3561 }
3562
3563 continue;
3564
3565 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3566 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3567 case NVME_ZONE_STATE_CLOSED:
3568 case NVME_ZONE_STATE_FULL:
3569 iocb->zone = zone;
3570 break;
3571
3572 default:
3573 continue;
3574 }
3575
3576 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3577
3578 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3579 nvme_l2b(ns, zone->d.zslba),
3580 nvme_l2b(ns, ns->zone_size),
3581 BDRV_REQ_MAY_UNMAP,
3582 nvme_zone_reset_epilogue_cb,
3583 iocb);
3584 return;
3585 }
3586
3587done:
3588 iocb->aiocb = NULL;
3589 if (iocb->bh) {
3590 qemu_bh_schedule(iocb->bh);
3591 }
3592}
3593
3594static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3595{
3596 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3597 NvmeNamespace *ns = req->ns;
3598 NvmeZone *zone;
3599 NvmeZoneResetAIOCB *iocb;
3600 uint8_t *zd_ext;
3601 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3602 uint64_t slba = 0;
3603 uint32_t zone_idx = 0;
3604 uint16_t status;
3605 uint8_t action;
3606 bool all;
3607 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3608
3609 action = dw13 & 0xff;
3610 all = !!(dw13 & 0x100);
3611
3612 req->status = NVME_SUCCESS;
3613
3614 if (!all) {
3615 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3616 if (status) {
3617 return status;
3618 }
3619 }
3620
3621 zone = &ns->zone_array[zone_idx];
3622 if (slba != zone->d.zslba) {
3623 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3624 return NVME_INVALID_FIELD | NVME_DNR;
3625 }
3626
3627 switch (action) {
3628
3629 case NVME_ZONE_ACTION_OPEN:
3630 if (all) {
3631 proc_mask = NVME_PROC_CLOSED_ZONES;
3632 }
3633 trace_pci_nvme_open_zone(slba, zone_idx, all);
3634 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3635 break;
3636
3637 case NVME_ZONE_ACTION_CLOSE:
3638 if (all) {
3639 proc_mask = NVME_PROC_OPENED_ZONES;
3640 }
3641 trace_pci_nvme_close_zone(slba, zone_idx, all);
3642 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3643 break;
3644
3645 case NVME_ZONE_ACTION_FINISH:
3646 if (all) {
3647 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3648 }
3649 trace_pci_nvme_finish_zone(slba, zone_idx, all);
3650 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3651 break;
3652
3653 case NVME_ZONE_ACTION_RESET:
3654 trace_pci_nvme_reset_zone(slba, zone_idx, all);
3655
3656 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3657 nvme_misc_cb, req);
3658
3659 iocb->req = req;
3660 iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3661 iocb->ret = 0;
3662 iocb->all = all;
3663 iocb->idx = zone_idx;
3664 iocb->zone = NULL;
3665
3666 req->aiocb = &iocb->common;
3667 nvme_zone_reset_cb(iocb, 0);
3668
3669 return NVME_NO_COMPLETE;
3670
3671 case NVME_ZONE_ACTION_OFFLINE:
3672 if (all) {
3673 proc_mask = NVME_PROC_READ_ONLY_ZONES;
3674 }
3675 trace_pci_nvme_offline_zone(slba, zone_idx, all);
3676 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3677 break;
3678
3679 case NVME_ZONE_ACTION_SET_ZD_EXT:
3680 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3681 if (all || !ns->params.zd_extension_size) {
3682 return NVME_INVALID_FIELD | NVME_DNR;
3683 }
3684 zd_ext = nvme_get_zd_extension(ns, zone_idx);
3685 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3686 if (status) {
3687 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3688 return status;
3689 }
3690
3691 status = nvme_set_zd_ext(ns, zone);
3692 if (status == NVME_SUCCESS) {
3693 trace_pci_nvme_zd_extension_set(zone_idx);
3694 return status;
3695 }
3696 break;
3697
3698 default:
3699 trace_pci_nvme_err_invalid_mgmt_action(action);
3700 status = NVME_INVALID_FIELD;
3701 }
3702
3703 if (status == NVME_ZONE_INVAL_TRANSITION) {
3704 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3705 zone->d.za);
3706 }
3707 if (status) {
3708 status |= NVME_DNR;
3709 }
3710
3711 return status;
3712}
3713
3714static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3715{
3716 NvmeZoneState zs = nvme_get_zone_state(zl);
3717
3718 switch (zafs) {
3719 case NVME_ZONE_REPORT_ALL:
3720 return true;
3721 case NVME_ZONE_REPORT_EMPTY:
3722 return zs == NVME_ZONE_STATE_EMPTY;
3723 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3724 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3725 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3726 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3727 case NVME_ZONE_REPORT_CLOSED:
3728 return zs == NVME_ZONE_STATE_CLOSED;
3729 case NVME_ZONE_REPORT_FULL:
3730 return zs == NVME_ZONE_STATE_FULL;
3731 case NVME_ZONE_REPORT_READ_ONLY:
3732 return zs == NVME_ZONE_STATE_READ_ONLY;
3733 case NVME_ZONE_REPORT_OFFLINE:
3734 return zs == NVME_ZONE_STATE_OFFLINE;
3735 default:
3736 return false;
3737 }
3738}
3739
3740static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3741{
3742 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3743 NvmeNamespace *ns = req->ns;
3744
3745 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3746 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3747 uint32_t zone_idx, zra, zrasf, partial;
3748 uint64_t max_zones, nr_zones = 0;
3749 uint16_t status;
3750 uint64_t slba;
3751 NvmeZoneDescr *z;
3752 NvmeZone *zone;
3753 NvmeZoneReportHeader *header;
3754 void *buf, *buf_p;
3755 size_t zone_entry_sz;
3756 int i;
3757
3758 req->status = NVME_SUCCESS;
3759
3760 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3761 if (status) {
3762 return status;
3763 }
3764
3765 zra = dw13 & 0xff;
3766 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3767 return NVME_INVALID_FIELD | NVME_DNR;
3768 }
3769 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3770 return NVME_INVALID_FIELD | NVME_DNR;
3771 }
3772
3773 zrasf = (dw13 >> 8) & 0xff;
3774 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3775 return NVME_INVALID_FIELD | NVME_DNR;
3776 }
3777
3778 if (data_size < sizeof(NvmeZoneReportHeader)) {
3779 return NVME_INVALID_FIELD | NVME_DNR;
3780 }
3781
3782 status = nvme_check_mdts(n, data_size);
3783 if (status) {
3784 return status;
3785 }
3786
3787 partial = (dw13 >> 16) & 0x01;
3788
3789 zone_entry_sz = sizeof(NvmeZoneDescr);
3790 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3791 zone_entry_sz += ns->params.zd_extension_size;
3792 }
3793
3794 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3795 buf = g_malloc0(data_size);
3796
3797 zone = &ns->zone_array[zone_idx];
3798 for (i = zone_idx; i < ns->num_zones; i++) {
3799 if (partial && nr_zones >= max_zones) {
3800 break;
3801 }
3802 if (nvme_zone_matches_filter(zrasf, zone++)) {
3803 nr_zones++;
3804 }
3805 }
3806 header = (NvmeZoneReportHeader *)buf;
3807 header->nr_zones = cpu_to_le64(nr_zones);
3808
3809 buf_p = buf + sizeof(NvmeZoneReportHeader);
3810 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3811 zone = &ns->zone_array[zone_idx];
3812 if (nvme_zone_matches_filter(zrasf, zone)) {
3813 z = (NvmeZoneDescr *)buf_p;
3814 buf_p += sizeof(NvmeZoneDescr);
3815
3816 z->zt = zone->d.zt;
3817 z->zs = zone->d.zs;
3818 z->zcap = cpu_to_le64(zone->d.zcap);
3819 z->zslba = cpu_to_le64(zone->d.zslba);
3820 z->za = zone->d.za;
3821
3822 if (nvme_wp_is_valid(zone)) {
3823 z->wp = cpu_to_le64(zone->d.wp);
3824 } else {
3825 z->wp = cpu_to_le64(~0ULL);
3826 }
3827
3828 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3829 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3830 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3831 ns->params.zd_extension_size);
3832 }
3833 buf_p += ns->params.zd_extension_size;
3834 }
3835
3836 max_zones--;
3837 }
3838 }
3839
3840 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3841
3842 g_free(buf);
3843
3844 return status;
3845}
3846
3847static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3848{
3849 NvmeNamespace *ns;
3850 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3851
3852 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3853 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3854
3855 if (!nvme_nsid_valid(n, nsid)) {
3856 return NVME_INVALID_NSID | NVME_DNR;
3857 }
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878 if (req->cmd.opcode == NVME_CMD_FLUSH) {
3879 return nvme_flush(n, req);
3880 }
3881
3882 ns = nvme_ns(n, nsid);
3883 if (unlikely(!ns)) {
3884 return NVME_INVALID_FIELD | NVME_DNR;
3885 }
3886
3887 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3888 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3889 return NVME_INVALID_OPCODE | NVME_DNR;
3890 }
3891
3892 if (ns->status) {
3893 return ns->status;
3894 }
3895
3896 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
3897 return NVME_INVALID_FIELD;
3898 }
3899
3900 req->ns = ns;
3901
3902 switch (req->cmd.opcode) {
3903 case NVME_CMD_WRITE_ZEROES:
3904 return nvme_write_zeroes(n, req);
3905 case NVME_CMD_ZONE_APPEND:
3906 return nvme_zone_append(n, req);
3907 case NVME_CMD_WRITE:
3908 return nvme_write(n, req);
3909 case NVME_CMD_READ:
3910 return nvme_read(n, req);
3911 case NVME_CMD_COMPARE:
3912 return nvme_compare(n, req);
3913 case NVME_CMD_DSM:
3914 return nvme_dsm(n, req);
3915 case NVME_CMD_VERIFY:
3916 return nvme_verify(n, req);
3917 case NVME_CMD_COPY:
3918 return nvme_copy(n, req);
3919 case NVME_CMD_ZONE_MGMT_SEND:
3920 return nvme_zone_mgmt_send(n, req);
3921 case NVME_CMD_ZONE_MGMT_RECV:
3922 return nvme_zone_mgmt_recv(n, req);
3923 default:
3924 assert(false);
3925 }
3926
3927 return NVME_INVALID_OPCODE | NVME_DNR;
3928}
3929
3930static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3931{
3932 n->sq[sq->sqid] = NULL;
3933 timer_free(sq->timer);
3934 g_free(sq->io_req);
3935 if (sq->sqid) {
3936 g_free(sq);
3937 }
3938}
3939
3940static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3941{
3942 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3943 NvmeRequest *r, *next;
3944 NvmeSQueue *sq;
3945 NvmeCQueue *cq;
3946 uint16_t qid = le16_to_cpu(c->qid);
3947
3948 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3949 trace_pci_nvme_err_invalid_del_sq(qid);
3950 return NVME_INVALID_QID | NVME_DNR;
3951 }
3952
3953 trace_pci_nvme_del_sq(qid);
3954
3955 sq = n->sq[qid];
3956 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3957 r = QTAILQ_FIRST(&sq->out_req_list);
3958 assert(r->aiocb);
3959 blk_aio_cancel(r->aiocb);
3960 }
3961
3962 assert(QTAILQ_EMPTY(&sq->out_req_list));
3963
3964 if (!nvme_check_cqid(n, sq->cqid)) {
3965 cq = n->cq[sq->cqid];
3966 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3967
3968 nvme_post_cqes(cq);
3969 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3970 if (r->sq == sq) {
3971 QTAILQ_REMOVE(&cq->req_list, r, entry);
3972 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3973 }
3974 }
3975 }
3976
3977 nvme_free_sq(sq, n);
3978 return NVME_SUCCESS;
3979}
3980
3981static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3982 uint16_t sqid, uint16_t cqid, uint16_t size)
3983{
3984 int i;
3985 NvmeCQueue *cq;
3986
3987 sq->ctrl = n;
3988 sq->dma_addr = dma_addr;
3989 sq->sqid = sqid;
3990 sq->size = size;
3991 sq->cqid = cqid;
3992 sq->head = sq->tail = 0;
3993 sq->io_req = g_new0(NvmeRequest, sq->size);
3994
3995 QTAILQ_INIT(&sq->req_list);
3996 QTAILQ_INIT(&sq->out_req_list);
3997 for (i = 0; i < sq->size; i++) {
3998 sq->io_req[i].sq = sq;
3999 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4000 }
4001 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
4002
4003 assert(n->cq[cqid]);
4004 cq = n->cq[cqid];
4005 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4006 n->sq[sqid] = sq;
4007}
4008
4009static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4010{
4011 NvmeSQueue *sq;
4012 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4013
4014 uint16_t cqid = le16_to_cpu(c->cqid);
4015 uint16_t sqid = le16_to_cpu(c->sqid);
4016 uint16_t qsize = le16_to_cpu(c->qsize);
4017 uint16_t qflags = le16_to_cpu(c->sq_flags);
4018 uint64_t prp1 = le64_to_cpu(c->prp1);
4019
4020 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4021
4022 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4023 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4024 return NVME_INVALID_CQID | NVME_DNR;
4025 }
4026 if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
4027 n->sq[sqid] != NULL)) {
4028 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4029 return NVME_INVALID_QID | NVME_DNR;
4030 }
4031 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4032 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4033 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4034 }
4035 if (unlikely(prp1 & (n->page_size - 1))) {
4036 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4037 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4038 }
4039 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4040 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4041 return NVME_INVALID_FIELD | NVME_DNR;
4042 }
4043 sq = g_malloc0(sizeof(*sq));
4044 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4045 return NVME_SUCCESS;
4046}
4047
4048struct nvme_stats {
4049 uint64_t units_read;
4050 uint64_t units_written;
4051 uint64_t read_commands;
4052 uint64_t write_commands;
4053};
4054
4055static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4056{
4057 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4058
4059 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4060 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4061 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4062 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4063}
4064
4065static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4066 uint64_t off, NvmeRequest *req)
4067{
4068 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4069 struct nvme_stats stats = { 0 };
4070 NvmeSmartLog smart = { 0 };
4071 uint32_t trans_len;
4072 NvmeNamespace *ns;
4073 time_t current_ms;
4074
4075 if (off >= sizeof(smart)) {
4076 return NVME_INVALID_FIELD | NVME_DNR;
4077 }
4078
4079 if (nsid != 0xffffffff) {
4080 ns = nvme_ns(n, nsid);
4081 if (!ns) {
4082 return NVME_INVALID_NSID | NVME_DNR;
4083 }
4084 nvme_set_blk_stats(ns, &stats);
4085 } else {
4086 int i;
4087
4088 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4089 ns = nvme_ns(n, i);
4090 if (!ns) {
4091 continue;
4092 }
4093 nvme_set_blk_stats(ns, &stats);
4094 }
4095 }
4096
4097 trans_len = MIN(sizeof(smart) - off, buf_len);
4098 smart.critical_warning = n->smart_critical_warning;
4099
4100 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4101 1000));
4102 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4103 1000));
4104 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4105 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4106
4107 smart.temperature = cpu_to_le16(n->temperature);
4108
4109 if ((n->temperature >= n->features.temp_thresh_hi) ||
4110 (n->temperature <= n->features.temp_thresh_low)) {
4111 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4112 }
4113
4114 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4115 smart.power_on_hours[0] =
4116 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4117
4118 if (!rae) {
4119 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4120 }
4121
4122 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4123}
4124
4125static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4126 NvmeRequest *req)
4127{
4128 uint32_t trans_len;
4129 NvmeFwSlotInfoLog fw_log = {
4130 .afi = 0x1,
4131 };
4132
4133 if (off >= sizeof(fw_log)) {
4134 return NVME_INVALID_FIELD | NVME_DNR;
4135 }
4136
4137 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4138 trans_len = MIN(sizeof(fw_log) - off, buf_len);
4139
4140 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4141}
4142
4143static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4144 uint64_t off, NvmeRequest *req)
4145{
4146 uint32_t trans_len;
4147 NvmeErrorLog errlog;
4148
4149 if (off >= sizeof(errlog)) {
4150 return NVME_INVALID_FIELD | NVME_DNR;
4151 }
4152
4153 if (!rae) {
4154 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4155 }
4156
4157 memset(&errlog, 0x0, sizeof(errlog));
4158 trans_len = MIN(sizeof(errlog) - off, buf_len);
4159
4160 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4161}
4162
4163static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4164 uint64_t off, NvmeRequest *req)
4165{
4166 uint32_t nslist[1024];
4167 uint32_t trans_len;
4168 int i = 0;
4169 uint32_t nsid;
4170
4171 if (off >= sizeof(nslist)) {
4172 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
4173 return NVME_INVALID_FIELD | NVME_DNR;
4174 }
4175
4176 memset(nslist, 0x0, sizeof(nslist));
4177 trans_len = MIN(sizeof(nslist) - off, buf_len);
4178
4179 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4180 NVME_CHANGED_NSID_SIZE) {
4181
4182
4183
4184
4185 if (i == ARRAY_SIZE(nslist)) {
4186 memset(nslist, 0x0, sizeof(nslist));
4187 nslist[0] = 0xffffffff;
4188 break;
4189 }
4190
4191 nslist[i++] = nsid;
4192 clear_bit(nsid, n->changed_nsids);
4193 }
4194
4195
4196
4197
4198
4199 if (nslist[0] == 0xffffffff) {
4200 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4201 }
4202
4203 if (!rae) {
4204 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4205 }
4206
4207 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4208}
4209
4210static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4211 uint64_t off, NvmeRequest *req)
4212{
4213 NvmeEffectsLog log = {};
4214 const uint32_t *src_iocs = NULL;
4215 uint32_t trans_len;
4216
4217 if (off >= sizeof(log)) {
4218 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4219 return NVME_INVALID_FIELD | NVME_DNR;
4220 }
4221
4222 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4223 case NVME_CC_CSS_NVM:
4224 src_iocs = nvme_cse_iocs_nvm;
4225
4226 case NVME_CC_CSS_ADMIN_ONLY:
4227 break;
4228 case NVME_CC_CSS_CSI:
4229 switch (csi) {
4230 case NVME_CSI_NVM:
4231 src_iocs = nvme_cse_iocs_nvm;
4232 break;
4233 case NVME_CSI_ZONED:
4234 src_iocs = nvme_cse_iocs_zoned;
4235 break;
4236 }
4237 }
4238
4239 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4240
4241 if (src_iocs) {
4242 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4243 }
4244
4245 trans_len = MIN(sizeof(log) - off, buf_len);
4246
4247 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4248}
4249
4250static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4251{
4252 NvmeCmd *cmd = &req->cmd;
4253
4254 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4255 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4256 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4257 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4258 uint8_t lid = dw10 & 0xff;
4259 uint8_t lsp = (dw10 >> 8) & 0xf;
4260 uint8_t rae = (dw10 >> 15) & 0x1;
4261 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
4262 uint32_t numdl, numdu;
4263 uint64_t off, lpol, lpou;
4264 size_t len;
4265 uint16_t status;
4266
4267 numdl = (dw10 >> 16);
4268 numdu = (dw11 & 0xffff);
4269 lpol = dw12;
4270 lpou = dw13;
4271
4272 len = (((numdu << 16) | numdl) + 1) << 2;
4273 off = (lpou << 32ULL) | lpol;
4274
4275 if (off & 0x3) {
4276 return NVME_INVALID_FIELD | NVME_DNR;
4277 }
4278
4279 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4280
4281 status = nvme_check_mdts(n, len);
4282 if (status) {
4283 return status;
4284 }
4285
4286 switch (lid) {
4287 case NVME_LOG_ERROR_INFO:
4288 return nvme_error_info(n, rae, len, off, req);
4289 case NVME_LOG_SMART_INFO:
4290 return nvme_smart_info(n, rae, len, off, req);
4291 case NVME_LOG_FW_SLOT_INFO:
4292 return nvme_fw_log_info(n, len, off, req);
4293 case NVME_LOG_CHANGED_NSLIST:
4294 return nvme_changed_nslist(n, rae, len, off, req);
4295 case NVME_LOG_CMD_EFFECTS:
4296 return nvme_cmd_effects(n, csi, len, off, req);
4297 default:
4298 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4299 return NVME_INVALID_FIELD | NVME_DNR;
4300 }
4301}
4302
4303static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4304{
4305 n->cq[cq->cqid] = NULL;
4306 timer_free(cq->timer);
4307 if (msix_enabled(&n->parent_obj)) {
4308 msix_vector_unuse(&n->parent_obj, cq->vector);
4309 }
4310 if (cq->cqid) {
4311 g_free(cq);
4312 }
4313}
4314
4315static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4316{
4317 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4318 NvmeCQueue *cq;
4319 uint16_t qid = le16_to_cpu(c->qid);
4320
4321 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4322 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4323 return NVME_INVALID_CQID | NVME_DNR;
4324 }
4325
4326 cq = n->cq[qid];
4327 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4328 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4329 return NVME_INVALID_QUEUE_DEL;
4330 }
4331
4332 if (cq->irq_enabled && cq->tail != cq->head) {
4333 n->cq_pending--;
4334 }
4335
4336 nvme_irq_deassert(n, cq);
4337 trace_pci_nvme_del_cq(qid);
4338 nvme_free_cq(cq, n);
4339 return NVME_SUCCESS;
4340}
4341
4342static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4343 uint16_t cqid, uint16_t vector, uint16_t size,
4344 uint16_t irq_enabled)
4345{
4346 int ret;
4347
4348 if (msix_enabled(&n->parent_obj)) {
4349 ret = msix_vector_use(&n->parent_obj, vector);
4350 assert(ret == 0);
4351 }
4352 cq->ctrl = n;
4353 cq->cqid = cqid;
4354 cq->size = size;
4355 cq->dma_addr = dma_addr;
4356 cq->phase = 1;
4357 cq->irq_enabled = irq_enabled;
4358 cq->vector = vector;
4359 cq->head = cq->tail = 0;
4360 QTAILQ_INIT(&cq->req_list);
4361 QTAILQ_INIT(&cq->sq_list);
4362 n->cq[cqid] = cq;
4363 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4364}
4365
4366static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4367{
4368 NvmeCQueue *cq;
4369 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4370 uint16_t cqid = le16_to_cpu(c->cqid);
4371 uint16_t vector = le16_to_cpu(c->irq_vector);
4372 uint16_t qsize = le16_to_cpu(c->qsize);
4373 uint16_t qflags = le16_to_cpu(c->cq_flags);
4374 uint64_t prp1 = le64_to_cpu(c->prp1);
4375
4376 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4377 NVME_CQ_FLAGS_IEN(qflags) != 0);
4378
4379 if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4380 n->cq[cqid] != NULL)) {
4381 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4382 return NVME_INVALID_QID | NVME_DNR;
4383 }
4384 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4385 trace_pci_nvme_err_invalid_create_cq_size(qsize);
4386 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4387 }
4388 if (unlikely(prp1 & (n->page_size - 1))) {
4389 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4390 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4391 }
4392 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4393 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4394 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4395 }
4396 if (unlikely(vector >= n->params.msix_qsize)) {
4397 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4398 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4399 }
4400 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4401 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4402 return NVME_INVALID_FIELD | NVME_DNR;
4403 }
4404
4405 cq = g_malloc0(sizeof(*cq));
4406 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4407 NVME_CQ_FLAGS_IEN(qflags));
4408
4409
4410
4411
4412
4413
4414 n->qs_created = true;
4415 return NVME_SUCCESS;
4416}
4417
4418static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4419{
4420 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4421
4422 return nvme_c2h(n, id, sizeof(id), req);
4423}
4424
4425static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4426{
4427 trace_pci_nvme_identify_ctrl();
4428
4429 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4430}
4431
4432static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4433{
4434 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4435 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4436 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4437
4438 trace_pci_nvme_identify_ctrl_csi(c->csi);
4439
4440 switch (c->csi) {
4441 case NVME_CSI_NVM:
4442 id_nvm->vsl = n->params.vsl;
4443 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4444 break;
4445
4446 case NVME_CSI_ZONED:
4447 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4448 break;
4449
4450 default:
4451 return NVME_INVALID_FIELD | NVME_DNR;
4452 }
4453
4454 return nvme_c2h(n, id, sizeof(id), req);
4455}
4456
4457static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4458{
4459 NvmeNamespace *ns;
4460 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4461 uint32_t nsid = le32_to_cpu(c->nsid);
4462
4463 trace_pci_nvme_identify_ns(nsid);
4464
4465 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4466 return NVME_INVALID_NSID | NVME_DNR;
4467 }
4468
4469 ns = nvme_ns(n, nsid);
4470 if (unlikely(!ns)) {
4471 if (!active) {
4472 ns = nvme_subsys_ns(n->subsys, nsid);
4473 if (!ns) {
4474 return nvme_rpt_empty_id_struct(n, req);
4475 }
4476 } else {
4477 return nvme_rpt_empty_id_struct(n, req);
4478 }
4479 }
4480
4481 if (active || ns->csi == NVME_CSI_NVM) {
4482 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4483 }
4484
4485 return NVME_INVALID_CMD_SET | NVME_DNR;
4486}
4487
4488static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4489 bool attached)
4490{
4491 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4492 uint32_t nsid = le32_to_cpu(c->nsid);
4493 uint16_t min_id = le16_to_cpu(c->ctrlid);
4494 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4495 uint16_t *ids = &list[1];
4496 NvmeNamespace *ns;
4497 NvmeCtrl *ctrl;
4498 int cntlid, nr_ids = 0;
4499
4500 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4501
4502 if (!n->subsys) {
4503 return NVME_INVALID_FIELD | NVME_DNR;
4504 }
4505
4506 if (attached) {
4507 if (nsid == NVME_NSID_BROADCAST) {
4508 return NVME_INVALID_FIELD | NVME_DNR;
4509 }
4510
4511 ns = nvme_subsys_ns(n->subsys, nsid);
4512 if (!ns) {
4513 return NVME_INVALID_FIELD | NVME_DNR;
4514 }
4515 }
4516
4517 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4518 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4519 if (!ctrl) {
4520 continue;
4521 }
4522
4523 if (attached && !nvme_ns(ctrl, nsid)) {
4524 continue;
4525 }
4526
4527 ids[nr_ids++] = cntlid;
4528 }
4529
4530 list[0] = nr_ids;
4531
4532 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4533}
4534
4535static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4536 bool active)
4537{
4538 NvmeNamespace *ns;
4539 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4540 uint32_t nsid = le32_to_cpu(c->nsid);
4541
4542 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4543
4544 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4545 return NVME_INVALID_NSID | NVME_DNR;
4546 }
4547
4548 ns = nvme_ns(n, nsid);
4549 if (unlikely(!ns)) {
4550 if (!active) {
4551 ns = nvme_subsys_ns(n->subsys, nsid);
4552 if (!ns) {
4553 return nvme_rpt_empty_id_struct(n, req);
4554 }
4555 } else {
4556 return nvme_rpt_empty_id_struct(n, req);
4557 }
4558 }
4559
4560 if (c->csi == NVME_CSI_NVM) {
4561 return nvme_rpt_empty_id_struct(n, req);
4562 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4563 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4564 req);
4565 }
4566
4567 return NVME_INVALID_FIELD | NVME_DNR;
4568}
4569
4570static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4571 bool active)
4572{
4573 NvmeNamespace *ns;
4574 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4575 uint32_t min_nsid = le32_to_cpu(c->nsid);
4576 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4577 static const int data_len = sizeof(list);
4578 uint32_t *list_ptr = (uint32_t *)list;
4579 int i, j = 0;
4580
4581 trace_pci_nvme_identify_nslist(min_nsid);
4582
4583
4584
4585
4586
4587
4588
4589 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4590 return NVME_INVALID_NSID | NVME_DNR;
4591 }
4592
4593 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4594 ns = nvme_ns(n, i);
4595 if (!ns) {
4596 if (!active) {
4597 ns = nvme_subsys_ns(n->subsys, i);
4598 if (!ns) {
4599 continue;
4600 }
4601 } else {
4602 continue;
4603 }
4604 }
4605 if (ns->params.nsid <= min_nsid) {
4606 continue;
4607 }
4608 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4609 if (j == data_len / sizeof(uint32_t)) {
4610 break;
4611 }
4612 }
4613
4614 return nvme_c2h(n, list, data_len, req);
4615}
4616
4617static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4618 bool active)
4619{
4620 NvmeNamespace *ns;
4621 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4622 uint32_t min_nsid = le32_to_cpu(c->nsid);
4623 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4624 static const int data_len = sizeof(list);
4625 uint32_t *list_ptr = (uint32_t *)list;
4626 int i, j = 0;
4627
4628 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4629
4630
4631
4632
4633 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4634 return NVME_INVALID_NSID | NVME_DNR;
4635 }
4636
4637 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4638 return NVME_INVALID_FIELD | NVME_DNR;
4639 }
4640
4641 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4642 ns = nvme_ns(n, i);
4643 if (!ns) {
4644 if (!active) {
4645 ns = nvme_subsys_ns(n->subsys, i);
4646 if (!ns) {
4647 continue;
4648 }
4649 } else {
4650 continue;
4651 }
4652 }
4653 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4654 continue;
4655 }
4656 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4657 if (j == data_len / sizeof(uint32_t)) {
4658 break;
4659 }
4660 }
4661
4662 return nvme_c2h(n, list, data_len, req);
4663}
4664
4665static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4666{
4667 NvmeNamespace *ns;
4668 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4669 uint32_t nsid = le32_to_cpu(c->nsid);
4670 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4671 uint8_t *pos = list;
4672 struct {
4673 NvmeIdNsDescr hdr;
4674 uint8_t v[NVME_NIDL_UUID];
4675 } QEMU_PACKED uuid = {};
4676 struct {
4677 NvmeIdNsDescr hdr;
4678 uint64_t v;
4679 } QEMU_PACKED eui64 = {};
4680 struct {
4681 NvmeIdNsDescr hdr;
4682 uint8_t v;
4683 } QEMU_PACKED csi = {};
4684
4685 trace_pci_nvme_identify_ns_descr_list(nsid);
4686
4687 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4688 return NVME_INVALID_NSID | NVME_DNR;
4689 }
4690
4691 ns = nvme_ns(n, nsid);
4692 if (unlikely(!ns)) {
4693 return NVME_INVALID_FIELD | NVME_DNR;
4694 }
4695
4696
4697
4698
4699
4700
4701 uuid.hdr.nidt = NVME_NIDT_UUID;
4702 uuid.hdr.nidl = NVME_NIDL_UUID;
4703 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4704 memcpy(pos, &uuid, sizeof(uuid));
4705 pos += sizeof(uuid);
4706
4707 if (ns->params.eui64) {
4708 eui64.hdr.nidt = NVME_NIDT_EUI64;
4709 eui64.hdr.nidl = NVME_NIDL_EUI64;
4710 eui64.v = cpu_to_be64(ns->params.eui64);
4711 memcpy(pos, &eui64, sizeof(eui64));
4712 pos += sizeof(eui64);
4713 }
4714
4715 csi.hdr.nidt = NVME_NIDT_CSI;
4716 csi.hdr.nidl = NVME_NIDL_CSI;
4717 csi.v = ns->csi;
4718 memcpy(pos, &csi, sizeof(csi));
4719 pos += sizeof(csi);
4720
4721 return nvme_c2h(n, list, sizeof(list), req);
4722}
4723
4724static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4725{
4726 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4727 static const int data_len = sizeof(list);
4728
4729 trace_pci_nvme_identify_cmd_set();
4730
4731 NVME_SET_CSI(*list, NVME_CSI_NVM);
4732 NVME_SET_CSI(*list, NVME_CSI_ZONED);
4733
4734 return nvme_c2h(n, list, data_len, req);
4735}
4736
4737static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4738{
4739 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4740
4741 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4742 c->csi);
4743
4744 switch (c->cns) {
4745 case NVME_ID_CNS_NS:
4746 return nvme_identify_ns(n, req, true);
4747 case NVME_ID_CNS_NS_PRESENT:
4748 return nvme_identify_ns(n, req, false);
4749 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4750 return nvme_identify_ctrl_list(n, req, true);
4751 case NVME_ID_CNS_CTRL_LIST:
4752 return nvme_identify_ctrl_list(n, req, false);
4753 case NVME_ID_CNS_CS_NS:
4754 return nvme_identify_ns_csi(n, req, true);
4755 case NVME_ID_CNS_CS_NS_PRESENT:
4756 return nvme_identify_ns_csi(n, req, false);
4757 case NVME_ID_CNS_CTRL:
4758 return nvme_identify_ctrl(n, req);
4759 case NVME_ID_CNS_CS_CTRL:
4760 return nvme_identify_ctrl_csi(n, req);
4761 case NVME_ID_CNS_NS_ACTIVE_LIST:
4762 return nvme_identify_nslist(n, req, true);
4763 case NVME_ID_CNS_NS_PRESENT_LIST:
4764 return nvme_identify_nslist(n, req, false);
4765 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4766 return nvme_identify_nslist_csi(n, req, true);
4767 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4768 return nvme_identify_nslist_csi(n, req, false);
4769 case NVME_ID_CNS_NS_DESCR_LIST:
4770 return nvme_identify_ns_descr_list(n, req);
4771 case NVME_ID_CNS_IO_COMMAND_SET:
4772 return nvme_identify_cmd_set(n, req);
4773 default:
4774 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4775 return NVME_INVALID_FIELD | NVME_DNR;
4776 }
4777}
4778
4779static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4780{
4781 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4782
4783 req->cqe.result = 1;
4784 if (nvme_check_sqid(n, sqid)) {
4785 return NVME_INVALID_FIELD | NVME_DNR;
4786 }
4787
4788 return NVME_SUCCESS;
4789}
4790
4791static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4792{
4793 trace_pci_nvme_setfeat_timestamp(ts);
4794
4795 n->host_timestamp = le64_to_cpu(ts);
4796 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4797}
4798
4799static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4800{
4801 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4802 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4803
4804 union nvme_timestamp {
4805 struct {
4806 uint64_t timestamp:48;
4807 uint64_t sync:1;
4808 uint64_t origin:3;
4809 uint64_t rsvd1:12;
4810 };
4811 uint64_t all;
4812 };
4813
4814 union nvme_timestamp ts;
4815 ts.all = 0;
4816 ts.timestamp = n->host_timestamp + elapsed_time;
4817
4818
4819 ts.origin = n->host_timestamp ? 0x01 : 0x00;
4820
4821 trace_pci_nvme_getfeat_timestamp(ts.all);
4822
4823 return cpu_to_le64(ts.all);
4824}
4825
4826static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4827{
4828 uint64_t timestamp = nvme_get_timestamp(n);
4829
4830 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4831}
4832
4833static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4834{
4835 NvmeCmd *cmd = &req->cmd;
4836 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4837 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4838 uint32_t nsid = le32_to_cpu(cmd->nsid);
4839 uint32_t result;
4840 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4841 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4842 uint16_t iv;
4843 NvmeNamespace *ns;
4844 int i;
4845
4846 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4847 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4848 };
4849
4850 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4851
4852 if (!nvme_feature_support[fid]) {
4853 return NVME_INVALID_FIELD | NVME_DNR;
4854 }
4855
4856 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4857 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4858
4859
4860
4861
4862
4863
4864
4865 return NVME_INVALID_NSID | NVME_DNR;
4866 }
4867
4868 if (!nvme_ns(n, nsid)) {
4869 return NVME_INVALID_FIELD | NVME_DNR;
4870 }
4871 }
4872
4873 switch (sel) {
4874 case NVME_GETFEAT_SELECT_CURRENT:
4875 break;
4876 case NVME_GETFEAT_SELECT_SAVED:
4877
4878 case NVME_GETFEAT_SELECT_DEFAULT:
4879 goto defaults;
4880 case NVME_GETFEAT_SELECT_CAP:
4881 result = nvme_feature_cap[fid];
4882 goto out;
4883 }
4884
4885 switch (fid) {
4886 case NVME_TEMPERATURE_THRESHOLD:
4887 result = 0;
4888
4889
4890
4891
4892
4893 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4894 goto out;
4895 }
4896
4897 switch (NVME_TEMP_THSEL(dw11)) {
4898 case NVME_TEMP_THSEL_OVER:
4899 result = n->features.temp_thresh_hi;
4900 goto out;
4901 case NVME_TEMP_THSEL_UNDER:
4902 result = n->features.temp_thresh_low;
4903 goto out;
4904 }
4905
4906 return NVME_INVALID_FIELD | NVME_DNR;
4907 case NVME_ERROR_RECOVERY:
4908 if (!nvme_nsid_valid(n, nsid)) {
4909 return NVME_INVALID_NSID | NVME_DNR;
4910 }
4911
4912 ns = nvme_ns(n, nsid);
4913 if (unlikely(!ns)) {
4914 return NVME_INVALID_FIELD | NVME_DNR;
4915 }
4916
4917 result = ns->features.err_rec;
4918 goto out;
4919 case NVME_VOLATILE_WRITE_CACHE:
4920 result = 0;
4921 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4922 ns = nvme_ns(n, i);
4923 if (!ns) {
4924 continue;
4925 }
4926
4927 result = blk_enable_write_cache(ns->blkconf.blk);
4928 if (result) {
4929 break;
4930 }
4931 }
4932 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4933 goto out;
4934 case NVME_ASYNCHRONOUS_EVENT_CONF:
4935 result = n->features.async_config;
4936 goto out;
4937 case NVME_TIMESTAMP:
4938 return nvme_get_feature_timestamp(n, req);
4939 default:
4940 break;
4941 }
4942
4943defaults:
4944 switch (fid) {
4945 case NVME_TEMPERATURE_THRESHOLD:
4946 result = 0;
4947
4948 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4949 break;
4950 }
4951
4952 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4953 result = NVME_TEMPERATURE_WARNING;
4954 }
4955
4956 break;
4957 case NVME_NUMBER_OF_QUEUES:
4958 result = (n->params.max_ioqpairs - 1) |
4959 ((n->params.max_ioqpairs - 1) << 16);
4960 trace_pci_nvme_getfeat_numq(result);
4961 break;
4962 case NVME_INTERRUPT_VECTOR_CONF:
4963 iv = dw11 & 0xffff;
4964 if (iv >= n->params.max_ioqpairs + 1) {
4965 return NVME_INVALID_FIELD | NVME_DNR;
4966 }
4967
4968 result = iv;
4969 if (iv == n->admin_cq.vector) {
4970 result |= NVME_INTVC_NOCOALESCING;
4971 }
4972 break;
4973 default:
4974 result = nvme_feature_default[fid];
4975 break;
4976 }
4977
4978out:
4979 req->cqe.result = cpu_to_le32(result);
4980 return NVME_SUCCESS;
4981}
4982
4983static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4984{
4985 uint16_t ret;
4986 uint64_t timestamp;
4987
4988 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4989 if (ret) {
4990 return ret;
4991 }
4992
4993 nvme_set_timestamp(n, timestamp);
4994
4995 return NVME_SUCCESS;
4996}
4997
4998static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4999{
5000 NvmeNamespace *ns = NULL;
5001
5002 NvmeCmd *cmd = &req->cmd;
5003 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5004 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5005 uint32_t nsid = le32_to_cpu(cmd->nsid);
5006 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5007 uint8_t save = NVME_SETFEAT_SAVE(dw10);
5008 int i;
5009
5010 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5011
5012 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5013 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5014 }
5015
5016 if (!nvme_feature_support[fid]) {
5017 return NVME_INVALID_FIELD | NVME_DNR;
5018 }
5019
5020 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5021 if (nsid != NVME_NSID_BROADCAST) {
5022 if (!nvme_nsid_valid(n, nsid)) {
5023 return NVME_INVALID_NSID | NVME_DNR;
5024 }
5025
5026 ns = nvme_ns(n, nsid);
5027 if (unlikely(!ns)) {
5028 return NVME_INVALID_FIELD | NVME_DNR;
5029 }
5030 }
5031 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5032 if (!nvme_nsid_valid(n, nsid)) {
5033 return NVME_INVALID_NSID | NVME_DNR;
5034 }
5035
5036 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5037 }
5038
5039 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5040 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5041 }
5042
5043 switch (fid) {
5044 case NVME_TEMPERATURE_THRESHOLD:
5045 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5046 break;
5047 }
5048
5049 switch (NVME_TEMP_THSEL(dw11)) {
5050 case NVME_TEMP_THSEL_OVER:
5051 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5052 break;
5053 case NVME_TEMP_THSEL_UNDER:
5054 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5055 break;
5056 default:
5057 return NVME_INVALID_FIELD | NVME_DNR;
5058 }
5059
5060 if ((n->temperature >= n->features.temp_thresh_hi) ||
5061 (n->temperature <= n->features.temp_thresh_low)) {
5062 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
5063 }
5064
5065 break;
5066 case NVME_ERROR_RECOVERY:
5067 if (nsid == NVME_NSID_BROADCAST) {
5068 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5069 ns = nvme_ns(n, i);
5070
5071 if (!ns) {
5072 continue;
5073 }
5074
5075 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5076 ns->features.err_rec = dw11;
5077 }
5078 }
5079
5080 break;
5081 }
5082
5083 assert(ns);
5084 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5085 ns->features.err_rec = dw11;
5086 }
5087 break;
5088 case NVME_VOLATILE_WRITE_CACHE:
5089 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5090 ns = nvme_ns(n, i);
5091 if (!ns) {
5092 continue;
5093 }
5094
5095 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5096 blk_flush(ns->blkconf.blk);
5097 }
5098
5099 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5100 }
5101
5102 break;
5103
5104 case NVME_NUMBER_OF_QUEUES:
5105 if (n->qs_created) {
5106 return NVME_CMD_SEQ_ERROR | NVME_DNR;
5107 }
5108
5109
5110
5111
5112
5113 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5114 return NVME_INVALID_FIELD | NVME_DNR;
5115 }
5116
5117 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5118 ((dw11 >> 16) & 0xffff) + 1,
5119 n->params.max_ioqpairs,
5120 n->params.max_ioqpairs);
5121 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
5122 ((n->params.max_ioqpairs - 1) << 16));
5123 break;
5124 case NVME_ASYNCHRONOUS_EVENT_CONF:
5125 n->features.async_config = dw11;
5126 break;
5127 case NVME_TIMESTAMP:
5128 return nvme_set_feature_timestamp(n, req);
5129 case NVME_COMMAND_SET_PROFILE:
5130 if (dw11 & 0x1ff) {
5131 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5132 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5133 }
5134 break;
5135 default:
5136 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5137 }
5138 return NVME_SUCCESS;
5139}
5140
5141static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5142{
5143 trace_pci_nvme_aer(nvme_cid(req));
5144
5145 if (n->outstanding_aers > n->params.aerl) {
5146 trace_pci_nvme_aer_aerl_exceeded();
5147 return NVME_AER_LIMIT_EXCEEDED;
5148 }
5149
5150 n->aer_reqs[n->outstanding_aers] = req;
5151 n->outstanding_aers++;
5152
5153 if (!QTAILQ_EMPTY(&n->aer_queue)) {
5154 nvme_process_aers(n);
5155 }
5156
5157 return NVME_NO_COMPLETE;
5158}
5159
5160static void nvme_update_dmrsl(NvmeCtrl *n)
5161{
5162 int nsid;
5163
5164 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5165 NvmeNamespace *ns = nvme_ns(n, nsid);
5166 if (!ns) {
5167 continue;
5168 }
5169
5170 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5171 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5172 }
5173}
5174
5175static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5176{
5177 uint32_t cc = ldl_le_p(&n->bar.cc);
5178
5179 ns->iocs = nvme_cse_iocs_none;
5180 switch (ns->csi) {
5181 case NVME_CSI_NVM:
5182 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5183 ns->iocs = nvme_cse_iocs_nvm;
5184 }
5185 break;
5186 case NVME_CSI_ZONED:
5187 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5188 ns->iocs = nvme_cse_iocs_zoned;
5189 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5190 ns->iocs = nvme_cse_iocs_nvm;
5191 }
5192 break;
5193 }
5194}
5195
5196static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5197{
5198 NvmeNamespace *ns;
5199 NvmeCtrl *ctrl;
5200 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5201 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5202 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5203 uint8_t sel = dw10 & 0xf;
5204 uint16_t *nr_ids = &list[0];
5205 uint16_t *ids = &list[1];
5206 uint16_t ret;
5207 int i;
5208
5209 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5210
5211 if (!nvme_nsid_valid(n, nsid)) {
5212 return NVME_INVALID_NSID | NVME_DNR;
5213 }
5214
5215 ns = nvme_subsys_ns(n->subsys, nsid);
5216 if (!ns) {
5217 return NVME_INVALID_FIELD | NVME_DNR;
5218 }
5219
5220 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5221 if (ret) {
5222 return ret;
5223 }
5224
5225 if (!*nr_ids) {
5226 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5227 }
5228
5229 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5230 for (i = 0; i < *nr_ids; i++) {
5231 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5232 if (!ctrl) {
5233 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5234 }
5235
5236 switch (sel) {
5237 case NVME_NS_ATTACHMENT_ATTACH:
5238 if (nvme_ns(ctrl, nsid)) {
5239 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5240 }
5241
5242 if (ns->attached && !ns->params.shared) {
5243 return NVME_NS_PRIVATE | NVME_DNR;
5244 }
5245
5246 nvme_attach_ns(ctrl, ns);
5247 nvme_select_iocs_ns(ctrl, ns);
5248
5249 break;
5250
5251 case NVME_NS_ATTACHMENT_DETACH:
5252 if (!nvme_ns(ctrl, nsid)) {
5253 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5254 }
5255
5256 ctrl->namespaces[nsid] = NULL;
5257 ns->attached--;
5258
5259 nvme_update_dmrsl(ctrl);
5260
5261 break;
5262
5263 default:
5264 return NVME_INVALID_FIELD | NVME_DNR;
5265 }
5266
5267
5268
5269
5270
5271 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5272 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5273 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5274 NVME_LOG_CHANGED_NSLIST);
5275 }
5276 }
5277
5278 return NVME_SUCCESS;
5279}
5280
5281typedef struct NvmeFormatAIOCB {
5282 BlockAIOCB common;
5283 BlockAIOCB *aiocb;
5284 QEMUBH *bh;
5285 NvmeRequest *req;
5286 int ret;
5287
5288 NvmeNamespace *ns;
5289 uint32_t nsid;
5290 bool broadcast;
5291 int64_t offset;
5292} NvmeFormatAIOCB;
5293
5294static void nvme_format_bh(void *opaque);
5295
5296static void nvme_format_cancel(BlockAIOCB *aiocb)
5297{
5298 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5299
5300 if (iocb->aiocb) {
5301 blk_aio_cancel_async(iocb->aiocb);
5302 }
5303}
5304
5305static const AIOCBInfo nvme_format_aiocb_info = {
5306 .aiocb_size = sizeof(NvmeFormatAIOCB),
5307 .cancel_async = nvme_format_cancel,
5308 .get_aio_context = nvme_get_aio_context,
5309};
5310
5311static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
5312{
5313 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5314 uint8_t lbaf = dw10 & 0xf;
5315 uint8_t pi = (dw10 >> 5) & 0x7;
5316 uint8_t mset = (dw10 >> 4) & 0x1;
5317 uint8_t pil = (dw10 >> 8) & 0x1;
5318
5319 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5320
5321 ns->id_ns.dps = (pil << 3) | pi;
5322 ns->id_ns.flbas = lbaf | (mset << 4);
5323
5324 nvme_ns_init_format(ns);
5325}
5326
5327static void nvme_format_ns_cb(void *opaque, int ret)
5328{
5329 NvmeFormatAIOCB *iocb = opaque;
5330 NvmeRequest *req = iocb->req;
5331 NvmeNamespace *ns = iocb->ns;
5332 int bytes;
5333
5334 if (ret < 0) {
5335 iocb->ret = ret;
5336 goto done;
5337 }
5338
5339 assert(ns);
5340
5341 if (iocb->offset < ns->size) {
5342 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5343
5344 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5345 bytes, BDRV_REQ_MAY_UNMAP,
5346 nvme_format_ns_cb, iocb);
5347
5348 iocb->offset += bytes;
5349 return;
5350 }
5351
5352 nvme_format_set(ns, &req->cmd);
5353 ns->status = 0x0;
5354 iocb->ns = NULL;
5355 iocb->offset = 0;
5356
5357done:
5358 iocb->aiocb = NULL;
5359 qemu_bh_schedule(iocb->bh);
5360}
5361
5362static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5363{
5364 if (ns->params.zoned) {
5365 return NVME_INVALID_FORMAT | NVME_DNR;
5366 }
5367
5368 if (lbaf > ns->id_ns.nlbaf) {
5369 return NVME_INVALID_FORMAT | NVME_DNR;
5370 }
5371
5372 if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
5373 return NVME_INVALID_FORMAT | NVME_DNR;
5374 }
5375
5376 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5377 return NVME_INVALID_FIELD | NVME_DNR;
5378 }
5379
5380 return NVME_SUCCESS;
5381}
5382
5383static void nvme_format_bh(void *opaque)
5384{
5385 NvmeFormatAIOCB *iocb = opaque;
5386 NvmeRequest *req = iocb->req;
5387 NvmeCtrl *n = nvme_ctrl(req);
5388 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5389 uint8_t lbaf = dw10 & 0xf;
5390 uint8_t pi = (dw10 >> 5) & 0x7;
5391 uint16_t status;
5392 int i;
5393
5394 if (iocb->ret < 0) {
5395 goto done;
5396 }
5397
5398 if (iocb->broadcast) {
5399 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5400 iocb->ns = nvme_ns(n, i);
5401 if (iocb->ns) {
5402 iocb->nsid = i;
5403 break;
5404 }
5405 }
5406 }
5407
5408 if (!iocb->ns) {
5409 goto done;
5410 }
5411
5412 status = nvme_format_check(iocb->ns, lbaf, pi);
5413 if (status) {
5414 req->status = status;
5415 goto done;
5416 }
5417
5418 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5419 nvme_format_ns_cb(iocb, 0);
5420 return;
5421
5422done:
5423 qemu_bh_delete(iocb->bh);
5424 iocb->bh = NULL;
5425
5426 iocb->common.cb(iocb->common.opaque, iocb->ret);
5427
5428 qemu_aio_unref(iocb);
5429}
5430
5431static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5432{
5433 NvmeFormatAIOCB *iocb;
5434 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5435 uint16_t status;
5436
5437 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5438
5439 iocb->req = req;
5440 iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5441 iocb->ret = 0;
5442 iocb->ns = NULL;
5443 iocb->nsid = 0;
5444 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5445 iocb->offset = 0;
5446
5447 if (!iocb->broadcast) {
5448 if (!nvme_nsid_valid(n, nsid)) {
5449 status = NVME_INVALID_NSID | NVME_DNR;
5450 goto out;
5451 }
5452
5453 iocb->ns = nvme_ns(n, nsid);
5454 if (!iocb->ns) {
5455 status = NVME_INVALID_FIELD | NVME_DNR;
5456 goto out;
5457 }
5458 }
5459
5460 req->aiocb = &iocb->common;
5461 qemu_bh_schedule(iocb->bh);
5462
5463 return NVME_NO_COMPLETE;
5464
5465out:
5466 qemu_bh_delete(iocb->bh);
5467 iocb->bh = NULL;
5468 qemu_aio_unref(iocb);
5469 return status;
5470}
5471
5472static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5473{
5474 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5475 nvme_adm_opc_str(req->cmd.opcode));
5476
5477 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5478 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5479 return NVME_INVALID_OPCODE | NVME_DNR;
5480 }
5481
5482
5483 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5484 return NVME_INVALID_FIELD | NVME_DNR;
5485 }
5486
5487 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
5488 return NVME_INVALID_FIELD;
5489 }
5490
5491 switch (req->cmd.opcode) {
5492 case NVME_ADM_CMD_DELETE_SQ:
5493 return nvme_del_sq(n, req);
5494 case NVME_ADM_CMD_CREATE_SQ:
5495 return nvme_create_sq(n, req);
5496 case NVME_ADM_CMD_GET_LOG_PAGE:
5497 return nvme_get_log(n, req);
5498 case NVME_ADM_CMD_DELETE_CQ:
5499 return nvme_del_cq(n, req);
5500 case NVME_ADM_CMD_CREATE_CQ:
5501 return nvme_create_cq(n, req);
5502 case NVME_ADM_CMD_IDENTIFY:
5503 return nvme_identify(n, req);
5504 case NVME_ADM_CMD_ABORT:
5505 return nvme_abort(n, req);
5506 case NVME_ADM_CMD_SET_FEATURES:
5507 return nvme_set_feature(n, req);
5508 case NVME_ADM_CMD_GET_FEATURES:
5509 return nvme_get_feature(n, req);
5510 case NVME_ADM_CMD_ASYNC_EV_REQ:
5511 return nvme_aer(n, req);
5512 case NVME_ADM_CMD_NS_ATTACHMENT:
5513 return nvme_ns_attachment(n, req);
5514 case NVME_ADM_CMD_FORMAT_NVM:
5515 return nvme_format(n, req);
5516 default:
5517 assert(false);
5518 }
5519
5520 return NVME_INVALID_OPCODE | NVME_DNR;
5521}
5522
5523static void nvme_process_sq(void *opaque)
5524{
5525 NvmeSQueue *sq = opaque;
5526 NvmeCtrl *n = sq->ctrl;
5527 NvmeCQueue *cq = n->cq[sq->cqid];
5528
5529 uint16_t status;
5530 hwaddr addr;
5531 NvmeCmd cmd;
5532 NvmeRequest *req;
5533
5534 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5535 addr = sq->dma_addr + sq->head * n->sqe_size;
5536 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5537 trace_pci_nvme_err_addr_read(addr);
5538 trace_pci_nvme_err_cfs();
5539 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
5540 break;
5541 }
5542 nvme_inc_sq_head(sq);
5543
5544 req = QTAILQ_FIRST(&sq->req_list);
5545 QTAILQ_REMOVE(&sq->req_list, req, entry);
5546 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5547 nvme_req_clear(req);
5548 req->cqe.cid = cmd.cid;
5549 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5550
5551 status = sq->sqid ? nvme_io_cmd(n, req) :
5552 nvme_admin_cmd(n, req);
5553 if (status != NVME_NO_COMPLETE) {
5554 req->status = status;
5555 nvme_enqueue_req_completion(cq, req);
5556 }
5557 }
5558}
5559
5560static void nvme_ctrl_reset(NvmeCtrl *n)
5561{
5562 NvmeNamespace *ns;
5563 int i;
5564
5565 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5566 ns = nvme_ns(n, i);
5567 if (!ns) {
5568 continue;
5569 }
5570
5571 nvme_ns_drain(ns);
5572 }
5573
5574 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5575 if (n->sq[i] != NULL) {
5576 nvme_free_sq(n->sq[i], n);
5577 }
5578 }
5579 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5580 if (n->cq[i] != NULL) {
5581 nvme_free_cq(n->cq[i], n);
5582 }
5583 }
5584
5585 while (!QTAILQ_EMPTY(&n->aer_queue)) {
5586 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5587 QTAILQ_REMOVE(&n->aer_queue, event, entry);
5588 g_free(event);
5589 }
5590
5591 n->aer_queued = 0;
5592 n->outstanding_aers = 0;
5593 n->qs_created = false;
5594}
5595
5596static void nvme_ctrl_shutdown(NvmeCtrl *n)
5597{
5598 NvmeNamespace *ns;
5599 int i;
5600
5601 if (n->pmr.dev) {
5602 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5603 }
5604
5605 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5606 ns = nvme_ns(n, i);
5607 if (!ns) {
5608 continue;
5609 }
5610
5611 nvme_ns_shutdown(ns);
5612 }
5613}
5614
5615static void nvme_select_iocs(NvmeCtrl *n)
5616{
5617 NvmeNamespace *ns;
5618 int i;
5619
5620 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5621 ns = nvme_ns(n, i);
5622 if (!ns) {
5623 continue;
5624 }
5625
5626 nvme_select_iocs_ns(n, ns);
5627 }
5628}
5629
5630static int nvme_start_ctrl(NvmeCtrl *n)
5631{
5632 uint64_t cap = ldq_le_p(&n->bar.cap);
5633 uint32_t cc = ldl_le_p(&n->bar.cc);
5634 uint32_t aqa = ldl_le_p(&n->bar.aqa);
5635 uint64_t asq = ldq_le_p(&n->bar.asq);
5636 uint64_t acq = ldq_le_p(&n->bar.acq);
5637 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
5638 uint32_t page_size = 1 << page_bits;
5639
5640 if (unlikely(n->cq[0])) {
5641 trace_pci_nvme_err_startfail_cq();
5642 return -1;
5643 }
5644 if (unlikely(n->sq[0])) {
5645 trace_pci_nvme_err_startfail_sq();
5646 return -1;
5647 }
5648 if (unlikely(asq & (page_size - 1))) {
5649 trace_pci_nvme_err_startfail_asq_misaligned(asq);
5650 return -1;
5651 }
5652 if (unlikely(acq & (page_size - 1))) {
5653 trace_pci_nvme_err_startfail_acq_misaligned(acq);
5654 return -1;
5655 }
5656 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
5657 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
5658 return -1;
5659 }
5660 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
5661 trace_pci_nvme_err_startfail_page_too_small(
5662 NVME_CC_MPS(cc),
5663 NVME_CAP_MPSMIN(cap));
5664 return -1;
5665 }
5666 if (unlikely(NVME_CC_MPS(cc) >
5667 NVME_CAP_MPSMAX(cap))) {
5668 trace_pci_nvme_err_startfail_page_too_large(
5669 NVME_CC_MPS(cc),
5670 NVME_CAP_MPSMAX(cap));
5671 return -1;
5672 }
5673 if (unlikely(NVME_CC_IOCQES(cc) <
5674 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5675 trace_pci_nvme_err_startfail_cqent_too_small(
5676 NVME_CC_IOCQES(cc),
5677 NVME_CTRL_CQES_MIN(cap));
5678 return -1;
5679 }
5680 if (unlikely(NVME_CC_IOCQES(cc) >
5681 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5682 trace_pci_nvme_err_startfail_cqent_too_large(
5683 NVME_CC_IOCQES(cc),
5684 NVME_CTRL_CQES_MAX(cap));
5685 return -1;
5686 }
5687 if (unlikely(NVME_CC_IOSQES(cc) <
5688 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5689 trace_pci_nvme_err_startfail_sqent_too_small(
5690 NVME_CC_IOSQES(cc),
5691 NVME_CTRL_SQES_MIN(cap));
5692 return -1;
5693 }
5694 if (unlikely(NVME_CC_IOSQES(cc) >
5695 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5696 trace_pci_nvme_err_startfail_sqent_too_large(
5697 NVME_CC_IOSQES(cc),
5698 NVME_CTRL_SQES_MAX(cap));
5699 return -1;
5700 }
5701 if (unlikely(!NVME_AQA_ASQS(aqa))) {
5702 trace_pci_nvme_err_startfail_asqent_sz_zero();
5703 return -1;
5704 }
5705 if (unlikely(!NVME_AQA_ACQS(aqa))) {
5706 trace_pci_nvme_err_startfail_acqent_sz_zero();
5707 return -1;
5708 }
5709
5710 n->page_bits = page_bits;
5711 n->page_size = page_size;
5712 n->max_prp_ents = n->page_size / sizeof(uint64_t);
5713 n->cqe_size = 1 << NVME_CC_IOCQES(cc);
5714 n->sqe_size = 1 << NVME_CC_IOSQES(cc);
5715 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
5716 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
5717
5718 nvme_set_timestamp(n, 0ULL);
5719
5720 QTAILQ_INIT(&n->aer_queue);
5721
5722 nvme_select_iocs(n);
5723
5724 return 0;
5725}
5726
5727static void nvme_cmb_enable_regs(NvmeCtrl *n)
5728{
5729 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
5730 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
5731
5732 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
5733 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
5734 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
5735 stl_le_p(&n->bar.cmbloc, cmbloc);
5736
5737 NVME_CMBSZ_SET_SQS(cmbsz, 1);
5738 NVME_CMBSZ_SET_CQS(cmbsz, 0);
5739 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
5740 NVME_CMBSZ_SET_RDS(cmbsz, 1);
5741 NVME_CMBSZ_SET_WDS(cmbsz, 1);
5742 NVME_CMBSZ_SET_SZU(cmbsz, 2);
5743 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
5744 stl_le_p(&n->bar.cmbsz, cmbsz);
5745}
5746
5747static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5748 unsigned size)
5749{
5750 uint64_t cap = ldq_le_p(&n->bar.cap);
5751 uint32_t cc = ldl_le_p(&n->bar.cc);
5752 uint32_t intms = ldl_le_p(&n->bar.intms);
5753 uint32_t csts = ldl_le_p(&n->bar.csts);
5754 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
5755
5756 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5757 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5758 "MMIO write not 32-bit aligned,"
5759 " offset=0x%"PRIx64"", offset);
5760
5761 }
5762
5763 if (unlikely(size < sizeof(uint32_t))) {
5764 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5765 "MMIO write smaller than 32-bits,"
5766 " offset=0x%"PRIx64", size=%u",
5767 offset, size);
5768
5769 }
5770
5771 switch (offset) {
5772 case NVME_REG_INTMS:
5773 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5774 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5775 "undefined access to interrupt mask set"
5776 " when MSI-X is enabled");
5777
5778 }
5779 intms |= data;
5780 stl_le_p(&n->bar.intms, intms);
5781 n->bar.intmc = n->bar.intms;
5782 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
5783 nvme_irq_check(n);
5784 break;
5785 case NVME_REG_INTMC:
5786 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5787 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5788 "undefined access to interrupt mask clr"
5789 " when MSI-X is enabled");
5790
5791 }
5792 intms &= ~data;
5793 stl_le_p(&n->bar.intms, intms);
5794 n->bar.intmc = n->bar.intms;
5795 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
5796 nvme_irq_check(n);
5797 break;
5798 case NVME_REG_CC:
5799 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5800
5801
5802 if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
5803 !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
5804 {
5805 cc = data;
5806 }
5807
5808 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
5809 cc = data;
5810
5811
5812 stl_le_p(&n->bar.cc, cc);
5813 if (unlikely(nvme_start_ctrl(n))) {
5814 trace_pci_nvme_err_startfail();
5815 csts = NVME_CSTS_FAILED;
5816 } else {
5817 trace_pci_nvme_mmio_start_success();
5818 csts = NVME_CSTS_READY;
5819 }
5820 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
5821 trace_pci_nvme_mmio_stopped();
5822 nvme_ctrl_reset(n);
5823 cc = 0;
5824 csts &= ~NVME_CSTS_READY;
5825 }
5826
5827 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
5828 trace_pci_nvme_mmio_shutdown_set();
5829 nvme_ctrl_shutdown(n);
5830 cc = data;
5831 csts |= NVME_CSTS_SHST_COMPLETE;
5832 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
5833 trace_pci_nvme_mmio_shutdown_cleared();
5834 csts &= ~NVME_CSTS_SHST_COMPLETE;
5835 cc = data;
5836 }
5837
5838 stl_le_p(&n->bar.cc, cc);
5839 stl_le_p(&n->bar.csts, csts);
5840
5841 break;
5842 case NVME_REG_CSTS:
5843 if (data & (1 << 4)) {
5844 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5845 "attempted to W1C CSTS.NSSRO"
5846 " but CAP.NSSRS is zero (not supported)");
5847 } else if (data != 0) {
5848 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5849 "attempted to set a read only bit"
5850 " of controller status");
5851 }
5852 break;
5853 case NVME_REG_NSSR:
5854 if (data == 0x4e564d65) {
5855 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5856 } else {
5857
5858 return;
5859 }
5860 break;
5861 case NVME_REG_AQA:
5862 stl_le_p(&n->bar.aqa, data);
5863 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5864 break;
5865 case NVME_REG_ASQ:
5866 stn_le_p(&n->bar.asq, size, data);
5867 trace_pci_nvme_mmio_asqaddr(data);
5868 break;
5869 case NVME_REG_ASQ + 4:
5870 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
5871 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
5872 break;
5873 case NVME_REG_ACQ:
5874 trace_pci_nvme_mmio_acqaddr(data);
5875 stn_le_p(&n->bar.acq, size, data);
5876 break;
5877 case NVME_REG_ACQ + 4:
5878 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
5879 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
5880 break;
5881 case NVME_REG_CMBLOC:
5882 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5883 "invalid write to reserved CMBLOC"
5884 " when CMBSZ is zero, ignored");
5885 return;
5886 case NVME_REG_CMBSZ:
5887 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5888 "invalid write to read only CMBSZ, ignored");
5889 return;
5890 case NVME_REG_CMBMSC:
5891 if (!NVME_CAP_CMBS(cap)) {
5892 return;
5893 }
5894
5895 stn_le_p(&n->bar.cmbmsc, size, data);
5896 n->cmb.cmse = false;
5897
5898 if (NVME_CMBMSC_CRE(data)) {
5899 nvme_cmb_enable_regs(n);
5900
5901 if (NVME_CMBMSC_CMSE(data)) {
5902 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
5903 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
5904 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5905 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
5906 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
5907 stl_le_p(&n->bar.cmbsts, cmbsts);
5908 return;
5909 }
5910
5911 n->cmb.cba = cba;
5912 n->cmb.cmse = true;
5913 }
5914 } else {
5915 n->bar.cmbsz = 0;
5916 n->bar.cmbloc = 0;
5917 }
5918
5919 return;
5920 case NVME_REG_CMBMSC + 4:
5921 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
5922 return;
5923
5924 case NVME_REG_PMRCAP:
5925 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5926 "invalid write to PMRCAP register, ignored");
5927 return;
5928 case NVME_REG_PMRCTL:
5929 if (!NVME_CAP_PMRS(cap)) {
5930 return;
5931 }
5932
5933 stl_le_p(&n->bar.pmrctl, data);
5934 if (NVME_PMRCTL_EN(data)) {
5935 memory_region_set_enabled(&n->pmr.dev->mr, true);
5936 pmrsts = 0;
5937 } else {
5938 memory_region_set_enabled(&n->pmr.dev->mr, false);
5939 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
5940 n->pmr.cmse = false;
5941 }
5942 stl_le_p(&n->bar.pmrsts, pmrsts);
5943 return;
5944 case NVME_REG_PMRSTS:
5945 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5946 "invalid write to PMRSTS register, ignored");
5947 return;
5948 case NVME_REG_PMREBS:
5949 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5950 "invalid write to PMREBS register, ignored");
5951 return;
5952 case NVME_REG_PMRSWTP:
5953 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5954 "invalid write to PMRSWTP register, ignored");
5955 return;
5956 case NVME_REG_PMRMSCL:
5957 if (!NVME_CAP_PMRS(cap)) {
5958 return;
5959 }
5960
5961 stl_le_p(&n->bar.pmrmscl, data);
5962 n->pmr.cmse = false;
5963
5964 if (NVME_PMRMSCL_CMSE(data)) {
5965 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
5966 hwaddr cba = pmrmscu << 32 |
5967 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
5968 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5969 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
5970 stl_le_p(&n->bar.pmrsts, pmrsts);
5971 return;
5972 }
5973
5974 n->pmr.cmse = true;
5975 n->pmr.cba = cba;
5976 }
5977
5978 return;
5979 case NVME_REG_PMRMSCU:
5980 if (!NVME_CAP_PMRS(cap)) {
5981 return;
5982 }
5983
5984 stl_le_p(&n->bar.pmrmscu, data);
5985 return;
5986 default:
5987 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5988 "invalid MMIO write,"
5989 " offset=0x%"PRIx64", data=%"PRIx64"",
5990 offset, data);
5991 break;
5992 }
5993}
5994
5995static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5996{
5997 NvmeCtrl *n = (NvmeCtrl *)opaque;
5998 uint8_t *ptr = (uint8_t *)&n->bar;
5999
6000 trace_pci_nvme_mmio_read(addr, size);
6001
6002 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
6003 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
6004 "MMIO read not 32-bit aligned,"
6005 " offset=0x%"PRIx64"", addr);
6006
6007 } else if (unlikely(size < sizeof(uint32_t))) {
6008 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
6009 "MMIO read smaller than 32-bits,"
6010 " offset=0x%"PRIx64"", addr);
6011
6012 }
6013
6014 if (addr > sizeof(n->bar) - size) {
6015 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6016 "MMIO read beyond last register,"
6017 " offset=0x%"PRIx64", returning 0", addr);
6018
6019 return 0;
6020 }
6021
6022
6023
6024
6025
6026
6027 if (addr == NVME_REG_PMRSTS &&
6028 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6029 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6030 }
6031
6032 return ldn_le_p(ptr + addr, size);
6033}
6034
6035static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6036{
6037 uint32_t qid;
6038
6039 if (unlikely(addr & ((1 << 2) - 1))) {
6040 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6041 "doorbell write not 32-bit aligned,"
6042 " offset=0x%"PRIx64", ignoring", addr);
6043 return;
6044 }
6045
6046 if (((addr - 0x1000) >> 2) & 1) {
6047
6048
6049 uint16_t new_head = val & 0xffff;
6050 int start_sqs;
6051 NvmeCQueue *cq;
6052
6053 qid = (addr - (0x1000 + (1 << 2))) >> 3;
6054 if (unlikely(nvme_check_cqid(n, qid))) {
6055 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6056 "completion queue doorbell write"
6057 " for nonexistent queue,"
6058 " sqid=%"PRIu32", ignoring", qid);
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073 if (n->outstanding_aers) {
6074 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6075 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6076 NVME_LOG_ERROR_INFO);
6077 }
6078
6079 return;
6080 }
6081
6082 cq = n->cq[qid];
6083 if (unlikely(new_head >= cq->size)) {
6084 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6085 "completion queue doorbell write value"
6086 " beyond queue size, sqid=%"PRIu32","
6087 " new_head=%"PRIu16", ignoring",
6088 qid, new_head);
6089
6090 if (n->outstanding_aers) {
6091 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6092 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6093 NVME_LOG_ERROR_INFO);
6094 }
6095
6096 return;
6097 }
6098
6099 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6100
6101 start_sqs = nvme_cq_full(cq) ? 1 : 0;
6102 cq->head = new_head;
6103 if (start_sqs) {
6104 NvmeSQueue *sq;
6105 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6106 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6107 }
6108 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6109 }
6110
6111 if (cq->tail == cq->head) {
6112 if (cq->irq_enabled) {
6113 n->cq_pending--;
6114 }
6115
6116 nvme_irq_deassert(n, cq);
6117 }
6118 } else {
6119
6120
6121 uint16_t new_tail = val & 0xffff;
6122 NvmeSQueue *sq;
6123
6124 qid = (addr - 0x1000) >> 3;
6125 if (unlikely(nvme_check_sqid(n, qid))) {
6126 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6127 "submission queue doorbell write"
6128 " for nonexistent queue,"
6129 " sqid=%"PRIu32", ignoring", qid);
6130
6131 if (n->outstanding_aers) {
6132 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6133 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6134 NVME_LOG_ERROR_INFO);
6135 }
6136
6137 return;
6138 }
6139
6140 sq = n->sq[qid];
6141 if (unlikely(new_tail >= sq->size)) {
6142 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6143 "submission queue doorbell write value"
6144 " beyond queue size, sqid=%"PRIu32","
6145 " new_tail=%"PRIu16", ignoring",
6146 qid, new_tail);
6147
6148 if (n->outstanding_aers) {
6149 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6150 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6151 NVME_LOG_ERROR_INFO);
6152 }
6153
6154 return;
6155 }
6156
6157 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6158
6159 sq->tail = new_tail;
6160 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6161 }
6162}
6163
6164static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6165 unsigned size)
6166{
6167 NvmeCtrl *n = (NvmeCtrl *)opaque;
6168
6169 trace_pci_nvme_mmio_write(addr, data, size);
6170
6171 if (addr < sizeof(n->bar)) {
6172 nvme_write_bar(n, addr, data, size);
6173 } else {
6174 nvme_process_db(n, addr, data);
6175 }
6176}
6177
6178static const MemoryRegionOps nvme_mmio_ops = {
6179 .read = nvme_mmio_read,
6180 .write = nvme_mmio_write,
6181 .endianness = DEVICE_LITTLE_ENDIAN,
6182 .impl = {
6183 .min_access_size = 2,
6184 .max_access_size = 8,
6185 },
6186};
6187
6188static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
6189 unsigned size)
6190{
6191 NvmeCtrl *n = (NvmeCtrl *)opaque;
6192 stn_le_p(&n->cmb.buf[addr], size, data);
6193}
6194
6195static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
6196{
6197 NvmeCtrl *n = (NvmeCtrl *)opaque;
6198 return ldn_le_p(&n->cmb.buf[addr], size);
6199}
6200
6201static const MemoryRegionOps nvme_cmb_ops = {
6202 .read = nvme_cmb_read,
6203 .write = nvme_cmb_write,
6204 .endianness = DEVICE_LITTLE_ENDIAN,
6205 .impl = {
6206 .min_access_size = 1,
6207 .max_access_size = 8,
6208 },
6209};
6210
6211static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
6212{
6213 NvmeParams *params = &n->params;
6214
6215 if (params->num_queues) {
6216 warn_report("num_queues is deprecated; please use max_ioqpairs "
6217 "instead");
6218
6219 params->max_ioqpairs = params->num_queues - 1;
6220 }
6221
6222 if (n->namespace.blkconf.blk && n->subsys) {
6223 error_setg(errp, "subsystem support is unavailable with legacy "
6224 "namespace ('drive' property)");
6225 return;
6226 }
6227
6228 if (params->max_ioqpairs < 1 ||
6229 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
6230 error_setg(errp, "max_ioqpairs must be between 1 and %d",
6231 NVME_MAX_IOQPAIRS);
6232 return;
6233 }
6234
6235 if (params->msix_qsize < 1 ||
6236 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
6237 error_setg(errp, "msix_qsize must be between 1 and %d",
6238 PCI_MSIX_FLAGS_QSIZE + 1);
6239 return;
6240 }
6241
6242 if (!params->serial) {
6243 error_setg(errp, "serial property not set");
6244 return;
6245 }
6246
6247 if (n->pmr.dev) {
6248 if (host_memory_backend_is_mapped(n->pmr.dev)) {
6249 error_setg(errp, "can't use already busy memdev: %s",
6250 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
6251 return;
6252 }
6253
6254 if (!is_power_of_2(n->pmr.dev->size)) {
6255 error_setg(errp, "pmr backend size needs to be power of 2 in size");
6256 return;
6257 }
6258
6259 host_memory_backend_set_mapped(n->pmr.dev, true);
6260 }
6261
6262 if (n->params.zasl > n->params.mdts) {
6263 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
6264 "than or equal to mdts (Maximum Data Transfer Size)");
6265 return;
6266 }
6267
6268 if (!n->params.vsl) {
6269 error_setg(errp, "vsl must be non-zero");
6270 return;
6271 }
6272}
6273
6274static void nvme_init_state(NvmeCtrl *n)
6275{
6276
6277 n->reg_size = pow2ceil(sizeof(NvmeBar) +
6278 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
6279 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
6280 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
6281 n->temperature = NVME_TEMPERATURE;
6282 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
6283 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6284 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
6285}
6286
6287static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
6288{
6289 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
6290 uint64_t cap = ldq_le_p(&n->bar.cap);
6291
6292 n->cmb.buf = g_malloc0(cmb_size);
6293 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
6294 "nvme-cmb", cmb_size);
6295 pci_register_bar(pci_dev, NVME_CMB_BIR,
6296 PCI_BASE_ADDRESS_SPACE_MEMORY |
6297 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6298 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
6299
6300 NVME_CAP_SET_CMBS(cap, 1);
6301 stq_le_p(&n->bar.cap, cap);
6302
6303 if (n->params.legacy_cmb) {
6304 nvme_cmb_enable_regs(n);
6305 n->cmb.cmse = true;
6306 }
6307}
6308
6309static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
6310{
6311 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
6312
6313 NVME_PMRCAP_SET_RDS(pmrcap, 1);
6314 NVME_PMRCAP_SET_WDS(pmrcap, 1);
6315 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
6316
6317 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
6318 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
6319 stl_le_p(&n->bar.pmrcap, pmrcap);
6320
6321 pci_register_bar(pci_dev, NVME_PMR_BIR,
6322 PCI_BASE_ADDRESS_SPACE_MEMORY |
6323 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6324 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
6325
6326 memory_region_set_enabled(&n->pmr.dev->mr, false);
6327}
6328
6329static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
6330{
6331 uint8_t *pci_conf = pci_dev->config;
6332 uint64_t bar_size, msix_table_size, msix_pba_size;
6333 unsigned msix_table_offset, msix_pba_offset;
6334 int ret;
6335
6336 Error *err = NULL;
6337
6338 pci_conf[PCI_INTERRUPT_PIN] = 1;
6339 pci_config_set_prog_interface(pci_conf, 0x2);
6340
6341 if (n->params.use_intel_id) {
6342 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6343 pci_config_set_device_id(pci_conf, 0x5845);
6344 } else {
6345 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6346 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6347 }
6348
6349 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6350 pcie_endpoint_cap_init(pci_dev, 0x80);
6351
6352 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6353 msix_table_offset = bar_size;
6354 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6355
6356 bar_size += msix_table_size;
6357 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6358 msix_pba_offset = bar_size;
6359 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6360
6361 bar_size += msix_pba_size;
6362 bar_size = pow2ceil(bar_size);
6363
6364 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6365 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6366 n->reg_size);
6367 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6368
6369 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6370 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6371 ret = msix_init(pci_dev, n->params.msix_qsize,
6372 &n->bar0, 0, msix_table_offset,
6373 &n->bar0, 0, msix_pba_offset, 0, &err);
6374 if (ret < 0) {
6375 if (ret == -ENOTSUP) {
6376 warn_report_err(err);
6377 } else {
6378 error_propagate(errp, err);
6379 return ret;
6380 }
6381 }
6382
6383 if (n->params.cmb_size_mb) {
6384 nvme_init_cmb(n, pci_dev);
6385 }
6386
6387 if (n->pmr.dev) {
6388 nvme_init_pmr(n, pci_dev);
6389 }
6390
6391 return 0;
6392}
6393
6394static void nvme_init_subnqn(NvmeCtrl *n)
6395{
6396 NvmeSubsystem *subsys = n->subsys;
6397 NvmeIdCtrl *id = &n->id_ctrl;
6398
6399 if (!subsys) {
6400 snprintf((char *)id->subnqn, sizeof(id->subnqn),
6401 "nqn.2019-08.org.qemu:%s", n->params.serial);
6402 } else {
6403 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6404 }
6405}
6406
6407static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6408{
6409 NvmeIdCtrl *id = &n->id_ctrl;
6410 uint8_t *pci_conf = pci_dev->config;
6411 uint64_t cap = ldq_le_p(&n->bar.cap);
6412
6413 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6414 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6415 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6416 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6417 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6418
6419 id->cntlid = cpu_to_le16(n->cntlid);
6420
6421 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6422
6423 id->rab = 6;
6424
6425 if (n->params.use_intel_id) {
6426 id->ieee[0] = 0xb3;
6427 id->ieee[1] = 0x02;
6428 id->ieee[2] = 0x00;
6429 } else {
6430 id->ieee[0] = 0x00;
6431 id->ieee[1] = 0x54;
6432 id->ieee[2] = 0x52;
6433 }
6434
6435 id->mdts = n->params.mdts;
6436 id->ver = cpu_to_le32(NVME_SPEC_VER);
6437 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6438 id->cntrltype = 0x1;
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451 id->acl = 3;
6452 id->aerl = n->params.aerl;
6453 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6454 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6455
6456
6457 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6458 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6459
6460 id->sqes = (0x6 << 4) | 0x6;
6461 id->cqes = (0x4 << 4) | 0x4;
6462 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6463 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6464 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6465 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6466
6467
6468
6469
6470
6471
6472
6473
6474 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6475
6476 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6477 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6478 NVME_CTRL_SGLS_BITBUCKET);
6479
6480 nvme_init_subnqn(n);
6481
6482 id->psd[0].mp = cpu_to_le16(0x9c4);
6483 id->psd[0].enlat = cpu_to_le32(0x10);
6484 id->psd[0].exlat = cpu_to_le32(0x4);
6485
6486 if (n->subsys) {
6487 id->cmic |= NVME_CMIC_MULTI_CTRL;
6488 }
6489
6490 NVME_CAP_SET_MQES(cap, 0x7ff);
6491 NVME_CAP_SET_CQR(cap, 1);
6492 NVME_CAP_SET_TO(cap, 0xf);
6493 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
6494 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
6495 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
6496 NVME_CAP_SET_MPSMAX(cap, 4);
6497 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
6498 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
6499 stq_le_p(&n->bar.cap, cap);
6500
6501 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
6502 n->bar.intmc = n->bar.intms = 0;
6503}
6504
6505static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6506{
6507 int cntlid;
6508
6509 if (!n->subsys) {
6510 return 0;
6511 }
6512
6513 cntlid = nvme_subsys_register_ctrl(n, errp);
6514 if (cntlid < 0) {
6515 return -1;
6516 }
6517
6518 n->cntlid = cntlid;
6519
6520 return 0;
6521}
6522
6523void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6524{
6525 uint32_t nsid = ns->params.nsid;
6526 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6527
6528 n->namespaces[nsid] = ns;
6529 ns->attached++;
6530
6531 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6532 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6533}
6534
6535static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6536{
6537 NvmeCtrl *n = NVME(pci_dev);
6538 NvmeNamespace *ns;
6539 Error *local_err = NULL;
6540
6541 nvme_check_constraints(n, &local_err);
6542 if (local_err) {
6543 error_propagate(errp, local_err);
6544 return;
6545 }
6546
6547 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6548 &pci_dev->qdev, n->parent_obj.qdev.id);
6549
6550 nvme_init_state(n);
6551 if (nvme_init_pci(n, pci_dev, errp)) {
6552 return;
6553 }
6554
6555 if (nvme_init_subsys(n, errp)) {
6556 error_propagate(errp, local_err);
6557 return;
6558 }
6559 nvme_init_ctrl(n, pci_dev);
6560
6561
6562 if (n->namespace.blkconf.blk) {
6563 ns = &n->namespace;
6564 ns->params.nsid = 1;
6565
6566 if (nvme_ns_setup(ns, errp)) {
6567 return;
6568 }
6569
6570 nvme_attach_ns(n, ns);
6571 }
6572}
6573
6574static void nvme_exit(PCIDevice *pci_dev)
6575{
6576 NvmeCtrl *n = NVME(pci_dev);
6577 NvmeNamespace *ns;
6578 int i;
6579
6580 nvme_ctrl_reset(n);
6581
6582 if (n->subsys) {
6583 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6584 ns = nvme_ns(n, i);
6585 if (ns) {
6586 ns->attached--;
6587 }
6588 }
6589
6590 nvme_subsys_unregister_ctrl(n->subsys, n);
6591 }
6592
6593 g_free(n->cq);
6594 g_free(n->sq);
6595 g_free(n->aer_reqs);
6596
6597 if (n->params.cmb_size_mb) {
6598 g_free(n->cmb.buf);
6599 }
6600
6601 if (n->pmr.dev) {
6602 host_memory_backend_set_mapped(n->pmr.dev, false);
6603 }
6604 msix_uninit(pci_dev, &n->bar0, &n->bar0);
6605 memory_region_del_subregion(&n->bar0, &n->iomem);
6606}
6607
6608static Property nvme_props[] = {
6609 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6610 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6611 HostMemoryBackend *),
6612 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6613 NvmeSubsystem *),
6614 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6615 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6616 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6617 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6618 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6619 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6620 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6621 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6622 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6623 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6624 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6625 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6626 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6627 params.auto_transition_zones, true),
6628 DEFINE_PROP_END_OF_LIST(),
6629};
6630
6631static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6632 void *opaque, Error **errp)
6633{
6634 NvmeCtrl *n = NVME(obj);
6635 uint8_t value = n->smart_critical_warning;
6636
6637 visit_type_uint8(v, name, &value, errp);
6638}
6639
6640static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6641 void *opaque, Error **errp)
6642{
6643 NvmeCtrl *n = NVME(obj);
6644 uint8_t value, old_value, cap = 0, index, event;
6645
6646 if (!visit_type_uint8(v, name, &value, errp)) {
6647 return;
6648 }
6649
6650 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6651 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6652 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
6653 cap |= NVME_SMART_PMR_UNRELIABLE;
6654 }
6655
6656 if ((value & cap) != value) {
6657 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6658 value & ~cap);
6659 return;
6660 }
6661
6662 old_value = n->smart_critical_warning;
6663 n->smart_critical_warning = value;
6664
6665
6666 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6667 event = 1 << index;
6668 if (value & ~old_value & event)
6669 nvme_smart_event(n, event);
6670 }
6671}
6672
6673static const VMStateDescription nvme_vmstate = {
6674 .name = "nvme",
6675 .unmigratable = 1,
6676};
6677
6678static void nvme_class_init(ObjectClass *oc, void *data)
6679{
6680 DeviceClass *dc = DEVICE_CLASS(oc);
6681 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6682
6683 pc->realize = nvme_realize;
6684 pc->exit = nvme_exit;
6685 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6686 pc->revision = 2;
6687
6688 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6689 dc->desc = "Non-Volatile Memory Express";
6690 device_class_set_props(dc, nvme_props);
6691 dc->vmsd = &nvme_vmstate;
6692}
6693
6694static void nvme_instance_init(Object *obj)
6695{
6696 NvmeCtrl *n = NVME(obj);
6697
6698 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6699 "bootindex", "/namespace@1,0",
6700 DEVICE(obj));
6701
6702 object_property_add(obj, "smart_critical_warning", "uint8",
6703 nvme_get_smart_warning,
6704 nvme_set_smart_warning, NULL, NULL);
6705}
6706
6707static const TypeInfo nvme_info = {
6708 .name = TYPE_NVME,
6709 .parent = TYPE_PCI_DEVICE,
6710 .instance_size = sizeof(NvmeCtrl),
6711 .instance_init = nvme_instance_init,
6712 .class_init = nvme_class_init,
6713 .interfaces = (InterfaceInfo[]) {
6714 { INTERFACE_PCIE_DEVICE },
6715 { }
6716 },
6717};
6718
6719static const TypeInfo nvme_bus_info = {
6720 .name = TYPE_NVME_BUS,
6721 .parent = TYPE_BUS,
6722 .instance_size = sizeof(NvmeBus),
6723};
6724
6725static void nvme_register_types(void)
6726{
6727 type_register_static(&nvme_info);
6728 type_register_static(&nvme_bus_info);
6729}
6730
6731type_init(nvme_register_types)
6732