1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152#include "qemu/osdep.h"
153#include "qemu/cutils.h"
154#include "qemu/error-report.h"
155#include "qemu/log.h"
156#include "qemu/units.h"
157#include "qapi/error.h"
158#include "qapi/visitor.h"
159#include "sysemu/sysemu.h"
160#include "sysemu/block-backend.h"
161#include "sysemu/hostmem.h"
162#include "hw/pci/msix.h"
163#include "migration/vmstate.h"
164
165#include "nvme.h"
166#include "trace.h"
167
168#define NVME_MAX_IOQPAIRS 0xffff
169#define NVME_DB_SIZE 4
170#define NVME_SPEC_VER 0x00010400
171#define NVME_CMB_BIR 2
172#define NVME_PMR_BIR 4
173#define NVME_TEMPERATURE 0x143
174#define NVME_TEMPERATURE_WARNING 0x157
175#define NVME_TEMPERATURE_CRITICAL 0x175
176#define NVME_NUM_FW_SLOTS 1
177#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
178
179#define NVME_GUEST_ERR(trace, fmt, ...) \
180 do { \
181 (trace_##trace)(__VA_ARGS__); \
182 qemu_log_mask(LOG_GUEST_ERROR, #trace \
183 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
184 } while (0)
185
186static const bool nvme_feature_support[NVME_FID_MAX] = {
187 [NVME_ARBITRATION] = true,
188 [NVME_POWER_MANAGEMENT] = true,
189 [NVME_TEMPERATURE_THRESHOLD] = true,
190 [NVME_ERROR_RECOVERY] = true,
191 [NVME_VOLATILE_WRITE_CACHE] = true,
192 [NVME_NUMBER_OF_QUEUES] = true,
193 [NVME_INTERRUPT_COALESCING] = true,
194 [NVME_INTERRUPT_VECTOR_CONF] = true,
195 [NVME_WRITE_ATOMICITY] = true,
196 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
197 [NVME_TIMESTAMP] = true,
198 [NVME_COMMAND_SET_PROFILE] = true,
199};
200
201static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
202 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
203 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
204 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
205 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
206 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
207 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
208 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
209};
210
211static const uint32_t nvme_cse_acs[256] = {
212 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
213 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
214 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
215 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
216 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
217 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
218 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
219 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
220 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
221 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
222 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
223 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224};
225
226static const uint32_t nvme_cse_iocs_none[256];
227
228static const uint32_t nvme_cse_iocs_nvm[256] = {
229 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
233 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
235 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
236 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
237};
238
239static const uint32_t nvme_cse_iocs_zoned[256] = {
240 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
241 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
243 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
244 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
246 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
248 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
249 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
251};
252
253static void nvme_process_sq(void *opaque);
254
255static uint16_t nvme_sqid(NvmeRequest *req)
256{
257 return le16_to_cpu(req->sq->sqid);
258}
259
260static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
261 NvmeZoneState state)
262{
263 if (QTAILQ_IN_USE(zone, entry)) {
264 switch (nvme_get_zone_state(zone)) {
265 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
266 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
267 break;
268 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
269 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
270 break;
271 case NVME_ZONE_STATE_CLOSED:
272 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
273 break;
274 case NVME_ZONE_STATE_FULL:
275 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
276 default:
277 ;
278 }
279 }
280
281 nvme_set_zone_state(zone, state);
282
283 switch (state) {
284 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
285 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
286 break;
287 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
288 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
289 break;
290 case NVME_ZONE_STATE_CLOSED:
291 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
292 break;
293 case NVME_ZONE_STATE_FULL:
294 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
295 case NVME_ZONE_STATE_READ_ONLY:
296 break;
297 default:
298 zone->d.za = 0;
299 }
300}
301
302
303
304
305
306static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
307{
308 if (ns->params.max_active_zones != 0 &&
309 ns->nr_active_zones + act > ns->params.max_active_zones) {
310 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
311 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
312 }
313 if (ns->params.max_open_zones != 0 &&
314 ns->nr_open_zones + opn > ns->params.max_open_zones) {
315 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
316 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
317 }
318
319 return NVME_SUCCESS;
320}
321
322static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
323{
324 hwaddr hi, lo;
325
326 if (!n->cmb.cmse) {
327 return false;
328 }
329
330 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
331 hi = lo + int128_get64(n->cmb.mem.size);
332
333 return addr >= lo && addr < hi;
334}
335
336static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
337{
338 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
339 return &n->cmb.buf[addr - base];
340}
341
342static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
343{
344 hwaddr hi;
345
346 if (!n->pmr.cmse) {
347 return false;
348 }
349
350 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
351
352 return addr >= n->pmr.cba && addr < hi;
353}
354
355static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
356{
357 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
358}
359
360static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
361{
362 hwaddr hi = addr + size - 1;
363 if (hi < addr) {
364 return 1;
365 }
366
367 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
368 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
369 return 0;
370 }
371
372 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
373 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
374 return 0;
375 }
376
377 return pci_dma_read(&n->parent_obj, addr, buf, size);
378}
379
380static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
381{
382 hwaddr hi = addr + size - 1;
383 if (hi < addr) {
384 return 1;
385 }
386
387 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
388 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
389 return 0;
390 }
391
392 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
393 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
394 return 0;
395 }
396
397 return pci_dma_write(&n->parent_obj, addr, buf, size);
398}
399
400static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
401{
402 return nsid &&
403 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
404}
405
406static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
407{
408 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
409}
410
411static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
412{
413 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
414}
415
416static void nvme_inc_cq_tail(NvmeCQueue *cq)
417{
418 cq->tail++;
419 if (cq->tail >= cq->size) {
420 cq->tail = 0;
421 cq->phase = !cq->phase;
422 }
423}
424
425static void nvme_inc_sq_head(NvmeSQueue *sq)
426{
427 sq->head = (sq->head + 1) % sq->size;
428}
429
430static uint8_t nvme_cq_full(NvmeCQueue *cq)
431{
432 return (cq->tail + 1) % cq->size == cq->head;
433}
434
435static uint8_t nvme_sq_empty(NvmeSQueue *sq)
436{
437 return sq->head == sq->tail;
438}
439
440static void nvme_irq_check(NvmeCtrl *n)
441{
442 uint32_t intms = ldl_le_p(&n->bar.intms);
443
444 if (msix_enabled(&(n->parent_obj))) {
445 return;
446 }
447 if (~intms & n->irq_status) {
448 pci_irq_assert(&n->parent_obj);
449 } else {
450 pci_irq_deassert(&n->parent_obj);
451 }
452}
453
454static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
455{
456 if (cq->irq_enabled) {
457 if (msix_enabled(&(n->parent_obj))) {
458 trace_pci_nvme_irq_msix(cq->vector);
459 msix_notify(&(n->parent_obj), cq->vector);
460 } else {
461 trace_pci_nvme_irq_pin();
462 assert(cq->vector < 32);
463 n->irq_status |= 1 << cq->vector;
464 nvme_irq_check(n);
465 }
466 } else {
467 trace_pci_nvme_irq_masked();
468 }
469}
470
471static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
472{
473 if (cq->irq_enabled) {
474 if (msix_enabled(&(n->parent_obj))) {
475 return;
476 } else {
477 assert(cq->vector < 32);
478 if (!n->cq_pending) {
479 n->irq_status &= ~(1 << cq->vector);
480 }
481 nvme_irq_check(n);
482 }
483 }
484}
485
486static void nvme_req_clear(NvmeRequest *req)
487{
488 req->ns = NULL;
489 req->opaque = NULL;
490 req->aiocb = NULL;
491 memset(&req->cqe, 0x0, sizeof(req->cqe));
492 req->status = NVME_SUCCESS;
493}
494
495static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
496{
497 if (dma) {
498 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
499 sg->flags = NVME_SG_DMA;
500 } else {
501 qemu_iovec_init(&sg->iov, 0);
502 }
503
504 sg->flags |= NVME_SG_ALLOC;
505}
506
507static inline void nvme_sg_unmap(NvmeSg *sg)
508{
509 if (!(sg->flags & NVME_SG_ALLOC)) {
510 return;
511 }
512
513 if (sg->flags & NVME_SG_DMA) {
514 qemu_sglist_destroy(&sg->qsg);
515 } else {
516 qemu_iovec_destroy(&sg->iov);
517 }
518
519 memset(sg, 0x0, sizeof(*sg));
520}
521
522
523
524
525
526
527static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
528 NvmeSg *mdata)
529{
530 NvmeSg *dst = data;
531 uint32_t trans_len, count = ns->lbasz;
532 uint64_t offset = 0;
533 bool dma = sg->flags & NVME_SG_DMA;
534 size_t sge_len;
535 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
536 int sg_idx = 0;
537
538 assert(sg->flags & NVME_SG_ALLOC);
539
540 while (sg_len) {
541 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
542
543 trans_len = MIN(sg_len, count);
544 trans_len = MIN(trans_len, sge_len - offset);
545
546 if (dst) {
547 if (dma) {
548 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
549 trans_len);
550 } else {
551 qemu_iovec_add(&dst->iov,
552 sg->iov.iov[sg_idx].iov_base + offset,
553 trans_len);
554 }
555 }
556
557 sg_len -= trans_len;
558 count -= trans_len;
559 offset += trans_len;
560
561 if (count == 0) {
562 dst = (dst == data) ? mdata : data;
563 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
564 }
565
566 if (sge_len == offset) {
567 offset = 0;
568 sg_idx++;
569 }
570 }
571}
572
573static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
574 size_t len)
575{
576 if (!len) {
577 return NVME_SUCCESS;
578 }
579
580 trace_pci_nvme_map_addr_cmb(addr, len);
581
582 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
583 return NVME_DATA_TRAS_ERROR;
584 }
585
586 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
587
588 return NVME_SUCCESS;
589}
590
591static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
592 size_t len)
593{
594 if (!len) {
595 return NVME_SUCCESS;
596 }
597
598 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
599 return NVME_DATA_TRAS_ERROR;
600 }
601
602 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
603
604 return NVME_SUCCESS;
605}
606
607static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
608{
609 bool cmb = false, pmr = false;
610
611 if (!len) {
612 return NVME_SUCCESS;
613 }
614
615 trace_pci_nvme_map_addr(addr, len);
616
617 if (nvme_addr_is_cmb(n, addr)) {
618 cmb = true;
619 } else if (nvme_addr_is_pmr(n, addr)) {
620 pmr = true;
621 }
622
623 if (cmb || pmr) {
624 if (sg->flags & NVME_SG_DMA) {
625 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
626 }
627
628 if (sg->iov.niov + 1 > IOV_MAX) {
629 goto max_mappings_exceeded;
630 }
631
632 if (cmb) {
633 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
634 } else {
635 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
636 }
637 }
638
639 if (!(sg->flags & NVME_SG_DMA)) {
640 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
641 }
642
643 if (sg->qsg.nsg + 1 > IOV_MAX) {
644 goto max_mappings_exceeded;
645 }
646
647 qemu_sglist_add(&sg->qsg, addr, len);
648
649 return NVME_SUCCESS;
650
651max_mappings_exceeded:
652 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
653 "number of mappings exceed 1024");
654 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
655}
656
657static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
658{
659 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
660}
661
662static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
663 uint64_t prp2, uint32_t len)
664{
665 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
666 trans_len = MIN(len, trans_len);
667 int num_prps = (len >> n->page_bits) + 1;
668 uint16_t status;
669 int ret;
670
671 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
672
673 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
674
675 status = nvme_map_addr(n, sg, prp1, trans_len);
676 if (status) {
677 goto unmap;
678 }
679
680 len -= trans_len;
681 if (len) {
682 if (len > n->page_size) {
683 uint64_t prp_list[n->max_prp_ents];
684 uint32_t nents, prp_trans;
685 int i = 0;
686
687
688
689
690
691
692 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
693 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
694 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
695 if (ret) {
696 trace_pci_nvme_err_addr_read(prp2);
697 status = NVME_DATA_TRAS_ERROR;
698 goto unmap;
699 }
700 while (len != 0) {
701 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
702
703 if (i == nents - 1 && len > n->page_size) {
704 if (unlikely(prp_ent & (n->page_size - 1))) {
705 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
706 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
707 goto unmap;
708 }
709
710 i = 0;
711 nents = (len + n->page_size - 1) >> n->page_bits;
712 nents = MIN(nents, n->max_prp_ents);
713 prp_trans = nents * sizeof(uint64_t);
714 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
715 prp_trans);
716 if (ret) {
717 trace_pci_nvme_err_addr_read(prp_ent);
718 status = NVME_DATA_TRAS_ERROR;
719 goto unmap;
720 }
721 prp_ent = le64_to_cpu(prp_list[i]);
722 }
723
724 if (unlikely(prp_ent & (n->page_size - 1))) {
725 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
726 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
727 goto unmap;
728 }
729
730 trans_len = MIN(len, n->page_size);
731 status = nvme_map_addr(n, sg, prp_ent, trans_len);
732 if (status) {
733 goto unmap;
734 }
735
736 len -= trans_len;
737 i++;
738 }
739 } else {
740 if (unlikely(prp2 & (n->page_size - 1))) {
741 trace_pci_nvme_err_invalid_prp2_align(prp2);
742 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
743 goto unmap;
744 }
745 status = nvme_map_addr(n, sg, prp2, len);
746 if (status) {
747 goto unmap;
748 }
749 }
750 }
751
752 return NVME_SUCCESS;
753
754unmap:
755 nvme_sg_unmap(sg);
756 return status;
757}
758
759
760
761
762
763static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
764 NvmeSglDescriptor *segment, uint64_t nsgld,
765 size_t *len, NvmeCmd *cmd)
766{
767 dma_addr_t addr, trans_len;
768 uint32_t dlen;
769 uint16_t status;
770
771 for (int i = 0; i < nsgld; i++) {
772 uint8_t type = NVME_SGL_TYPE(segment[i].type);
773
774 switch (type) {
775 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
776 if (cmd->opcode == NVME_CMD_WRITE) {
777 continue;
778 }
779 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
780 break;
781 case NVME_SGL_DESCR_TYPE_SEGMENT:
782 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
783 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
784 default:
785 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
786 }
787
788 dlen = le32_to_cpu(segment[i].len);
789
790 if (!dlen) {
791 continue;
792 }
793
794 if (*len == 0) {
795
796
797
798
799
800 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
801 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
802 break;
803 }
804
805 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
806 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
807 }
808
809 trans_len = MIN(*len, dlen);
810
811 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
812 goto next;
813 }
814
815 addr = le64_to_cpu(segment[i].addr);
816
817 if (UINT64_MAX - addr < dlen) {
818 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
819 }
820
821 status = nvme_map_addr(n, sg, addr, trans_len);
822 if (status) {
823 return status;
824 }
825
826next:
827 *len -= trans_len;
828 }
829
830 return NVME_SUCCESS;
831}
832
833static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
834 size_t len, NvmeCmd *cmd)
835{
836
837
838
839
840
841
842
843 const int SEG_CHUNK_SIZE = 256;
844
845 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
846 uint64_t nsgld;
847 uint32_t seg_len;
848 uint16_t status;
849 hwaddr addr;
850 int ret;
851
852 sgld = &sgl;
853 addr = le64_to_cpu(sgl.addr);
854
855 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
856
857 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
858
859
860
861
862
863 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
864 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
865 if (status) {
866 goto unmap;
867 }
868
869 goto out;
870 }
871
872 for (;;) {
873 switch (NVME_SGL_TYPE(sgld->type)) {
874 case NVME_SGL_DESCR_TYPE_SEGMENT:
875 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
876 break;
877 default:
878 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
879 }
880
881 seg_len = le32_to_cpu(sgld->len);
882
883
884 if ((!seg_len || seg_len & 0xf) &&
885 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
886 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
887 }
888
889 if (UINT64_MAX - addr < seg_len) {
890 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
891 }
892
893 nsgld = seg_len / sizeof(NvmeSglDescriptor);
894
895 while (nsgld > SEG_CHUNK_SIZE) {
896 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
897 trace_pci_nvme_err_addr_read(addr);
898 status = NVME_DATA_TRAS_ERROR;
899 goto unmap;
900 }
901
902 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
903 &len, cmd);
904 if (status) {
905 goto unmap;
906 }
907
908 nsgld -= SEG_CHUNK_SIZE;
909 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
910 }
911
912 ret = nvme_addr_read(n, addr, segment, nsgld *
913 sizeof(NvmeSglDescriptor));
914 if (ret) {
915 trace_pci_nvme_err_addr_read(addr);
916 status = NVME_DATA_TRAS_ERROR;
917 goto unmap;
918 }
919
920 last_sgld = &segment[nsgld - 1];
921
922
923
924
925
926 switch (NVME_SGL_TYPE(last_sgld->type)) {
927 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
928 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
929 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
930 if (status) {
931 goto unmap;
932 }
933
934 goto out;
935
936 default:
937 break;
938 }
939
940
941
942
943
944 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
945 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
946 goto unmap;
947 }
948
949 sgld = last_sgld;
950 addr = le64_to_cpu(sgld->addr);
951
952
953
954
955
956 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
957 if (status) {
958 goto unmap;
959 }
960 }
961
962out:
963
964 if (len) {
965 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
966 goto unmap;
967 }
968
969 return NVME_SUCCESS;
970
971unmap:
972 nvme_sg_unmap(sg);
973 return status;
974}
975
976uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
977 NvmeCmd *cmd)
978{
979 uint64_t prp1, prp2;
980
981 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
982 case NVME_PSDT_PRP:
983 prp1 = le64_to_cpu(cmd->dptr.prp1);
984 prp2 = le64_to_cpu(cmd->dptr.prp2);
985
986 return nvme_map_prp(n, sg, prp1, prp2, len);
987 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
988 case NVME_PSDT_SGL_MPTR_SGL:
989 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
990 default:
991 return NVME_INVALID_FIELD;
992 }
993}
994
995static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
996 NvmeCmd *cmd)
997{
998 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
999 hwaddr mptr = le64_to_cpu(cmd->mptr);
1000 uint16_t status;
1001
1002 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1003 NvmeSglDescriptor sgl;
1004
1005 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1006 return NVME_DATA_TRAS_ERROR;
1007 }
1008
1009 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1010 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1011 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1012 }
1013
1014 return status;
1015 }
1016
1017 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1018 status = nvme_map_addr(n, sg, mptr, len);
1019 if (status) {
1020 nvme_sg_unmap(sg);
1021 }
1022
1023 return status;
1024}
1025
1026static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1027{
1028 NvmeNamespace *ns = req->ns;
1029 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1030 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1031 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1032 size_t len = nvme_l2b(ns, nlb);
1033 uint16_t status;
1034
1035 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1036 NvmeSg sg;
1037
1038 len += nvme_m2b(ns, nlb);
1039
1040 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1041 if (status) {
1042 return status;
1043 }
1044
1045 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1046 nvme_sg_split(&sg, ns, &req->sg, NULL);
1047 nvme_sg_unmap(&sg);
1048
1049 return NVME_SUCCESS;
1050 }
1051
1052 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1053}
1054
1055static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1056{
1057 NvmeNamespace *ns = req->ns;
1058 size_t len = nvme_m2b(ns, nlb);
1059 uint16_t status;
1060
1061 if (nvme_ns_ext(ns)) {
1062 NvmeSg sg;
1063
1064 len += nvme_l2b(ns, nlb);
1065
1066 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1067 if (status) {
1068 return status;
1069 }
1070
1071 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1072 nvme_sg_split(&sg, ns, NULL, &req->sg);
1073 nvme_sg_unmap(&sg);
1074
1075 return NVME_SUCCESS;
1076 }
1077
1078 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1079}
1080
1081static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1082 uint32_t len, uint32_t bytes,
1083 int32_t skip_bytes, int64_t offset,
1084 NvmeTxDirection dir)
1085{
1086 hwaddr addr;
1087 uint32_t trans_len, count = bytes;
1088 bool dma = sg->flags & NVME_SG_DMA;
1089 int64_t sge_len;
1090 int sg_idx = 0;
1091 int ret;
1092
1093 assert(sg->flags & NVME_SG_ALLOC);
1094
1095 while (len) {
1096 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1097
1098 if (sge_len - offset < 0) {
1099 offset -= sge_len;
1100 sg_idx++;
1101 continue;
1102 }
1103
1104 if (sge_len == offset) {
1105 offset = 0;
1106 sg_idx++;
1107 continue;
1108 }
1109
1110 trans_len = MIN(len, count);
1111 trans_len = MIN(trans_len, sge_len - offset);
1112
1113 if (dma) {
1114 addr = sg->qsg.sg[sg_idx].base + offset;
1115 } else {
1116 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1117 }
1118
1119 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1120 ret = nvme_addr_read(n, addr, ptr, trans_len);
1121 } else {
1122 ret = nvme_addr_write(n, addr, ptr, trans_len);
1123 }
1124
1125 if (ret) {
1126 return NVME_DATA_TRAS_ERROR;
1127 }
1128
1129 ptr += trans_len;
1130 len -= trans_len;
1131 count -= trans_len;
1132 offset += trans_len;
1133
1134 if (count == 0) {
1135 count = bytes;
1136 offset += skip_bytes;
1137 }
1138 }
1139
1140 return NVME_SUCCESS;
1141}
1142
1143static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1144 NvmeTxDirection dir)
1145{
1146 assert(sg->flags & NVME_SG_ALLOC);
1147
1148 if (sg->flags & NVME_SG_DMA) {
1149 uint64_t residual;
1150
1151 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1152 residual = dma_buf_write(ptr, len, &sg->qsg);
1153 } else {
1154 residual = dma_buf_read(ptr, len, &sg->qsg);
1155 }
1156
1157 if (unlikely(residual)) {
1158 trace_pci_nvme_err_invalid_dma();
1159 return NVME_INVALID_FIELD | NVME_DNR;
1160 }
1161 } else {
1162 size_t bytes;
1163
1164 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1165 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1166 } else {
1167 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1168 }
1169
1170 if (unlikely(bytes != len)) {
1171 trace_pci_nvme_err_invalid_dma();
1172 return NVME_INVALID_FIELD | NVME_DNR;
1173 }
1174 }
1175
1176 return NVME_SUCCESS;
1177}
1178
1179static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1180 NvmeRequest *req)
1181{
1182 uint16_t status;
1183
1184 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1185 if (status) {
1186 return status;
1187 }
1188
1189 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1190}
1191
1192static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1193 NvmeRequest *req)
1194{
1195 uint16_t status;
1196
1197 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1198 if (status) {
1199 return status;
1200 }
1201
1202 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1203}
1204
1205uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1206 NvmeTxDirection dir, NvmeRequest *req)
1207{
1208 NvmeNamespace *ns = req->ns;
1209 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1210 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1211 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1212
1213 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1214 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1215 ns->lbaf.ms, 0, dir);
1216 }
1217
1218 return nvme_tx(n, &req->sg, ptr, len, dir);
1219}
1220
1221uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1222 NvmeTxDirection dir, NvmeRequest *req)
1223{
1224 NvmeNamespace *ns = req->ns;
1225 uint16_t status;
1226
1227 if (nvme_ns_ext(ns)) {
1228 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1229 ns->lbasz, ns->lbasz, dir);
1230 }
1231
1232 nvme_sg_unmap(&req->sg);
1233
1234 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1235 if (status) {
1236 return status;
1237 }
1238
1239 return nvme_tx(n, &req->sg, ptr, len, dir);
1240}
1241
1242static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1243 BlockCompletionFunc *cb, NvmeRequest *req)
1244{
1245 assert(req->sg.flags & NVME_SG_ALLOC);
1246
1247 if (req->sg.flags & NVME_SG_DMA) {
1248 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1249 cb, req);
1250 } else {
1251 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1252 }
1253}
1254
1255static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1256 BlockCompletionFunc *cb, NvmeRequest *req)
1257{
1258 assert(req->sg.flags & NVME_SG_ALLOC);
1259
1260 if (req->sg.flags & NVME_SG_DMA) {
1261 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1262 cb, req);
1263 } else {
1264 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1265 }
1266}
1267
1268static void nvme_post_cqes(void *opaque)
1269{
1270 NvmeCQueue *cq = opaque;
1271 NvmeCtrl *n = cq->ctrl;
1272 NvmeRequest *req, *next;
1273 bool pending = cq->head != cq->tail;
1274 int ret;
1275
1276 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1277 NvmeSQueue *sq;
1278 hwaddr addr;
1279
1280 if (nvme_cq_full(cq)) {
1281 break;
1282 }
1283
1284 sq = req->sq;
1285 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1286 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1287 req->cqe.sq_head = cpu_to_le16(sq->head);
1288 addr = cq->dma_addr + cq->tail * n->cqe_size;
1289 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1290 sizeof(req->cqe));
1291 if (ret) {
1292 trace_pci_nvme_err_addr_write(addr);
1293 trace_pci_nvme_err_cfs();
1294 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1295 break;
1296 }
1297 QTAILQ_REMOVE(&cq->req_list, req, entry);
1298 nvme_inc_cq_tail(cq);
1299 nvme_sg_unmap(&req->sg);
1300 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1301 }
1302 if (cq->tail != cq->head) {
1303 if (cq->irq_enabled && !pending) {
1304 n->cq_pending++;
1305 }
1306
1307 nvme_irq_assert(n, cq);
1308 }
1309}
1310
1311static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1312{
1313 assert(cq->cqid == req->sq->cqid);
1314 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1315 le32_to_cpu(req->cqe.result),
1316 le32_to_cpu(req->cqe.dw1),
1317 req->status);
1318
1319 if (req->status) {
1320 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1321 req->status, req->cmd.opcode);
1322 }
1323
1324 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1325 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1326 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1327}
1328
1329static void nvme_process_aers(void *opaque)
1330{
1331 NvmeCtrl *n = opaque;
1332 NvmeAsyncEvent *event, *next;
1333
1334 trace_pci_nvme_process_aers(n->aer_queued);
1335
1336 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1337 NvmeRequest *req;
1338 NvmeAerResult *result;
1339
1340
1341 if (!n->outstanding_aers) {
1342 trace_pci_nvme_no_outstanding_aers();
1343 break;
1344 }
1345
1346
1347 if (n->aer_mask & (1 << event->result.event_type)) {
1348 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1349 continue;
1350 }
1351
1352 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1353 n->aer_queued--;
1354
1355 n->aer_mask |= 1 << event->result.event_type;
1356 n->outstanding_aers--;
1357
1358 req = n->aer_reqs[n->outstanding_aers];
1359
1360 result = (NvmeAerResult *) &req->cqe.result;
1361 result->event_type = event->result.event_type;
1362 result->event_info = event->result.event_info;
1363 result->log_page = event->result.log_page;
1364 g_free(event);
1365
1366 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1367 result->log_page);
1368
1369 nvme_enqueue_req_completion(&n->admin_cq, req);
1370 }
1371}
1372
1373static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1374 uint8_t event_info, uint8_t log_page)
1375{
1376 NvmeAsyncEvent *event;
1377
1378 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1379
1380 if (n->aer_queued == n->params.aer_max_queued) {
1381 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1382 return;
1383 }
1384
1385 event = g_new(NvmeAsyncEvent, 1);
1386 event->result = (NvmeAerResult) {
1387 .event_type = event_type,
1388 .event_info = event_info,
1389 .log_page = log_page,
1390 };
1391
1392 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1393 n->aer_queued++;
1394
1395 nvme_process_aers(n);
1396}
1397
1398static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1399{
1400 uint8_t aer_info;
1401
1402
1403 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1404 return;
1405 }
1406
1407 switch (event) {
1408 case NVME_SMART_SPARE:
1409 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1410 break;
1411 case NVME_SMART_TEMPERATURE:
1412 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1413 break;
1414 case NVME_SMART_RELIABILITY:
1415 case NVME_SMART_MEDIA_READ_ONLY:
1416 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1417 case NVME_SMART_PMR_UNRELIABLE:
1418 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1419 break;
1420 default:
1421 return;
1422 }
1423
1424 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1425}
1426
1427static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1428{
1429 n->aer_mask &= ~(1 << event_type);
1430 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1431 nvme_process_aers(n);
1432 }
1433}
1434
1435static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1436{
1437 uint8_t mdts = n->params.mdts;
1438
1439 if (mdts && len > n->page_size << mdts) {
1440 trace_pci_nvme_err_mdts(len);
1441 return NVME_INVALID_FIELD | NVME_DNR;
1442 }
1443
1444 return NVME_SUCCESS;
1445}
1446
1447static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1448 uint32_t nlb)
1449{
1450 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1451
1452 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1453 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1454 return NVME_LBA_RANGE | NVME_DNR;
1455 }
1456
1457 return NVME_SUCCESS;
1458}
1459
1460static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1461 uint32_t nlb, int flags)
1462{
1463 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1464
1465 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1466 int64_t offset = nvme_l2b(ns, slba);
1467 int ret;
1468
1469
1470
1471
1472
1473
1474
1475 do {
1476 bytes -= pnum;
1477
1478 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1479 if (ret < 0) {
1480 return ret;
1481 }
1482
1483
1484 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1485 !!(ret & BDRV_BLOCK_ZERO));
1486
1487 if (!(ret & flags)) {
1488 return 1;
1489 }
1490
1491 offset += pnum;
1492 } while (pnum != bytes);
1493
1494 return 0;
1495}
1496
1497static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1498 uint32_t nlb)
1499{
1500 int ret;
1501 Error *err = NULL;
1502
1503 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1504 if (ret) {
1505 if (ret < 0) {
1506 error_setg_errno(&err, -ret, "unable to get block status");
1507 error_report_err(err);
1508
1509 return NVME_INTERNAL_DEV_ERROR;
1510 }
1511
1512 return NVME_DULB;
1513 }
1514
1515 return NVME_SUCCESS;
1516}
1517
1518static void nvme_aio_err(NvmeRequest *req, int ret)
1519{
1520 uint16_t status = NVME_SUCCESS;
1521 Error *local_err = NULL;
1522
1523 switch (req->cmd.opcode) {
1524 case NVME_CMD_READ:
1525 status = NVME_UNRECOVERED_READ;
1526 break;
1527 case NVME_CMD_FLUSH:
1528 case NVME_CMD_WRITE:
1529 case NVME_CMD_WRITE_ZEROES:
1530 case NVME_CMD_ZONE_APPEND:
1531 status = NVME_WRITE_FAULT;
1532 break;
1533 default:
1534 status = NVME_INTERNAL_DEV_ERROR;
1535 break;
1536 }
1537
1538 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1539
1540 error_setg_errno(&local_err, -ret, "aio failed");
1541 error_report_err(local_err);
1542
1543
1544
1545
1546
1547 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1548 return;
1549 }
1550
1551 req->status = status;
1552}
1553
1554static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1555{
1556 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1557 slba / ns->zone_size;
1558}
1559
1560static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1561{
1562 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1563
1564 if (zone_idx >= ns->num_zones) {
1565 return NULL;
1566 }
1567
1568 return &ns->zone_array[zone_idx];
1569}
1570
1571static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1572{
1573 uint64_t zslba = zone->d.zslba;
1574
1575 switch (nvme_get_zone_state(zone)) {
1576 case NVME_ZONE_STATE_EMPTY:
1577 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1578 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1579 case NVME_ZONE_STATE_CLOSED:
1580 return NVME_SUCCESS;
1581 case NVME_ZONE_STATE_FULL:
1582 trace_pci_nvme_err_zone_is_full(zslba);
1583 return NVME_ZONE_FULL;
1584 case NVME_ZONE_STATE_OFFLINE:
1585 trace_pci_nvme_err_zone_is_offline(zslba);
1586 return NVME_ZONE_OFFLINE;
1587 case NVME_ZONE_STATE_READ_ONLY:
1588 trace_pci_nvme_err_zone_is_read_only(zslba);
1589 return NVME_ZONE_READ_ONLY;
1590 default:
1591 assert(false);
1592 }
1593
1594 return NVME_INTERNAL_DEV_ERROR;
1595}
1596
1597static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1598 uint64_t slba, uint32_t nlb)
1599{
1600 uint64_t zcap = nvme_zone_wr_boundary(zone);
1601 uint16_t status;
1602
1603 status = nvme_check_zone_state_for_write(zone);
1604 if (status) {
1605 return status;
1606 }
1607
1608 if (unlikely(slba != zone->w_ptr)) {
1609 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1610 return NVME_ZONE_INVALID_WRITE;
1611 }
1612
1613 if (unlikely((slba + nlb) > zcap)) {
1614 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1615 return NVME_ZONE_BOUNDARY_ERROR;
1616 }
1617
1618 return NVME_SUCCESS;
1619}
1620
1621static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1622{
1623 switch (nvme_get_zone_state(zone)) {
1624 case NVME_ZONE_STATE_EMPTY:
1625 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1626 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1627 case NVME_ZONE_STATE_FULL:
1628 case NVME_ZONE_STATE_CLOSED:
1629 case NVME_ZONE_STATE_READ_ONLY:
1630 return NVME_SUCCESS;
1631 case NVME_ZONE_STATE_OFFLINE:
1632 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1633 return NVME_ZONE_OFFLINE;
1634 default:
1635 assert(false);
1636 }
1637
1638 return NVME_INTERNAL_DEV_ERROR;
1639}
1640
1641static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1642 uint32_t nlb)
1643{
1644 NvmeZone *zone;
1645 uint64_t bndry, end;
1646 uint16_t status;
1647
1648 zone = nvme_get_zone_by_slba(ns, slba);
1649 assert(zone);
1650
1651 bndry = nvme_zone_rd_boundary(ns, zone);
1652 end = slba + nlb;
1653
1654 status = nvme_check_zone_state_for_read(zone);
1655 if (status) {
1656 ;
1657 } else if (unlikely(end > bndry)) {
1658 if (!ns->params.cross_zone_read) {
1659 status = NVME_ZONE_BOUNDARY_ERROR;
1660 } else {
1661
1662
1663
1664
1665 do {
1666 zone++;
1667 status = nvme_check_zone_state_for_read(zone);
1668 if (status) {
1669 break;
1670 }
1671 } while (end > nvme_zone_rd_boundary(ns, zone));
1672 }
1673 }
1674
1675 return status;
1676}
1677
1678static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1679{
1680 switch (nvme_get_zone_state(zone)) {
1681 case NVME_ZONE_STATE_FULL:
1682 return NVME_SUCCESS;
1683
1684 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1685 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1686 nvme_aor_dec_open(ns);
1687
1688 case NVME_ZONE_STATE_CLOSED:
1689 nvme_aor_dec_active(ns);
1690
1691 case NVME_ZONE_STATE_EMPTY:
1692 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1693 return NVME_SUCCESS;
1694
1695 default:
1696 return NVME_ZONE_INVAL_TRANSITION;
1697 }
1698}
1699
1700static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1701{
1702 switch (nvme_get_zone_state(zone)) {
1703 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1704 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1705 nvme_aor_dec_open(ns);
1706 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1707
1708 case NVME_ZONE_STATE_CLOSED:
1709 return NVME_SUCCESS;
1710
1711 default:
1712 return NVME_ZONE_INVAL_TRANSITION;
1713 }
1714}
1715
1716static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1717{
1718 switch (nvme_get_zone_state(zone)) {
1719 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1720 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1721 nvme_aor_dec_open(ns);
1722
1723 case NVME_ZONE_STATE_CLOSED:
1724 nvme_aor_dec_active(ns);
1725
1726 case NVME_ZONE_STATE_FULL:
1727 zone->w_ptr = zone->d.zslba;
1728 zone->d.wp = zone->w_ptr;
1729 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1730
1731 case NVME_ZONE_STATE_EMPTY:
1732 return NVME_SUCCESS;
1733
1734 default:
1735 return NVME_ZONE_INVAL_TRANSITION;
1736 }
1737}
1738
1739static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1740{
1741 NvmeZone *zone;
1742
1743 if (ns->params.max_open_zones &&
1744 ns->nr_open_zones == ns->params.max_open_zones) {
1745 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1746 if (zone) {
1747
1748
1749
1750 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1751 nvme_zrm_close(ns, zone);
1752 }
1753 }
1754}
1755
1756enum {
1757 NVME_ZRM_AUTO = 1 << 0,
1758};
1759
1760static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1761 NvmeZone *zone, int flags)
1762{
1763 int act = 0;
1764 uint16_t status;
1765
1766 switch (nvme_get_zone_state(zone)) {
1767 case NVME_ZONE_STATE_EMPTY:
1768 act = 1;
1769
1770
1771
1772 case NVME_ZONE_STATE_CLOSED:
1773 if (n->params.auto_transition_zones) {
1774 nvme_zrm_auto_transition_zone(ns);
1775 }
1776 status = nvme_aor_check(ns, act, 1);
1777 if (status) {
1778 return status;
1779 }
1780
1781 if (act) {
1782 nvme_aor_inc_active(ns);
1783 }
1784
1785 nvme_aor_inc_open(ns);
1786
1787 if (flags & NVME_ZRM_AUTO) {
1788 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1789 return NVME_SUCCESS;
1790 }
1791
1792
1793
1794 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1795 if (flags & NVME_ZRM_AUTO) {
1796 return NVME_SUCCESS;
1797 }
1798
1799 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1800
1801
1802
1803 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1804 return NVME_SUCCESS;
1805
1806 default:
1807 return NVME_ZONE_INVAL_TRANSITION;
1808 }
1809}
1810
1811static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1812 NvmeZone *zone)
1813{
1814 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1815}
1816
1817static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns,
1818 NvmeZone *zone)
1819{
1820 return nvme_zrm_open_flags(n, ns, zone, 0);
1821}
1822
1823static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1824 uint32_t nlb)
1825{
1826 zone->d.wp += nlb;
1827
1828 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1829 nvme_zrm_finish(ns, zone);
1830 }
1831}
1832
1833static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1834{
1835 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1836 NvmeZone *zone;
1837 uint64_t slba;
1838 uint32_t nlb;
1839
1840 slba = le64_to_cpu(rw->slba);
1841 nlb = le16_to_cpu(rw->nlb) + 1;
1842 zone = nvme_get_zone_by_slba(ns, slba);
1843 assert(zone);
1844
1845 nvme_advance_zone_wp(ns, zone, nlb);
1846}
1847
1848static inline bool nvme_is_write(NvmeRequest *req)
1849{
1850 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1851
1852 return rw->opcode == NVME_CMD_WRITE ||
1853 rw->opcode == NVME_CMD_ZONE_APPEND ||
1854 rw->opcode == NVME_CMD_WRITE_ZEROES;
1855}
1856
1857static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1858{
1859 return qemu_get_aio_context();
1860}
1861
1862static void nvme_misc_cb(void *opaque, int ret)
1863{
1864 NvmeRequest *req = opaque;
1865
1866 trace_pci_nvme_misc_cb(nvme_cid(req));
1867
1868 if (ret) {
1869 nvme_aio_err(req, ret);
1870 }
1871
1872 nvme_enqueue_req_completion(nvme_cq(req), req);
1873}
1874
1875void nvme_rw_complete_cb(void *opaque, int ret)
1876{
1877 NvmeRequest *req = opaque;
1878 NvmeNamespace *ns = req->ns;
1879 BlockBackend *blk = ns->blkconf.blk;
1880 BlockAcctCookie *acct = &req->acct;
1881 BlockAcctStats *stats = blk_get_stats(blk);
1882
1883 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1884
1885 if (ret) {
1886 block_acct_failed(stats, acct);
1887 nvme_aio_err(req, ret);
1888 } else {
1889 block_acct_done(stats, acct);
1890 }
1891
1892 if (ns->params.zoned && nvme_is_write(req)) {
1893 nvme_finalize_zoned_write(ns, req);
1894 }
1895
1896 nvme_enqueue_req_completion(nvme_cq(req), req);
1897}
1898
1899static void nvme_rw_cb(void *opaque, int ret)
1900{
1901 NvmeRequest *req = opaque;
1902 NvmeNamespace *ns = req->ns;
1903
1904 BlockBackend *blk = ns->blkconf.blk;
1905
1906 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1907
1908 if (ret) {
1909 goto out;
1910 }
1911
1912 if (ns->lbaf.ms) {
1913 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1914 uint64_t slba = le64_to_cpu(rw->slba);
1915 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1916 uint64_t offset = nvme_moff(ns, slba);
1917
1918 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1919 size_t mlen = nvme_m2b(ns, nlb);
1920
1921 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1922 BDRV_REQ_MAY_UNMAP,
1923 nvme_rw_complete_cb, req);
1924 return;
1925 }
1926
1927 if (nvme_ns_ext(ns) || req->cmd.mptr) {
1928 uint16_t status;
1929
1930 nvme_sg_unmap(&req->sg);
1931 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1932 if (status) {
1933 ret = -EFAULT;
1934 goto out;
1935 }
1936
1937 if (req->cmd.opcode == NVME_CMD_READ) {
1938 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1939 }
1940
1941 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1942 }
1943 }
1944
1945out:
1946 nvme_rw_complete_cb(req, ret);
1947}
1948
1949static void nvme_verify_cb(void *opaque, int ret)
1950{
1951 NvmeBounceContext *ctx = opaque;
1952 NvmeRequest *req = ctx->req;
1953 NvmeNamespace *ns = req->ns;
1954 BlockBackend *blk = ns->blkconf.blk;
1955 BlockAcctCookie *acct = &req->acct;
1956 BlockAcctStats *stats = blk_get_stats(blk);
1957 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1958 uint64_t slba = le64_to_cpu(rw->slba);
1959 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
1960 uint16_t apptag = le16_to_cpu(rw->apptag);
1961 uint16_t appmask = le16_to_cpu(rw->appmask);
1962 uint32_t reftag = le32_to_cpu(rw->reftag);
1963 uint16_t status;
1964
1965 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
1966
1967 if (ret) {
1968 block_acct_failed(stats, acct);
1969 nvme_aio_err(req, ret);
1970 goto out;
1971 }
1972
1973 block_acct_done(stats, acct);
1974
1975 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1976 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1977 ctx->mdata.iov.size, slba);
1978 if (status) {
1979 req->status = status;
1980 goto out;
1981 }
1982
1983 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1984 ctx->mdata.bounce, ctx->mdata.iov.size,
1985 prinfo, slba, apptag, appmask, &reftag);
1986 }
1987
1988out:
1989 qemu_iovec_destroy(&ctx->data.iov);
1990 g_free(ctx->data.bounce);
1991
1992 qemu_iovec_destroy(&ctx->mdata.iov);
1993 g_free(ctx->mdata.bounce);
1994
1995 g_free(ctx);
1996
1997 nvme_enqueue_req_completion(nvme_cq(req), req);
1998}
1999
2000
2001static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2002{
2003 NvmeBounceContext *ctx = opaque;
2004 NvmeRequest *req = ctx->req;
2005 NvmeNamespace *ns = req->ns;
2006 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2007 uint64_t slba = le64_to_cpu(rw->slba);
2008 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2009 size_t mlen = nvme_m2b(ns, nlb);
2010 uint64_t offset = nvme_moff(ns, slba);
2011 BlockBackend *blk = ns->blkconf.blk;
2012
2013 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2014
2015 if (ret) {
2016 goto out;
2017 }
2018
2019 ctx->mdata.bounce = g_malloc(mlen);
2020
2021 qemu_iovec_reset(&ctx->mdata.iov);
2022 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2023
2024 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2025 nvme_verify_cb, ctx);
2026 return;
2027
2028out:
2029 nvme_verify_cb(ctx, ret);
2030}
2031
2032struct nvme_compare_ctx {
2033 struct {
2034 QEMUIOVector iov;
2035 uint8_t *bounce;
2036 } data;
2037
2038 struct {
2039 QEMUIOVector iov;
2040 uint8_t *bounce;
2041 } mdata;
2042};
2043
2044static void nvme_compare_mdata_cb(void *opaque, int ret)
2045{
2046 NvmeRequest *req = opaque;
2047 NvmeNamespace *ns = req->ns;
2048 NvmeCtrl *n = nvme_ctrl(req);
2049 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2050 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2051 uint16_t apptag = le16_to_cpu(rw->apptag);
2052 uint16_t appmask = le16_to_cpu(rw->appmask);
2053 uint32_t reftag = le32_to_cpu(rw->reftag);
2054 struct nvme_compare_ctx *ctx = req->opaque;
2055 g_autofree uint8_t *buf = NULL;
2056 BlockBackend *blk = ns->blkconf.blk;
2057 BlockAcctCookie *acct = &req->acct;
2058 BlockAcctStats *stats = blk_get_stats(blk);
2059 uint16_t status = NVME_SUCCESS;
2060
2061 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2062
2063 if (ret) {
2064 block_acct_failed(stats, acct);
2065 nvme_aio_err(req, ret);
2066 goto out;
2067 }
2068
2069 buf = g_malloc(ctx->mdata.iov.size);
2070
2071 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2072 NVME_TX_DIRECTION_TO_DEVICE, req);
2073 if (status) {
2074 req->status = status;
2075 goto out;
2076 }
2077
2078 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2079 uint64_t slba = le64_to_cpu(rw->slba);
2080 uint8_t *bufp;
2081 uint8_t *mbufp = ctx->mdata.bounce;
2082 uint8_t *end = mbufp + ctx->mdata.iov.size;
2083 int16_t pil = 0;
2084
2085 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2086 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2087 slba, apptag, appmask, &reftag);
2088 if (status) {
2089 req->status = status;
2090 goto out;
2091 }
2092
2093
2094
2095
2096
2097 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2098 pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
2099 }
2100
2101 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2102 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2103 req->status = NVME_CMP_FAILURE;
2104 goto out;
2105 }
2106 }
2107
2108 goto out;
2109 }
2110
2111 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2112 req->status = NVME_CMP_FAILURE;
2113 goto out;
2114 }
2115
2116 block_acct_done(stats, acct);
2117
2118out:
2119 qemu_iovec_destroy(&ctx->data.iov);
2120 g_free(ctx->data.bounce);
2121
2122 qemu_iovec_destroy(&ctx->mdata.iov);
2123 g_free(ctx->mdata.bounce);
2124
2125 g_free(ctx);
2126
2127 nvme_enqueue_req_completion(nvme_cq(req), req);
2128}
2129
2130static void nvme_compare_data_cb(void *opaque, int ret)
2131{
2132 NvmeRequest *req = opaque;
2133 NvmeCtrl *n = nvme_ctrl(req);
2134 NvmeNamespace *ns = req->ns;
2135 BlockBackend *blk = ns->blkconf.blk;
2136 BlockAcctCookie *acct = &req->acct;
2137 BlockAcctStats *stats = blk_get_stats(blk);
2138
2139 struct nvme_compare_ctx *ctx = req->opaque;
2140 g_autofree uint8_t *buf = NULL;
2141 uint16_t status;
2142
2143 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2144
2145 if (ret) {
2146 block_acct_failed(stats, acct);
2147 nvme_aio_err(req, ret);
2148 goto out;
2149 }
2150
2151 buf = g_malloc(ctx->data.iov.size);
2152
2153 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2154 NVME_TX_DIRECTION_TO_DEVICE, req);
2155 if (status) {
2156 req->status = status;
2157 goto out;
2158 }
2159
2160 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2161 req->status = NVME_CMP_FAILURE;
2162 goto out;
2163 }
2164
2165 if (ns->lbaf.ms) {
2166 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2167 uint64_t slba = le64_to_cpu(rw->slba);
2168 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2169 size_t mlen = nvme_m2b(ns, nlb);
2170 uint64_t offset = nvme_moff(ns, slba);
2171
2172 ctx->mdata.bounce = g_malloc(mlen);
2173
2174 qemu_iovec_init(&ctx->mdata.iov, 1);
2175 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2176
2177 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2178 nvme_compare_mdata_cb, req);
2179 return;
2180 }
2181
2182 block_acct_done(stats, acct);
2183
2184out:
2185 qemu_iovec_destroy(&ctx->data.iov);
2186 g_free(ctx->data.bounce);
2187 g_free(ctx);
2188
2189 nvme_enqueue_req_completion(nvme_cq(req), req);
2190}
2191
2192typedef struct NvmeDSMAIOCB {
2193 BlockAIOCB common;
2194 BlockAIOCB *aiocb;
2195 NvmeRequest *req;
2196 QEMUBH *bh;
2197 int ret;
2198
2199 NvmeDsmRange *range;
2200 unsigned int nr;
2201 unsigned int idx;
2202} NvmeDSMAIOCB;
2203
2204static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2205{
2206 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2207
2208
2209 iocb->idx = iocb->nr;
2210 iocb->ret = -ECANCELED;
2211
2212 if (iocb->aiocb) {
2213 blk_aio_cancel_async(iocb->aiocb);
2214 iocb->aiocb = NULL;
2215 } else {
2216
2217
2218
2219
2220 assert(iocb->idx == iocb->nr);
2221 }
2222}
2223
2224static const AIOCBInfo nvme_dsm_aiocb_info = {
2225 .aiocb_size = sizeof(NvmeDSMAIOCB),
2226 .cancel_async = nvme_dsm_cancel,
2227};
2228
2229static void nvme_dsm_bh(void *opaque)
2230{
2231 NvmeDSMAIOCB *iocb = opaque;
2232
2233 iocb->common.cb(iocb->common.opaque, iocb->ret);
2234
2235 qemu_bh_delete(iocb->bh);
2236 iocb->bh = NULL;
2237 qemu_aio_unref(iocb);
2238}
2239
2240static void nvme_dsm_cb(void *opaque, int ret);
2241
2242static void nvme_dsm_md_cb(void *opaque, int ret)
2243{
2244 NvmeDSMAIOCB *iocb = opaque;
2245 NvmeRequest *req = iocb->req;
2246 NvmeNamespace *ns = req->ns;
2247 NvmeDsmRange *range;
2248 uint64_t slba;
2249 uint32_t nlb;
2250
2251 if (ret < 0) {
2252 iocb->ret = ret;
2253 goto done;
2254 }
2255
2256 if (!ns->lbaf.ms) {
2257 nvme_dsm_cb(iocb, 0);
2258 return;
2259 }
2260
2261 range = &iocb->range[iocb->idx - 1];
2262 slba = le64_to_cpu(range->slba);
2263 nlb = le32_to_cpu(range->nlb);
2264
2265
2266
2267
2268
2269
2270 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2271 if (ret) {
2272 if (ret < 0) {
2273 iocb->ret = ret;
2274 goto done;
2275 }
2276
2277 nvme_dsm_cb(iocb, 0);
2278 }
2279
2280 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2281 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2282 nvme_dsm_cb, iocb);
2283 return;
2284
2285done:
2286 iocb->aiocb = NULL;
2287 qemu_bh_schedule(iocb->bh);
2288}
2289
2290static void nvme_dsm_cb(void *opaque, int ret)
2291{
2292 NvmeDSMAIOCB *iocb = opaque;
2293 NvmeRequest *req = iocb->req;
2294 NvmeCtrl *n = nvme_ctrl(req);
2295 NvmeNamespace *ns = req->ns;
2296 NvmeDsmRange *range;
2297 uint64_t slba;
2298 uint32_t nlb;
2299
2300 if (ret < 0) {
2301 iocb->ret = ret;
2302 goto done;
2303 }
2304
2305next:
2306 if (iocb->idx == iocb->nr) {
2307 goto done;
2308 }
2309
2310 range = &iocb->range[iocb->idx++];
2311 slba = le64_to_cpu(range->slba);
2312 nlb = le32_to_cpu(range->nlb);
2313
2314 trace_pci_nvme_dsm_deallocate(slba, nlb);
2315
2316 if (nlb > n->dmrsl) {
2317 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2318 goto next;
2319 }
2320
2321 if (nvme_check_bounds(ns, slba, nlb)) {
2322 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2323 ns->id_ns.nsze);
2324 goto next;
2325 }
2326
2327 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2328 nvme_l2b(ns, nlb),
2329 nvme_dsm_md_cb, iocb);
2330 return;
2331
2332done:
2333 iocb->aiocb = NULL;
2334 qemu_bh_schedule(iocb->bh);
2335}
2336
2337static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2338{
2339 NvmeNamespace *ns = req->ns;
2340 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2341 uint32_t attr = le32_to_cpu(dsm->attributes);
2342 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2343 uint16_t status = NVME_SUCCESS;
2344
2345 trace_pci_nvme_dsm(nr, attr);
2346
2347 if (attr & NVME_DSMGMT_AD) {
2348 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2349 nvme_misc_cb, req);
2350
2351 iocb->req = req;
2352 iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2353 iocb->ret = 0;
2354 iocb->range = g_new(NvmeDsmRange, nr);
2355 iocb->nr = nr;
2356 iocb->idx = 0;
2357
2358 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2359 req);
2360 if (status) {
2361 return status;
2362 }
2363
2364 req->aiocb = &iocb->common;
2365 nvme_dsm_cb(iocb, 0);
2366
2367 return NVME_NO_COMPLETE;
2368 }
2369
2370 return status;
2371}
2372
2373static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2374{
2375 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2376 NvmeNamespace *ns = req->ns;
2377 BlockBackend *blk = ns->blkconf.blk;
2378 uint64_t slba = le64_to_cpu(rw->slba);
2379 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2380 size_t len = nvme_l2b(ns, nlb);
2381 int64_t offset = nvme_l2b(ns, slba);
2382 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2383 uint32_t reftag = le32_to_cpu(rw->reftag);
2384 NvmeBounceContext *ctx = NULL;
2385 uint16_t status;
2386
2387 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2388
2389 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2390 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2391 if (status) {
2392 return status;
2393 }
2394
2395 if (prinfo & NVME_PRINFO_PRACT) {
2396 return NVME_INVALID_PROT_INFO | NVME_DNR;
2397 }
2398 }
2399
2400 if (len > n->page_size << n->params.vsl) {
2401 return NVME_INVALID_FIELD | NVME_DNR;
2402 }
2403
2404 status = nvme_check_bounds(ns, slba, nlb);
2405 if (status) {
2406 return status;
2407 }
2408
2409 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2410 status = nvme_check_dulbe(ns, slba, nlb);
2411 if (status) {
2412 return status;
2413 }
2414 }
2415
2416 ctx = g_new0(NvmeBounceContext, 1);
2417 ctx->req = req;
2418
2419 ctx->data.bounce = g_malloc(len);
2420
2421 qemu_iovec_init(&ctx->data.iov, 1);
2422 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2423
2424 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2425 BLOCK_ACCT_READ);
2426
2427 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2428 nvme_verify_mdata_in_cb, ctx);
2429 return NVME_NO_COMPLETE;
2430}
2431
2432typedef struct NvmeCopyAIOCB {
2433 BlockAIOCB common;
2434 BlockAIOCB *aiocb;
2435 NvmeRequest *req;
2436 QEMUBH *bh;
2437 int ret;
2438
2439 NvmeCopySourceRange *ranges;
2440 int nr;
2441 int idx;
2442
2443 uint8_t *bounce;
2444 QEMUIOVector iov;
2445 struct {
2446 BlockAcctCookie read;
2447 BlockAcctCookie write;
2448 } acct;
2449
2450 uint32_t reftag;
2451 uint64_t slba;
2452
2453 NvmeZone *zone;
2454} NvmeCopyAIOCB;
2455
2456static void nvme_copy_cancel(BlockAIOCB *aiocb)
2457{
2458 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2459
2460 iocb->ret = -ECANCELED;
2461
2462 if (iocb->aiocb) {
2463 blk_aio_cancel_async(iocb->aiocb);
2464 iocb->aiocb = NULL;
2465 }
2466}
2467
2468static const AIOCBInfo nvme_copy_aiocb_info = {
2469 .aiocb_size = sizeof(NvmeCopyAIOCB),
2470 .cancel_async = nvme_copy_cancel,
2471};
2472
2473static void nvme_copy_bh(void *opaque)
2474{
2475 NvmeCopyAIOCB *iocb = opaque;
2476 NvmeRequest *req = iocb->req;
2477 NvmeNamespace *ns = req->ns;
2478 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2479
2480 if (iocb->idx != iocb->nr) {
2481 req->cqe.result = cpu_to_le32(iocb->idx);
2482 }
2483
2484 qemu_iovec_destroy(&iocb->iov);
2485 g_free(iocb->bounce);
2486
2487 qemu_bh_delete(iocb->bh);
2488 iocb->bh = NULL;
2489
2490 if (iocb->ret < 0) {
2491 block_acct_failed(stats, &iocb->acct.read);
2492 block_acct_failed(stats, &iocb->acct.write);
2493 } else {
2494 block_acct_done(stats, &iocb->acct.read);
2495 block_acct_done(stats, &iocb->acct.write);
2496 }
2497
2498 iocb->common.cb(iocb->common.opaque, iocb->ret);
2499 qemu_aio_unref(iocb);
2500}
2501
2502static void nvme_copy_cb(void *opaque, int ret);
2503
2504static void nvme_copy_out_completed_cb(void *opaque, int ret)
2505{
2506 NvmeCopyAIOCB *iocb = opaque;
2507 NvmeRequest *req = iocb->req;
2508 NvmeNamespace *ns = req->ns;
2509 NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
2510 uint32_t nlb = le32_to_cpu(range->nlb) + 1;
2511
2512 if (ret < 0) {
2513 iocb->ret = ret;
2514 goto out;
2515 } else if (iocb->ret < 0) {
2516 goto out;
2517 }
2518
2519 if (ns->params.zoned) {
2520 nvme_advance_zone_wp(ns, iocb->zone, nlb);
2521 }
2522
2523 iocb->idx++;
2524 iocb->slba += nlb;
2525out:
2526 nvme_copy_cb(iocb, iocb->ret);
2527}
2528
2529static void nvme_copy_out_cb(void *opaque, int ret)
2530{
2531 NvmeCopyAIOCB *iocb = opaque;
2532 NvmeRequest *req = iocb->req;
2533 NvmeNamespace *ns = req->ns;
2534 NvmeCopySourceRange *range;
2535 uint32_t nlb;
2536 size_t mlen;
2537 uint8_t *mbounce;
2538
2539 if (ret < 0) {
2540 iocb->ret = ret;
2541 goto out;
2542 } else if (iocb->ret < 0) {
2543 goto out;
2544 }
2545
2546 if (!ns->lbaf.ms) {
2547 nvme_copy_out_completed_cb(iocb, 0);
2548 return;
2549 }
2550
2551 range = &iocb->ranges[iocb->idx];
2552 nlb = le32_to_cpu(range->nlb) + 1;
2553
2554 mlen = nvme_m2b(ns, nlb);
2555 mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2556
2557 qemu_iovec_reset(&iocb->iov);
2558 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2559
2560 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2561 &iocb->iov, 0, nvme_copy_out_completed_cb,
2562 iocb);
2563
2564 return;
2565
2566out:
2567 nvme_copy_cb(iocb, ret);
2568}
2569
2570static void nvme_copy_in_completed_cb(void *opaque, int ret)
2571{
2572 NvmeCopyAIOCB *iocb = opaque;
2573 NvmeRequest *req = iocb->req;
2574 NvmeNamespace *ns = req->ns;
2575 NvmeCopySourceRange *range;
2576 uint32_t nlb;
2577 size_t len;
2578 uint16_t status;
2579
2580 if (ret < 0) {
2581 iocb->ret = ret;
2582 goto out;
2583 } else if (iocb->ret < 0) {
2584 goto out;
2585 }
2586
2587 range = &iocb->ranges[iocb->idx];
2588 nlb = le32_to_cpu(range->nlb) + 1;
2589 len = nvme_l2b(ns, nlb);
2590
2591 trace_pci_nvme_copy_out(iocb->slba, nlb);
2592
2593 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2594 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2595
2596 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2597 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2598
2599 uint16_t apptag = le16_to_cpu(range->apptag);
2600 uint16_t appmask = le16_to_cpu(range->appmask);
2601 uint32_t reftag = le32_to_cpu(range->reftag);
2602
2603 uint64_t slba = le64_to_cpu(range->slba);
2604 size_t mlen = nvme_m2b(ns, nlb);
2605 uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2606
2607 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2608 slba, apptag, appmask, &reftag);
2609 if (status) {
2610 goto invalid;
2611 }
2612
2613 apptag = le16_to_cpu(copy->apptag);
2614 appmask = le16_to_cpu(copy->appmask);
2615
2616 if (prinfow & NVME_PRINFO_PRACT) {
2617 status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2618 if (status) {
2619 goto invalid;
2620 }
2621
2622 nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2623 apptag, &iocb->reftag);
2624 } else {
2625 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2626 prinfow, iocb->slba, apptag, appmask,
2627 &iocb->reftag);
2628 if (status) {
2629 goto invalid;
2630 }
2631 }
2632 }
2633
2634 status = nvme_check_bounds(ns, iocb->slba, nlb);
2635 if (status) {
2636 goto invalid;
2637 }
2638
2639 if (ns->params.zoned) {
2640 status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2641 if (status) {
2642 goto invalid;
2643 }
2644
2645 iocb->zone->w_ptr += nlb;
2646 }
2647
2648 qemu_iovec_reset(&iocb->iov);
2649 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2650
2651 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2652 &iocb->iov, 0, nvme_copy_out_cb, iocb);
2653
2654 return;
2655
2656invalid:
2657 req->status = status;
2658 iocb->aiocb = NULL;
2659 if (iocb->bh) {
2660 qemu_bh_schedule(iocb->bh);
2661 }
2662
2663 return;
2664
2665out:
2666 nvme_copy_cb(iocb, ret);
2667}
2668
2669static void nvme_copy_in_cb(void *opaque, int ret)
2670{
2671 NvmeCopyAIOCB *iocb = opaque;
2672 NvmeRequest *req = iocb->req;
2673 NvmeNamespace *ns = req->ns;
2674 NvmeCopySourceRange *range;
2675 uint64_t slba;
2676 uint32_t nlb;
2677
2678 if (ret < 0) {
2679 iocb->ret = ret;
2680 goto out;
2681 } else if (iocb->ret < 0) {
2682 goto out;
2683 }
2684
2685 if (!ns->lbaf.ms) {
2686 nvme_copy_in_completed_cb(iocb, 0);
2687 return;
2688 }
2689
2690 range = &iocb->ranges[iocb->idx];
2691 slba = le64_to_cpu(range->slba);
2692 nlb = le32_to_cpu(range->nlb) + 1;
2693
2694 qemu_iovec_reset(&iocb->iov);
2695 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2696 nvme_m2b(ns, nlb));
2697
2698 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2699 &iocb->iov, 0, nvme_copy_in_completed_cb,
2700 iocb);
2701 return;
2702
2703out:
2704 nvme_copy_cb(iocb, iocb->ret);
2705}
2706
2707static void nvme_copy_cb(void *opaque, int ret)
2708{
2709 NvmeCopyAIOCB *iocb = opaque;
2710 NvmeRequest *req = iocb->req;
2711 NvmeNamespace *ns = req->ns;
2712 NvmeCopySourceRange *range;
2713 uint64_t slba;
2714 uint32_t nlb;
2715 size_t len;
2716 uint16_t status;
2717
2718 if (ret < 0) {
2719 iocb->ret = ret;
2720 goto done;
2721 } else if (iocb->ret < 0) {
2722 goto done;
2723 }
2724
2725 if (iocb->idx == iocb->nr) {
2726 goto done;
2727 }
2728
2729 range = &iocb->ranges[iocb->idx];
2730 slba = le64_to_cpu(range->slba);
2731 nlb = le32_to_cpu(range->nlb) + 1;
2732 len = nvme_l2b(ns, nlb);
2733
2734 trace_pci_nvme_copy_source_range(slba, nlb);
2735
2736 if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2737 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2738 goto invalid;
2739 }
2740
2741 status = nvme_check_bounds(ns, slba, nlb);
2742 if (status) {
2743 goto invalid;
2744 }
2745
2746 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2747 status = nvme_check_dulbe(ns, slba, nlb);
2748 if (status) {
2749 goto invalid;
2750 }
2751 }
2752
2753 if (ns->params.zoned) {
2754 status = nvme_check_zone_read(ns, slba, nlb);
2755 if (status) {
2756 goto invalid;
2757 }
2758 }
2759
2760 qemu_iovec_reset(&iocb->iov);
2761 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2762
2763 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2764 &iocb->iov, 0, nvme_copy_in_cb, iocb);
2765 return;
2766
2767invalid:
2768 req->status = status;
2769done:
2770 iocb->aiocb = NULL;
2771 if (iocb->bh) {
2772 qemu_bh_schedule(iocb->bh);
2773 }
2774}
2775
2776
2777static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2778{
2779 NvmeNamespace *ns = req->ns;
2780 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2781 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
2782 nvme_misc_cb, req);
2783 uint16_t nr = copy->nr + 1;
2784 uint8_t format = copy->control[0] & 0xf;
2785 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2786 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2787
2788 uint16_t status;
2789
2790 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2791
2792 iocb->ranges = NULL;
2793 iocb->zone = NULL;
2794
2795 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2796 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
2797 status = NVME_INVALID_FIELD | NVME_DNR;
2798 goto invalid;
2799 }
2800
2801 if (!(n->id_ctrl.ocfs & (1 << format))) {
2802 trace_pci_nvme_err_copy_invalid_format(format);
2803 status = NVME_INVALID_FIELD | NVME_DNR;
2804 goto invalid;
2805 }
2806
2807 if (nr > ns->id_ns.msrc + 1) {
2808 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2809 goto invalid;
2810 }
2811
2812 iocb->ranges = g_new(NvmeCopySourceRange, nr);
2813
2814 status = nvme_h2c(n, (uint8_t *)iocb->ranges,
2815 sizeof(NvmeCopySourceRange) * nr, req);
2816 if (status) {
2817 goto invalid;
2818 }
2819
2820 iocb->slba = le64_to_cpu(copy->sdlba);
2821
2822 if (ns->params.zoned) {
2823 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
2824 if (!iocb->zone) {
2825 status = NVME_LBA_RANGE | NVME_DNR;
2826 goto invalid;
2827 }
2828
2829 status = nvme_zrm_auto(n, ns, iocb->zone);
2830 if (status) {
2831 goto invalid;
2832 }
2833 }
2834
2835 iocb->req = req;
2836 iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
2837 iocb->ret = 0;
2838 iocb->nr = nr;
2839 iocb->idx = 0;
2840 iocb->reftag = le32_to_cpu(copy->reftag);
2841 iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
2842 ns->lbasz + ns->lbaf.ms);
2843
2844 qemu_iovec_init(&iocb->iov, 1);
2845
2846 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
2847 BLOCK_ACCT_READ);
2848 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
2849 BLOCK_ACCT_WRITE);
2850
2851 req->aiocb = &iocb->common;
2852 nvme_copy_cb(iocb, 0);
2853
2854 return NVME_NO_COMPLETE;
2855
2856invalid:
2857 g_free(iocb->ranges);
2858 qemu_aio_unref(iocb);
2859 return status;
2860}
2861
2862static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2863{
2864 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2865 NvmeNamespace *ns = req->ns;
2866 BlockBackend *blk = ns->blkconf.blk;
2867 uint64_t slba = le64_to_cpu(rw->slba);
2868 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2869 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2870 size_t data_len = nvme_l2b(ns, nlb);
2871 size_t len = data_len;
2872 int64_t offset = nvme_l2b(ns, slba);
2873 struct nvme_compare_ctx *ctx = NULL;
2874 uint16_t status;
2875
2876 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2877
2878 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
2879 return NVME_INVALID_PROT_INFO | NVME_DNR;
2880 }
2881
2882 if (nvme_ns_ext(ns)) {
2883 len += nvme_m2b(ns, nlb);
2884 }
2885
2886 status = nvme_check_mdts(n, len);
2887 if (status) {
2888 return status;
2889 }
2890
2891 status = nvme_check_bounds(ns, slba, nlb);
2892 if (status) {
2893 return status;
2894 }
2895
2896 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2897 status = nvme_check_dulbe(ns, slba, nlb);
2898 if (status) {
2899 return status;
2900 }
2901 }
2902
2903 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2904 if (status) {
2905 return status;
2906 }
2907
2908 ctx = g_new(struct nvme_compare_ctx, 1);
2909 ctx->data.bounce = g_malloc(data_len);
2910
2911 req->opaque = ctx;
2912
2913 qemu_iovec_init(&ctx->data.iov, 1);
2914 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2915
2916 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2917 BLOCK_ACCT_READ);
2918 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2919 nvme_compare_data_cb, req);
2920
2921 return NVME_NO_COMPLETE;
2922}
2923
2924typedef struct NvmeFlushAIOCB {
2925 BlockAIOCB common;
2926 BlockAIOCB *aiocb;
2927 NvmeRequest *req;
2928 QEMUBH *bh;
2929 int ret;
2930
2931 NvmeNamespace *ns;
2932 uint32_t nsid;
2933 bool broadcast;
2934} NvmeFlushAIOCB;
2935
2936static void nvme_flush_cancel(BlockAIOCB *acb)
2937{
2938 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
2939
2940 iocb->ret = -ECANCELED;
2941
2942 if (iocb->aiocb) {
2943 blk_aio_cancel_async(iocb->aiocb);
2944 }
2945}
2946
2947static const AIOCBInfo nvme_flush_aiocb_info = {
2948 .aiocb_size = sizeof(NvmeFlushAIOCB),
2949 .cancel_async = nvme_flush_cancel,
2950 .get_aio_context = nvme_get_aio_context,
2951};
2952
2953static void nvme_flush_ns_cb(void *opaque, int ret)
2954{
2955 NvmeFlushAIOCB *iocb = opaque;
2956 NvmeNamespace *ns = iocb->ns;
2957
2958 if (ret < 0) {
2959 iocb->ret = ret;
2960 goto out;
2961 } else if (iocb->ret < 0) {
2962 goto out;
2963 }
2964
2965 if (ns) {
2966 trace_pci_nvme_flush_ns(iocb->nsid);
2967
2968 iocb->ns = NULL;
2969 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
2970 return;
2971 }
2972
2973out:
2974 iocb->aiocb = NULL;
2975 qemu_bh_schedule(iocb->bh);
2976}
2977
2978static void nvme_flush_bh(void *opaque)
2979{
2980 NvmeFlushAIOCB *iocb = opaque;
2981 NvmeRequest *req = iocb->req;
2982 NvmeCtrl *n = nvme_ctrl(req);
2983 int i;
2984
2985 if (iocb->ret < 0) {
2986 goto done;
2987 }
2988
2989 if (iocb->broadcast) {
2990 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
2991 iocb->ns = nvme_ns(n, i);
2992 if (iocb->ns) {
2993 iocb->nsid = i;
2994 break;
2995 }
2996 }
2997 }
2998
2999 if (!iocb->ns) {
3000 goto done;
3001 }
3002
3003 nvme_flush_ns_cb(iocb, 0);
3004 return;
3005
3006done:
3007 qemu_bh_delete(iocb->bh);
3008 iocb->bh = NULL;
3009
3010 iocb->common.cb(iocb->common.opaque, iocb->ret);
3011
3012 qemu_aio_unref(iocb);
3013
3014 return;
3015}
3016
3017static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3018{
3019 NvmeFlushAIOCB *iocb;
3020 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3021 uint16_t status;
3022
3023 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3024
3025 iocb->req = req;
3026 iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3027 iocb->ret = 0;
3028 iocb->ns = NULL;
3029 iocb->nsid = 0;
3030 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3031
3032 if (!iocb->broadcast) {
3033 if (!nvme_nsid_valid(n, nsid)) {
3034 status = NVME_INVALID_NSID | NVME_DNR;
3035 goto out;
3036 }
3037
3038 iocb->ns = nvme_ns(n, nsid);
3039 if (!iocb->ns) {
3040 status = NVME_INVALID_FIELD | NVME_DNR;
3041 goto out;
3042 }
3043
3044 iocb->nsid = nsid;
3045 }
3046
3047 req->aiocb = &iocb->common;
3048 qemu_bh_schedule(iocb->bh);
3049
3050 return NVME_NO_COMPLETE;
3051
3052out:
3053 qemu_bh_delete(iocb->bh);
3054 iocb->bh = NULL;
3055 qemu_aio_unref(iocb);
3056
3057 return status;
3058}
3059
3060static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3061{
3062 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3063 NvmeNamespace *ns = req->ns;
3064 uint64_t slba = le64_to_cpu(rw->slba);
3065 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3066 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3067 uint64_t data_size = nvme_l2b(ns, nlb);
3068 uint64_t mapped_size = data_size;
3069 uint64_t data_offset;
3070 BlockBackend *blk = ns->blkconf.blk;
3071 uint16_t status;
3072
3073 if (nvme_ns_ext(ns)) {
3074 mapped_size += nvme_m2b(ns, nlb);
3075
3076 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3077 bool pract = prinfo & NVME_PRINFO_PRACT;
3078
3079 if (pract && ns->lbaf.ms == 8) {
3080 mapped_size = data_size;
3081 }
3082 }
3083 }
3084
3085 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3086
3087 status = nvme_check_mdts(n, mapped_size);
3088 if (status) {
3089 goto invalid;
3090 }
3091
3092 status = nvme_check_bounds(ns, slba, nlb);
3093 if (status) {
3094 goto invalid;
3095 }
3096
3097 if (ns->params.zoned) {
3098 status = nvme_check_zone_read(ns, slba, nlb);
3099 if (status) {
3100 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3101 goto invalid;
3102 }
3103 }
3104
3105 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3106 status = nvme_check_dulbe(ns, slba, nlb);
3107 if (status) {
3108 goto invalid;
3109 }
3110 }
3111
3112 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3113 return nvme_dif_rw(n, req);
3114 }
3115
3116 status = nvme_map_data(n, nlb, req);
3117 if (status) {
3118 goto invalid;
3119 }
3120
3121 data_offset = nvme_l2b(ns, slba);
3122
3123 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3124 BLOCK_ACCT_READ);
3125 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3126 return NVME_NO_COMPLETE;
3127
3128invalid:
3129 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3130 return status | NVME_DNR;
3131}
3132
3133static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3134 bool wrz)
3135{
3136 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3137 NvmeNamespace *ns = req->ns;
3138 uint64_t slba = le64_to_cpu(rw->slba);
3139 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3140 uint16_t ctrl = le16_to_cpu(rw->control);
3141 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3142 uint64_t data_size = nvme_l2b(ns, nlb);
3143 uint64_t mapped_size = data_size;
3144 uint64_t data_offset;
3145 NvmeZone *zone;
3146 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3147 BlockBackend *blk = ns->blkconf.blk;
3148 uint16_t status;
3149
3150 if (nvme_ns_ext(ns)) {
3151 mapped_size += nvme_m2b(ns, nlb);
3152
3153 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3154 bool pract = prinfo & NVME_PRINFO_PRACT;
3155
3156 if (pract && ns->lbaf.ms == 8) {
3157 mapped_size -= nvme_m2b(ns, nlb);
3158 }
3159 }
3160 }
3161
3162 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3163 nvme_nsid(ns), nlb, mapped_size, slba);
3164
3165 if (!wrz) {
3166 status = nvme_check_mdts(n, mapped_size);
3167 if (status) {
3168 goto invalid;
3169 }
3170 }
3171
3172 status = nvme_check_bounds(ns, slba, nlb);
3173 if (status) {
3174 goto invalid;
3175 }
3176
3177 if (ns->params.zoned) {
3178 zone = nvme_get_zone_by_slba(ns, slba);
3179 assert(zone);
3180
3181 if (append) {
3182 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3183
3184 if (unlikely(slba != zone->d.zslba)) {
3185 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3186 status = NVME_INVALID_FIELD;
3187 goto invalid;
3188 }
3189
3190 if (n->params.zasl &&
3191 data_size > (uint64_t)n->page_size << n->params.zasl) {
3192 trace_pci_nvme_err_zasl(data_size);
3193 return NVME_INVALID_FIELD | NVME_DNR;
3194 }
3195
3196 slba = zone->w_ptr;
3197 rw->slba = cpu_to_le64(slba);
3198 res->slba = cpu_to_le64(slba);
3199
3200 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3201 case NVME_ID_NS_DPS_TYPE_1:
3202 if (!piremap) {
3203 return NVME_INVALID_PROT_INFO | NVME_DNR;
3204 }
3205
3206
3207
3208 case NVME_ID_NS_DPS_TYPE_2:
3209 if (piremap) {
3210 uint32_t reftag = le32_to_cpu(rw->reftag);
3211 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3212 }
3213
3214 break;
3215
3216 case NVME_ID_NS_DPS_TYPE_3:
3217 if (piremap) {
3218 return NVME_INVALID_PROT_INFO | NVME_DNR;
3219 }
3220
3221 break;
3222 }
3223 }
3224
3225 status = nvme_check_zone_write(ns, zone, slba, nlb);
3226 if (status) {
3227 goto invalid;
3228 }
3229
3230 status = nvme_zrm_auto(n, ns, zone);
3231 if (status) {
3232 goto invalid;
3233 }
3234
3235 zone->w_ptr += nlb;
3236 }
3237
3238 data_offset = nvme_l2b(ns, slba);
3239
3240 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3241 return nvme_dif_rw(n, req);
3242 }
3243
3244 if (!wrz) {
3245 status = nvme_map_data(n, nlb, req);
3246 if (status) {
3247 goto invalid;
3248 }
3249
3250 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3251 BLOCK_ACCT_WRITE);
3252 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3253 } else {
3254 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3255 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3256 req);
3257 }
3258
3259 return NVME_NO_COMPLETE;
3260
3261invalid:
3262 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3263 return status | NVME_DNR;
3264}
3265
3266static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3267{
3268 return nvme_do_write(n, req, false, false);
3269}
3270
3271static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3272{
3273 return nvme_do_write(n, req, false, true);
3274}
3275
3276static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3277{
3278 return nvme_do_write(n, req, true, false);
3279}
3280
3281static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3282 uint64_t *slba, uint32_t *zone_idx)
3283{
3284 uint32_t dw10 = le32_to_cpu(c->cdw10);
3285 uint32_t dw11 = le32_to_cpu(c->cdw11);
3286
3287 if (!ns->params.zoned) {
3288 trace_pci_nvme_err_invalid_opc(c->opcode);
3289 return NVME_INVALID_OPCODE | NVME_DNR;
3290 }
3291
3292 *slba = ((uint64_t)dw11) << 32 | dw10;
3293 if (unlikely(*slba >= ns->id_ns.nsze)) {
3294 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3295 *slba = 0;
3296 return NVME_LBA_RANGE | NVME_DNR;
3297 }
3298
3299 *zone_idx = nvme_zone_idx(ns, *slba);
3300 assert(*zone_idx < ns->num_zones);
3301
3302 return NVME_SUCCESS;
3303}
3304
3305typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3306 NvmeRequest *);
3307
3308enum NvmeZoneProcessingMask {
3309 NVME_PROC_CURRENT_ZONE = 0,
3310 NVME_PROC_OPENED_ZONES = 1 << 0,
3311 NVME_PROC_CLOSED_ZONES = 1 << 1,
3312 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3313 NVME_PROC_FULL_ZONES = 1 << 3,
3314};
3315
3316static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3317 NvmeZoneState state, NvmeRequest *req)
3318{
3319 return nvme_zrm_open(nvme_ctrl(req), ns, zone);
3320}
3321
3322static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3323 NvmeZoneState state, NvmeRequest *req)
3324{
3325 return nvme_zrm_close(ns, zone);
3326}
3327
3328static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3329 NvmeZoneState state, NvmeRequest *req)
3330{
3331 return nvme_zrm_finish(ns, zone);
3332}
3333
3334static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3335 NvmeZoneState state, NvmeRequest *req)
3336{
3337 switch (state) {
3338 case NVME_ZONE_STATE_READ_ONLY:
3339 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3340
3341 case NVME_ZONE_STATE_OFFLINE:
3342 return NVME_SUCCESS;
3343 default:
3344 return NVME_ZONE_INVAL_TRANSITION;
3345 }
3346}
3347
3348static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3349{
3350 uint16_t status;
3351 uint8_t state = nvme_get_zone_state(zone);
3352
3353 if (state == NVME_ZONE_STATE_EMPTY) {
3354 status = nvme_aor_check(ns, 1, 0);
3355 if (status) {
3356 return status;
3357 }
3358 nvme_aor_inc_active(ns);
3359 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3360 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3361 return NVME_SUCCESS;
3362 }
3363
3364 return NVME_ZONE_INVAL_TRANSITION;
3365}
3366
3367static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3368 enum NvmeZoneProcessingMask proc_mask,
3369 op_handler_t op_hndlr, NvmeRequest *req)
3370{
3371 uint16_t status = NVME_SUCCESS;
3372 NvmeZoneState zs = nvme_get_zone_state(zone);
3373 bool proc_zone;
3374
3375 switch (zs) {
3376 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3377 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3378 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3379 break;
3380 case NVME_ZONE_STATE_CLOSED:
3381 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3382 break;
3383 case NVME_ZONE_STATE_READ_ONLY:
3384 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3385 break;
3386 case NVME_ZONE_STATE_FULL:
3387 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3388 break;
3389 default:
3390 proc_zone = false;
3391 }
3392
3393 if (proc_zone) {
3394 status = op_hndlr(ns, zone, zs, req);
3395 }
3396
3397 return status;
3398}
3399
3400static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3401 enum NvmeZoneProcessingMask proc_mask,
3402 op_handler_t op_hndlr, NvmeRequest *req)
3403{
3404 NvmeZone *next;
3405 uint16_t status = NVME_SUCCESS;
3406 int i;
3407
3408 if (!proc_mask) {
3409 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3410 } else {
3411 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3412 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3413 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3414 req);
3415 if (status && status != NVME_NO_COMPLETE) {
3416 goto out;
3417 }
3418 }
3419 }
3420 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3421 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3422 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3423 req);
3424 if (status && status != NVME_NO_COMPLETE) {
3425 goto out;
3426 }
3427 }
3428
3429 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3430 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3431 req);
3432 if (status && status != NVME_NO_COMPLETE) {
3433 goto out;
3434 }
3435 }
3436 }
3437 if (proc_mask & NVME_PROC_FULL_ZONES) {
3438 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3439 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3440 req);
3441 if (status && status != NVME_NO_COMPLETE) {
3442 goto out;
3443 }
3444 }
3445 }
3446
3447 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3448 for (i = 0; i < ns->num_zones; i++, zone++) {
3449 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3450 req);
3451 if (status && status != NVME_NO_COMPLETE) {
3452 goto out;
3453 }
3454 }
3455 }
3456 }
3457
3458out:
3459 return status;
3460}
3461
3462typedef struct NvmeZoneResetAIOCB {
3463 BlockAIOCB common;
3464 BlockAIOCB *aiocb;
3465 NvmeRequest *req;
3466 QEMUBH *bh;
3467 int ret;
3468
3469 bool all;
3470 int idx;
3471 NvmeZone *zone;
3472} NvmeZoneResetAIOCB;
3473
3474static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3475{
3476 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3477 NvmeRequest *req = iocb->req;
3478 NvmeNamespace *ns = req->ns;
3479
3480 iocb->idx = ns->num_zones;
3481
3482 iocb->ret = -ECANCELED;
3483
3484 if (iocb->aiocb) {
3485 blk_aio_cancel_async(iocb->aiocb);
3486 iocb->aiocb = NULL;
3487 }
3488}
3489
3490static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3491 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3492 .cancel_async = nvme_zone_reset_cancel,
3493};
3494
3495static void nvme_zone_reset_bh(void *opaque)
3496{
3497 NvmeZoneResetAIOCB *iocb = opaque;
3498
3499 iocb->common.cb(iocb->common.opaque, iocb->ret);
3500
3501 qemu_bh_delete(iocb->bh);
3502 iocb->bh = NULL;
3503 qemu_aio_unref(iocb);
3504}
3505
3506static void nvme_zone_reset_cb(void *opaque, int ret);
3507
3508static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3509{
3510 NvmeZoneResetAIOCB *iocb = opaque;
3511 NvmeRequest *req = iocb->req;
3512 NvmeNamespace *ns = req->ns;
3513 int64_t moff;
3514 int count;
3515
3516 if (ret < 0) {
3517 nvme_zone_reset_cb(iocb, ret);
3518 return;
3519 }
3520
3521 if (!ns->lbaf.ms) {
3522 nvme_zone_reset_cb(iocb, 0);
3523 return;
3524 }
3525
3526 moff = nvme_moff(ns, iocb->zone->d.zslba);
3527 count = nvme_m2b(ns, ns->zone_size);
3528
3529 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3530 BDRV_REQ_MAY_UNMAP,
3531 nvme_zone_reset_cb, iocb);
3532 return;
3533}
3534
3535static void nvme_zone_reset_cb(void *opaque, int ret)
3536{
3537 NvmeZoneResetAIOCB *iocb = opaque;
3538 NvmeRequest *req = iocb->req;
3539 NvmeNamespace *ns = req->ns;
3540
3541 if (ret < 0) {
3542 iocb->ret = ret;
3543 goto done;
3544 }
3545
3546 if (iocb->zone) {
3547 nvme_zrm_reset(ns, iocb->zone);
3548
3549 if (!iocb->all) {
3550 goto done;
3551 }
3552 }
3553
3554 while (iocb->idx < ns->num_zones) {
3555 NvmeZone *zone = &ns->zone_array[iocb->idx++];
3556
3557 switch (nvme_get_zone_state(zone)) {
3558 case NVME_ZONE_STATE_EMPTY:
3559 if (!iocb->all) {
3560 goto done;
3561 }
3562
3563 continue;
3564
3565 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3566 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3567 case NVME_ZONE_STATE_CLOSED:
3568 case NVME_ZONE_STATE_FULL:
3569 iocb->zone = zone;
3570 break;
3571
3572 default:
3573 continue;
3574 }
3575
3576 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3577
3578 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3579 nvme_l2b(ns, zone->d.zslba),
3580 nvme_l2b(ns, ns->zone_size),
3581 BDRV_REQ_MAY_UNMAP,
3582 nvme_zone_reset_epilogue_cb,
3583 iocb);
3584 return;
3585 }
3586
3587done:
3588 iocb->aiocb = NULL;
3589 if (iocb->bh) {
3590 qemu_bh_schedule(iocb->bh);
3591 }
3592}
3593
3594static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3595{
3596 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3597 NvmeNamespace *ns = req->ns;
3598 NvmeZone *zone;
3599 NvmeZoneResetAIOCB *iocb;
3600 uint8_t *zd_ext;
3601 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3602 uint64_t slba = 0;
3603 uint32_t zone_idx = 0;
3604 uint16_t status;
3605 uint8_t action;
3606 bool all;
3607 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3608
3609 action = dw13 & 0xff;
3610 all = !!(dw13 & 0x100);
3611
3612 req->status = NVME_SUCCESS;
3613
3614 if (!all) {
3615 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3616 if (status) {
3617 return status;
3618 }
3619 }
3620
3621 zone = &ns->zone_array[zone_idx];
3622 if (slba != zone->d.zslba) {
3623 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3624 return NVME_INVALID_FIELD | NVME_DNR;
3625 }
3626
3627 switch (action) {
3628
3629 case NVME_ZONE_ACTION_OPEN:
3630 if (all) {
3631 proc_mask = NVME_PROC_CLOSED_ZONES;
3632 }
3633 trace_pci_nvme_open_zone(slba, zone_idx, all);
3634 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3635 break;
3636
3637 case NVME_ZONE_ACTION_CLOSE:
3638 if (all) {
3639 proc_mask = NVME_PROC_OPENED_ZONES;
3640 }
3641 trace_pci_nvme_close_zone(slba, zone_idx, all);
3642 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3643 break;
3644
3645 case NVME_ZONE_ACTION_FINISH:
3646 if (all) {
3647 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3648 }
3649 trace_pci_nvme_finish_zone(slba, zone_idx, all);
3650 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3651 break;
3652
3653 case NVME_ZONE_ACTION_RESET:
3654 trace_pci_nvme_reset_zone(slba, zone_idx, all);
3655
3656 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3657 nvme_misc_cb, req);
3658
3659 iocb->req = req;
3660 iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3661 iocb->ret = 0;
3662 iocb->all = all;
3663 iocb->idx = zone_idx;
3664 iocb->zone = NULL;
3665
3666 req->aiocb = &iocb->common;
3667 nvme_zone_reset_cb(iocb, 0);
3668
3669 return NVME_NO_COMPLETE;
3670
3671 case NVME_ZONE_ACTION_OFFLINE:
3672 if (all) {
3673 proc_mask = NVME_PROC_READ_ONLY_ZONES;
3674 }
3675 trace_pci_nvme_offline_zone(slba, zone_idx, all);
3676 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3677 break;
3678
3679 case NVME_ZONE_ACTION_SET_ZD_EXT:
3680 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3681 if (all || !ns->params.zd_extension_size) {
3682 return NVME_INVALID_FIELD | NVME_DNR;
3683 }
3684 zd_ext = nvme_get_zd_extension(ns, zone_idx);
3685 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3686 if (status) {
3687 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3688 return status;
3689 }
3690
3691 status = nvme_set_zd_ext(ns, zone);
3692 if (status == NVME_SUCCESS) {
3693 trace_pci_nvme_zd_extension_set(zone_idx);
3694 return status;
3695 }
3696 break;
3697
3698 default:
3699 trace_pci_nvme_err_invalid_mgmt_action(action);
3700 status = NVME_INVALID_FIELD;
3701 }
3702
3703 if (status == NVME_ZONE_INVAL_TRANSITION) {
3704 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3705 zone->d.za);
3706 }
3707 if (status) {
3708 status |= NVME_DNR;
3709 }
3710
3711 return status;
3712}
3713
3714static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3715{
3716 NvmeZoneState zs = nvme_get_zone_state(zl);
3717
3718 switch (zafs) {
3719 case NVME_ZONE_REPORT_ALL:
3720 return true;
3721 case NVME_ZONE_REPORT_EMPTY:
3722 return zs == NVME_ZONE_STATE_EMPTY;
3723 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3724 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3725 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3726 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3727 case NVME_ZONE_REPORT_CLOSED:
3728 return zs == NVME_ZONE_STATE_CLOSED;
3729 case NVME_ZONE_REPORT_FULL:
3730 return zs == NVME_ZONE_STATE_FULL;
3731 case NVME_ZONE_REPORT_READ_ONLY:
3732 return zs == NVME_ZONE_STATE_READ_ONLY;
3733 case NVME_ZONE_REPORT_OFFLINE:
3734 return zs == NVME_ZONE_STATE_OFFLINE;
3735 default:
3736 return false;
3737 }
3738}
3739
3740static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3741{
3742 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3743 NvmeNamespace *ns = req->ns;
3744
3745 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3746 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3747 uint32_t zone_idx, zra, zrasf, partial;
3748 uint64_t max_zones, nr_zones = 0;
3749 uint16_t status;
3750 uint64_t slba;
3751 NvmeZoneDescr *z;
3752 NvmeZone *zone;
3753 NvmeZoneReportHeader *header;
3754 void *buf, *buf_p;
3755 size_t zone_entry_sz;
3756 int i;
3757
3758 req->status = NVME_SUCCESS;
3759
3760 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3761 if (status) {
3762 return status;
3763 }
3764
3765 zra = dw13 & 0xff;
3766 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3767 return NVME_INVALID_FIELD | NVME_DNR;
3768 }
3769 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3770 return NVME_INVALID_FIELD | NVME_DNR;
3771 }
3772
3773 zrasf = (dw13 >> 8) & 0xff;
3774 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3775 return NVME_INVALID_FIELD | NVME_DNR;
3776 }
3777
3778 if (data_size < sizeof(NvmeZoneReportHeader)) {
3779 return NVME_INVALID_FIELD | NVME_DNR;
3780 }
3781
3782 status = nvme_check_mdts(n, data_size);
3783 if (status) {
3784 return status;
3785 }
3786
3787 partial = (dw13 >> 16) & 0x01;
3788
3789 zone_entry_sz = sizeof(NvmeZoneDescr);
3790 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3791 zone_entry_sz += ns->params.zd_extension_size;
3792 }
3793
3794 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3795 buf = g_malloc0(data_size);
3796
3797 zone = &ns->zone_array[zone_idx];
3798 for (i = zone_idx; i < ns->num_zones; i++) {
3799 if (partial && nr_zones >= max_zones) {
3800 break;
3801 }
3802 if (nvme_zone_matches_filter(zrasf, zone++)) {
3803 nr_zones++;
3804 }
3805 }
3806 header = (NvmeZoneReportHeader *)buf;
3807 header->nr_zones = cpu_to_le64(nr_zones);
3808
3809 buf_p = buf + sizeof(NvmeZoneReportHeader);
3810 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3811 zone = &ns->zone_array[zone_idx];
3812 if (nvme_zone_matches_filter(zrasf, zone)) {
3813 z = (NvmeZoneDescr *)buf_p;
3814 buf_p += sizeof(NvmeZoneDescr);
3815
3816 z->zt = zone->d.zt;
3817 z->zs = zone->d.zs;
3818 z->zcap = cpu_to_le64(zone->d.zcap);
3819 z->zslba = cpu_to_le64(zone->d.zslba);
3820 z->za = zone->d.za;
3821
3822 if (nvme_wp_is_valid(zone)) {
3823 z->wp = cpu_to_le64(zone->d.wp);
3824 } else {
3825 z->wp = cpu_to_le64(~0ULL);
3826 }
3827
3828 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3829 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3830 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3831 ns->params.zd_extension_size);
3832 }
3833 buf_p += ns->params.zd_extension_size;
3834 }
3835
3836 max_zones--;
3837 }
3838 }
3839
3840 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3841
3842 g_free(buf);
3843
3844 return status;
3845}
3846
3847static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3848{
3849 NvmeNamespace *ns;
3850 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3851
3852 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3853 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3854
3855 if (!nvme_nsid_valid(n, nsid)) {
3856 return NVME_INVALID_NSID | NVME_DNR;
3857 }
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878 if (req->cmd.opcode == NVME_CMD_FLUSH) {
3879 return nvme_flush(n, req);
3880 }
3881
3882 ns = nvme_ns(n, nsid);
3883 if (unlikely(!ns)) {
3884 return NVME_INVALID_FIELD | NVME_DNR;
3885 }
3886
3887 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3888 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3889 return NVME_INVALID_OPCODE | NVME_DNR;
3890 }
3891
3892 if (ns->status) {
3893 return ns->status;
3894 }
3895
3896 req->ns = ns;
3897
3898 switch (req->cmd.opcode) {
3899 case NVME_CMD_WRITE_ZEROES:
3900 return nvme_write_zeroes(n, req);
3901 case NVME_CMD_ZONE_APPEND:
3902 return nvme_zone_append(n, req);
3903 case NVME_CMD_WRITE:
3904 return nvme_write(n, req);
3905 case NVME_CMD_READ:
3906 return nvme_read(n, req);
3907 case NVME_CMD_COMPARE:
3908 return nvme_compare(n, req);
3909 case NVME_CMD_DSM:
3910 return nvme_dsm(n, req);
3911 case NVME_CMD_VERIFY:
3912 return nvme_verify(n, req);
3913 case NVME_CMD_COPY:
3914 return nvme_copy(n, req);
3915 case NVME_CMD_ZONE_MGMT_SEND:
3916 return nvme_zone_mgmt_send(n, req);
3917 case NVME_CMD_ZONE_MGMT_RECV:
3918 return nvme_zone_mgmt_recv(n, req);
3919 default:
3920 assert(false);
3921 }
3922
3923 return NVME_INVALID_OPCODE | NVME_DNR;
3924}
3925
3926static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3927{
3928 n->sq[sq->sqid] = NULL;
3929 timer_free(sq->timer);
3930 g_free(sq->io_req);
3931 if (sq->sqid) {
3932 g_free(sq);
3933 }
3934}
3935
3936static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3937{
3938 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3939 NvmeRequest *r, *next;
3940 NvmeSQueue *sq;
3941 NvmeCQueue *cq;
3942 uint16_t qid = le16_to_cpu(c->qid);
3943
3944 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3945 trace_pci_nvme_err_invalid_del_sq(qid);
3946 return NVME_INVALID_QID | NVME_DNR;
3947 }
3948
3949 trace_pci_nvme_del_sq(qid);
3950
3951 sq = n->sq[qid];
3952 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3953 r = QTAILQ_FIRST(&sq->out_req_list);
3954 assert(r->aiocb);
3955 blk_aio_cancel(r->aiocb);
3956 }
3957
3958 assert(QTAILQ_EMPTY(&sq->out_req_list));
3959
3960 if (!nvme_check_cqid(n, sq->cqid)) {
3961 cq = n->cq[sq->cqid];
3962 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3963
3964 nvme_post_cqes(cq);
3965 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3966 if (r->sq == sq) {
3967 QTAILQ_REMOVE(&cq->req_list, r, entry);
3968 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3969 }
3970 }
3971 }
3972
3973 nvme_free_sq(sq, n);
3974 return NVME_SUCCESS;
3975}
3976
3977static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3978 uint16_t sqid, uint16_t cqid, uint16_t size)
3979{
3980 int i;
3981 NvmeCQueue *cq;
3982
3983 sq->ctrl = n;
3984 sq->dma_addr = dma_addr;
3985 sq->sqid = sqid;
3986 sq->size = size;
3987 sq->cqid = cqid;
3988 sq->head = sq->tail = 0;
3989 sq->io_req = g_new0(NvmeRequest, sq->size);
3990
3991 QTAILQ_INIT(&sq->req_list);
3992 QTAILQ_INIT(&sq->out_req_list);
3993 for (i = 0; i < sq->size; i++) {
3994 sq->io_req[i].sq = sq;
3995 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
3996 }
3997 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
3998
3999 assert(n->cq[cqid]);
4000 cq = n->cq[cqid];
4001 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4002 n->sq[sqid] = sq;
4003}
4004
4005static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4006{
4007 NvmeSQueue *sq;
4008 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4009
4010 uint16_t cqid = le16_to_cpu(c->cqid);
4011 uint16_t sqid = le16_to_cpu(c->sqid);
4012 uint16_t qsize = le16_to_cpu(c->qsize);
4013 uint16_t qflags = le16_to_cpu(c->sq_flags);
4014 uint64_t prp1 = le64_to_cpu(c->prp1);
4015
4016 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4017
4018 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4019 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4020 return NVME_INVALID_CQID | NVME_DNR;
4021 }
4022 if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
4023 n->sq[sqid] != NULL)) {
4024 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4025 return NVME_INVALID_QID | NVME_DNR;
4026 }
4027 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4028 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4029 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4030 }
4031 if (unlikely(prp1 & (n->page_size - 1))) {
4032 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4033 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4034 }
4035 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4036 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4037 return NVME_INVALID_FIELD | NVME_DNR;
4038 }
4039 sq = g_malloc0(sizeof(*sq));
4040 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4041 return NVME_SUCCESS;
4042}
4043
4044struct nvme_stats {
4045 uint64_t units_read;
4046 uint64_t units_written;
4047 uint64_t read_commands;
4048 uint64_t write_commands;
4049};
4050
4051static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4052{
4053 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4054
4055 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4056 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4057 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4058 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4059}
4060
4061static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4062 uint64_t off, NvmeRequest *req)
4063{
4064 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4065 struct nvme_stats stats = { 0 };
4066 NvmeSmartLog smart = { 0 };
4067 uint32_t trans_len;
4068 NvmeNamespace *ns;
4069 time_t current_ms;
4070
4071 if (off >= sizeof(smart)) {
4072 return NVME_INVALID_FIELD | NVME_DNR;
4073 }
4074
4075 if (nsid != 0xffffffff) {
4076 ns = nvme_ns(n, nsid);
4077 if (!ns) {
4078 return NVME_INVALID_NSID | NVME_DNR;
4079 }
4080 nvme_set_blk_stats(ns, &stats);
4081 } else {
4082 int i;
4083
4084 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4085 ns = nvme_ns(n, i);
4086 if (!ns) {
4087 continue;
4088 }
4089 nvme_set_blk_stats(ns, &stats);
4090 }
4091 }
4092
4093 trans_len = MIN(sizeof(smart) - off, buf_len);
4094 smart.critical_warning = n->smart_critical_warning;
4095
4096 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4097 1000));
4098 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4099 1000));
4100 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4101 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4102
4103 smart.temperature = cpu_to_le16(n->temperature);
4104
4105 if ((n->temperature >= n->features.temp_thresh_hi) ||
4106 (n->temperature <= n->features.temp_thresh_low)) {
4107 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4108 }
4109
4110 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4111 smart.power_on_hours[0] =
4112 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4113
4114 if (!rae) {
4115 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4116 }
4117
4118 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4119}
4120
4121static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4122 NvmeRequest *req)
4123{
4124 uint32_t trans_len;
4125 NvmeFwSlotInfoLog fw_log = {
4126 .afi = 0x1,
4127 };
4128
4129 if (off >= sizeof(fw_log)) {
4130 return NVME_INVALID_FIELD | NVME_DNR;
4131 }
4132
4133 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4134 trans_len = MIN(sizeof(fw_log) - off, buf_len);
4135
4136 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4137}
4138
4139static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4140 uint64_t off, NvmeRequest *req)
4141{
4142 uint32_t trans_len;
4143 NvmeErrorLog errlog;
4144
4145 if (off >= sizeof(errlog)) {
4146 return NVME_INVALID_FIELD | NVME_DNR;
4147 }
4148
4149 if (!rae) {
4150 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4151 }
4152
4153 memset(&errlog, 0x0, sizeof(errlog));
4154 trans_len = MIN(sizeof(errlog) - off, buf_len);
4155
4156 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4157}
4158
4159static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4160 uint64_t off, NvmeRequest *req)
4161{
4162 uint32_t nslist[1024];
4163 uint32_t trans_len;
4164 int i = 0;
4165 uint32_t nsid;
4166
4167 if (off >= sizeof(nslist)) {
4168 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
4169 return NVME_INVALID_FIELD | NVME_DNR;
4170 }
4171
4172 memset(nslist, 0x0, sizeof(nslist));
4173 trans_len = MIN(sizeof(nslist) - off, buf_len);
4174
4175 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4176 NVME_CHANGED_NSID_SIZE) {
4177
4178
4179
4180
4181 if (i == ARRAY_SIZE(nslist)) {
4182 memset(nslist, 0x0, sizeof(nslist));
4183 nslist[0] = 0xffffffff;
4184 break;
4185 }
4186
4187 nslist[i++] = nsid;
4188 clear_bit(nsid, n->changed_nsids);
4189 }
4190
4191
4192
4193
4194
4195 if (nslist[0] == 0xffffffff) {
4196 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4197 }
4198
4199 if (!rae) {
4200 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4201 }
4202
4203 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4204}
4205
4206static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4207 uint64_t off, NvmeRequest *req)
4208{
4209 NvmeEffectsLog log = {};
4210 const uint32_t *src_iocs = NULL;
4211 uint32_t trans_len;
4212
4213 if (off >= sizeof(log)) {
4214 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4215 return NVME_INVALID_FIELD | NVME_DNR;
4216 }
4217
4218 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4219 case NVME_CC_CSS_NVM:
4220 src_iocs = nvme_cse_iocs_nvm;
4221
4222 case NVME_CC_CSS_ADMIN_ONLY:
4223 break;
4224 case NVME_CC_CSS_CSI:
4225 switch (csi) {
4226 case NVME_CSI_NVM:
4227 src_iocs = nvme_cse_iocs_nvm;
4228 break;
4229 case NVME_CSI_ZONED:
4230 src_iocs = nvme_cse_iocs_zoned;
4231 break;
4232 }
4233 }
4234
4235 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4236
4237 if (src_iocs) {
4238 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4239 }
4240
4241 trans_len = MIN(sizeof(log) - off, buf_len);
4242
4243 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4244}
4245
4246static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4247{
4248 NvmeCmd *cmd = &req->cmd;
4249
4250 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4251 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4252 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4253 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4254 uint8_t lid = dw10 & 0xff;
4255 uint8_t lsp = (dw10 >> 8) & 0xf;
4256 uint8_t rae = (dw10 >> 15) & 0x1;
4257 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
4258 uint32_t numdl, numdu;
4259 uint64_t off, lpol, lpou;
4260 size_t len;
4261 uint16_t status;
4262
4263 numdl = (dw10 >> 16);
4264 numdu = (dw11 & 0xffff);
4265 lpol = dw12;
4266 lpou = dw13;
4267
4268 len = (((numdu << 16) | numdl) + 1) << 2;
4269 off = (lpou << 32ULL) | lpol;
4270
4271 if (off & 0x3) {
4272 return NVME_INVALID_FIELD | NVME_DNR;
4273 }
4274
4275 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4276
4277 status = nvme_check_mdts(n, len);
4278 if (status) {
4279 return status;
4280 }
4281
4282 switch (lid) {
4283 case NVME_LOG_ERROR_INFO:
4284 return nvme_error_info(n, rae, len, off, req);
4285 case NVME_LOG_SMART_INFO:
4286 return nvme_smart_info(n, rae, len, off, req);
4287 case NVME_LOG_FW_SLOT_INFO:
4288 return nvme_fw_log_info(n, len, off, req);
4289 case NVME_LOG_CHANGED_NSLIST:
4290 return nvme_changed_nslist(n, rae, len, off, req);
4291 case NVME_LOG_CMD_EFFECTS:
4292 return nvme_cmd_effects(n, csi, len, off, req);
4293 default:
4294 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4295 return NVME_INVALID_FIELD | NVME_DNR;
4296 }
4297}
4298
4299static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4300{
4301 n->cq[cq->cqid] = NULL;
4302 timer_free(cq->timer);
4303 if (msix_enabled(&n->parent_obj)) {
4304 msix_vector_unuse(&n->parent_obj, cq->vector);
4305 }
4306 if (cq->cqid) {
4307 g_free(cq);
4308 }
4309}
4310
4311static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4312{
4313 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4314 NvmeCQueue *cq;
4315 uint16_t qid = le16_to_cpu(c->qid);
4316
4317 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4318 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4319 return NVME_INVALID_CQID | NVME_DNR;
4320 }
4321
4322 cq = n->cq[qid];
4323 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4324 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4325 return NVME_INVALID_QUEUE_DEL;
4326 }
4327
4328 if (cq->irq_enabled && cq->tail != cq->head) {
4329 n->cq_pending--;
4330 }
4331
4332 nvme_irq_deassert(n, cq);
4333 trace_pci_nvme_del_cq(qid);
4334 nvme_free_cq(cq, n);
4335 return NVME_SUCCESS;
4336}
4337
4338static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4339 uint16_t cqid, uint16_t vector, uint16_t size,
4340 uint16_t irq_enabled)
4341{
4342 int ret;
4343
4344 if (msix_enabled(&n->parent_obj)) {
4345 ret = msix_vector_use(&n->parent_obj, vector);
4346 assert(ret == 0);
4347 }
4348 cq->ctrl = n;
4349 cq->cqid = cqid;
4350 cq->size = size;
4351 cq->dma_addr = dma_addr;
4352 cq->phase = 1;
4353 cq->irq_enabled = irq_enabled;
4354 cq->vector = vector;
4355 cq->head = cq->tail = 0;
4356 QTAILQ_INIT(&cq->req_list);
4357 QTAILQ_INIT(&cq->sq_list);
4358 n->cq[cqid] = cq;
4359 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4360}
4361
4362static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4363{
4364 NvmeCQueue *cq;
4365 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4366 uint16_t cqid = le16_to_cpu(c->cqid);
4367 uint16_t vector = le16_to_cpu(c->irq_vector);
4368 uint16_t qsize = le16_to_cpu(c->qsize);
4369 uint16_t qflags = le16_to_cpu(c->cq_flags);
4370 uint64_t prp1 = le64_to_cpu(c->prp1);
4371
4372 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4373 NVME_CQ_FLAGS_IEN(qflags) != 0);
4374
4375 if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4376 n->cq[cqid] != NULL)) {
4377 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4378 return NVME_INVALID_QID | NVME_DNR;
4379 }
4380 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4381 trace_pci_nvme_err_invalid_create_cq_size(qsize);
4382 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4383 }
4384 if (unlikely(prp1 & (n->page_size - 1))) {
4385 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4386 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4387 }
4388 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4389 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4390 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4391 }
4392 if (unlikely(vector >= n->params.msix_qsize)) {
4393 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4394 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4395 }
4396 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4397 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4398 return NVME_INVALID_FIELD | NVME_DNR;
4399 }
4400
4401 cq = g_malloc0(sizeof(*cq));
4402 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4403 NVME_CQ_FLAGS_IEN(qflags));
4404
4405
4406
4407
4408
4409
4410 n->qs_created = true;
4411 return NVME_SUCCESS;
4412}
4413
4414static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4415{
4416 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4417
4418 return nvme_c2h(n, id, sizeof(id), req);
4419}
4420
4421static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4422{
4423 trace_pci_nvme_identify_ctrl();
4424
4425 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4426}
4427
4428static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4429{
4430 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4431 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4432 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4433
4434 trace_pci_nvme_identify_ctrl_csi(c->csi);
4435
4436 switch (c->csi) {
4437 case NVME_CSI_NVM:
4438 id_nvm->vsl = n->params.vsl;
4439 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4440 break;
4441
4442 case NVME_CSI_ZONED:
4443 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4444 break;
4445
4446 default:
4447 return NVME_INVALID_FIELD | NVME_DNR;
4448 }
4449
4450 return nvme_c2h(n, id, sizeof(id), req);
4451}
4452
4453static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4454{
4455 NvmeNamespace *ns;
4456 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4457 uint32_t nsid = le32_to_cpu(c->nsid);
4458
4459 trace_pci_nvme_identify_ns(nsid);
4460
4461 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4462 return NVME_INVALID_NSID | NVME_DNR;
4463 }
4464
4465 ns = nvme_ns(n, nsid);
4466 if (unlikely(!ns)) {
4467 if (!active) {
4468 ns = nvme_subsys_ns(n->subsys, nsid);
4469 if (!ns) {
4470 return nvme_rpt_empty_id_struct(n, req);
4471 }
4472 } else {
4473 return nvme_rpt_empty_id_struct(n, req);
4474 }
4475 }
4476
4477 if (active || ns->csi == NVME_CSI_NVM) {
4478 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4479 }
4480
4481 return NVME_INVALID_CMD_SET | NVME_DNR;
4482}
4483
4484static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4485 bool attached)
4486{
4487 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4488 uint32_t nsid = le32_to_cpu(c->nsid);
4489 uint16_t min_id = le16_to_cpu(c->ctrlid);
4490 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4491 uint16_t *ids = &list[1];
4492 NvmeNamespace *ns;
4493 NvmeCtrl *ctrl;
4494 int cntlid, nr_ids = 0;
4495
4496 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4497
4498 if (!n->subsys) {
4499 return NVME_INVALID_FIELD | NVME_DNR;
4500 }
4501
4502 if (attached) {
4503 if (nsid == NVME_NSID_BROADCAST) {
4504 return NVME_INVALID_FIELD | NVME_DNR;
4505 }
4506
4507 ns = nvme_subsys_ns(n->subsys, nsid);
4508 if (!ns) {
4509 return NVME_INVALID_FIELD | NVME_DNR;
4510 }
4511 }
4512
4513 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4514 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4515 if (!ctrl) {
4516 continue;
4517 }
4518
4519 if (attached && !nvme_ns(ctrl, nsid)) {
4520 continue;
4521 }
4522
4523 ids[nr_ids++] = cntlid;
4524 }
4525
4526 list[0] = nr_ids;
4527
4528 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4529}
4530
4531static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4532 bool active)
4533{
4534 NvmeNamespace *ns;
4535 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4536 uint32_t nsid = le32_to_cpu(c->nsid);
4537
4538 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4539
4540 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4541 return NVME_INVALID_NSID | NVME_DNR;
4542 }
4543
4544 ns = nvme_ns(n, nsid);
4545 if (unlikely(!ns)) {
4546 if (!active) {
4547 ns = nvme_subsys_ns(n->subsys, nsid);
4548 if (!ns) {
4549 return nvme_rpt_empty_id_struct(n, req);
4550 }
4551 } else {
4552 return nvme_rpt_empty_id_struct(n, req);
4553 }
4554 }
4555
4556 if (c->csi == NVME_CSI_NVM) {
4557 return nvme_rpt_empty_id_struct(n, req);
4558 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4559 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4560 req);
4561 }
4562
4563 return NVME_INVALID_FIELD | NVME_DNR;
4564}
4565
4566static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4567 bool active)
4568{
4569 NvmeNamespace *ns;
4570 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4571 uint32_t min_nsid = le32_to_cpu(c->nsid);
4572 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4573 static const int data_len = sizeof(list);
4574 uint32_t *list_ptr = (uint32_t *)list;
4575 int i, j = 0;
4576
4577 trace_pci_nvme_identify_nslist(min_nsid);
4578
4579
4580
4581
4582
4583
4584
4585 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4586 return NVME_INVALID_NSID | NVME_DNR;
4587 }
4588
4589 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4590 ns = nvme_ns(n, i);
4591 if (!ns) {
4592 if (!active) {
4593 ns = nvme_subsys_ns(n->subsys, i);
4594 if (!ns) {
4595 continue;
4596 }
4597 } else {
4598 continue;
4599 }
4600 }
4601 if (ns->params.nsid <= min_nsid) {
4602 continue;
4603 }
4604 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4605 if (j == data_len / sizeof(uint32_t)) {
4606 break;
4607 }
4608 }
4609
4610 return nvme_c2h(n, list, data_len, req);
4611}
4612
4613static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4614 bool active)
4615{
4616 NvmeNamespace *ns;
4617 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4618 uint32_t min_nsid = le32_to_cpu(c->nsid);
4619 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4620 static const int data_len = sizeof(list);
4621 uint32_t *list_ptr = (uint32_t *)list;
4622 int i, j = 0;
4623
4624 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4625
4626
4627
4628
4629 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4630 return NVME_INVALID_NSID | NVME_DNR;
4631 }
4632
4633 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4634 return NVME_INVALID_FIELD | NVME_DNR;
4635 }
4636
4637 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4638 ns = nvme_ns(n, i);
4639 if (!ns) {
4640 if (!active) {
4641 ns = nvme_subsys_ns(n->subsys, i);
4642 if (!ns) {
4643 continue;
4644 }
4645 } else {
4646 continue;
4647 }
4648 }
4649 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4650 continue;
4651 }
4652 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4653 if (j == data_len / sizeof(uint32_t)) {
4654 break;
4655 }
4656 }
4657
4658 return nvme_c2h(n, list, data_len, req);
4659}
4660
4661static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4662{
4663 NvmeNamespace *ns;
4664 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4665 uint32_t nsid = le32_to_cpu(c->nsid);
4666 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4667 uint8_t *pos = list;
4668 struct {
4669 NvmeIdNsDescr hdr;
4670 uint8_t v[NVME_NIDL_UUID];
4671 } QEMU_PACKED uuid = {};
4672 struct {
4673 NvmeIdNsDescr hdr;
4674 uint64_t v;
4675 } QEMU_PACKED eui64 = {};
4676 struct {
4677 NvmeIdNsDescr hdr;
4678 uint8_t v;
4679 } QEMU_PACKED csi = {};
4680
4681 trace_pci_nvme_identify_ns_descr_list(nsid);
4682
4683 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4684 return NVME_INVALID_NSID | NVME_DNR;
4685 }
4686
4687 ns = nvme_ns(n, nsid);
4688 if (unlikely(!ns)) {
4689 return NVME_INVALID_FIELD | NVME_DNR;
4690 }
4691
4692
4693
4694
4695
4696
4697 uuid.hdr.nidt = NVME_NIDT_UUID;
4698 uuid.hdr.nidl = NVME_NIDL_UUID;
4699 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4700 memcpy(pos, &uuid, sizeof(uuid));
4701 pos += sizeof(uuid);
4702
4703 if (ns->params.eui64) {
4704 eui64.hdr.nidt = NVME_NIDT_EUI64;
4705 eui64.hdr.nidl = NVME_NIDL_EUI64;
4706 eui64.v = cpu_to_be64(ns->params.eui64);
4707 memcpy(pos, &eui64, sizeof(eui64));
4708 pos += sizeof(eui64);
4709 }
4710
4711 csi.hdr.nidt = NVME_NIDT_CSI;
4712 csi.hdr.nidl = NVME_NIDL_CSI;
4713 csi.v = ns->csi;
4714 memcpy(pos, &csi, sizeof(csi));
4715 pos += sizeof(csi);
4716
4717 return nvme_c2h(n, list, sizeof(list), req);
4718}
4719
4720static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4721{
4722 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4723 static const int data_len = sizeof(list);
4724
4725 trace_pci_nvme_identify_cmd_set();
4726
4727 NVME_SET_CSI(*list, NVME_CSI_NVM);
4728 NVME_SET_CSI(*list, NVME_CSI_ZONED);
4729
4730 return nvme_c2h(n, list, data_len, req);
4731}
4732
4733static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4734{
4735 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4736
4737 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4738 c->csi);
4739
4740 switch (c->cns) {
4741 case NVME_ID_CNS_NS:
4742 return nvme_identify_ns(n, req, true);
4743 case NVME_ID_CNS_NS_PRESENT:
4744 return nvme_identify_ns(n, req, false);
4745 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4746 return nvme_identify_ctrl_list(n, req, true);
4747 case NVME_ID_CNS_CTRL_LIST:
4748 return nvme_identify_ctrl_list(n, req, false);
4749 case NVME_ID_CNS_CS_NS:
4750 return nvme_identify_ns_csi(n, req, true);
4751 case NVME_ID_CNS_CS_NS_PRESENT:
4752 return nvme_identify_ns_csi(n, req, false);
4753 case NVME_ID_CNS_CTRL:
4754 return nvme_identify_ctrl(n, req);
4755 case NVME_ID_CNS_CS_CTRL:
4756 return nvme_identify_ctrl_csi(n, req);
4757 case NVME_ID_CNS_NS_ACTIVE_LIST:
4758 return nvme_identify_nslist(n, req, true);
4759 case NVME_ID_CNS_NS_PRESENT_LIST:
4760 return nvme_identify_nslist(n, req, false);
4761 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4762 return nvme_identify_nslist_csi(n, req, true);
4763 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4764 return nvme_identify_nslist_csi(n, req, false);
4765 case NVME_ID_CNS_NS_DESCR_LIST:
4766 return nvme_identify_ns_descr_list(n, req);
4767 case NVME_ID_CNS_IO_COMMAND_SET:
4768 return nvme_identify_cmd_set(n, req);
4769 default:
4770 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4771 return NVME_INVALID_FIELD | NVME_DNR;
4772 }
4773}
4774
4775static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4776{
4777 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4778
4779 req->cqe.result = 1;
4780 if (nvme_check_sqid(n, sqid)) {
4781 return NVME_INVALID_FIELD | NVME_DNR;
4782 }
4783
4784 return NVME_SUCCESS;
4785}
4786
4787static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4788{
4789 trace_pci_nvme_setfeat_timestamp(ts);
4790
4791 n->host_timestamp = le64_to_cpu(ts);
4792 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4793}
4794
4795static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4796{
4797 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4798 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4799
4800 union nvme_timestamp {
4801 struct {
4802 uint64_t timestamp:48;
4803 uint64_t sync:1;
4804 uint64_t origin:3;
4805 uint64_t rsvd1:12;
4806 };
4807 uint64_t all;
4808 };
4809
4810 union nvme_timestamp ts;
4811 ts.all = 0;
4812 ts.timestamp = n->host_timestamp + elapsed_time;
4813
4814
4815 ts.origin = n->host_timestamp ? 0x01 : 0x00;
4816
4817 trace_pci_nvme_getfeat_timestamp(ts.all);
4818
4819 return cpu_to_le64(ts.all);
4820}
4821
4822static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4823{
4824 uint64_t timestamp = nvme_get_timestamp(n);
4825
4826 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4827}
4828
4829static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4830{
4831 NvmeCmd *cmd = &req->cmd;
4832 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4833 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4834 uint32_t nsid = le32_to_cpu(cmd->nsid);
4835 uint32_t result;
4836 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4837 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4838 uint16_t iv;
4839 NvmeNamespace *ns;
4840 int i;
4841
4842 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4843 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4844 };
4845
4846 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4847
4848 if (!nvme_feature_support[fid]) {
4849 return NVME_INVALID_FIELD | NVME_DNR;
4850 }
4851
4852 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4853 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4854
4855
4856
4857
4858
4859
4860
4861 return NVME_INVALID_NSID | NVME_DNR;
4862 }
4863
4864 if (!nvme_ns(n, nsid)) {
4865 return NVME_INVALID_FIELD | NVME_DNR;
4866 }
4867 }
4868
4869 switch (sel) {
4870 case NVME_GETFEAT_SELECT_CURRENT:
4871 break;
4872 case NVME_GETFEAT_SELECT_SAVED:
4873
4874 case NVME_GETFEAT_SELECT_DEFAULT:
4875 goto defaults;
4876 case NVME_GETFEAT_SELECT_CAP:
4877 result = nvme_feature_cap[fid];
4878 goto out;
4879 }
4880
4881 switch (fid) {
4882 case NVME_TEMPERATURE_THRESHOLD:
4883 result = 0;
4884
4885
4886
4887
4888
4889 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4890 goto out;
4891 }
4892
4893 switch (NVME_TEMP_THSEL(dw11)) {
4894 case NVME_TEMP_THSEL_OVER:
4895 result = n->features.temp_thresh_hi;
4896 goto out;
4897 case NVME_TEMP_THSEL_UNDER:
4898 result = n->features.temp_thresh_low;
4899 goto out;
4900 }
4901
4902 return NVME_INVALID_FIELD | NVME_DNR;
4903 case NVME_ERROR_RECOVERY:
4904 if (!nvme_nsid_valid(n, nsid)) {
4905 return NVME_INVALID_NSID | NVME_DNR;
4906 }
4907
4908 ns = nvme_ns(n, nsid);
4909 if (unlikely(!ns)) {
4910 return NVME_INVALID_FIELD | NVME_DNR;
4911 }
4912
4913 result = ns->features.err_rec;
4914 goto out;
4915 case NVME_VOLATILE_WRITE_CACHE:
4916 result = 0;
4917 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4918 ns = nvme_ns(n, i);
4919 if (!ns) {
4920 continue;
4921 }
4922
4923 result = blk_enable_write_cache(ns->blkconf.blk);
4924 if (result) {
4925 break;
4926 }
4927 }
4928 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4929 goto out;
4930 case NVME_ASYNCHRONOUS_EVENT_CONF:
4931 result = n->features.async_config;
4932 goto out;
4933 case NVME_TIMESTAMP:
4934 return nvme_get_feature_timestamp(n, req);
4935 default:
4936 break;
4937 }
4938
4939defaults:
4940 switch (fid) {
4941 case NVME_TEMPERATURE_THRESHOLD:
4942 result = 0;
4943
4944 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4945 break;
4946 }
4947
4948 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4949 result = NVME_TEMPERATURE_WARNING;
4950 }
4951
4952 break;
4953 case NVME_NUMBER_OF_QUEUES:
4954 result = (n->params.max_ioqpairs - 1) |
4955 ((n->params.max_ioqpairs - 1) << 16);
4956 trace_pci_nvme_getfeat_numq(result);
4957 break;
4958 case NVME_INTERRUPT_VECTOR_CONF:
4959 iv = dw11 & 0xffff;
4960 if (iv >= n->params.max_ioqpairs + 1) {
4961 return NVME_INVALID_FIELD | NVME_DNR;
4962 }
4963
4964 result = iv;
4965 if (iv == n->admin_cq.vector) {
4966 result |= NVME_INTVC_NOCOALESCING;
4967 }
4968 break;
4969 default:
4970 result = nvme_feature_default[fid];
4971 break;
4972 }
4973
4974out:
4975 req->cqe.result = cpu_to_le32(result);
4976 return NVME_SUCCESS;
4977}
4978
4979static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4980{
4981 uint16_t ret;
4982 uint64_t timestamp;
4983
4984 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4985 if (ret) {
4986 return ret;
4987 }
4988
4989 nvme_set_timestamp(n, timestamp);
4990
4991 return NVME_SUCCESS;
4992}
4993
4994static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4995{
4996 NvmeNamespace *ns = NULL;
4997
4998 NvmeCmd *cmd = &req->cmd;
4999 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5000 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5001 uint32_t nsid = le32_to_cpu(cmd->nsid);
5002 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5003 uint8_t save = NVME_SETFEAT_SAVE(dw10);
5004 int i;
5005
5006 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5007
5008 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5009 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5010 }
5011
5012 if (!nvme_feature_support[fid]) {
5013 return NVME_INVALID_FIELD | NVME_DNR;
5014 }
5015
5016 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5017 if (nsid != NVME_NSID_BROADCAST) {
5018 if (!nvme_nsid_valid(n, nsid)) {
5019 return NVME_INVALID_NSID | NVME_DNR;
5020 }
5021
5022 ns = nvme_ns(n, nsid);
5023 if (unlikely(!ns)) {
5024 return NVME_INVALID_FIELD | NVME_DNR;
5025 }
5026 }
5027 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5028 if (!nvme_nsid_valid(n, nsid)) {
5029 return NVME_INVALID_NSID | NVME_DNR;
5030 }
5031
5032 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5033 }
5034
5035 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5036 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5037 }
5038
5039 switch (fid) {
5040 case NVME_TEMPERATURE_THRESHOLD:
5041 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5042 break;
5043 }
5044
5045 switch (NVME_TEMP_THSEL(dw11)) {
5046 case NVME_TEMP_THSEL_OVER:
5047 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5048 break;
5049 case NVME_TEMP_THSEL_UNDER:
5050 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5051 break;
5052 default:
5053 return NVME_INVALID_FIELD | NVME_DNR;
5054 }
5055
5056 if ((n->temperature >= n->features.temp_thresh_hi) ||
5057 (n->temperature <= n->features.temp_thresh_low)) {
5058 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
5059 }
5060
5061 break;
5062 case NVME_ERROR_RECOVERY:
5063 if (nsid == NVME_NSID_BROADCAST) {
5064 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5065 ns = nvme_ns(n, i);
5066
5067 if (!ns) {
5068 continue;
5069 }
5070
5071 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5072 ns->features.err_rec = dw11;
5073 }
5074 }
5075
5076 break;
5077 }
5078
5079 assert(ns);
5080 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5081 ns->features.err_rec = dw11;
5082 }
5083 break;
5084 case NVME_VOLATILE_WRITE_CACHE:
5085 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5086 ns = nvme_ns(n, i);
5087 if (!ns) {
5088 continue;
5089 }
5090
5091 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5092 blk_flush(ns->blkconf.blk);
5093 }
5094
5095 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5096 }
5097
5098 break;
5099
5100 case NVME_NUMBER_OF_QUEUES:
5101 if (n->qs_created) {
5102 return NVME_CMD_SEQ_ERROR | NVME_DNR;
5103 }
5104
5105
5106
5107
5108
5109 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5110 return NVME_INVALID_FIELD | NVME_DNR;
5111 }
5112
5113 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5114 ((dw11 >> 16) & 0xffff) + 1,
5115 n->params.max_ioqpairs,
5116 n->params.max_ioqpairs);
5117 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
5118 ((n->params.max_ioqpairs - 1) << 16));
5119 break;
5120 case NVME_ASYNCHRONOUS_EVENT_CONF:
5121 n->features.async_config = dw11;
5122 break;
5123 case NVME_TIMESTAMP:
5124 return nvme_set_feature_timestamp(n, req);
5125 case NVME_COMMAND_SET_PROFILE:
5126 if (dw11 & 0x1ff) {
5127 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5128 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5129 }
5130 break;
5131 default:
5132 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5133 }
5134 return NVME_SUCCESS;
5135}
5136
5137static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5138{
5139 trace_pci_nvme_aer(nvme_cid(req));
5140
5141 if (n->outstanding_aers > n->params.aerl) {
5142 trace_pci_nvme_aer_aerl_exceeded();
5143 return NVME_AER_LIMIT_EXCEEDED;
5144 }
5145
5146 n->aer_reqs[n->outstanding_aers] = req;
5147 n->outstanding_aers++;
5148
5149 if (!QTAILQ_EMPTY(&n->aer_queue)) {
5150 nvme_process_aers(n);
5151 }
5152
5153 return NVME_NO_COMPLETE;
5154}
5155
5156static void nvme_update_dmrsl(NvmeCtrl *n)
5157{
5158 int nsid;
5159
5160 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5161 NvmeNamespace *ns = nvme_ns(n, nsid);
5162 if (!ns) {
5163 continue;
5164 }
5165
5166 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5167 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5168 }
5169}
5170
5171static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5172{
5173 uint32_t cc = ldl_le_p(&n->bar.cc);
5174
5175 ns->iocs = nvme_cse_iocs_none;
5176 switch (ns->csi) {
5177 case NVME_CSI_NVM:
5178 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5179 ns->iocs = nvme_cse_iocs_nvm;
5180 }
5181 break;
5182 case NVME_CSI_ZONED:
5183 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5184 ns->iocs = nvme_cse_iocs_zoned;
5185 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5186 ns->iocs = nvme_cse_iocs_nvm;
5187 }
5188 break;
5189 }
5190}
5191
5192static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5193{
5194 NvmeNamespace *ns;
5195 NvmeCtrl *ctrl;
5196 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5197 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5198 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5199 bool attach = !(dw10 & 0xf);
5200 uint16_t *nr_ids = &list[0];
5201 uint16_t *ids = &list[1];
5202 uint16_t ret;
5203 int i;
5204
5205 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5206
5207 if (!nvme_nsid_valid(n, nsid)) {
5208 return NVME_INVALID_NSID | NVME_DNR;
5209 }
5210
5211 ns = nvme_subsys_ns(n->subsys, nsid);
5212 if (!ns) {
5213 return NVME_INVALID_FIELD | NVME_DNR;
5214 }
5215
5216 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5217 if (ret) {
5218 return ret;
5219 }
5220
5221 if (!*nr_ids) {
5222 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5223 }
5224
5225 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5226 for (i = 0; i < *nr_ids; i++) {
5227 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5228 if (!ctrl) {
5229 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5230 }
5231
5232 if (attach) {
5233 if (nvme_ns(ctrl, nsid)) {
5234 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5235 }
5236
5237 if (ns->attached && !ns->params.shared) {
5238 return NVME_NS_PRIVATE | NVME_DNR;
5239 }
5240
5241 nvme_attach_ns(ctrl, ns);
5242 nvme_select_iocs_ns(ctrl, ns);
5243 } else {
5244 if (!nvme_ns(ctrl, nsid)) {
5245 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5246 }
5247
5248 ctrl->namespaces[nsid] = NULL;
5249 ns->attached--;
5250
5251 nvme_update_dmrsl(ctrl);
5252 }
5253
5254
5255
5256
5257
5258 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5259 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5260 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5261 NVME_LOG_CHANGED_NSLIST);
5262 }
5263 }
5264
5265 return NVME_SUCCESS;
5266}
5267
5268typedef struct NvmeFormatAIOCB {
5269 BlockAIOCB common;
5270 BlockAIOCB *aiocb;
5271 QEMUBH *bh;
5272 NvmeRequest *req;
5273 int ret;
5274
5275 NvmeNamespace *ns;
5276 uint32_t nsid;
5277 bool broadcast;
5278 int64_t offset;
5279} NvmeFormatAIOCB;
5280
5281static void nvme_format_bh(void *opaque);
5282
5283static void nvme_format_cancel(BlockAIOCB *aiocb)
5284{
5285 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5286
5287 if (iocb->aiocb) {
5288 blk_aio_cancel_async(iocb->aiocb);
5289 }
5290}
5291
5292static const AIOCBInfo nvme_format_aiocb_info = {
5293 .aiocb_size = sizeof(NvmeFormatAIOCB),
5294 .cancel_async = nvme_format_cancel,
5295 .get_aio_context = nvme_get_aio_context,
5296};
5297
5298static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
5299{
5300 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5301 uint8_t lbaf = dw10 & 0xf;
5302 uint8_t pi = (dw10 >> 5) & 0x7;
5303 uint8_t mset = (dw10 >> 4) & 0x1;
5304 uint8_t pil = (dw10 >> 8) & 0x1;
5305
5306 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5307
5308 ns->id_ns.dps = (pil << 3) | pi;
5309 ns->id_ns.flbas = lbaf | (mset << 4);
5310
5311 nvme_ns_init_format(ns);
5312}
5313
5314static void nvme_format_ns_cb(void *opaque, int ret)
5315{
5316 NvmeFormatAIOCB *iocb = opaque;
5317 NvmeRequest *req = iocb->req;
5318 NvmeNamespace *ns = iocb->ns;
5319 int bytes;
5320
5321 if (ret < 0) {
5322 iocb->ret = ret;
5323 goto done;
5324 }
5325
5326 assert(ns);
5327
5328 if (iocb->offset < ns->size) {
5329 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5330
5331 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5332 bytes, BDRV_REQ_MAY_UNMAP,
5333 nvme_format_ns_cb, iocb);
5334
5335 iocb->offset += bytes;
5336 return;
5337 }
5338
5339 nvme_format_set(ns, &req->cmd);
5340 ns->status = 0x0;
5341 iocb->ns = NULL;
5342 iocb->offset = 0;
5343
5344done:
5345 iocb->aiocb = NULL;
5346 qemu_bh_schedule(iocb->bh);
5347}
5348
5349static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5350{
5351 if (ns->params.zoned) {
5352 return NVME_INVALID_FORMAT | NVME_DNR;
5353 }
5354
5355 if (lbaf > ns->id_ns.nlbaf) {
5356 return NVME_INVALID_FORMAT | NVME_DNR;
5357 }
5358
5359 if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
5360 return NVME_INVALID_FORMAT | NVME_DNR;
5361 }
5362
5363 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5364 return NVME_INVALID_FIELD | NVME_DNR;
5365 }
5366
5367 return NVME_SUCCESS;
5368}
5369
5370static void nvme_format_bh(void *opaque)
5371{
5372 NvmeFormatAIOCB *iocb = opaque;
5373 NvmeRequest *req = iocb->req;
5374 NvmeCtrl *n = nvme_ctrl(req);
5375 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5376 uint8_t lbaf = dw10 & 0xf;
5377 uint8_t pi = (dw10 >> 5) & 0x7;
5378 uint16_t status;
5379 int i;
5380
5381 if (iocb->ret < 0) {
5382 goto done;
5383 }
5384
5385 if (iocb->broadcast) {
5386 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5387 iocb->ns = nvme_ns(n, i);
5388 if (iocb->ns) {
5389 iocb->nsid = i;
5390 break;
5391 }
5392 }
5393 }
5394
5395 if (!iocb->ns) {
5396 goto done;
5397 }
5398
5399 status = nvme_format_check(iocb->ns, lbaf, pi);
5400 if (status) {
5401 req->status = status;
5402 goto done;
5403 }
5404
5405 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5406 nvme_format_ns_cb(iocb, 0);
5407 return;
5408
5409done:
5410 qemu_bh_delete(iocb->bh);
5411 iocb->bh = NULL;
5412
5413 iocb->common.cb(iocb->common.opaque, iocb->ret);
5414
5415 qemu_aio_unref(iocb);
5416}
5417
5418static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5419{
5420 NvmeFormatAIOCB *iocb;
5421 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5422 uint16_t status;
5423
5424 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5425
5426 iocb->req = req;
5427 iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5428 iocb->ret = 0;
5429 iocb->ns = NULL;
5430 iocb->nsid = 0;
5431 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5432 iocb->offset = 0;
5433
5434 if (!iocb->broadcast) {
5435 if (!nvme_nsid_valid(n, nsid)) {
5436 status = NVME_INVALID_NSID | NVME_DNR;
5437 goto out;
5438 }
5439
5440 iocb->ns = nvme_ns(n, nsid);
5441 if (!iocb->ns) {
5442 status = NVME_INVALID_FIELD | NVME_DNR;
5443 goto out;
5444 }
5445 }
5446
5447 req->aiocb = &iocb->common;
5448 qemu_bh_schedule(iocb->bh);
5449
5450 return NVME_NO_COMPLETE;
5451
5452out:
5453 qemu_bh_delete(iocb->bh);
5454 iocb->bh = NULL;
5455 qemu_aio_unref(iocb);
5456 return status;
5457}
5458
5459static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5460{
5461 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5462 nvme_adm_opc_str(req->cmd.opcode));
5463
5464 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5465 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5466 return NVME_INVALID_OPCODE | NVME_DNR;
5467 }
5468
5469
5470 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5471 return NVME_INVALID_FIELD | NVME_DNR;
5472 }
5473
5474 switch (req->cmd.opcode) {
5475 case NVME_ADM_CMD_DELETE_SQ:
5476 return nvme_del_sq(n, req);
5477 case NVME_ADM_CMD_CREATE_SQ:
5478 return nvme_create_sq(n, req);
5479 case NVME_ADM_CMD_GET_LOG_PAGE:
5480 return nvme_get_log(n, req);
5481 case NVME_ADM_CMD_DELETE_CQ:
5482 return nvme_del_cq(n, req);
5483 case NVME_ADM_CMD_CREATE_CQ:
5484 return nvme_create_cq(n, req);
5485 case NVME_ADM_CMD_IDENTIFY:
5486 return nvme_identify(n, req);
5487 case NVME_ADM_CMD_ABORT:
5488 return nvme_abort(n, req);
5489 case NVME_ADM_CMD_SET_FEATURES:
5490 return nvme_set_feature(n, req);
5491 case NVME_ADM_CMD_GET_FEATURES:
5492 return nvme_get_feature(n, req);
5493 case NVME_ADM_CMD_ASYNC_EV_REQ:
5494 return nvme_aer(n, req);
5495 case NVME_ADM_CMD_NS_ATTACHMENT:
5496 return nvme_ns_attachment(n, req);
5497 case NVME_ADM_CMD_FORMAT_NVM:
5498 return nvme_format(n, req);
5499 default:
5500 assert(false);
5501 }
5502
5503 return NVME_INVALID_OPCODE | NVME_DNR;
5504}
5505
5506static void nvme_process_sq(void *opaque)
5507{
5508 NvmeSQueue *sq = opaque;
5509 NvmeCtrl *n = sq->ctrl;
5510 NvmeCQueue *cq = n->cq[sq->cqid];
5511
5512 uint16_t status;
5513 hwaddr addr;
5514 NvmeCmd cmd;
5515 NvmeRequest *req;
5516
5517 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5518 addr = sq->dma_addr + sq->head * n->sqe_size;
5519 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5520 trace_pci_nvme_err_addr_read(addr);
5521 trace_pci_nvme_err_cfs();
5522 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
5523 break;
5524 }
5525 nvme_inc_sq_head(sq);
5526
5527 req = QTAILQ_FIRST(&sq->req_list);
5528 QTAILQ_REMOVE(&sq->req_list, req, entry);
5529 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5530 nvme_req_clear(req);
5531 req->cqe.cid = cmd.cid;
5532 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5533
5534 status = sq->sqid ? nvme_io_cmd(n, req) :
5535 nvme_admin_cmd(n, req);
5536 if (status != NVME_NO_COMPLETE) {
5537 req->status = status;
5538 nvme_enqueue_req_completion(cq, req);
5539 }
5540 }
5541}
5542
5543static void nvme_ctrl_reset(NvmeCtrl *n)
5544{
5545 NvmeNamespace *ns;
5546 int i;
5547
5548 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5549 ns = nvme_ns(n, i);
5550 if (!ns) {
5551 continue;
5552 }
5553
5554 nvme_ns_drain(ns);
5555 }
5556
5557 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5558 if (n->sq[i] != NULL) {
5559 nvme_free_sq(n->sq[i], n);
5560 }
5561 }
5562 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5563 if (n->cq[i] != NULL) {
5564 nvme_free_cq(n->cq[i], n);
5565 }
5566 }
5567
5568 while (!QTAILQ_EMPTY(&n->aer_queue)) {
5569 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5570 QTAILQ_REMOVE(&n->aer_queue, event, entry);
5571 g_free(event);
5572 }
5573
5574 n->aer_queued = 0;
5575 n->outstanding_aers = 0;
5576 n->qs_created = false;
5577}
5578
5579static void nvme_ctrl_shutdown(NvmeCtrl *n)
5580{
5581 NvmeNamespace *ns;
5582 int i;
5583
5584 if (n->pmr.dev) {
5585 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5586 }
5587
5588 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5589 ns = nvme_ns(n, i);
5590 if (!ns) {
5591 continue;
5592 }
5593
5594 nvme_ns_shutdown(ns);
5595 }
5596}
5597
5598static void nvme_select_iocs(NvmeCtrl *n)
5599{
5600 NvmeNamespace *ns;
5601 int i;
5602
5603 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5604 ns = nvme_ns(n, i);
5605 if (!ns) {
5606 continue;
5607 }
5608
5609 nvme_select_iocs_ns(n, ns);
5610 }
5611}
5612
5613static int nvme_start_ctrl(NvmeCtrl *n)
5614{
5615 uint64_t cap = ldq_le_p(&n->bar.cap);
5616 uint32_t cc = ldl_le_p(&n->bar.cc);
5617 uint32_t aqa = ldl_le_p(&n->bar.aqa);
5618 uint64_t asq = ldq_le_p(&n->bar.asq);
5619 uint64_t acq = ldq_le_p(&n->bar.acq);
5620 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
5621 uint32_t page_size = 1 << page_bits;
5622
5623 if (unlikely(n->cq[0])) {
5624 trace_pci_nvme_err_startfail_cq();
5625 return -1;
5626 }
5627 if (unlikely(n->sq[0])) {
5628 trace_pci_nvme_err_startfail_sq();
5629 return -1;
5630 }
5631 if (unlikely(!asq)) {
5632 trace_pci_nvme_err_startfail_nbarasq();
5633 return -1;
5634 }
5635 if (unlikely(!acq)) {
5636 trace_pci_nvme_err_startfail_nbaracq();
5637 return -1;
5638 }
5639 if (unlikely(asq & (page_size - 1))) {
5640 trace_pci_nvme_err_startfail_asq_misaligned(asq);
5641 return -1;
5642 }
5643 if (unlikely(acq & (page_size - 1))) {
5644 trace_pci_nvme_err_startfail_acq_misaligned(acq);
5645 return -1;
5646 }
5647 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
5648 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
5649 return -1;
5650 }
5651 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
5652 trace_pci_nvme_err_startfail_page_too_small(
5653 NVME_CC_MPS(cc),
5654 NVME_CAP_MPSMIN(cap));
5655 return -1;
5656 }
5657 if (unlikely(NVME_CC_MPS(cc) >
5658 NVME_CAP_MPSMAX(cap))) {
5659 trace_pci_nvme_err_startfail_page_too_large(
5660 NVME_CC_MPS(cc),
5661 NVME_CAP_MPSMAX(cap));
5662 return -1;
5663 }
5664 if (unlikely(NVME_CC_IOCQES(cc) <
5665 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5666 trace_pci_nvme_err_startfail_cqent_too_small(
5667 NVME_CC_IOCQES(cc),
5668 NVME_CTRL_CQES_MIN(cap));
5669 return -1;
5670 }
5671 if (unlikely(NVME_CC_IOCQES(cc) >
5672 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5673 trace_pci_nvme_err_startfail_cqent_too_large(
5674 NVME_CC_IOCQES(cc),
5675 NVME_CTRL_CQES_MAX(cap));
5676 return -1;
5677 }
5678 if (unlikely(NVME_CC_IOSQES(cc) <
5679 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5680 trace_pci_nvme_err_startfail_sqent_too_small(
5681 NVME_CC_IOSQES(cc),
5682 NVME_CTRL_SQES_MIN(cap));
5683 return -1;
5684 }
5685 if (unlikely(NVME_CC_IOSQES(cc) >
5686 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5687 trace_pci_nvme_err_startfail_sqent_too_large(
5688 NVME_CC_IOSQES(cc),
5689 NVME_CTRL_SQES_MAX(cap));
5690 return -1;
5691 }
5692 if (unlikely(!NVME_AQA_ASQS(aqa))) {
5693 trace_pci_nvme_err_startfail_asqent_sz_zero();
5694 return -1;
5695 }
5696 if (unlikely(!NVME_AQA_ACQS(aqa))) {
5697 trace_pci_nvme_err_startfail_acqent_sz_zero();
5698 return -1;
5699 }
5700
5701 n->page_bits = page_bits;
5702 n->page_size = page_size;
5703 n->max_prp_ents = n->page_size / sizeof(uint64_t);
5704 n->cqe_size = 1 << NVME_CC_IOCQES(cc);
5705 n->sqe_size = 1 << NVME_CC_IOSQES(cc);
5706 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
5707 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
5708
5709 nvme_set_timestamp(n, 0ULL);
5710
5711 QTAILQ_INIT(&n->aer_queue);
5712
5713 nvme_select_iocs(n);
5714
5715 return 0;
5716}
5717
5718static void nvme_cmb_enable_regs(NvmeCtrl *n)
5719{
5720 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
5721 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
5722
5723 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
5724 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
5725 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
5726 stl_le_p(&n->bar.cmbloc, cmbloc);
5727
5728 NVME_CMBSZ_SET_SQS(cmbsz, 1);
5729 NVME_CMBSZ_SET_CQS(cmbsz, 0);
5730 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
5731 NVME_CMBSZ_SET_RDS(cmbsz, 1);
5732 NVME_CMBSZ_SET_WDS(cmbsz, 1);
5733 NVME_CMBSZ_SET_SZU(cmbsz, 2);
5734 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
5735 stl_le_p(&n->bar.cmbsz, cmbsz);
5736}
5737
5738static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5739 unsigned size)
5740{
5741 uint64_t cap = ldq_le_p(&n->bar.cap);
5742 uint32_t cc = ldl_le_p(&n->bar.cc);
5743 uint32_t intms = ldl_le_p(&n->bar.intms);
5744 uint32_t csts = ldl_le_p(&n->bar.csts);
5745 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
5746
5747 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5748 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5749 "MMIO write not 32-bit aligned,"
5750 " offset=0x%"PRIx64"", offset);
5751
5752 }
5753
5754 if (unlikely(size < sizeof(uint32_t))) {
5755 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5756 "MMIO write smaller than 32-bits,"
5757 " offset=0x%"PRIx64", size=%u",
5758 offset, size);
5759
5760 }
5761
5762 switch (offset) {
5763 case NVME_REG_INTMS:
5764 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5765 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5766 "undefined access to interrupt mask set"
5767 " when MSI-X is enabled");
5768
5769 }
5770 intms |= data;
5771 stl_le_p(&n->bar.intms, intms);
5772 n->bar.intmc = n->bar.intms;
5773 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
5774 nvme_irq_check(n);
5775 break;
5776 case NVME_REG_INTMC:
5777 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5778 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5779 "undefined access to interrupt mask clr"
5780 " when MSI-X is enabled");
5781
5782 }
5783 intms &= ~data;
5784 stl_le_p(&n->bar.intms, intms);
5785 n->bar.intmc = n->bar.intms;
5786 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
5787 nvme_irq_check(n);
5788 break;
5789 case NVME_REG_CC:
5790 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5791
5792
5793 if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
5794 !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
5795 {
5796 cc = data;
5797 }
5798
5799 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
5800 cc = data;
5801
5802
5803 stl_le_p(&n->bar.cc, cc);
5804 if (unlikely(nvme_start_ctrl(n))) {
5805 trace_pci_nvme_err_startfail();
5806 csts = NVME_CSTS_FAILED;
5807 } else {
5808 trace_pci_nvme_mmio_start_success();
5809 csts = NVME_CSTS_READY;
5810 }
5811 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
5812 trace_pci_nvme_mmio_stopped();
5813 nvme_ctrl_reset(n);
5814 cc = 0;
5815 csts &= ~NVME_CSTS_READY;
5816 }
5817
5818 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
5819 trace_pci_nvme_mmio_shutdown_set();
5820 nvme_ctrl_shutdown(n);
5821 cc = data;
5822 csts |= NVME_CSTS_SHST_COMPLETE;
5823 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
5824 trace_pci_nvme_mmio_shutdown_cleared();
5825 csts &= ~NVME_CSTS_SHST_COMPLETE;
5826 cc = data;
5827 }
5828
5829 stl_le_p(&n->bar.cc, cc);
5830 stl_le_p(&n->bar.csts, csts);
5831
5832 break;
5833 case NVME_REG_CSTS:
5834 if (data & (1 << 4)) {
5835 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5836 "attempted to W1C CSTS.NSSRO"
5837 " but CAP.NSSRS is zero (not supported)");
5838 } else if (data != 0) {
5839 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5840 "attempted to set a read only bit"
5841 " of controller status");
5842 }
5843 break;
5844 case NVME_REG_NSSR:
5845 if (data == 0x4e564d65) {
5846 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5847 } else {
5848
5849 return;
5850 }
5851 break;
5852 case NVME_REG_AQA:
5853 stl_le_p(&n->bar.aqa, data);
5854 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5855 break;
5856 case NVME_REG_ASQ:
5857 stn_le_p(&n->bar.asq, size, data);
5858 trace_pci_nvme_mmio_asqaddr(data);
5859 break;
5860 case NVME_REG_ASQ + 4:
5861 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
5862 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
5863 break;
5864 case NVME_REG_ACQ:
5865 trace_pci_nvme_mmio_acqaddr(data);
5866 stn_le_p(&n->bar.acq, size, data);
5867 break;
5868 case NVME_REG_ACQ + 4:
5869 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
5870 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
5871 break;
5872 case NVME_REG_CMBLOC:
5873 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5874 "invalid write to reserved CMBLOC"
5875 " when CMBSZ is zero, ignored");
5876 return;
5877 case NVME_REG_CMBSZ:
5878 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5879 "invalid write to read only CMBSZ, ignored");
5880 return;
5881 case NVME_REG_CMBMSC:
5882 if (!NVME_CAP_CMBS(cap)) {
5883 return;
5884 }
5885
5886 stn_le_p(&n->bar.cmbmsc, size, data);
5887 n->cmb.cmse = false;
5888
5889 if (NVME_CMBMSC_CRE(data)) {
5890 nvme_cmb_enable_regs(n);
5891
5892 if (NVME_CMBMSC_CMSE(data)) {
5893 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
5894 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
5895 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5896 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
5897 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
5898 stl_le_p(&n->bar.cmbsts, cmbsts);
5899 return;
5900 }
5901
5902 n->cmb.cba = cba;
5903 n->cmb.cmse = true;
5904 }
5905 } else {
5906 n->bar.cmbsz = 0;
5907 n->bar.cmbloc = 0;
5908 }
5909
5910 return;
5911 case NVME_REG_CMBMSC + 4:
5912 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
5913 return;
5914
5915 case NVME_REG_PMRCAP:
5916 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5917 "invalid write to PMRCAP register, ignored");
5918 return;
5919 case NVME_REG_PMRCTL:
5920 if (!NVME_CAP_PMRS(cap)) {
5921 return;
5922 }
5923
5924 stl_le_p(&n->bar.pmrctl, data);
5925 if (NVME_PMRCTL_EN(data)) {
5926 memory_region_set_enabled(&n->pmr.dev->mr, true);
5927 pmrsts = 0;
5928 } else {
5929 memory_region_set_enabled(&n->pmr.dev->mr, false);
5930 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
5931 n->pmr.cmse = false;
5932 }
5933 stl_le_p(&n->bar.pmrsts, pmrsts);
5934 return;
5935 case NVME_REG_PMRSTS:
5936 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5937 "invalid write to PMRSTS register, ignored");
5938 return;
5939 case NVME_REG_PMREBS:
5940 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5941 "invalid write to PMREBS register, ignored");
5942 return;
5943 case NVME_REG_PMRSWTP:
5944 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5945 "invalid write to PMRSWTP register, ignored");
5946 return;
5947 case NVME_REG_PMRMSCL:
5948 if (!NVME_CAP_PMRS(cap)) {
5949 return;
5950 }
5951
5952 stl_le_p(&n->bar.pmrmscl, data);
5953 n->pmr.cmse = false;
5954
5955 if (NVME_PMRMSCL_CMSE(data)) {
5956 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
5957 hwaddr cba = pmrmscu << 32 |
5958 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
5959 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5960 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
5961 stl_le_p(&n->bar.pmrsts, pmrsts);
5962 return;
5963 }
5964
5965 n->pmr.cmse = true;
5966 n->pmr.cba = cba;
5967 }
5968
5969 return;
5970 case NVME_REG_PMRMSCU:
5971 if (!NVME_CAP_PMRS(cap)) {
5972 return;
5973 }
5974
5975 stl_le_p(&n->bar.pmrmscu, data);
5976 return;
5977 default:
5978 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5979 "invalid MMIO write,"
5980 " offset=0x%"PRIx64", data=%"PRIx64"",
5981 offset, data);
5982 break;
5983 }
5984}
5985
5986static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5987{
5988 NvmeCtrl *n = (NvmeCtrl *)opaque;
5989 uint8_t *ptr = (uint8_t *)&n->bar;
5990
5991 trace_pci_nvme_mmio_read(addr, size);
5992
5993 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
5994 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
5995 "MMIO read not 32-bit aligned,"
5996 " offset=0x%"PRIx64"", addr);
5997
5998 } else if (unlikely(size < sizeof(uint32_t))) {
5999 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
6000 "MMIO read smaller than 32-bits,"
6001 " offset=0x%"PRIx64"", addr);
6002
6003 }
6004
6005 if (addr > sizeof(n->bar) - size) {
6006 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6007 "MMIO read beyond last register,"
6008 " offset=0x%"PRIx64", returning 0", addr);
6009
6010 return 0;
6011 }
6012
6013
6014
6015
6016
6017
6018 if (addr == NVME_REG_PMRSTS &&
6019 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6020 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6021 }
6022
6023 return ldn_le_p(ptr + addr, size);
6024}
6025
6026static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6027{
6028 uint32_t qid;
6029
6030 if (unlikely(addr & ((1 << 2) - 1))) {
6031 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6032 "doorbell write not 32-bit aligned,"
6033 " offset=0x%"PRIx64", ignoring", addr);
6034 return;
6035 }
6036
6037 if (((addr - 0x1000) >> 2) & 1) {
6038
6039
6040 uint16_t new_head = val & 0xffff;
6041 int start_sqs;
6042 NvmeCQueue *cq;
6043
6044 qid = (addr - (0x1000 + (1 << 2))) >> 3;
6045 if (unlikely(nvme_check_cqid(n, qid))) {
6046 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6047 "completion queue doorbell write"
6048 " for nonexistent queue,"
6049 " sqid=%"PRIu32", ignoring", qid);
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064 if (n->outstanding_aers) {
6065 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6066 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6067 NVME_LOG_ERROR_INFO);
6068 }
6069
6070 return;
6071 }
6072
6073 cq = n->cq[qid];
6074 if (unlikely(new_head >= cq->size)) {
6075 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6076 "completion queue doorbell write value"
6077 " beyond queue size, sqid=%"PRIu32","
6078 " new_head=%"PRIu16", ignoring",
6079 qid, new_head);
6080
6081 if (n->outstanding_aers) {
6082 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6083 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6084 NVME_LOG_ERROR_INFO);
6085 }
6086
6087 return;
6088 }
6089
6090 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6091
6092 start_sqs = nvme_cq_full(cq) ? 1 : 0;
6093 cq->head = new_head;
6094 if (start_sqs) {
6095 NvmeSQueue *sq;
6096 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6097 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6098 }
6099 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6100 }
6101
6102 if (cq->tail == cq->head) {
6103 if (cq->irq_enabled) {
6104 n->cq_pending--;
6105 }
6106
6107 nvme_irq_deassert(n, cq);
6108 }
6109 } else {
6110
6111
6112 uint16_t new_tail = val & 0xffff;
6113 NvmeSQueue *sq;
6114
6115 qid = (addr - 0x1000) >> 3;
6116 if (unlikely(nvme_check_sqid(n, qid))) {
6117 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6118 "submission queue doorbell write"
6119 " for nonexistent queue,"
6120 " sqid=%"PRIu32", ignoring", qid);
6121
6122 if (n->outstanding_aers) {
6123 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6124 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6125 NVME_LOG_ERROR_INFO);
6126 }
6127
6128 return;
6129 }
6130
6131 sq = n->sq[qid];
6132 if (unlikely(new_tail >= sq->size)) {
6133 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6134 "submission queue doorbell write value"
6135 " beyond queue size, sqid=%"PRIu32","
6136 " new_tail=%"PRIu16", ignoring",
6137 qid, new_tail);
6138
6139 if (n->outstanding_aers) {
6140 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6141 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6142 NVME_LOG_ERROR_INFO);
6143 }
6144
6145 return;
6146 }
6147
6148 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6149
6150 sq->tail = new_tail;
6151 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6152 }
6153}
6154
6155static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6156 unsigned size)
6157{
6158 NvmeCtrl *n = (NvmeCtrl *)opaque;
6159
6160 trace_pci_nvme_mmio_write(addr, data, size);
6161
6162 if (addr < sizeof(n->bar)) {
6163 nvme_write_bar(n, addr, data, size);
6164 } else {
6165 nvme_process_db(n, addr, data);
6166 }
6167}
6168
6169static const MemoryRegionOps nvme_mmio_ops = {
6170 .read = nvme_mmio_read,
6171 .write = nvme_mmio_write,
6172 .endianness = DEVICE_LITTLE_ENDIAN,
6173 .impl = {
6174 .min_access_size = 2,
6175 .max_access_size = 8,
6176 },
6177};
6178
6179static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
6180 unsigned size)
6181{
6182 NvmeCtrl *n = (NvmeCtrl *)opaque;
6183 stn_le_p(&n->cmb.buf[addr], size, data);
6184}
6185
6186static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
6187{
6188 NvmeCtrl *n = (NvmeCtrl *)opaque;
6189 return ldn_le_p(&n->cmb.buf[addr], size);
6190}
6191
6192static const MemoryRegionOps nvme_cmb_ops = {
6193 .read = nvme_cmb_read,
6194 .write = nvme_cmb_write,
6195 .endianness = DEVICE_LITTLE_ENDIAN,
6196 .impl = {
6197 .min_access_size = 1,
6198 .max_access_size = 8,
6199 },
6200};
6201
6202static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
6203{
6204 NvmeParams *params = &n->params;
6205
6206 if (params->num_queues) {
6207 warn_report("num_queues is deprecated; please use max_ioqpairs "
6208 "instead");
6209
6210 params->max_ioqpairs = params->num_queues - 1;
6211 }
6212
6213 if (n->namespace.blkconf.blk && n->subsys) {
6214 error_setg(errp, "subsystem support is unavailable with legacy "
6215 "namespace ('drive' property)");
6216 return;
6217 }
6218
6219 if (params->max_ioqpairs < 1 ||
6220 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
6221 error_setg(errp, "max_ioqpairs must be between 1 and %d",
6222 NVME_MAX_IOQPAIRS);
6223 return;
6224 }
6225
6226 if (params->msix_qsize < 1 ||
6227 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
6228 error_setg(errp, "msix_qsize must be between 1 and %d",
6229 PCI_MSIX_FLAGS_QSIZE + 1);
6230 return;
6231 }
6232
6233 if (!params->serial) {
6234 error_setg(errp, "serial property not set");
6235 return;
6236 }
6237
6238 if (n->pmr.dev) {
6239 if (host_memory_backend_is_mapped(n->pmr.dev)) {
6240 error_setg(errp, "can't use already busy memdev: %s",
6241 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
6242 return;
6243 }
6244
6245 if (!is_power_of_2(n->pmr.dev->size)) {
6246 error_setg(errp, "pmr backend size needs to be power of 2 in size");
6247 return;
6248 }
6249
6250 host_memory_backend_set_mapped(n->pmr.dev, true);
6251 }
6252
6253 if (n->params.zasl > n->params.mdts) {
6254 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
6255 "than or equal to mdts (Maximum Data Transfer Size)");
6256 return;
6257 }
6258
6259 if (!n->params.vsl) {
6260 error_setg(errp, "vsl must be non-zero");
6261 return;
6262 }
6263}
6264
6265static void nvme_init_state(NvmeCtrl *n)
6266{
6267
6268 n->reg_size = pow2ceil(sizeof(NvmeBar) +
6269 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
6270 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
6271 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
6272 n->temperature = NVME_TEMPERATURE;
6273 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
6274 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6275 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
6276}
6277
6278static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
6279{
6280 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
6281 uint64_t cap = ldq_le_p(&n->bar.cap);
6282
6283 n->cmb.buf = g_malloc0(cmb_size);
6284 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
6285 "nvme-cmb", cmb_size);
6286 pci_register_bar(pci_dev, NVME_CMB_BIR,
6287 PCI_BASE_ADDRESS_SPACE_MEMORY |
6288 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6289 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
6290
6291 NVME_CAP_SET_CMBS(cap, 1);
6292 stq_le_p(&n->bar.cap, cap);
6293
6294 if (n->params.legacy_cmb) {
6295 nvme_cmb_enable_regs(n);
6296 n->cmb.cmse = true;
6297 }
6298}
6299
6300static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
6301{
6302 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
6303
6304 NVME_PMRCAP_SET_RDS(pmrcap, 1);
6305 NVME_PMRCAP_SET_WDS(pmrcap, 1);
6306 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
6307
6308 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
6309 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
6310 stl_le_p(&n->bar.pmrcap, pmrcap);
6311
6312 pci_register_bar(pci_dev, NVME_PMR_BIR,
6313 PCI_BASE_ADDRESS_SPACE_MEMORY |
6314 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6315 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
6316
6317 memory_region_set_enabled(&n->pmr.dev->mr, false);
6318}
6319
6320static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
6321{
6322 uint8_t *pci_conf = pci_dev->config;
6323 uint64_t bar_size, msix_table_size, msix_pba_size;
6324 unsigned msix_table_offset, msix_pba_offset;
6325 int ret;
6326
6327 Error *err = NULL;
6328
6329 pci_conf[PCI_INTERRUPT_PIN] = 1;
6330 pci_config_set_prog_interface(pci_conf, 0x2);
6331
6332 if (n->params.use_intel_id) {
6333 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6334 pci_config_set_device_id(pci_conf, 0x5845);
6335 } else {
6336 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6337 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6338 }
6339
6340 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6341 pcie_endpoint_cap_init(pci_dev, 0x80);
6342
6343 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6344 msix_table_offset = bar_size;
6345 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6346
6347 bar_size += msix_table_size;
6348 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6349 msix_pba_offset = bar_size;
6350 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6351
6352 bar_size += msix_pba_size;
6353 bar_size = pow2ceil(bar_size);
6354
6355 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6356 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6357 n->reg_size);
6358 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6359
6360 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6361 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6362 ret = msix_init(pci_dev, n->params.msix_qsize,
6363 &n->bar0, 0, msix_table_offset,
6364 &n->bar0, 0, msix_pba_offset, 0, &err);
6365 if (ret < 0) {
6366 if (ret == -ENOTSUP) {
6367 warn_report_err(err);
6368 } else {
6369 error_propagate(errp, err);
6370 return ret;
6371 }
6372 }
6373
6374 if (n->params.cmb_size_mb) {
6375 nvme_init_cmb(n, pci_dev);
6376 }
6377
6378 if (n->pmr.dev) {
6379 nvme_init_pmr(n, pci_dev);
6380 }
6381
6382 return 0;
6383}
6384
6385static void nvme_init_subnqn(NvmeCtrl *n)
6386{
6387 NvmeSubsystem *subsys = n->subsys;
6388 NvmeIdCtrl *id = &n->id_ctrl;
6389
6390 if (!subsys) {
6391 snprintf((char *)id->subnqn, sizeof(id->subnqn),
6392 "nqn.2019-08.org.qemu:%s", n->params.serial);
6393 } else {
6394 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6395 }
6396}
6397
6398static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6399{
6400 NvmeIdCtrl *id = &n->id_ctrl;
6401 uint8_t *pci_conf = pci_dev->config;
6402 uint64_t cap = ldq_le_p(&n->bar.cap);
6403
6404 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6405 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6406 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6407 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6408 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6409
6410 id->cntlid = cpu_to_le16(n->cntlid);
6411
6412 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6413
6414 id->rab = 6;
6415
6416 if (n->params.use_intel_id) {
6417 id->ieee[0] = 0xb3;
6418 id->ieee[1] = 0x02;
6419 id->ieee[2] = 0x00;
6420 } else {
6421 id->ieee[0] = 0x00;
6422 id->ieee[1] = 0x54;
6423 id->ieee[2] = 0x52;
6424 }
6425
6426 id->mdts = n->params.mdts;
6427 id->ver = cpu_to_le32(NVME_SPEC_VER);
6428 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6429 id->cntrltype = 0x1;
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442 id->acl = 3;
6443 id->aerl = n->params.aerl;
6444 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6445 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6446
6447
6448 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6449 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6450
6451 id->sqes = (0x6 << 4) | 0x6;
6452 id->cqes = (0x4 << 4) | 0x4;
6453 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6454 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6455 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6456 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6457
6458
6459
6460
6461
6462
6463
6464
6465 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6466
6467 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6468 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6469 NVME_CTRL_SGLS_BITBUCKET);
6470
6471 nvme_init_subnqn(n);
6472
6473 id->psd[0].mp = cpu_to_le16(0x9c4);
6474 id->psd[0].enlat = cpu_to_le32(0x10);
6475 id->psd[0].exlat = cpu_to_le32(0x4);
6476
6477 if (n->subsys) {
6478 id->cmic |= NVME_CMIC_MULTI_CTRL;
6479 }
6480
6481 NVME_CAP_SET_MQES(cap, 0x7ff);
6482 NVME_CAP_SET_CQR(cap, 1);
6483 NVME_CAP_SET_TO(cap, 0xf);
6484 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
6485 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
6486 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
6487 NVME_CAP_SET_MPSMAX(cap, 4);
6488 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
6489 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
6490 stq_le_p(&n->bar.cap, cap);
6491
6492 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
6493 n->bar.intmc = n->bar.intms = 0;
6494}
6495
6496static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6497{
6498 int cntlid;
6499
6500 if (!n->subsys) {
6501 return 0;
6502 }
6503
6504 cntlid = nvme_subsys_register_ctrl(n, errp);
6505 if (cntlid < 0) {
6506 return -1;
6507 }
6508
6509 n->cntlid = cntlid;
6510
6511 return 0;
6512}
6513
6514void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6515{
6516 uint32_t nsid = ns->params.nsid;
6517 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6518
6519 n->namespaces[nsid] = ns;
6520 ns->attached++;
6521
6522 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6523 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6524}
6525
6526static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6527{
6528 NvmeCtrl *n = NVME(pci_dev);
6529 NvmeNamespace *ns;
6530 Error *local_err = NULL;
6531
6532 nvme_check_constraints(n, &local_err);
6533 if (local_err) {
6534 error_propagate(errp, local_err);
6535 return;
6536 }
6537
6538 qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6539 &pci_dev->qdev, n->parent_obj.qdev.id);
6540
6541 nvme_init_state(n);
6542 if (nvme_init_pci(n, pci_dev, errp)) {
6543 return;
6544 }
6545
6546 if (nvme_init_subsys(n, errp)) {
6547 error_propagate(errp, local_err);
6548 return;
6549 }
6550 nvme_init_ctrl(n, pci_dev);
6551
6552
6553 if (n->namespace.blkconf.blk) {
6554 ns = &n->namespace;
6555 ns->params.nsid = 1;
6556
6557 if (nvme_ns_setup(ns, errp)) {
6558 return;
6559 }
6560
6561 nvme_attach_ns(n, ns);
6562 }
6563}
6564
6565static void nvme_exit(PCIDevice *pci_dev)
6566{
6567 NvmeCtrl *n = NVME(pci_dev);
6568 NvmeNamespace *ns;
6569 int i;
6570
6571 nvme_ctrl_reset(n);
6572
6573 if (n->subsys) {
6574 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6575 ns = nvme_ns(n, i);
6576 if (ns) {
6577 ns->attached--;
6578 }
6579 }
6580
6581 nvme_subsys_unregister_ctrl(n->subsys, n);
6582 }
6583
6584 g_free(n->cq);
6585 g_free(n->sq);
6586 g_free(n->aer_reqs);
6587
6588 if (n->params.cmb_size_mb) {
6589 g_free(n->cmb.buf);
6590 }
6591
6592 if (n->pmr.dev) {
6593 host_memory_backend_set_mapped(n->pmr.dev, false);
6594 }
6595 msix_uninit(pci_dev, &n->bar0, &n->bar0);
6596 memory_region_del_subregion(&n->bar0, &n->iomem);
6597}
6598
6599static Property nvme_props[] = {
6600 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6601 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6602 HostMemoryBackend *),
6603 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6604 NvmeSubsystem *),
6605 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6606 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6607 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6608 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6609 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6610 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6611 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6612 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6613 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6614 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6615 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6616 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6617 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6618 params.auto_transition_zones, true),
6619 DEFINE_PROP_END_OF_LIST(),
6620};
6621
6622static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6623 void *opaque, Error **errp)
6624{
6625 NvmeCtrl *n = NVME(obj);
6626 uint8_t value = n->smart_critical_warning;
6627
6628 visit_type_uint8(v, name, &value, errp);
6629}
6630
6631static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6632 void *opaque, Error **errp)
6633{
6634 NvmeCtrl *n = NVME(obj);
6635 uint8_t value, old_value, cap = 0, index, event;
6636
6637 if (!visit_type_uint8(v, name, &value, errp)) {
6638 return;
6639 }
6640
6641 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6642 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6643 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
6644 cap |= NVME_SMART_PMR_UNRELIABLE;
6645 }
6646
6647 if ((value & cap) != value) {
6648 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6649 value & ~cap);
6650 return;
6651 }
6652
6653 old_value = n->smart_critical_warning;
6654 n->smart_critical_warning = value;
6655
6656
6657 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6658 event = 1 << index;
6659 if (value & ~old_value & event)
6660 nvme_smart_event(n, event);
6661 }
6662}
6663
6664static const VMStateDescription nvme_vmstate = {
6665 .name = "nvme",
6666 .unmigratable = 1,
6667};
6668
6669static void nvme_class_init(ObjectClass *oc, void *data)
6670{
6671 DeviceClass *dc = DEVICE_CLASS(oc);
6672 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6673
6674 pc->realize = nvme_realize;
6675 pc->exit = nvme_exit;
6676 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6677 pc->revision = 2;
6678
6679 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6680 dc->desc = "Non-Volatile Memory Express";
6681 device_class_set_props(dc, nvme_props);
6682 dc->vmsd = &nvme_vmstate;
6683}
6684
6685static void nvme_instance_init(Object *obj)
6686{
6687 NvmeCtrl *n = NVME(obj);
6688
6689 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6690 "bootindex", "/namespace@1,0",
6691 DEVICE(obj));
6692
6693 object_property_add(obj, "smart_critical_warning", "uint8",
6694 nvme_get_smart_warning,
6695 nvme_set_smart_warning, NULL, NULL);
6696}
6697
6698static const TypeInfo nvme_info = {
6699 .name = TYPE_NVME,
6700 .parent = TYPE_PCI_DEVICE,
6701 .instance_size = sizeof(NvmeCtrl),
6702 .instance_init = nvme_instance_init,
6703 .class_init = nvme_class_init,
6704 .interfaces = (InterfaceInfo[]) {
6705 { INTERFACE_PCIE_DEVICE },
6706 { }
6707 },
6708};
6709
6710static const TypeInfo nvme_bus_info = {
6711 .name = TYPE_NVME_BUS,
6712 .parent = TYPE_BUS,
6713 .instance_size = sizeof(NvmeBus),
6714};
6715
6716static void nvme_register_types(void)
6717{
6718 type_register_static(&nvme_info);
6719 type_register_static(&nvme_bus_info);
6720}
6721
6722type_init(nvme_register_types)
6723