1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146#include "qemu/osdep.h"
147#include "qemu/units.h"
148#include "qemu/error-report.h"
149#include "hw/block/block.h"
150#include "hw/pci/msix.h"
151#include "hw/pci/pci.h"
152#include "hw/qdev-properties.h"
153#include "migration/vmstate.h"
154#include "sysemu/sysemu.h"
155#include "qapi/error.h"
156#include "qapi/visitor.h"
157#include "sysemu/hostmem.h"
158#include "sysemu/block-backend.h"
159#include "exec/memory.h"
160#include "qemu/log.h"
161#include "qemu/module.h"
162#include "qemu/cutils.h"
163#include "trace.h"
164#include "nvme.h"
165#include "nvme-ns.h"
166#include "nvme-dif.h"
167
168#define NVME_MAX_IOQPAIRS 0xffff
169#define NVME_DB_SIZE 4
170#define NVME_SPEC_VER 0x00010400
171#define NVME_CMB_BIR 2
172#define NVME_PMR_BIR 4
173#define NVME_TEMPERATURE 0x143
174#define NVME_TEMPERATURE_WARNING 0x157
175#define NVME_TEMPERATURE_CRITICAL 0x175
176#define NVME_NUM_FW_SLOTS 1
177
178#define NVME_GUEST_ERR(trace, fmt, ...) \
179 do { \
180 (trace_##trace)(__VA_ARGS__); \
181 qemu_log_mask(LOG_GUEST_ERROR, #trace \
182 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
183 } while (0)
184
185static const bool nvme_feature_support[NVME_FID_MAX] = {
186 [NVME_ARBITRATION] = true,
187 [NVME_POWER_MANAGEMENT] = true,
188 [NVME_TEMPERATURE_THRESHOLD] = true,
189 [NVME_ERROR_RECOVERY] = true,
190 [NVME_VOLATILE_WRITE_CACHE] = true,
191 [NVME_NUMBER_OF_QUEUES] = true,
192 [NVME_INTERRUPT_COALESCING] = true,
193 [NVME_INTERRUPT_VECTOR_CONF] = true,
194 [NVME_WRITE_ATOMICITY] = true,
195 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
196 [NVME_TIMESTAMP] = true,
197};
198
199static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
200 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
201 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
202 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
203 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
204 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
205 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
206};
207
208static const uint32_t nvme_cse_acs[256] = {
209 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
210 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
211 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
212 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
213 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
214 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
215 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
216 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
217 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
218 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
219 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
220 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
221};
222
223static const uint32_t nvme_cse_iocs_none[256];
224
225static const uint32_t nvme_cse_iocs_nvm[256] = {
226 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
227 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
228 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
229 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
230 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
232 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
233 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
234};
235
236static const uint32_t nvme_cse_iocs_zoned[256] = {
237 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
238 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
239 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
240 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
241 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
243 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
244 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
245 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
246 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
248};
249
250static void nvme_process_sq(void *opaque);
251
252static uint16_t nvme_sqid(NvmeRequest *req)
253{
254 return le16_to_cpu(req->sq->sqid);
255}
256
257static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
258 NvmeZoneState state)
259{
260 if (QTAILQ_IN_USE(zone, entry)) {
261 switch (nvme_get_zone_state(zone)) {
262 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
263 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
264 break;
265 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
266 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
267 break;
268 case NVME_ZONE_STATE_CLOSED:
269 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
270 break;
271 case NVME_ZONE_STATE_FULL:
272 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
273 default:
274 ;
275 }
276 }
277
278 nvme_set_zone_state(zone, state);
279
280 switch (state) {
281 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
282 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
283 break;
284 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
285 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
286 break;
287 case NVME_ZONE_STATE_CLOSED:
288 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
289 break;
290 case NVME_ZONE_STATE_FULL:
291 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
292 case NVME_ZONE_STATE_READ_ONLY:
293 break;
294 default:
295 zone->d.za = 0;
296 }
297}
298
299
300
301
302
303static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
304{
305 if (ns->params.max_active_zones != 0 &&
306 ns->nr_active_zones + act > ns->params.max_active_zones) {
307 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
308 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
309 }
310 if (ns->params.max_open_zones != 0 &&
311 ns->nr_open_zones + opn > ns->params.max_open_zones) {
312 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
313 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
314 }
315
316 return NVME_SUCCESS;
317}
318
319static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
320{
321 hwaddr hi, lo;
322
323 if (!n->cmb.cmse) {
324 return false;
325 }
326
327 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
328 hi = lo + int128_get64(n->cmb.mem.size);
329
330 return addr >= lo && addr < hi;
331}
332
333static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
334{
335 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
336 return &n->cmb.buf[addr - base];
337}
338
339static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
340{
341 hwaddr hi;
342
343 if (!n->pmr.cmse) {
344 return false;
345 }
346
347 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
348
349 return addr >= n->pmr.cba && addr < hi;
350}
351
352static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
353{
354 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
355}
356
357static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
358{
359 hwaddr hi = addr + size - 1;
360 if (hi < addr) {
361 return 1;
362 }
363
364 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
365 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
366 return 0;
367 }
368
369 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
370 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
371 return 0;
372 }
373
374 return pci_dma_read(&n->parent_obj, addr, buf, size);
375}
376
377static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
378{
379 hwaddr hi = addr + size - 1;
380 if (hi < addr) {
381 return 1;
382 }
383
384 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
385 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
386 return 0;
387 }
388
389 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
390 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
391 return 0;
392 }
393
394 return pci_dma_write(&n->parent_obj, addr, buf, size);
395}
396
397static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
398{
399 return nsid && (nsid == NVME_NSID_BROADCAST || nsid <= n->num_namespaces);
400}
401
402static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
403{
404 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
405}
406
407static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
408{
409 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
410}
411
412static void nvme_inc_cq_tail(NvmeCQueue *cq)
413{
414 cq->tail++;
415 if (cq->tail >= cq->size) {
416 cq->tail = 0;
417 cq->phase = !cq->phase;
418 }
419}
420
421static void nvme_inc_sq_head(NvmeSQueue *sq)
422{
423 sq->head = (sq->head + 1) % sq->size;
424}
425
426static uint8_t nvme_cq_full(NvmeCQueue *cq)
427{
428 return (cq->tail + 1) % cq->size == cq->head;
429}
430
431static uint8_t nvme_sq_empty(NvmeSQueue *sq)
432{
433 return sq->head == sq->tail;
434}
435
436static void nvme_irq_check(NvmeCtrl *n)
437{
438 if (msix_enabled(&(n->parent_obj))) {
439 return;
440 }
441 if (~n->bar.intms & n->irq_status) {
442 pci_irq_assert(&n->parent_obj);
443 } else {
444 pci_irq_deassert(&n->parent_obj);
445 }
446}
447
448static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
449{
450 if (cq->irq_enabled) {
451 if (msix_enabled(&(n->parent_obj))) {
452 trace_pci_nvme_irq_msix(cq->vector);
453 msix_notify(&(n->parent_obj), cq->vector);
454 } else {
455 trace_pci_nvme_irq_pin();
456 assert(cq->vector < 32);
457 n->irq_status |= 1 << cq->vector;
458 nvme_irq_check(n);
459 }
460 } else {
461 trace_pci_nvme_irq_masked();
462 }
463}
464
465static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
466{
467 if (cq->irq_enabled) {
468 if (msix_enabled(&(n->parent_obj))) {
469 return;
470 } else {
471 assert(cq->vector < 32);
472 if (!n->cq_pending) {
473 n->irq_status &= ~(1 << cq->vector);
474 }
475 nvme_irq_check(n);
476 }
477 }
478}
479
480static void nvme_req_clear(NvmeRequest *req)
481{
482 req->ns = NULL;
483 req->opaque = NULL;
484 req->aiocb = NULL;
485 memset(&req->cqe, 0x0, sizeof(req->cqe));
486 req->status = NVME_SUCCESS;
487}
488
489static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
490{
491 if (dma) {
492 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
493 sg->flags = NVME_SG_DMA;
494 } else {
495 qemu_iovec_init(&sg->iov, 0);
496 }
497
498 sg->flags |= NVME_SG_ALLOC;
499}
500
501static inline void nvme_sg_unmap(NvmeSg *sg)
502{
503 if (!(sg->flags & NVME_SG_ALLOC)) {
504 return;
505 }
506
507 if (sg->flags & NVME_SG_DMA) {
508 qemu_sglist_destroy(&sg->qsg);
509 } else {
510 qemu_iovec_destroy(&sg->iov);
511 }
512
513 memset(sg, 0x0, sizeof(*sg));
514}
515
516
517
518
519
520
521static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
522 NvmeSg *mdata)
523{
524 NvmeSg *dst = data;
525 size_t size = nvme_lsize(ns);
526 size_t msize = nvme_msize(ns);
527 uint32_t trans_len, count = size;
528 uint64_t offset = 0;
529 bool dma = sg->flags & NVME_SG_DMA;
530 size_t sge_len;
531 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
532 int sg_idx = 0;
533
534 assert(sg->flags & NVME_SG_ALLOC);
535
536 while (sg_len) {
537 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
538
539 trans_len = MIN(sg_len, count);
540 trans_len = MIN(trans_len, sge_len - offset);
541
542 if (dst) {
543 if (dma) {
544 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
545 trans_len);
546 } else {
547 qemu_iovec_add(&dst->iov,
548 sg->iov.iov[sg_idx].iov_base + offset,
549 trans_len);
550 }
551 }
552
553 sg_len -= trans_len;
554 count -= trans_len;
555 offset += trans_len;
556
557 if (count == 0) {
558 dst = (dst == data) ? mdata : data;
559 count = (dst == data) ? size : msize;
560 }
561
562 if (sge_len == offset) {
563 offset = 0;
564 sg_idx++;
565 }
566 }
567}
568
569static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
570 size_t len)
571{
572 if (!len) {
573 return NVME_SUCCESS;
574 }
575
576 trace_pci_nvme_map_addr_cmb(addr, len);
577
578 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
579 return NVME_DATA_TRAS_ERROR;
580 }
581
582 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
583
584 return NVME_SUCCESS;
585}
586
587static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
588 size_t len)
589{
590 if (!len) {
591 return NVME_SUCCESS;
592 }
593
594 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
595 return NVME_DATA_TRAS_ERROR;
596 }
597
598 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
599
600 return NVME_SUCCESS;
601}
602
603static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
604{
605 bool cmb = false, pmr = false;
606
607 if (!len) {
608 return NVME_SUCCESS;
609 }
610
611 trace_pci_nvme_map_addr(addr, len);
612
613 if (nvme_addr_is_cmb(n, addr)) {
614 cmb = true;
615 } else if (nvme_addr_is_pmr(n, addr)) {
616 pmr = true;
617 }
618
619 if (cmb || pmr) {
620 if (sg->flags & NVME_SG_DMA) {
621 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
622 }
623
624 if (cmb) {
625 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
626 } else {
627 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
628 }
629 }
630
631 if (!(sg->flags & NVME_SG_DMA)) {
632 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
633 }
634
635 qemu_sglist_add(&sg->qsg, addr, len);
636
637 return NVME_SUCCESS;
638}
639
640static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
641{
642 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
643}
644
645static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
646 uint64_t prp2, uint32_t len)
647{
648 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
649 trans_len = MIN(len, trans_len);
650 int num_prps = (len >> n->page_bits) + 1;
651 uint16_t status;
652 int ret;
653
654 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
655
656 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
657
658 status = nvme_map_addr(n, sg, prp1, trans_len);
659 if (status) {
660 goto unmap;
661 }
662
663 len -= trans_len;
664 if (len) {
665 if (len > n->page_size) {
666 uint64_t prp_list[n->max_prp_ents];
667 uint32_t nents, prp_trans;
668 int i = 0;
669
670
671
672
673
674
675 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
676 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
677 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
678 if (ret) {
679 trace_pci_nvme_err_addr_read(prp2);
680 status = NVME_DATA_TRAS_ERROR;
681 goto unmap;
682 }
683 while (len != 0) {
684 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
685
686 if (i == nents - 1 && len > n->page_size) {
687 if (unlikely(prp_ent & (n->page_size - 1))) {
688 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
689 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
690 goto unmap;
691 }
692
693 i = 0;
694 nents = (len + n->page_size - 1) >> n->page_bits;
695 nents = MIN(nents, n->max_prp_ents);
696 prp_trans = nents * sizeof(uint64_t);
697 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
698 prp_trans);
699 if (ret) {
700 trace_pci_nvme_err_addr_read(prp_ent);
701 status = NVME_DATA_TRAS_ERROR;
702 goto unmap;
703 }
704 prp_ent = le64_to_cpu(prp_list[i]);
705 }
706
707 if (unlikely(prp_ent & (n->page_size - 1))) {
708 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
709 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
710 goto unmap;
711 }
712
713 trans_len = MIN(len, n->page_size);
714 status = nvme_map_addr(n, sg, prp_ent, trans_len);
715 if (status) {
716 goto unmap;
717 }
718
719 len -= trans_len;
720 i++;
721 }
722 } else {
723 if (unlikely(prp2 & (n->page_size - 1))) {
724 trace_pci_nvme_err_invalid_prp2_align(prp2);
725 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
726 goto unmap;
727 }
728 status = nvme_map_addr(n, sg, prp2, len);
729 if (status) {
730 goto unmap;
731 }
732 }
733 }
734
735 return NVME_SUCCESS;
736
737unmap:
738 nvme_sg_unmap(sg);
739 return status;
740}
741
742
743
744
745
746static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
747 NvmeSglDescriptor *segment, uint64_t nsgld,
748 size_t *len, NvmeCmd *cmd)
749{
750 dma_addr_t addr, trans_len;
751 uint32_t dlen;
752 uint16_t status;
753
754 for (int i = 0; i < nsgld; i++) {
755 uint8_t type = NVME_SGL_TYPE(segment[i].type);
756
757 switch (type) {
758 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
759 if (cmd->opcode == NVME_CMD_WRITE) {
760 continue;
761 }
762 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
763 break;
764 case NVME_SGL_DESCR_TYPE_SEGMENT:
765 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
766 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
767 default:
768 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
769 }
770
771 dlen = le32_to_cpu(segment[i].len);
772
773 if (!dlen) {
774 continue;
775 }
776
777 if (*len == 0) {
778
779
780
781
782
783 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
784 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
785 break;
786 }
787
788 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
789 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
790 }
791
792 trans_len = MIN(*len, dlen);
793
794 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
795 goto next;
796 }
797
798 addr = le64_to_cpu(segment[i].addr);
799
800 if (UINT64_MAX - addr < dlen) {
801 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
802 }
803
804 status = nvme_map_addr(n, sg, addr, trans_len);
805 if (status) {
806 return status;
807 }
808
809next:
810 *len -= trans_len;
811 }
812
813 return NVME_SUCCESS;
814}
815
816static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
817 size_t len, NvmeCmd *cmd)
818{
819
820
821
822
823
824
825
826 const int SEG_CHUNK_SIZE = 256;
827
828 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
829 uint64_t nsgld;
830 uint32_t seg_len;
831 uint16_t status;
832 hwaddr addr;
833 int ret;
834
835 sgld = &sgl;
836 addr = le64_to_cpu(sgl.addr);
837
838 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
839
840 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
841
842
843
844
845
846 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
847 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
848 if (status) {
849 goto unmap;
850 }
851
852 goto out;
853 }
854
855 for (;;) {
856 switch (NVME_SGL_TYPE(sgld->type)) {
857 case NVME_SGL_DESCR_TYPE_SEGMENT:
858 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
859 break;
860 default:
861 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
862 }
863
864 seg_len = le32_to_cpu(sgld->len);
865
866
867 if ((!seg_len || seg_len & 0xf) &&
868 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
869 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
870 }
871
872 if (UINT64_MAX - addr < seg_len) {
873 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
874 }
875
876 nsgld = seg_len / sizeof(NvmeSglDescriptor);
877
878 while (nsgld > SEG_CHUNK_SIZE) {
879 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
880 trace_pci_nvme_err_addr_read(addr);
881 status = NVME_DATA_TRAS_ERROR;
882 goto unmap;
883 }
884
885 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
886 &len, cmd);
887 if (status) {
888 goto unmap;
889 }
890
891 nsgld -= SEG_CHUNK_SIZE;
892 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
893 }
894
895 ret = nvme_addr_read(n, addr, segment, nsgld *
896 sizeof(NvmeSglDescriptor));
897 if (ret) {
898 trace_pci_nvme_err_addr_read(addr);
899 status = NVME_DATA_TRAS_ERROR;
900 goto unmap;
901 }
902
903 last_sgld = &segment[nsgld - 1];
904
905
906
907
908
909 switch (NVME_SGL_TYPE(last_sgld->type)) {
910 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
911 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
912 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
913 if (status) {
914 goto unmap;
915 }
916
917 goto out;
918
919 default:
920 break;
921 }
922
923
924
925
926
927 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
928 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
929 goto unmap;
930 }
931
932 sgld = last_sgld;
933 addr = le64_to_cpu(sgld->addr);
934
935
936
937
938
939 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
940 if (status) {
941 goto unmap;
942 }
943 }
944
945out:
946
947 if (len) {
948 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
949 goto unmap;
950 }
951
952 return NVME_SUCCESS;
953
954unmap:
955 nvme_sg_unmap(sg);
956 return status;
957}
958
959uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
960 NvmeCmd *cmd)
961{
962 uint64_t prp1, prp2;
963
964 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
965 case NVME_PSDT_PRP:
966 prp1 = le64_to_cpu(cmd->dptr.prp1);
967 prp2 = le64_to_cpu(cmd->dptr.prp2);
968
969 return nvme_map_prp(n, sg, prp1, prp2, len);
970 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
971 case NVME_PSDT_SGL_MPTR_SGL:
972 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
973 default:
974 return NVME_INVALID_FIELD;
975 }
976}
977
978static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
979 NvmeCmd *cmd)
980{
981 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
982 hwaddr mptr = le64_to_cpu(cmd->mptr);
983 uint16_t status;
984
985 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
986 NvmeSglDescriptor sgl;
987
988 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
989 return NVME_DATA_TRAS_ERROR;
990 }
991
992 status = nvme_map_sgl(n, sg, sgl, len, cmd);
993 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
994 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
995 }
996
997 return status;
998 }
999
1000 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1001 status = nvme_map_addr(n, sg, mptr, len);
1002 if (status) {
1003 nvme_sg_unmap(sg);
1004 }
1005
1006 return status;
1007}
1008
1009static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1010{
1011 NvmeNamespace *ns = req->ns;
1012 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1013 uint16_t ctrl = le16_to_cpu(rw->control);
1014 size_t len = nvme_l2b(ns, nlb);
1015 uint16_t status;
1016
1017 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
1018 (ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
1019 goto out;
1020 }
1021
1022 if (nvme_ns_ext(ns)) {
1023 NvmeSg sg;
1024
1025 len += nvme_m2b(ns, nlb);
1026
1027 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1028 if (status) {
1029 return status;
1030 }
1031
1032 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1033 nvme_sg_split(&sg, ns, &req->sg, NULL);
1034 nvme_sg_unmap(&sg);
1035
1036 return NVME_SUCCESS;
1037 }
1038
1039out:
1040 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1041}
1042
1043static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1044{
1045 NvmeNamespace *ns = req->ns;
1046 size_t len = nvme_m2b(ns, nlb);
1047 uint16_t status;
1048
1049 if (nvme_ns_ext(ns)) {
1050 NvmeSg sg;
1051
1052 len += nvme_l2b(ns, nlb);
1053
1054 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1055 if (status) {
1056 return status;
1057 }
1058
1059 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1060 nvme_sg_split(&sg, ns, NULL, &req->sg);
1061 nvme_sg_unmap(&sg);
1062
1063 return NVME_SUCCESS;
1064 }
1065
1066 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1067}
1068
1069static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1070 uint32_t len, uint32_t bytes,
1071 int32_t skip_bytes, int64_t offset,
1072 NvmeTxDirection dir)
1073{
1074 hwaddr addr;
1075 uint32_t trans_len, count = bytes;
1076 bool dma = sg->flags & NVME_SG_DMA;
1077 int64_t sge_len;
1078 int sg_idx = 0;
1079 int ret;
1080
1081 assert(sg->flags & NVME_SG_ALLOC);
1082
1083 while (len) {
1084 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1085
1086 if (sge_len - offset < 0) {
1087 offset -= sge_len;
1088 sg_idx++;
1089 continue;
1090 }
1091
1092 if (sge_len == offset) {
1093 offset = 0;
1094 sg_idx++;
1095 continue;
1096 }
1097
1098 trans_len = MIN(len, count);
1099 trans_len = MIN(trans_len, sge_len - offset);
1100
1101 if (dma) {
1102 addr = sg->qsg.sg[sg_idx].base + offset;
1103 } else {
1104 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1105 }
1106
1107 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1108 ret = nvme_addr_read(n, addr, ptr, trans_len);
1109 } else {
1110 ret = nvme_addr_write(n, addr, ptr, trans_len);
1111 }
1112
1113 if (ret) {
1114 return NVME_DATA_TRAS_ERROR;
1115 }
1116
1117 ptr += trans_len;
1118 len -= trans_len;
1119 count -= trans_len;
1120 offset += trans_len;
1121
1122 if (count == 0) {
1123 count = bytes;
1124 offset += skip_bytes;
1125 }
1126 }
1127
1128 return NVME_SUCCESS;
1129}
1130
1131static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1132 NvmeTxDirection dir)
1133{
1134 assert(sg->flags & NVME_SG_ALLOC);
1135
1136 if (sg->flags & NVME_SG_DMA) {
1137 uint64_t residual;
1138
1139 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1140 residual = dma_buf_write(ptr, len, &sg->qsg);
1141 } else {
1142 residual = dma_buf_read(ptr, len, &sg->qsg);
1143 }
1144
1145 if (unlikely(residual)) {
1146 trace_pci_nvme_err_invalid_dma();
1147 return NVME_INVALID_FIELD | NVME_DNR;
1148 }
1149 } else {
1150 size_t bytes;
1151
1152 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1153 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1154 } else {
1155 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1156 }
1157
1158 if (unlikely(bytes != len)) {
1159 trace_pci_nvme_err_invalid_dma();
1160 return NVME_INVALID_FIELD | NVME_DNR;
1161 }
1162 }
1163
1164 return NVME_SUCCESS;
1165}
1166
1167static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1168 NvmeRequest *req)
1169{
1170 uint16_t status;
1171
1172 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1173 if (status) {
1174 return status;
1175 }
1176
1177 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1178}
1179
1180static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1181 NvmeRequest *req)
1182{
1183 uint16_t status;
1184
1185 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1186 if (status) {
1187 return status;
1188 }
1189
1190 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1191}
1192
1193uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1194 NvmeTxDirection dir, NvmeRequest *req)
1195{
1196 NvmeNamespace *ns = req->ns;
1197 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1198 uint16_t ctrl = le16_to_cpu(rw->control);
1199
1200 if (nvme_ns_ext(ns) &&
1201 !(ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
1202 size_t lsize = nvme_lsize(ns);
1203 size_t msize = nvme_msize(ns);
1204
1205 return nvme_tx_interleaved(n, &req->sg, ptr, len, lsize, msize, 0,
1206 dir);
1207 }
1208
1209 return nvme_tx(n, &req->sg, ptr, len, dir);
1210}
1211
1212uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1213 NvmeTxDirection dir, NvmeRequest *req)
1214{
1215 NvmeNamespace *ns = req->ns;
1216 uint16_t status;
1217
1218 if (nvme_ns_ext(ns)) {
1219 size_t lsize = nvme_lsize(ns);
1220 size_t msize = nvme_msize(ns);
1221
1222 return nvme_tx_interleaved(n, &req->sg, ptr, len, msize, lsize, lsize,
1223 dir);
1224 }
1225
1226 nvme_sg_unmap(&req->sg);
1227
1228 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1229 if (status) {
1230 return status;
1231 }
1232
1233 return nvme_tx(n, &req->sg, ptr, len, dir);
1234}
1235
1236static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1237 BlockCompletionFunc *cb, NvmeRequest *req)
1238{
1239 assert(req->sg.flags & NVME_SG_ALLOC);
1240
1241 if (req->sg.flags & NVME_SG_DMA) {
1242 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1243 cb, req);
1244 } else {
1245 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1246 }
1247}
1248
1249static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1250 BlockCompletionFunc *cb, NvmeRequest *req)
1251{
1252 assert(req->sg.flags & NVME_SG_ALLOC);
1253
1254 if (req->sg.flags & NVME_SG_DMA) {
1255 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1256 cb, req);
1257 } else {
1258 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1259 }
1260}
1261
1262static void nvme_post_cqes(void *opaque)
1263{
1264 NvmeCQueue *cq = opaque;
1265 NvmeCtrl *n = cq->ctrl;
1266 NvmeRequest *req, *next;
1267 bool pending = cq->head != cq->tail;
1268 int ret;
1269
1270 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1271 NvmeSQueue *sq;
1272 hwaddr addr;
1273
1274 if (nvme_cq_full(cq)) {
1275 break;
1276 }
1277
1278 sq = req->sq;
1279 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1280 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1281 req->cqe.sq_head = cpu_to_le16(sq->head);
1282 addr = cq->dma_addr + cq->tail * n->cqe_size;
1283 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1284 sizeof(req->cqe));
1285 if (ret) {
1286 trace_pci_nvme_err_addr_write(addr);
1287 trace_pci_nvme_err_cfs();
1288 n->bar.csts = NVME_CSTS_FAILED;
1289 break;
1290 }
1291 QTAILQ_REMOVE(&cq->req_list, req, entry);
1292 nvme_inc_cq_tail(cq);
1293 nvme_sg_unmap(&req->sg);
1294 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1295 }
1296 if (cq->tail != cq->head) {
1297 if (cq->irq_enabled && !pending) {
1298 n->cq_pending++;
1299 }
1300
1301 nvme_irq_assert(n, cq);
1302 }
1303}
1304
1305static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1306{
1307 assert(cq->cqid == req->sq->cqid);
1308 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1309 req->status);
1310
1311 if (req->status) {
1312 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1313 req->status, req->cmd.opcode);
1314 }
1315
1316 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1317 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1318 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1319}
1320
1321static void nvme_process_aers(void *opaque)
1322{
1323 NvmeCtrl *n = opaque;
1324 NvmeAsyncEvent *event, *next;
1325
1326 trace_pci_nvme_process_aers(n->aer_queued);
1327
1328 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1329 NvmeRequest *req;
1330 NvmeAerResult *result;
1331
1332
1333 if (!n->outstanding_aers) {
1334 trace_pci_nvme_no_outstanding_aers();
1335 break;
1336 }
1337
1338
1339 if (n->aer_mask & (1 << event->result.event_type)) {
1340 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1341 continue;
1342 }
1343
1344 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1345 n->aer_queued--;
1346
1347 n->aer_mask |= 1 << event->result.event_type;
1348 n->outstanding_aers--;
1349
1350 req = n->aer_reqs[n->outstanding_aers];
1351
1352 result = (NvmeAerResult *) &req->cqe.result;
1353 result->event_type = event->result.event_type;
1354 result->event_info = event->result.event_info;
1355 result->log_page = event->result.log_page;
1356 g_free(event);
1357
1358 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1359 result->log_page);
1360
1361 nvme_enqueue_req_completion(&n->admin_cq, req);
1362 }
1363}
1364
1365static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1366 uint8_t event_info, uint8_t log_page)
1367{
1368 NvmeAsyncEvent *event;
1369
1370 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1371
1372 if (n->aer_queued == n->params.aer_max_queued) {
1373 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1374 return;
1375 }
1376
1377 event = g_new(NvmeAsyncEvent, 1);
1378 event->result = (NvmeAerResult) {
1379 .event_type = event_type,
1380 .event_info = event_info,
1381 .log_page = log_page,
1382 };
1383
1384 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1385 n->aer_queued++;
1386
1387 nvme_process_aers(n);
1388}
1389
1390static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1391{
1392 uint8_t aer_info;
1393
1394
1395 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1396 return;
1397 }
1398
1399 switch (event) {
1400 case NVME_SMART_SPARE:
1401 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1402 break;
1403 case NVME_SMART_TEMPERATURE:
1404 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1405 break;
1406 case NVME_SMART_RELIABILITY:
1407 case NVME_SMART_MEDIA_READ_ONLY:
1408 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1409 case NVME_SMART_PMR_UNRELIABLE:
1410 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1411 break;
1412 default:
1413 return;
1414 }
1415
1416 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1417}
1418
1419static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1420{
1421 n->aer_mask &= ~(1 << event_type);
1422 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1423 nvme_process_aers(n);
1424 }
1425}
1426
1427static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1428{
1429 uint8_t mdts = n->params.mdts;
1430
1431 if (mdts && len > n->page_size << mdts) {
1432 trace_pci_nvme_err_mdts(len);
1433 return NVME_INVALID_FIELD | NVME_DNR;
1434 }
1435
1436 return NVME_SUCCESS;
1437}
1438
1439static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1440 uint32_t nlb)
1441{
1442 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1443
1444 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1445 return NVME_LBA_RANGE | NVME_DNR;
1446 }
1447
1448 return NVME_SUCCESS;
1449}
1450
1451static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1452 uint32_t nlb)
1453{
1454 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1455
1456 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1457 int64_t offset = nvme_l2b(ns, slba);
1458 bool zeroed;
1459 int ret;
1460
1461 Error *local_err = NULL;
1462
1463
1464
1465
1466
1467
1468
1469 do {
1470 bytes -= pnum;
1471
1472 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1473 if (ret < 0) {
1474 error_setg_errno(&local_err, -ret, "unable to get block status");
1475 error_report_err(local_err);
1476
1477 return NVME_INTERNAL_DEV_ERROR;
1478 }
1479
1480 zeroed = !!(ret & BDRV_BLOCK_ZERO);
1481
1482 trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
1483
1484 if (zeroed) {
1485 return NVME_DULB;
1486 }
1487
1488 offset += pnum;
1489 } while (pnum != bytes);
1490
1491 return NVME_SUCCESS;
1492}
1493
1494static void nvme_aio_err(NvmeRequest *req, int ret)
1495{
1496 uint16_t status = NVME_SUCCESS;
1497 Error *local_err = NULL;
1498
1499 switch (req->cmd.opcode) {
1500 case NVME_CMD_READ:
1501 status = NVME_UNRECOVERED_READ;
1502 break;
1503 case NVME_CMD_FLUSH:
1504 case NVME_CMD_WRITE:
1505 case NVME_CMD_WRITE_ZEROES:
1506 case NVME_CMD_ZONE_APPEND:
1507 status = NVME_WRITE_FAULT;
1508 break;
1509 default:
1510 status = NVME_INTERNAL_DEV_ERROR;
1511 break;
1512 }
1513
1514 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1515
1516 error_setg_errno(&local_err, -ret, "aio failed");
1517 error_report_err(local_err);
1518
1519
1520
1521
1522
1523 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1524 return;
1525 }
1526
1527 req->status = status;
1528}
1529
1530static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1531{
1532 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1533 slba / ns->zone_size;
1534}
1535
1536static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1537{
1538 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1539
1540 assert(zone_idx < ns->num_zones);
1541 return &ns->zone_array[zone_idx];
1542}
1543
1544static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1545{
1546 uint64_t zslba = zone->d.zslba;
1547
1548 switch (nvme_get_zone_state(zone)) {
1549 case NVME_ZONE_STATE_EMPTY:
1550 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1551 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1552 case NVME_ZONE_STATE_CLOSED:
1553 return NVME_SUCCESS;
1554 case NVME_ZONE_STATE_FULL:
1555 trace_pci_nvme_err_zone_is_full(zslba);
1556 return NVME_ZONE_FULL;
1557 case NVME_ZONE_STATE_OFFLINE:
1558 trace_pci_nvme_err_zone_is_offline(zslba);
1559 return NVME_ZONE_OFFLINE;
1560 case NVME_ZONE_STATE_READ_ONLY:
1561 trace_pci_nvme_err_zone_is_read_only(zslba);
1562 return NVME_ZONE_READ_ONLY;
1563 default:
1564 assert(false);
1565 }
1566
1567 return NVME_INTERNAL_DEV_ERROR;
1568}
1569
1570static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1571 uint64_t slba, uint32_t nlb)
1572{
1573 uint64_t zcap = nvme_zone_wr_boundary(zone);
1574 uint16_t status;
1575
1576 status = nvme_check_zone_state_for_write(zone);
1577 if (status) {
1578 return status;
1579 }
1580
1581 if (unlikely(slba != zone->w_ptr)) {
1582 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1583 return NVME_ZONE_INVALID_WRITE;
1584 }
1585
1586 if (unlikely((slba + nlb) > zcap)) {
1587 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1588 return NVME_ZONE_BOUNDARY_ERROR;
1589 }
1590
1591 return NVME_SUCCESS;
1592}
1593
1594static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1595{
1596 switch (nvme_get_zone_state(zone)) {
1597 case NVME_ZONE_STATE_EMPTY:
1598 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1599 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1600 case NVME_ZONE_STATE_FULL:
1601 case NVME_ZONE_STATE_CLOSED:
1602 case NVME_ZONE_STATE_READ_ONLY:
1603 return NVME_SUCCESS;
1604 case NVME_ZONE_STATE_OFFLINE:
1605 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1606 return NVME_ZONE_OFFLINE;
1607 default:
1608 assert(false);
1609 }
1610
1611 return NVME_INTERNAL_DEV_ERROR;
1612}
1613
1614static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1615 uint32_t nlb)
1616{
1617 NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
1618 uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
1619 uint64_t end = slba + nlb;
1620 uint16_t status;
1621
1622 status = nvme_check_zone_state_for_read(zone);
1623 if (status) {
1624 ;
1625 } else if (unlikely(end > bndry)) {
1626 if (!ns->params.cross_zone_read) {
1627 status = NVME_ZONE_BOUNDARY_ERROR;
1628 } else {
1629
1630
1631
1632
1633 do {
1634 zone++;
1635 status = nvme_check_zone_state_for_read(zone);
1636 if (status) {
1637 break;
1638 }
1639 } while (end > nvme_zone_rd_boundary(ns, zone));
1640 }
1641 }
1642
1643 return status;
1644}
1645
1646static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1647{
1648 switch (nvme_get_zone_state(zone)) {
1649 case NVME_ZONE_STATE_FULL:
1650 return NVME_SUCCESS;
1651
1652 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1653 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1654 nvme_aor_dec_open(ns);
1655
1656 case NVME_ZONE_STATE_CLOSED:
1657 nvme_aor_dec_active(ns);
1658
1659 case NVME_ZONE_STATE_EMPTY:
1660 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1661 return NVME_SUCCESS;
1662
1663 default:
1664 return NVME_ZONE_INVAL_TRANSITION;
1665 }
1666}
1667
1668static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1669{
1670 switch (nvme_get_zone_state(zone)) {
1671 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1672 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1673 nvme_aor_dec_open(ns);
1674 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1675
1676 case NVME_ZONE_STATE_CLOSED:
1677 return NVME_SUCCESS;
1678
1679 default:
1680 return NVME_ZONE_INVAL_TRANSITION;
1681 }
1682}
1683
1684static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1685{
1686 NvmeZone *zone;
1687
1688 if (ns->params.max_open_zones &&
1689 ns->nr_open_zones == ns->params.max_open_zones) {
1690 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1691 if (zone) {
1692
1693
1694
1695 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1696 nvme_zrm_close(ns, zone);
1697 }
1698 }
1699}
1700
1701static uint16_t __nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone,
1702 bool implicit)
1703{
1704 int act = 0;
1705 uint16_t status;
1706
1707 switch (nvme_get_zone_state(zone)) {
1708 case NVME_ZONE_STATE_EMPTY:
1709 act = 1;
1710
1711
1712
1713 case NVME_ZONE_STATE_CLOSED:
1714 nvme_zrm_auto_transition_zone(ns);
1715 status = nvme_aor_check(ns, act, 1);
1716 if (status) {
1717 return status;
1718 }
1719
1720 if (act) {
1721 nvme_aor_inc_active(ns);
1722 }
1723
1724 nvme_aor_inc_open(ns);
1725
1726 if (implicit) {
1727 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1728 return NVME_SUCCESS;
1729 }
1730
1731
1732
1733 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1734 if (implicit) {
1735 return NVME_SUCCESS;
1736 }
1737
1738 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1739
1740
1741
1742 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1743 return NVME_SUCCESS;
1744
1745 default:
1746 return NVME_ZONE_INVAL_TRANSITION;
1747 }
1748}
1749
1750static inline uint16_t nvme_zrm_auto(NvmeNamespace *ns, NvmeZone *zone)
1751{
1752 return __nvme_zrm_open(ns, zone, true);
1753}
1754
1755static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone)
1756{
1757 return __nvme_zrm_open(ns, zone, false);
1758}
1759
1760static void __nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1761 uint32_t nlb)
1762{
1763 zone->d.wp += nlb;
1764
1765 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1766 nvme_zrm_finish(ns, zone);
1767 }
1768}
1769
1770static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1771{
1772 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1773 NvmeZone *zone;
1774 uint64_t slba;
1775 uint32_t nlb;
1776
1777 slba = le64_to_cpu(rw->slba);
1778 nlb = le16_to_cpu(rw->nlb) + 1;
1779 zone = nvme_get_zone_by_slba(ns, slba);
1780
1781 __nvme_advance_zone_wp(ns, zone, nlb);
1782}
1783
1784static inline bool nvme_is_write(NvmeRequest *req)
1785{
1786 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1787
1788 return rw->opcode == NVME_CMD_WRITE ||
1789 rw->opcode == NVME_CMD_ZONE_APPEND ||
1790 rw->opcode == NVME_CMD_WRITE_ZEROES;
1791}
1792
1793static void nvme_misc_cb(void *opaque, int ret)
1794{
1795 NvmeRequest *req = opaque;
1796 NvmeNamespace *ns = req->ns;
1797
1798 BlockBackend *blk = ns->blkconf.blk;
1799 BlockAcctCookie *acct = &req->acct;
1800 BlockAcctStats *stats = blk_get_stats(blk);
1801
1802 trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk));
1803
1804 if (ret) {
1805 block_acct_failed(stats, acct);
1806 nvme_aio_err(req, ret);
1807 } else {
1808 block_acct_done(stats, acct);
1809 }
1810
1811 nvme_enqueue_req_completion(nvme_cq(req), req);
1812}
1813
1814void nvme_rw_complete_cb(void *opaque, int ret)
1815{
1816 NvmeRequest *req = opaque;
1817 NvmeNamespace *ns = req->ns;
1818 BlockBackend *blk = ns->blkconf.blk;
1819 BlockAcctCookie *acct = &req->acct;
1820 BlockAcctStats *stats = blk_get_stats(blk);
1821
1822 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1823
1824 if (ret) {
1825 block_acct_failed(stats, acct);
1826 nvme_aio_err(req, ret);
1827 } else {
1828 block_acct_done(stats, acct);
1829 }
1830
1831 if (ns->params.zoned && nvme_is_write(req)) {
1832 nvme_finalize_zoned_write(ns, req);
1833 }
1834
1835 nvme_enqueue_req_completion(nvme_cq(req), req);
1836}
1837
1838static void nvme_rw_cb(void *opaque, int ret)
1839{
1840 NvmeRequest *req = opaque;
1841 NvmeNamespace *ns = req->ns;
1842
1843 BlockBackend *blk = ns->blkconf.blk;
1844
1845 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1846
1847 if (ret) {
1848 goto out;
1849 }
1850
1851 if (nvme_msize(ns)) {
1852 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1853 uint64_t slba = le64_to_cpu(rw->slba);
1854 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1855 uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
1856
1857 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1858 size_t mlen = nvme_m2b(ns, nlb);
1859
1860 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1861 BDRV_REQ_MAY_UNMAP,
1862 nvme_rw_complete_cb, req);
1863 return;
1864 }
1865
1866 if (nvme_ns_ext(ns) || req->cmd.mptr) {
1867 uint16_t status;
1868
1869 nvme_sg_unmap(&req->sg);
1870 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1871 if (status) {
1872 ret = -EFAULT;
1873 goto out;
1874 }
1875
1876 if (req->cmd.opcode == NVME_CMD_READ) {
1877 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1878 }
1879
1880 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1881 }
1882 }
1883
1884out:
1885 nvme_rw_complete_cb(req, ret);
1886}
1887
1888struct nvme_aio_format_ctx {
1889 NvmeRequest *req;
1890 NvmeNamespace *ns;
1891
1892
1893 int *count;
1894};
1895
1896static void nvme_aio_format_cb(void *opaque, int ret)
1897{
1898 struct nvme_aio_format_ctx *ctx = opaque;
1899 NvmeRequest *req = ctx->req;
1900 NvmeNamespace *ns = ctx->ns;
1901 uintptr_t *num_formats = (uintptr_t *)&req->opaque;
1902 int *count = ctx->count;
1903
1904 g_free(ctx);
1905
1906 if (ret) {
1907 nvme_aio_err(req, ret);
1908 }
1909
1910 if (--(*count)) {
1911 return;
1912 }
1913
1914 g_free(count);
1915 ns->status = 0x0;
1916
1917 if (--(*num_formats)) {
1918 return;
1919 }
1920
1921 nvme_enqueue_req_completion(nvme_cq(req), req);
1922}
1923
1924struct nvme_aio_flush_ctx {
1925 NvmeRequest *req;
1926 NvmeNamespace *ns;
1927 BlockAcctCookie acct;
1928};
1929
1930static void nvme_aio_flush_cb(void *opaque, int ret)
1931{
1932 struct nvme_aio_flush_ctx *ctx = opaque;
1933 NvmeRequest *req = ctx->req;
1934 uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
1935
1936 BlockBackend *blk = ctx->ns->blkconf.blk;
1937 BlockAcctCookie *acct = &ctx->acct;
1938 BlockAcctStats *stats = blk_get_stats(blk);
1939
1940 trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
1941
1942 if (!ret) {
1943 block_acct_done(stats, acct);
1944 } else {
1945 block_acct_failed(stats, acct);
1946 nvme_aio_err(req, ret);
1947 }
1948
1949 (*num_flushes)--;
1950 g_free(ctx);
1951
1952 if (*num_flushes) {
1953 return;
1954 }
1955
1956 nvme_enqueue_req_completion(nvme_cq(req), req);
1957}
1958
1959static void nvme_verify_cb(void *opaque, int ret)
1960{
1961 NvmeBounceContext *ctx = opaque;
1962 NvmeRequest *req = ctx->req;
1963 NvmeNamespace *ns = req->ns;
1964 BlockBackend *blk = ns->blkconf.blk;
1965 BlockAcctCookie *acct = &req->acct;
1966 BlockAcctStats *stats = blk_get_stats(blk);
1967 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1968 uint64_t slba = le64_to_cpu(rw->slba);
1969 uint16_t ctrl = le16_to_cpu(rw->control);
1970 uint16_t apptag = le16_to_cpu(rw->apptag);
1971 uint16_t appmask = le16_to_cpu(rw->appmask);
1972 uint32_t reftag = le32_to_cpu(rw->reftag);
1973 uint16_t status;
1974
1975 trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag,
1976 appmask, reftag);
1977
1978 if (ret) {
1979 block_acct_failed(stats, acct);
1980 nvme_aio_err(req, ret);
1981 goto out;
1982 }
1983
1984 block_acct_done(stats, acct);
1985
1986 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1987 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1988 ctx->mdata.iov.size, slba);
1989 if (status) {
1990 req->status = status;
1991 goto out;
1992 }
1993
1994 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1995 ctx->mdata.bounce, ctx->mdata.iov.size,
1996 ctrl, slba, apptag, appmask, reftag);
1997 }
1998
1999out:
2000 qemu_iovec_destroy(&ctx->data.iov);
2001 g_free(ctx->data.bounce);
2002
2003 qemu_iovec_destroy(&ctx->mdata.iov);
2004 g_free(ctx->mdata.bounce);
2005
2006 g_free(ctx);
2007
2008 nvme_enqueue_req_completion(nvme_cq(req), req);
2009}
2010
2011
2012static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2013{
2014 NvmeBounceContext *ctx = opaque;
2015 NvmeRequest *req = ctx->req;
2016 NvmeNamespace *ns = req->ns;
2017 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2018 uint64_t slba = le64_to_cpu(rw->slba);
2019 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2020 size_t mlen = nvme_m2b(ns, nlb);
2021 uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
2022 BlockBackend *blk = ns->blkconf.blk;
2023
2024 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2025
2026 if (ret) {
2027 goto out;
2028 }
2029
2030 ctx->mdata.bounce = g_malloc(mlen);
2031
2032 qemu_iovec_reset(&ctx->mdata.iov);
2033 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2034
2035 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2036 nvme_verify_cb, ctx);
2037 return;
2038
2039out:
2040 nvme_verify_cb(ctx, ret);
2041}
2042
2043static void nvme_aio_discard_cb(void *opaque, int ret)
2044{
2045 NvmeRequest *req = opaque;
2046 uintptr_t *discards = (uintptr_t *)&req->opaque;
2047
2048 trace_pci_nvme_aio_discard_cb(nvme_cid(req));
2049
2050 if (ret) {
2051 nvme_aio_err(req, ret);
2052 }
2053
2054 (*discards)--;
2055
2056 if (*discards) {
2057 return;
2058 }
2059
2060 nvme_enqueue_req_completion(nvme_cq(req), req);
2061}
2062
2063struct nvme_zone_reset_ctx {
2064 NvmeRequest *req;
2065 NvmeZone *zone;
2066};
2067
2068static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret)
2069{
2070 struct nvme_zone_reset_ctx *ctx = opaque;
2071 NvmeRequest *req = ctx->req;
2072 NvmeNamespace *ns = req->ns;
2073 NvmeZone *zone = ctx->zone;
2074 uintptr_t *resets = (uintptr_t *)&req->opaque;
2075
2076 if (ret) {
2077 nvme_aio_err(req, ret);
2078 goto out;
2079 }
2080
2081 switch (nvme_get_zone_state(zone)) {
2082 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2083 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2084 nvme_aor_dec_open(ns);
2085
2086 case NVME_ZONE_STATE_CLOSED:
2087 nvme_aor_dec_active(ns);
2088
2089 case NVME_ZONE_STATE_FULL:
2090 zone->w_ptr = zone->d.zslba;
2091 zone->d.wp = zone->w_ptr;
2092 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
2093
2094 default:
2095 break;
2096 }
2097
2098out:
2099 g_free(ctx);
2100
2101 (*resets)--;
2102
2103 if (*resets) {
2104 return;
2105 }
2106
2107 nvme_enqueue_req_completion(nvme_cq(req), req);
2108}
2109
2110static void nvme_aio_zone_reset_cb(void *opaque, int ret)
2111{
2112 struct nvme_zone_reset_ctx *ctx = opaque;
2113 NvmeRequest *req = ctx->req;
2114 NvmeNamespace *ns = req->ns;
2115 NvmeZone *zone = ctx->zone;
2116
2117 trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
2118
2119 if (ret) {
2120 goto out;
2121 }
2122
2123 if (nvme_msize(ns)) {
2124 int64_t offset = ns->mdata_offset + nvme_m2b(ns, zone->d.zslba);
2125
2126 blk_aio_pwrite_zeroes(ns->blkconf.blk, offset,
2127 nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
2128 nvme_aio_zone_reset_complete_cb, ctx);
2129 return;
2130 }
2131
2132out:
2133 nvme_aio_zone_reset_complete_cb(opaque, ret);
2134}
2135
2136struct nvme_copy_ctx {
2137 int copies;
2138 uint8_t *bounce;
2139 uint8_t *mbounce;
2140 uint32_t nlb;
2141 NvmeCopySourceRange *ranges;
2142};
2143
2144struct nvme_copy_in_ctx {
2145 NvmeRequest *req;
2146 QEMUIOVector iov;
2147 NvmeCopySourceRange *range;
2148};
2149
2150static void nvme_copy_complete_cb(void *opaque, int ret)
2151{
2152 NvmeRequest *req = opaque;
2153 NvmeNamespace *ns = req->ns;
2154 struct nvme_copy_ctx *ctx = req->opaque;
2155
2156 if (ret) {
2157 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2158 nvme_aio_err(req, ret);
2159 goto out;
2160 }
2161
2162 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2163
2164out:
2165 if (ns->params.zoned) {
2166 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2167 uint64_t sdlba = le64_to_cpu(copy->sdlba);
2168 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2169
2170 __nvme_advance_zone_wp(ns, zone, ctx->nlb);
2171 }
2172
2173 g_free(ctx->bounce);
2174 g_free(ctx->mbounce);
2175 g_free(ctx);
2176
2177 nvme_enqueue_req_completion(nvme_cq(req), req);
2178}
2179
2180static void nvme_copy_cb(void *opaque, int ret)
2181{
2182 NvmeRequest *req = opaque;
2183 NvmeNamespace *ns = req->ns;
2184 struct nvme_copy_ctx *ctx = req->opaque;
2185
2186 trace_pci_nvme_copy_cb(nvme_cid(req));
2187
2188 if (ret) {
2189 goto out;
2190 }
2191
2192 if (nvme_msize(ns)) {
2193 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2194 uint64_t sdlba = le64_to_cpu(copy->sdlba);
2195 int64_t offset = ns->mdata_offset + nvme_m2b(ns, sdlba);
2196
2197 qemu_iovec_reset(&req->sg.iov);
2198 qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb));
2199
2200 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0,
2201 nvme_copy_complete_cb, req);
2202 return;
2203 }
2204
2205out:
2206 nvme_copy_complete_cb(opaque, ret);
2207}
2208
2209static void nvme_copy_in_complete(NvmeRequest *req)
2210{
2211 NvmeNamespace *ns = req->ns;
2212 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2213 struct nvme_copy_ctx *ctx = req->opaque;
2214 uint64_t sdlba = le64_to_cpu(copy->sdlba);
2215 uint16_t status;
2216
2217 trace_pci_nvme_copy_in_complete(nvme_cid(req));
2218
2219 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2220
2221 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2222 uint16_t prinfor = (copy->control[0] >> 4) & 0xf;
2223 uint16_t prinfow = (copy->control[2] >> 2) & 0xf;
2224 uint16_t nr = copy->nr + 1;
2225 NvmeCopySourceRange *range;
2226 uint64_t slba;
2227 uint32_t nlb;
2228 uint16_t apptag, appmask;
2229 uint32_t reftag;
2230 uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce;
2231 size_t len, mlen;
2232 int i;
2233
2234
2235
2236
2237
2238 prinfor = prinfor << 10;
2239 prinfow = prinfow << 10;
2240
2241 for (i = 0; i < nr; i++) {
2242 range = &ctx->ranges[i];
2243 slba = le64_to_cpu(range->slba);
2244 nlb = le16_to_cpu(range->nlb) + 1;
2245 len = nvme_l2b(ns, nlb);
2246 mlen = nvme_m2b(ns, nlb);
2247 apptag = le16_to_cpu(range->apptag);
2248 appmask = le16_to_cpu(range->appmask);
2249 reftag = le32_to_cpu(range->reftag);
2250
2251 status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba,
2252 apptag, appmask, reftag);
2253 if (status) {
2254 goto invalid;
2255 }
2256
2257 buf += len;
2258 mbuf += mlen;
2259 }
2260
2261 apptag = le16_to_cpu(copy->apptag);
2262 appmask = le16_to_cpu(copy->appmask);
2263 reftag = le32_to_cpu(copy->reftag);
2264
2265 if (prinfow & NVME_RW_PRINFO_PRACT) {
2266 size_t len = nvme_l2b(ns, ctx->nlb);
2267 size_t mlen = nvme_m2b(ns, ctx->nlb);
2268
2269 status = nvme_check_prinfo(ns, prinfow, sdlba, reftag);
2270 if (status) {
2271 goto invalid;
2272 }
2273
2274 nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce,
2275 mlen, apptag, reftag);
2276 } else {
2277 status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen,
2278 prinfow, sdlba, apptag, appmask, reftag);
2279 if (status) {
2280 goto invalid;
2281 }
2282 }
2283 }
2284
2285 status = nvme_check_bounds(ns, sdlba, ctx->nlb);
2286 if (status) {
2287 trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);
2288 goto invalid;
2289 }
2290
2291 if (ns->params.zoned) {
2292 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2293
2294 status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb);
2295 if (status) {
2296 goto invalid;
2297 }
2298
2299 status = nvme_zrm_auto(ns, zone);
2300 if (status) {
2301 goto invalid;
2302 }
2303
2304 zone->w_ptr += ctx->nlb;
2305 }
2306
2307 qemu_iovec_init(&req->sg.iov, 1);
2308 qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
2309
2310 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2311 BLOCK_ACCT_WRITE);
2312
2313 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
2314 &req->sg.iov, 0, nvme_copy_cb, req);
2315
2316 return;
2317
2318invalid:
2319 req->status = status;
2320
2321 g_free(ctx->bounce);
2322 g_free(ctx);
2323
2324 nvme_enqueue_req_completion(nvme_cq(req), req);
2325}
2326
2327static void nvme_aio_copy_in_cb(void *opaque, int ret)
2328{
2329 struct nvme_copy_in_ctx *in_ctx = opaque;
2330 NvmeRequest *req = in_ctx->req;
2331 NvmeNamespace *ns = req->ns;
2332 struct nvme_copy_ctx *ctx = req->opaque;
2333
2334 qemu_iovec_destroy(&in_ctx->iov);
2335 g_free(in_ctx);
2336
2337 trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
2338
2339 if (ret) {
2340 nvme_aio_err(req, ret);
2341 }
2342
2343 ctx->copies--;
2344
2345 if (ctx->copies) {
2346 return;
2347 }
2348
2349 if (req->status) {
2350 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2351
2352 g_free(ctx->bounce);
2353 g_free(ctx->mbounce);
2354 g_free(ctx);
2355
2356 nvme_enqueue_req_completion(nvme_cq(req), req);
2357
2358 return;
2359 }
2360
2361 nvme_copy_in_complete(req);
2362}
2363
2364struct nvme_compare_ctx {
2365 struct {
2366 QEMUIOVector iov;
2367 uint8_t *bounce;
2368 } data;
2369
2370 struct {
2371 QEMUIOVector iov;
2372 uint8_t *bounce;
2373 } mdata;
2374};
2375
2376static void nvme_compare_mdata_cb(void *opaque, int ret)
2377{
2378 NvmeRequest *req = opaque;
2379 NvmeNamespace *ns = req->ns;
2380 NvmeCtrl *n = nvme_ctrl(req);
2381 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2382 uint16_t ctrl = le16_to_cpu(rw->control);
2383 uint16_t apptag = le16_to_cpu(rw->apptag);
2384 uint16_t appmask = le16_to_cpu(rw->appmask);
2385 uint32_t reftag = le32_to_cpu(rw->reftag);
2386 struct nvme_compare_ctx *ctx = req->opaque;
2387 g_autofree uint8_t *buf = NULL;
2388 uint16_t status = NVME_SUCCESS;
2389
2390 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2391
2392 buf = g_malloc(ctx->mdata.iov.size);
2393
2394 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2395 NVME_TX_DIRECTION_TO_DEVICE, req);
2396 if (status) {
2397 req->status = status;
2398 goto out;
2399 }
2400
2401 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2402 uint64_t slba = le64_to_cpu(rw->slba);
2403 uint8_t *bufp;
2404 uint8_t *mbufp = ctx->mdata.bounce;
2405 uint8_t *end = mbufp + ctx->mdata.iov.size;
2406 size_t msize = nvme_msize(ns);
2407 int16_t pil = 0;
2408
2409 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2410 ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
2411 slba, apptag, appmask, reftag);
2412 if (status) {
2413 req->status = status;
2414 goto out;
2415 }
2416
2417
2418
2419
2420
2421 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2422 pil = nvme_msize(ns) - sizeof(NvmeDifTuple);
2423 }
2424
2425 for (bufp = buf; mbufp < end; bufp += msize, mbufp += msize) {
2426 if (memcmp(bufp + pil, mbufp + pil, msize - pil)) {
2427 req->status = NVME_CMP_FAILURE;
2428 goto out;
2429 }
2430 }
2431
2432 goto out;
2433 }
2434
2435 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2436 req->status = NVME_CMP_FAILURE;
2437 goto out;
2438 }
2439
2440out:
2441 qemu_iovec_destroy(&ctx->data.iov);
2442 g_free(ctx->data.bounce);
2443
2444 qemu_iovec_destroy(&ctx->mdata.iov);
2445 g_free(ctx->mdata.bounce);
2446
2447 g_free(ctx);
2448
2449 nvme_enqueue_req_completion(nvme_cq(req), req);
2450}
2451
2452static void nvme_compare_data_cb(void *opaque, int ret)
2453{
2454 NvmeRequest *req = opaque;
2455 NvmeCtrl *n = nvme_ctrl(req);
2456 NvmeNamespace *ns = req->ns;
2457 BlockBackend *blk = ns->blkconf.blk;
2458 BlockAcctCookie *acct = &req->acct;
2459 BlockAcctStats *stats = blk_get_stats(blk);
2460
2461 struct nvme_compare_ctx *ctx = req->opaque;
2462 g_autofree uint8_t *buf = NULL;
2463 uint16_t status;
2464
2465 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2466
2467 if (ret) {
2468 block_acct_failed(stats, acct);
2469 nvme_aio_err(req, ret);
2470 goto out;
2471 }
2472
2473 buf = g_malloc(ctx->data.iov.size);
2474
2475 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2476 NVME_TX_DIRECTION_TO_DEVICE, req);
2477 if (status) {
2478 req->status = status;
2479 goto out;
2480 }
2481
2482 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2483 req->status = NVME_CMP_FAILURE;
2484 goto out;
2485 }
2486
2487 if (nvme_msize(ns)) {
2488 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2489 uint64_t slba = le64_to_cpu(rw->slba);
2490 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2491 size_t mlen = nvme_m2b(ns, nlb);
2492 uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
2493
2494 ctx->mdata.bounce = g_malloc(mlen);
2495
2496 qemu_iovec_init(&ctx->mdata.iov, 1);
2497 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2498
2499 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2500 nvme_compare_mdata_cb, req);
2501 return;
2502 }
2503
2504 block_acct_done(stats, acct);
2505
2506out:
2507 qemu_iovec_destroy(&ctx->data.iov);
2508 g_free(ctx->data.bounce);
2509 g_free(ctx);
2510
2511 nvme_enqueue_req_completion(nvme_cq(req), req);
2512}
2513
2514static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2515{
2516 NvmeNamespace *ns = req->ns;
2517 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2518
2519 uint32_t attr = le32_to_cpu(dsm->attributes);
2520 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2521
2522 uint16_t status = NVME_SUCCESS;
2523
2524 trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
2525
2526 if (attr & NVME_DSMGMT_AD) {
2527 int64_t offset;
2528 size_t len;
2529 NvmeDsmRange range[nr];
2530 uintptr_t *discards = (uintptr_t *)&req->opaque;
2531
2532 status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
2533 if (status) {
2534 return status;
2535 }
2536
2537
2538
2539
2540
2541
2542 *discards = 1;
2543
2544 for (int i = 0; i < nr; i++) {
2545 uint64_t slba = le64_to_cpu(range[i].slba);
2546 uint32_t nlb = le32_to_cpu(range[i].nlb);
2547
2548 if (nvme_check_bounds(ns, slba, nlb)) {
2549 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2550 ns->id_ns.nsze);
2551 continue;
2552 }
2553
2554 trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
2555 nlb);
2556
2557 if (nlb > n->dmrsl) {
2558 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2559 }
2560
2561 offset = nvme_l2b(ns, slba);
2562 len = nvme_l2b(ns, nlb);
2563
2564 while (len) {
2565 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
2566
2567 (*discards)++;
2568
2569 blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
2570 nvme_aio_discard_cb, req);
2571
2572 offset += bytes;
2573 len -= bytes;
2574 }
2575 }
2576
2577
2578 (*discards)--;
2579
2580 if (*discards) {
2581 status = NVME_NO_COMPLETE;
2582 } else {
2583 status = req->status;
2584 }
2585 }
2586
2587 return status;
2588}
2589
2590static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2591{
2592 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2593 NvmeNamespace *ns = req->ns;
2594 BlockBackend *blk = ns->blkconf.blk;
2595 uint64_t slba = le64_to_cpu(rw->slba);
2596 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2597 size_t len = nvme_l2b(ns, nlb);
2598 int64_t offset = nvme_l2b(ns, slba);
2599 uint16_t ctrl = le16_to_cpu(rw->control);
2600 uint32_t reftag = le32_to_cpu(rw->reftag);
2601 NvmeBounceContext *ctx = NULL;
2602 uint16_t status;
2603
2604 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2605
2606 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2607 status = nvme_check_prinfo(ns, ctrl, slba, reftag);
2608 if (status) {
2609 return status;
2610 }
2611
2612 if (ctrl & NVME_RW_PRINFO_PRACT) {
2613 return NVME_INVALID_PROT_INFO | NVME_DNR;
2614 }
2615 }
2616
2617 if (len > n->page_size << n->params.vsl) {
2618 return NVME_INVALID_FIELD | NVME_DNR;
2619 }
2620
2621 status = nvme_check_bounds(ns, slba, nlb);
2622 if (status) {
2623 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2624 return status;
2625 }
2626
2627 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2628 status = nvme_check_dulbe(ns, slba, nlb);
2629 if (status) {
2630 return status;
2631 }
2632 }
2633
2634 ctx = g_new0(NvmeBounceContext, 1);
2635 ctx->req = req;
2636
2637 ctx->data.bounce = g_malloc(len);
2638
2639 qemu_iovec_init(&ctx->data.iov, 1);
2640 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2641
2642 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2643 BLOCK_ACCT_READ);
2644
2645 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2646 nvme_verify_mdata_in_cb, ctx);
2647 return NVME_NO_COMPLETE;
2648}
2649
2650static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2651{
2652 NvmeNamespace *ns = req->ns;
2653 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2654
2655 uint16_t nr = copy->nr + 1;
2656 uint8_t format = copy->control[0] & 0xf;
2657
2658
2659
2660
2661
2662 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10;
2663 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10;
2664
2665 uint32_t nlb = 0;
2666 uint8_t *bounce = NULL, *bouncep = NULL;
2667 uint8_t *mbounce = NULL, *mbouncep = NULL;
2668 struct nvme_copy_ctx *ctx;
2669 uint16_t status;
2670 int i;
2671
2672 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2673
2674 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2675 ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) {
2676 return NVME_INVALID_FIELD | NVME_DNR;
2677 }
2678
2679 if (!(n->id_ctrl.ocfs & (1 << format))) {
2680 trace_pci_nvme_err_copy_invalid_format(format);
2681 return NVME_INVALID_FIELD | NVME_DNR;
2682 }
2683
2684 if (nr > ns->id_ns.msrc + 1) {
2685 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2686 }
2687
2688 ctx = g_new(struct nvme_copy_ctx, 1);
2689 ctx->ranges = g_new(NvmeCopySourceRange, nr);
2690
2691 status = nvme_h2c(n, (uint8_t *)ctx->ranges,
2692 nr * sizeof(NvmeCopySourceRange), req);
2693 if (status) {
2694 goto out;
2695 }
2696
2697 for (i = 0; i < nr; i++) {
2698 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2699 uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2700
2701 if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2702 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2703 goto out;
2704 }
2705
2706 status = nvme_check_bounds(ns, slba, _nlb);
2707 if (status) {
2708 trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze);
2709 goto out;
2710 }
2711
2712 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2713 status = nvme_check_dulbe(ns, slba, _nlb);
2714 if (status) {
2715 goto out;
2716 }
2717 }
2718
2719 if (ns->params.zoned) {
2720 status = nvme_check_zone_read(ns, slba, _nlb);
2721 if (status) {
2722 goto out;
2723 }
2724 }
2725
2726 nlb += _nlb;
2727 }
2728
2729 if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
2730 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2731 goto out;
2732 }
2733
2734 bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
2735 if (nvme_msize(ns)) {
2736 mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb));
2737 }
2738
2739 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2740 BLOCK_ACCT_READ);
2741
2742 ctx->bounce = bounce;
2743 ctx->mbounce = mbounce;
2744 ctx->nlb = nlb;
2745 ctx->copies = 1;
2746
2747 req->opaque = ctx;
2748
2749 for (i = 0; i < nr; i++) {
2750 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2751 uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2752
2753 size_t len = nvme_l2b(ns, nlb);
2754 int64_t offset = nvme_l2b(ns, slba);
2755
2756 trace_pci_nvme_copy_source_range(slba, nlb);
2757
2758 struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2759 in_ctx->req = req;
2760
2761 qemu_iovec_init(&in_ctx->iov, 1);
2762 qemu_iovec_add(&in_ctx->iov, bouncep, len);
2763
2764 ctx->copies++;
2765
2766 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2767 nvme_aio_copy_in_cb, in_ctx);
2768
2769 bouncep += len;
2770
2771 if (nvme_msize(ns)) {
2772 len = nvme_m2b(ns, nlb);
2773 offset = ns->mdata_offset + nvme_m2b(ns, slba);
2774
2775 in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2776 in_ctx->req = req;
2777
2778 qemu_iovec_init(&in_ctx->iov, 1);
2779 qemu_iovec_add(&in_ctx->iov, mbouncep, len);
2780
2781 ctx->copies++;
2782
2783 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2784 nvme_aio_copy_in_cb, in_ctx);
2785
2786 mbouncep += len;
2787 }
2788 }
2789
2790
2791 ctx->copies--;
2792
2793 if (!ctx->copies) {
2794 nvme_copy_in_complete(req);
2795 }
2796
2797 return NVME_NO_COMPLETE;
2798
2799out:
2800 g_free(ctx->ranges);
2801 g_free(ctx);
2802
2803 return status;
2804}
2805
2806static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2807{
2808 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2809 NvmeNamespace *ns = req->ns;
2810 BlockBackend *blk = ns->blkconf.blk;
2811 uint64_t slba = le64_to_cpu(rw->slba);
2812 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2813 uint16_t ctrl = le16_to_cpu(rw->control);
2814 size_t data_len = nvme_l2b(ns, nlb);
2815 size_t len = data_len;
2816 int64_t offset = nvme_l2b(ns, slba);
2817 struct nvme_compare_ctx *ctx = NULL;
2818 uint16_t status;
2819
2820 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2821
2822 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) {
2823 return NVME_INVALID_PROT_INFO | NVME_DNR;
2824 }
2825
2826 if (nvme_ns_ext(ns)) {
2827 len += nvme_m2b(ns, nlb);
2828 }
2829
2830 status = nvme_check_mdts(n, len);
2831 if (status) {
2832 return status;
2833 }
2834
2835 status = nvme_check_bounds(ns, slba, nlb);
2836 if (status) {
2837 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2838 return status;
2839 }
2840
2841 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2842 status = nvme_check_dulbe(ns, slba, nlb);
2843 if (status) {
2844 return status;
2845 }
2846 }
2847
2848 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2849 if (status) {
2850 return status;
2851 }
2852
2853 ctx = g_new(struct nvme_compare_ctx, 1);
2854 ctx->data.bounce = g_malloc(data_len);
2855
2856 req->opaque = ctx;
2857
2858 qemu_iovec_init(&ctx->data.iov, 1);
2859 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2860
2861 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2862 BLOCK_ACCT_READ);
2863 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2864 nvme_compare_data_cb, req);
2865
2866 return NVME_NO_COMPLETE;
2867}
2868
2869static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
2870{
2871 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
2872 uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
2873 uint16_t status;
2874 struct nvme_aio_flush_ctx *ctx;
2875 NvmeNamespace *ns;
2876
2877 trace_pci_nvme_flush(nvme_cid(req), nsid);
2878
2879 if (nsid != NVME_NSID_BROADCAST) {
2880 req->ns = nvme_ns(n, nsid);
2881 if (unlikely(!req->ns)) {
2882 return NVME_INVALID_FIELD | NVME_DNR;
2883 }
2884
2885 block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
2886 BLOCK_ACCT_FLUSH);
2887 req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req);
2888 return NVME_NO_COMPLETE;
2889 }
2890
2891
2892 *num_flushes = 1;
2893
2894 for (int i = 1; i <= n->num_namespaces; i++) {
2895 ns = nvme_ns(n, i);
2896 if (!ns) {
2897 continue;
2898 }
2899
2900 ctx = g_new(struct nvme_aio_flush_ctx, 1);
2901 ctx->req = req;
2902 ctx->ns = ns;
2903
2904 (*num_flushes)++;
2905
2906 block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
2907 BLOCK_ACCT_FLUSH);
2908 blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
2909 }
2910
2911
2912 (*num_flushes)--;
2913
2914 if (*num_flushes) {
2915 status = NVME_NO_COMPLETE;
2916 } else {
2917 status = req->status;
2918 }
2919
2920 return status;
2921}
2922
2923static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
2924{
2925 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2926 NvmeNamespace *ns = req->ns;
2927 uint64_t slba = le64_to_cpu(rw->slba);
2928 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2929 uint16_t ctrl = le16_to_cpu(rw->control);
2930 uint64_t data_size = nvme_l2b(ns, nlb);
2931 uint64_t mapped_size = data_size;
2932 uint64_t data_offset;
2933 BlockBackend *blk = ns->blkconf.blk;
2934 uint16_t status;
2935
2936 if (nvme_ns_ext(ns)) {
2937 mapped_size += nvme_m2b(ns, nlb);
2938
2939 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2940 bool pract = ctrl & NVME_RW_PRINFO_PRACT;
2941
2942 if (pract && nvme_msize(ns) == 8) {
2943 mapped_size = data_size;
2944 }
2945 }
2946 }
2947
2948 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
2949
2950 status = nvme_check_mdts(n, mapped_size);
2951 if (status) {
2952 goto invalid;
2953 }
2954
2955 status = nvme_check_bounds(ns, slba, nlb);
2956 if (status) {
2957 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2958 goto invalid;
2959 }
2960
2961 if (ns->params.zoned) {
2962 status = nvme_check_zone_read(ns, slba, nlb);
2963 if (status) {
2964 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
2965 goto invalid;
2966 }
2967 }
2968
2969 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2970 status = nvme_check_dulbe(ns, slba, nlb);
2971 if (status) {
2972 goto invalid;
2973 }
2974 }
2975
2976 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2977 return nvme_dif_rw(n, req);
2978 }
2979
2980 status = nvme_map_data(n, nlb, req);
2981 if (status) {
2982 goto invalid;
2983 }
2984
2985 data_offset = nvme_l2b(ns, slba);
2986
2987 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
2988 BLOCK_ACCT_READ);
2989 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
2990 return NVME_NO_COMPLETE;
2991
2992invalid:
2993 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
2994 return status | NVME_DNR;
2995}
2996
2997static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
2998 bool wrz)
2999{
3000 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3001 NvmeNamespace *ns = req->ns;
3002 uint64_t slba = le64_to_cpu(rw->slba);
3003 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3004 uint16_t ctrl = le16_to_cpu(rw->control);
3005 uint64_t data_size = nvme_l2b(ns, nlb);
3006 uint64_t mapped_size = data_size;
3007 uint64_t data_offset;
3008 NvmeZone *zone;
3009 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3010 BlockBackend *blk = ns->blkconf.blk;
3011 uint16_t status;
3012
3013 if (nvme_ns_ext(ns)) {
3014 mapped_size += nvme_m2b(ns, nlb);
3015
3016 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3017 bool pract = ctrl & NVME_RW_PRINFO_PRACT;
3018
3019 if (pract && nvme_msize(ns) == 8) {
3020 mapped_size -= nvme_m2b(ns, nlb);
3021 }
3022 }
3023 }
3024
3025 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3026 nvme_nsid(ns), nlb, mapped_size, slba);
3027
3028 if (!wrz) {
3029 status = nvme_check_mdts(n, mapped_size);
3030 if (status) {
3031 goto invalid;
3032 }
3033 }
3034
3035 status = nvme_check_bounds(ns, slba, nlb);
3036 if (status) {
3037 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
3038 goto invalid;
3039 }
3040
3041 if (ns->params.zoned) {
3042 zone = nvme_get_zone_by_slba(ns, slba);
3043
3044 if (append) {
3045 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3046
3047 if (unlikely(slba != zone->d.zslba)) {
3048 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3049 status = NVME_INVALID_FIELD;
3050 goto invalid;
3051 }
3052
3053 if (n->params.zasl &&
3054 data_size > (uint64_t)n->page_size << n->params.zasl) {
3055 trace_pci_nvme_err_zasl(data_size);
3056 return NVME_INVALID_FIELD | NVME_DNR;
3057 }
3058
3059 slba = zone->w_ptr;
3060 rw->slba = cpu_to_le64(slba);
3061 res->slba = cpu_to_le64(slba);
3062
3063 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3064 case NVME_ID_NS_DPS_TYPE_1:
3065 if (!piremap) {
3066 return NVME_INVALID_PROT_INFO | NVME_DNR;
3067 }
3068
3069
3070
3071 case NVME_ID_NS_DPS_TYPE_2:
3072 if (piremap) {
3073 uint32_t reftag = le32_to_cpu(rw->reftag);
3074 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3075 }
3076
3077 break;
3078
3079 case NVME_ID_NS_DPS_TYPE_3:
3080 if (piremap) {
3081 return NVME_INVALID_PROT_INFO | NVME_DNR;
3082 }
3083
3084 break;
3085 }
3086 }
3087
3088 status = nvme_check_zone_write(ns, zone, slba, nlb);
3089 if (status) {
3090 goto invalid;
3091 }
3092
3093 status = nvme_zrm_auto(ns, zone);
3094 if (status) {
3095 goto invalid;
3096 }
3097
3098 zone->w_ptr += nlb;
3099 }
3100
3101 data_offset = nvme_l2b(ns, slba);
3102
3103 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3104 return nvme_dif_rw(n, req);
3105 }
3106
3107 if (!wrz) {
3108 status = nvme_map_data(n, nlb, req);
3109 if (status) {
3110 goto invalid;
3111 }
3112
3113 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3114 BLOCK_ACCT_WRITE);
3115 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3116 } else {
3117 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3118 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3119 req);
3120 }
3121
3122 return NVME_NO_COMPLETE;
3123
3124invalid:
3125 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3126 return status | NVME_DNR;
3127}
3128
3129static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3130{
3131 return nvme_do_write(n, req, false, false);
3132}
3133
3134static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3135{
3136 return nvme_do_write(n, req, false, true);
3137}
3138
3139static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3140{
3141 return nvme_do_write(n, req, true, false);
3142}
3143
3144static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3145 uint64_t *slba, uint32_t *zone_idx)
3146{
3147 uint32_t dw10 = le32_to_cpu(c->cdw10);
3148 uint32_t dw11 = le32_to_cpu(c->cdw11);
3149
3150 if (!ns->params.zoned) {
3151 trace_pci_nvme_err_invalid_opc(c->opcode);
3152 return NVME_INVALID_OPCODE | NVME_DNR;
3153 }
3154
3155 *slba = ((uint64_t)dw11) << 32 | dw10;
3156 if (unlikely(*slba >= ns->id_ns.nsze)) {
3157 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3158 *slba = 0;
3159 return NVME_LBA_RANGE | NVME_DNR;
3160 }
3161
3162 *zone_idx = nvme_zone_idx(ns, *slba);
3163 assert(*zone_idx < ns->num_zones);
3164
3165 return NVME_SUCCESS;
3166}
3167
3168typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3169 NvmeRequest *);
3170
3171enum NvmeZoneProcessingMask {
3172 NVME_PROC_CURRENT_ZONE = 0,
3173 NVME_PROC_OPENED_ZONES = 1 << 0,
3174 NVME_PROC_CLOSED_ZONES = 1 << 1,
3175 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3176 NVME_PROC_FULL_ZONES = 1 << 3,
3177};
3178
3179static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3180 NvmeZoneState state, NvmeRequest *req)
3181{
3182 return nvme_zrm_open(ns, zone);
3183}
3184
3185static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3186 NvmeZoneState state, NvmeRequest *req)
3187{
3188 return nvme_zrm_close(ns, zone);
3189}
3190
3191static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3192 NvmeZoneState state, NvmeRequest *req)
3193{
3194 return nvme_zrm_finish(ns, zone);
3195}
3196
3197static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
3198 NvmeZoneState state, NvmeRequest *req)
3199{
3200 uintptr_t *resets = (uintptr_t *)&req->opaque;
3201 struct nvme_zone_reset_ctx *ctx;
3202
3203 switch (state) {
3204 case NVME_ZONE_STATE_EMPTY:
3205 return NVME_SUCCESS;
3206 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3207 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3208 case NVME_ZONE_STATE_CLOSED:
3209 case NVME_ZONE_STATE_FULL:
3210 break;
3211 default:
3212 return NVME_ZONE_INVAL_TRANSITION;
3213 }
3214
3215
3216
3217
3218
3219 ctx = g_new(struct nvme_zone_reset_ctx, 1);
3220 ctx->req = req;
3221 ctx->zone = zone;
3222
3223 (*resets)++;
3224
3225 blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
3226 nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
3227 nvme_aio_zone_reset_cb, ctx);
3228
3229 return NVME_NO_COMPLETE;
3230}
3231
3232static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3233 NvmeZoneState state, NvmeRequest *req)
3234{
3235 switch (state) {
3236 case NVME_ZONE_STATE_READ_ONLY:
3237 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3238
3239 case NVME_ZONE_STATE_OFFLINE:
3240 return NVME_SUCCESS;
3241 default:
3242 return NVME_ZONE_INVAL_TRANSITION;
3243 }
3244}
3245
3246static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3247{
3248 uint16_t status;
3249 uint8_t state = nvme_get_zone_state(zone);
3250
3251 if (state == NVME_ZONE_STATE_EMPTY) {
3252 status = nvme_aor_check(ns, 1, 0);
3253 if (status) {
3254 return status;
3255 }
3256 nvme_aor_inc_active(ns);
3257 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3258 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3259 return NVME_SUCCESS;
3260 }
3261
3262 return NVME_ZONE_INVAL_TRANSITION;
3263}
3264
3265static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3266 enum NvmeZoneProcessingMask proc_mask,
3267 op_handler_t op_hndlr, NvmeRequest *req)
3268{
3269 uint16_t status = NVME_SUCCESS;
3270 NvmeZoneState zs = nvme_get_zone_state(zone);
3271 bool proc_zone;
3272
3273 switch (zs) {
3274 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3275 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3276 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3277 break;
3278 case NVME_ZONE_STATE_CLOSED:
3279 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3280 break;
3281 case NVME_ZONE_STATE_READ_ONLY:
3282 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3283 break;
3284 case NVME_ZONE_STATE_FULL:
3285 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3286 break;
3287 default:
3288 proc_zone = false;
3289 }
3290
3291 if (proc_zone) {
3292 status = op_hndlr(ns, zone, zs, req);
3293 }
3294
3295 return status;
3296}
3297
3298static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3299 enum NvmeZoneProcessingMask proc_mask,
3300 op_handler_t op_hndlr, NvmeRequest *req)
3301{
3302 NvmeZone *next;
3303 uint16_t status = NVME_SUCCESS;
3304 int i;
3305
3306 if (!proc_mask) {
3307 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3308 } else {
3309 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3310 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3311 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3312 req);
3313 if (status && status != NVME_NO_COMPLETE) {
3314 goto out;
3315 }
3316 }
3317 }
3318 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3319 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3320 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3321 req);
3322 if (status && status != NVME_NO_COMPLETE) {
3323 goto out;
3324 }
3325 }
3326
3327 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3328 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3329 req);
3330 if (status && status != NVME_NO_COMPLETE) {
3331 goto out;
3332 }
3333 }
3334 }
3335 if (proc_mask & NVME_PROC_FULL_ZONES) {
3336 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3337 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3338 req);
3339 if (status && status != NVME_NO_COMPLETE) {
3340 goto out;
3341 }
3342 }
3343 }
3344
3345 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3346 for (i = 0; i < ns->num_zones; i++, zone++) {
3347 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3348 req);
3349 if (status && status != NVME_NO_COMPLETE) {
3350 goto out;
3351 }
3352 }
3353 }
3354 }
3355
3356out:
3357 return status;
3358}
3359
3360static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3361{
3362 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3363 NvmeNamespace *ns = req->ns;
3364 NvmeZone *zone;
3365 uintptr_t *resets;
3366 uint8_t *zd_ext;
3367 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3368 uint64_t slba = 0;
3369 uint32_t zone_idx = 0;
3370 uint16_t status;
3371 uint8_t action;
3372 bool all;
3373 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3374
3375 action = dw13 & 0xff;
3376 all = dw13 & 0x100;
3377
3378 req->status = NVME_SUCCESS;
3379
3380 if (!all) {
3381 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3382 if (status) {
3383 return status;
3384 }
3385 }
3386
3387 zone = &ns->zone_array[zone_idx];
3388 if (slba != zone->d.zslba) {
3389 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3390 return NVME_INVALID_FIELD | NVME_DNR;
3391 }
3392
3393 switch (action) {
3394
3395 case NVME_ZONE_ACTION_OPEN:
3396 if (all) {
3397 proc_mask = NVME_PROC_CLOSED_ZONES;
3398 }
3399 trace_pci_nvme_open_zone(slba, zone_idx, all);
3400 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3401 break;
3402
3403 case NVME_ZONE_ACTION_CLOSE:
3404 if (all) {
3405 proc_mask = NVME_PROC_OPENED_ZONES;
3406 }
3407 trace_pci_nvme_close_zone(slba, zone_idx, all);
3408 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3409 break;
3410
3411 case NVME_ZONE_ACTION_FINISH:
3412 if (all) {
3413 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3414 }
3415 trace_pci_nvme_finish_zone(slba, zone_idx, all);
3416 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3417 break;
3418
3419 case NVME_ZONE_ACTION_RESET:
3420 resets = (uintptr_t *)&req->opaque;
3421
3422 if (all) {
3423 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
3424 NVME_PROC_FULL_ZONES;
3425 }
3426 trace_pci_nvme_reset_zone(slba, zone_idx, all);
3427
3428 *resets = 1;
3429
3430 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
3431
3432 (*resets)--;
3433
3434 return *resets ? NVME_NO_COMPLETE : req->status;
3435
3436 case NVME_ZONE_ACTION_OFFLINE:
3437 if (all) {
3438 proc_mask = NVME_PROC_READ_ONLY_ZONES;
3439 }
3440 trace_pci_nvme_offline_zone(slba, zone_idx, all);
3441 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3442 break;
3443
3444 case NVME_ZONE_ACTION_SET_ZD_EXT:
3445 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3446 if (all || !ns->params.zd_extension_size) {
3447 return NVME_INVALID_FIELD | NVME_DNR;
3448 }
3449 zd_ext = nvme_get_zd_extension(ns, zone_idx);
3450 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3451 if (status) {
3452 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3453 return status;
3454 }
3455
3456 status = nvme_set_zd_ext(ns, zone);
3457 if (status == NVME_SUCCESS) {
3458 trace_pci_nvme_zd_extension_set(zone_idx);
3459 return status;
3460 }
3461 break;
3462
3463 default:
3464 trace_pci_nvme_err_invalid_mgmt_action(action);
3465 status = NVME_INVALID_FIELD;
3466 }
3467
3468 if (status == NVME_ZONE_INVAL_TRANSITION) {
3469 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3470 zone->d.za);
3471 }
3472 if (status) {
3473 status |= NVME_DNR;
3474 }
3475
3476 return status;
3477}
3478
3479static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3480{
3481 NvmeZoneState zs = nvme_get_zone_state(zl);
3482
3483 switch (zafs) {
3484 case NVME_ZONE_REPORT_ALL:
3485 return true;
3486 case NVME_ZONE_REPORT_EMPTY:
3487 return zs == NVME_ZONE_STATE_EMPTY;
3488 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3489 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3490 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3491 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3492 case NVME_ZONE_REPORT_CLOSED:
3493 return zs == NVME_ZONE_STATE_CLOSED;
3494 case NVME_ZONE_REPORT_FULL:
3495 return zs == NVME_ZONE_STATE_FULL;
3496 case NVME_ZONE_REPORT_READ_ONLY:
3497 return zs == NVME_ZONE_STATE_READ_ONLY;
3498 case NVME_ZONE_REPORT_OFFLINE:
3499 return zs == NVME_ZONE_STATE_OFFLINE;
3500 default:
3501 return false;
3502 }
3503}
3504
3505static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3506{
3507 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3508 NvmeNamespace *ns = req->ns;
3509
3510 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3511 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3512 uint32_t zone_idx, zra, zrasf, partial;
3513 uint64_t max_zones, nr_zones = 0;
3514 uint16_t status;
3515 uint64_t slba;
3516 NvmeZoneDescr *z;
3517 NvmeZone *zone;
3518 NvmeZoneReportHeader *header;
3519 void *buf, *buf_p;
3520 size_t zone_entry_sz;
3521 int i;
3522
3523 req->status = NVME_SUCCESS;
3524
3525 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3526 if (status) {
3527 return status;
3528 }
3529
3530 zra = dw13 & 0xff;
3531 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3532 return NVME_INVALID_FIELD | NVME_DNR;
3533 }
3534 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3535 return NVME_INVALID_FIELD | NVME_DNR;
3536 }
3537
3538 zrasf = (dw13 >> 8) & 0xff;
3539 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3540 return NVME_INVALID_FIELD | NVME_DNR;
3541 }
3542
3543 if (data_size < sizeof(NvmeZoneReportHeader)) {
3544 return NVME_INVALID_FIELD | NVME_DNR;
3545 }
3546
3547 status = nvme_check_mdts(n, data_size);
3548 if (status) {
3549 return status;
3550 }
3551
3552 partial = (dw13 >> 16) & 0x01;
3553
3554 zone_entry_sz = sizeof(NvmeZoneDescr);
3555 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3556 zone_entry_sz += ns->params.zd_extension_size;
3557 }
3558
3559 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3560 buf = g_malloc0(data_size);
3561
3562 zone = &ns->zone_array[zone_idx];
3563 for (i = zone_idx; i < ns->num_zones; i++) {
3564 if (partial && nr_zones >= max_zones) {
3565 break;
3566 }
3567 if (nvme_zone_matches_filter(zrasf, zone++)) {
3568 nr_zones++;
3569 }
3570 }
3571 header = (NvmeZoneReportHeader *)buf;
3572 header->nr_zones = cpu_to_le64(nr_zones);
3573
3574 buf_p = buf + sizeof(NvmeZoneReportHeader);
3575 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3576 zone = &ns->zone_array[zone_idx];
3577 if (nvme_zone_matches_filter(zrasf, zone)) {
3578 z = (NvmeZoneDescr *)buf_p;
3579 buf_p += sizeof(NvmeZoneDescr);
3580
3581 z->zt = zone->d.zt;
3582 z->zs = zone->d.zs;
3583 z->zcap = cpu_to_le64(zone->d.zcap);
3584 z->zslba = cpu_to_le64(zone->d.zslba);
3585 z->za = zone->d.za;
3586
3587 if (nvme_wp_is_valid(zone)) {
3588 z->wp = cpu_to_le64(zone->d.wp);
3589 } else {
3590 z->wp = cpu_to_le64(~0ULL);
3591 }
3592
3593 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3594 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3595 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3596 ns->params.zd_extension_size);
3597 }
3598 buf_p += ns->params.zd_extension_size;
3599 }
3600
3601 max_zones--;
3602 }
3603 }
3604
3605 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3606
3607 g_free(buf);
3608
3609 return status;
3610}
3611
3612static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3613{
3614 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3615 uint16_t status;
3616
3617 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3618 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3619
3620 if (!nvme_nsid_valid(n, nsid)) {
3621 return NVME_INVALID_NSID | NVME_DNR;
3622 }
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643 if (req->cmd.opcode == NVME_CMD_FLUSH) {
3644 return nvme_flush(n, req);
3645 }
3646
3647 req->ns = nvme_ns(n, nsid);
3648 if (unlikely(!req->ns)) {
3649 return NVME_INVALID_FIELD | NVME_DNR;
3650 }
3651
3652 if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3653 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3654 return NVME_INVALID_OPCODE | NVME_DNR;
3655 }
3656
3657 status = nvme_ns_status(req->ns);
3658 if (unlikely(status)) {
3659 return status;
3660 }
3661
3662 switch (req->cmd.opcode) {
3663 case NVME_CMD_WRITE_ZEROES:
3664 return nvme_write_zeroes(n, req);
3665 case NVME_CMD_ZONE_APPEND:
3666 return nvme_zone_append(n, req);
3667 case NVME_CMD_WRITE:
3668 return nvme_write(n, req);
3669 case NVME_CMD_READ:
3670 return nvme_read(n, req);
3671 case NVME_CMD_COMPARE:
3672 return nvme_compare(n, req);
3673 case NVME_CMD_DSM:
3674 return nvme_dsm(n, req);
3675 case NVME_CMD_VERIFY:
3676 return nvme_verify(n, req);
3677 case NVME_CMD_COPY:
3678 return nvme_copy(n, req);
3679 case NVME_CMD_ZONE_MGMT_SEND:
3680 return nvme_zone_mgmt_send(n, req);
3681 case NVME_CMD_ZONE_MGMT_RECV:
3682 return nvme_zone_mgmt_recv(n, req);
3683 default:
3684 assert(false);
3685 }
3686
3687 return NVME_INVALID_OPCODE | NVME_DNR;
3688}
3689
3690static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3691{
3692 n->sq[sq->sqid] = NULL;
3693 timer_free(sq->timer);
3694 g_free(sq->io_req);
3695 if (sq->sqid) {
3696 g_free(sq);
3697 }
3698}
3699
3700static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3701{
3702 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3703 NvmeRequest *r, *next;
3704 NvmeSQueue *sq;
3705 NvmeCQueue *cq;
3706 uint16_t qid = le16_to_cpu(c->qid);
3707 uint32_t nsid;
3708
3709 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3710 trace_pci_nvme_err_invalid_del_sq(qid);
3711 return NVME_INVALID_QID | NVME_DNR;
3712 }
3713
3714 trace_pci_nvme_del_sq(qid);
3715
3716 sq = n->sq[qid];
3717 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3718 r = QTAILQ_FIRST(&sq->out_req_list);
3719 if (r->aiocb) {
3720 blk_aio_cancel(r->aiocb);
3721 }
3722 }
3723
3724
3725
3726
3727
3728 if (!QTAILQ_EMPTY(&sq->out_req_list)) {
3729 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
3730 NvmeNamespace *ns = nvme_ns(n, nsid);
3731 if (ns) {
3732 nvme_ns_drain(ns);
3733 }
3734 }
3735 }
3736
3737 assert(QTAILQ_EMPTY(&sq->out_req_list));
3738
3739 if (!nvme_check_cqid(n, sq->cqid)) {
3740 cq = n->cq[sq->cqid];
3741 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3742
3743 nvme_post_cqes(cq);
3744 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3745 if (r->sq == sq) {
3746 QTAILQ_REMOVE(&cq->req_list, r, entry);
3747 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3748 }
3749 }
3750 }
3751
3752 nvme_free_sq(sq, n);
3753 return NVME_SUCCESS;
3754}
3755
3756static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3757 uint16_t sqid, uint16_t cqid, uint16_t size)
3758{
3759 int i;
3760 NvmeCQueue *cq;
3761
3762 sq->ctrl = n;
3763 sq->dma_addr = dma_addr;
3764 sq->sqid = sqid;
3765 sq->size = size;
3766 sq->cqid = cqid;
3767 sq->head = sq->tail = 0;
3768 sq->io_req = g_new0(NvmeRequest, sq->size);
3769
3770 QTAILQ_INIT(&sq->req_list);
3771 QTAILQ_INIT(&sq->out_req_list);
3772 for (i = 0; i < sq->size; i++) {
3773 sq->io_req[i].sq = sq;
3774 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
3775 }
3776 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
3777
3778 assert(n->cq[cqid]);
3779 cq = n->cq[cqid];
3780 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
3781 n->sq[sqid] = sq;
3782}
3783
3784static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
3785{
3786 NvmeSQueue *sq;
3787 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
3788
3789 uint16_t cqid = le16_to_cpu(c->cqid);
3790 uint16_t sqid = le16_to_cpu(c->sqid);
3791 uint16_t qsize = le16_to_cpu(c->qsize);
3792 uint16_t qflags = le16_to_cpu(c->sq_flags);
3793 uint64_t prp1 = le64_to_cpu(c->prp1);
3794
3795 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
3796
3797 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
3798 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
3799 return NVME_INVALID_CQID | NVME_DNR;
3800 }
3801 if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
3802 n->sq[sqid] != NULL)) {
3803 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
3804 return NVME_INVALID_QID | NVME_DNR;
3805 }
3806 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
3807 trace_pci_nvme_err_invalid_create_sq_size(qsize);
3808 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
3809 }
3810 if (unlikely(prp1 & (n->page_size - 1))) {
3811 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
3812 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
3813 }
3814 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
3815 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
3816 return NVME_INVALID_FIELD | NVME_DNR;
3817 }
3818 sq = g_malloc0(sizeof(*sq));
3819 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
3820 return NVME_SUCCESS;
3821}
3822
3823struct nvme_stats {
3824 uint64_t units_read;
3825 uint64_t units_written;
3826 uint64_t read_commands;
3827 uint64_t write_commands;
3828};
3829
3830static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
3831{
3832 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
3833
3834 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
3835 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
3836 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
3837 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
3838}
3839
3840static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3841 uint64_t off, NvmeRequest *req)
3842{
3843 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3844 struct nvme_stats stats = { 0 };
3845 NvmeSmartLog smart = { 0 };
3846 uint32_t trans_len;
3847 NvmeNamespace *ns;
3848 time_t current_ms;
3849
3850 if (off >= sizeof(smart)) {
3851 return NVME_INVALID_FIELD | NVME_DNR;
3852 }
3853
3854 if (nsid != 0xffffffff) {
3855 ns = nvme_ns(n, nsid);
3856 if (!ns) {
3857 return NVME_INVALID_NSID | NVME_DNR;
3858 }
3859 nvme_set_blk_stats(ns, &stats);
3860 } else {
3861 int i;
3862
3863 for (i = 1; i <= n->num_namespaces; i++) {
3864 ns = nvme_ns(n, i);
3865 if (!ns) {
3866 continue;
3867 }
3868 nvme_set_blk_stats(ns, &stats);
3869 }
3870 }
3871
3872 trans_len = MIN(sizeof(smart) - off, buf_len);
3873 smart.critical_warning = n->smart_critical_warning;
3874
3875 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
3876 1000));
3877 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
3878 1000));
3879 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
3880 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
3881
3882 smart.temperature = cpu_to_le16(n->temperature);
3883
3884 if ((n->temperature >= n->features.temp_thresh_hi) ||
3885 (n->temperature <= n->features.temp_thresh_low)) {
3886 smart.critical_warning |= NVME_SMART_TEMPERATURE;
3887 }
3888
3889 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
3890 smart.power_on_hours[0] =
3891 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
3892
3893 if (!rae) {
3894 nvme_clear_events(n, NVME_AER_TYPE_SMART);
3895 }
3896
3897 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
3898}
3899
3900static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
3901 NvmeRequest *req)
3902{
3903 uint32_t trans_len;
3904 NvmeFwSlotInfoLog fw_log = {
3905 .afi = 0x1,
3906 };
3907
3908 if (off >= sizeof(fw_log)) {
3909 return NVME_INVALID_FIELD | NVME_DNR;
3910 }
3911
3912 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
3913 trans_len = MIN(sizeof(fw_log) - off, buf_len);
3914
3915 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
3916}
3917
3918static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3919 uint64_t off, NvmeRequest *req)
3920{
3921 uint32_t trans_len;
3922 NvmeErrorLog errlog;
3923
3924 if (off >= sizeof(errlog)) {
3925 return NVME_INVALID_FIELD | NVME_DNR;
3926 }
3927
3928 if (!rae) {
3929 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
3930 }
3931
3932 memset(&errlog, 0x0, sizeof(errlog));
3933 trans_len = MIN(sizeof(errlog) - off, buf_len);
3934
3935 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
3936}
3937
3938static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3939 uint64_t off, NvmeRequest *req)
3940{
3941 uint32_t nslist[1024];
3942 uint32_t trans_len;
3943 int i = 0;
3944 uint32_t nsid;
3945
3946 memset(nslist, 0x0, sizeof(nslist));
3947 trans_len = MIN(sizeof(nslist) - off, buf_len);
3948
3949 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
3950 NVME_CHANGED_NSID_SIZE) {
3951
3952
3953
3954
3955 if (i == ARRAY_SIZE(nslist)) {
3956 memset(nslist, 0x0, sizeof(nslist));
3957 nslist[0] = 0xffffffff;
3958 break;
3959 }
3960
3961 nslist[i++] = nsid;
3962 clear_bit(nsid, n->changed_nsids);
3963 }
3964
3965
3966
3967
3968
3969 if (nslist[0] == 0xffffffff) {
3970 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
3971 }
3972
3973 if (!rae) {
3974 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
3975 }
3976
3977 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
3978}
3979
3980static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
3981 uint64_t off, NvmeRequest *req)
3982{
3983 NvmeEffectsLog log = {};
3984 const uint32_t *src_iocs = NULL;
3985 uint32_t trans_len;
3986
3987 if (off >= sizeof(log)) {
3988 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
3989 return NVME_INVALID_FIELD | NVME_DNR;
3990 }
3991
3992 switch (NVME_CC_CSS(n->bar.cc)) {
3993 case NVME_CC_CSS_NVM:
3994 src_iocs = nvme_cse_iocs_nvm;
3995
3996 case NVME_CC_CSS_ADMIN_ONLY:
3997 break;
3998 case NVME_CC_CSS_CSI:
3999 switch (csi) {
4000 case NVME_CSI_NVM:
4001 src_iocs = nvme_cse_iocs_nvm;
4002 break;
4003 case NVME_CSI_ZONED:
4004 src_iocs = nvme_cse_iocs_zoned;
4005 break;
4006 }
4007 }
4008
4009 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4010
4011 if (src_iocs) {
4012 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4013 }
4014
4015 trans_len = MIN(sizeof(log) - off, buf_len);
4016
4017 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4018}
4019
4020static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4021{
4022 NvmeCmd *cmd = &req->cmd;
4023
4024 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4025 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4026 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4027 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4028 uint8_t lid = dw10 & 0xff;
4029 uint8_t lsp = (dw10 >> 8) & 0xf;
4030 uint8_t rae = (dw10 >> 15) & 0x1;
4031 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
4032 uint32_t numdl, numdu;
4033 uint64_t off, lpol, lpou;
4034 size_t len;
4035 uint16_t status;
4036
4037 numdl = (dw10 >> 16);
4038 numdu = (dw11 & 0xffff);
4039 lpol = dw12;
4040 lpou = dw13;
4041
4042 len = (((numdu << 16) | numdl) + 1) << 2;
4043 off = (lpou << 32ULL) | lpol;
4044
4045 if (off & 0x3) {
4046 return NVME_INVALID_FIELD | NVME_DNR;
4047 }
4048
4049 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4050
4051 status = nvme_check_mdts(n, len);
4052 if (status) {
4053 return status;
4054 }
4055
4056 switch (lid) {
4057 case NVME_LOG_ERROR_INFO:
4058 return nvme_error_info(n, rae, len, off, req);
4059 case NVME_LOG_SMART_INFO:
4060 return nvme_smart_info(n, rae, len, off, req);
4061 case NVME_LOG_FW_SLOT_INFO:
4062 return nvme_fw_log_info(n, len, off, req);
4063 case NVME_LOG_CHANGED_NSLIST:
4064 return nvme_changed_nslist(n, rae, len, off, req);
4065 case NVME_LOG_CMD_EFFECTS:
4066 return nvme_cmd_effects(n, csi, len, off, req);
4067 default:
4068 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4069 return NVME_INVALID_FIELD | NVME_DNR;
4070 }
4071}
4072
4073static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4074{
4075 n->cq[cq->cqid] = NULL;
4076 timer_free(cq->timer);
4077 if (msix_enabled(&n->parent_obj)) {
4078 msix_vector_unuse(&n->parent_obj, cq->vector);
4079 }
4080 if (cq->cqid) {
4081 g_free(cq);
4082 }
4083}
4084
4085static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4086{
4087 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4088 NvmeCQueue *cq;
4089 uint16_t qid = le16_to_cpu(c->qid);
4090
4091 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4092 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4093 return NVME_INVALID_CQID | NVME_DNR;
4094 }
4095
4096 cq = n->cq[qid];
4097 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4098 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4099 return NVME_INVALID_QUEUE_DEL;
4100 }
4101
4102 if (cq->irq_enabled && cq->tail != cq->head) {
4103 n->cq_pending--;
4104 }
4105
4106 nvme_irq_deassert(n, cq);
4107 trace_pci_nvme_del_cq(qid);
4108 nvme_free_cq(cq, n);
4109 return NVME_SUCCESS;
4110}
4111
4112static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4113 uint16_t cqid, uint16_t vector, uint16_t size,
4114 uint16_t irq_enabled)
4115{
4116 int ret;
4117
4118 if (msix_enabled(&n->parent_obj)) {
4119 ret = msix_vector_use(&n->parent_obj, vector);
4120 assert(ret == 0);
4121 }
4122 cq->ctrl = n;
4123 cq->cqid = cqid;
4124 cq->size = size;
4125 cq->dma_addr = dma_addr;
4126 cq->phase = 1;
4127 cq->irq_enabled = irq_enabled;
4128 cq->vector = vector;
4129 cq->head = cq->tail = 0;
4130 QTAILQ_INIT(&cq->req_list);
4131 QTAILQ_INIT(&cq->sq_list);
4132 n->cq[cqid] = cq;
4133 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4134}
4135
4136static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4137{
4138 NvmeCQueue *cq;
4139 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4140 uint16_t cqid = le16_to_cpu(c->cqid);
4141 uint16_t vector = le16_to_cpu(c->irq_vector);
4142 uint16_t qsize = le16_to_cpu(c->qsize);
4143 uint16_t qflags = le16_to_cpu(c->cq_flags);
4144 uint64_t prp1 = le64_to_cpu(c->prp1);
4145
4146 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4147 NVME_CQ_FLAGS_IEN(qflags) != 0);
4148
4149 if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4150 n->cq[cqid] != NULL)) {
4151 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4152 return NVME_INVALID_QID | NVME_DNR;
4153 }
4154 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
4155 trace_pci_nvme_err_invalid_create_cq_size(qsize);
4156 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4157 }
4158 if (unlikely(prp1 & (n->page_size - 1))) {
4159 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4160 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4161 }
4162 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4163 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4164 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4165 }
4166 if (unlikely(vector >= n->params.msix_qsize)) {
4167 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4168 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4169 }
4170 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4171 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4172 return NVME_INVALID_FIELD | NVME_DNR;
4173 }
4174
4175 cq = g_malloc0(sizeof(*cq));
4176 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4177 NVME_CQ_FLAGS_IEN(qflags));
4178
4179
4180
4181
4182
4183
4184 n->qs_created = true;
4185 return NVME_SUCCESS;
4186}
4187
4188static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4189{
4190 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4191
4192 return nvme_c2h(n, id, sizeof(id), req);
4193}
4194
4195static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
4196{
4197 switch (ns->csi) {
4198 case NVME_CSI_NVM:
4199 case NVME_CSI_ZONED:
4200 return true;
4201 }
4202 return false;
4203}
4204
4205static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4206{
4207 trace_pci_nvme_identify_ctrl();
4208
4209 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4210}
4211
4212static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4213{
4214 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4215 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4216 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4217
4218 trace_pci_nvme_identify_ctrl_csi(c->csi);
4219
4220 switch (c->csi) {
4221 case NVME_CSI_NVM:
4222 id_nvm->vsl = n->params.vsl;
4223 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4224 break;
4225
4226 case NVME_CSI_ZONED:
4227 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4228 break;
4229
4230 default:
4231 return NVME_INVALID_FIELD | NVME_DNR;
4232 }
4233
4234 return nvme_c2h(n, id, sizeof(id), req);
4235}
4236
4237static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4238{
4239 NvmeNamespace *ns;
4240 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4241 uint32_t nsid = le32_to_cpu(c->nsid);
4242
4243 trace_pci_nvme_identify_ns(nsid);
4244
4245 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4246 return NVME_INVALID_NSID | NVME_DNR;
4247 }
4248
4249 ns = nvme_ns(n, nsid);
4250 if (unlikely(!ns)) {
4251 if (!active) {
4252 ns = nvme_subsys_ns(n->subsys, nsid);
4253 if (!ns) {
4254 return nvme_rpt_empty_id_struct(n, req);
4255 }
4256 } else {
4257 return nvme_rpt_empty_id_struct(n, req);
4258 }
4259 }
4260
4261 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
4262 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4263 }
4264
4265 return NVME_INVALID_CMD_SET | NVME_DNR;
4266}
4267
4268static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req)
4269{
4270 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4271 uint16_t min_id = le16_to_cpu(c->ctrlid);
4272 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4273 uint16_t *ids = &list[1];
4274 NvmeNamespace *ns;
4275 NvmeCtrl *ctrl;
4276 int cntlid, nr_ids = 0;
4277
4278 trace_pci_nvme_identify_ns_attached_list(min_id);
4279
4280 if (c->nsid == NVME_NSID_BROADCAST) {
4281 return NVME_INVALID_FIELD | NVME_DNR;
4282 }
4283
4284 ns = nvme_subsys_ns(n->subsys, c->nsid);
4285 if (!ns) {
4286 return NVME_INVALID_FIELD | NVME_DNR;
4287 }
4288
4289 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4290 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4291 if (!ctrl) {
4292 continue;
4293 }
4294
4295 if (!nvme_ns(ctrl, c->nsid)) {
4296 continue;
4297 }
4298
4299 ids[nr_ids++] = cntlid;
4300 }
4301
4302 list[0] = nr_ids;
4303
4304 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4305}
4306
4307static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4308 bool active)
4309{
4310 NvmeNamespace *ns;
4311 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4312 uint32_t nsid = le32_to_cpu(c->nsid);
4313
4314 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4315
4316 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4317 return NVME_INVALID_NSID | NVME_DNR;
4318 }
4319
4320 ns = nvme_ns(n, nsid);
4321 if (unlikely(!ns)) {
4322 if (!active) {
4323 ns = nvme_subsys_ns(n->subsys, nsid);
4324 if (!ns) {
4325 return nvme_rpt_empty_id_struct(n, req);
4326 }
4327 } else {
4328 return nvme_rpt_empty_id_struct(n, req);
4329 }
4330 }
4331
4332 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
4333 return nvme_rpt_empty_id_struct(n, req);
4334 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4335 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4336 req);
4337 }
4338
4339 return NVME_INVALID_FIELD | NVME_DNR;
4340}
4341
4342static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4343 bool active)
4344{
4345 NvmeNamespace *ns;
4346 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4347 uint32_t min_nsid = le32_to_cpu(c->nsid);
4348 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4349 static const int data_len = sizeof(list);
4350 uint32_t *list_ptr = (uint32_t *)list;
4351 int i, j = 0;
4352
4353 trace_pci_nvme_identify_nslist(min_nsid);
4354
4355
4356
4357
4358
4359
4360
4361 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4362 return NVME_INVALID_NSID | NVME_DNR;
4363 }
4364
4365 for (i = 1; i <= n->num_namespaces; i++) {
4366 ns = nvme_ns(n, i);
4367 if (!ns) {
4368 if (!active) {
4369 ns = nvme_subsys_ns(n->subsys, i);
4370 if (!ns) {
4371 continue;
4372 }
4373 } else {
4374 continue;
4375 }
4376 }
4377 if (ns->params.nsid <= min_nsid) {
4378 continue;
4379 }
4380 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4381 if (j == data_len / sizeof(uint32_t)) {
4382 break;
4383 }
4384 }
4385
4386 return nvme_c2h(n, list, data_len, req);
4387}
4388
4389static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4390 bool active)
4391{
4392 NvmeNamespace *ns;
4393 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4394 uint32_t min_nsid = le32_to_cpu(c->nsid);
4395 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4396 static const int data_len = sizeof(list);
4397 uint32_t *list_ptr = (uint32_t *)list;
4398 int i, j = 0;
4399
4400 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4401
4402
4403
4404
4405 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4406 return NVME_INVALID_NSID | NVME_DNR;
4407 }
4408
4409 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4410 return NVME_INVALID_FIELD | NVME_DNR;
4411 }
4412
4413 for (i = 1; i <= n->num_namespaces; i++) {
4414 ns = nvme_ns(n, i);
4415 if (!ns) {
4416 if (!active) {
4417 ns = nvme_subsys_ns(n->subsys, i);
4418 if (!ns) {
4419 continue;
4420 }
4421 } else {
4422 continue;
4423 }
4424 }
4425 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4426 continue;
4427 }
4428 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4429 if (j == data_len / sizeof(uint32_t)) {
4430 break;
4431 }
4432 }
4433
4434 return nvme_c2h(n, list, data_len, req);
4435}
4436
4437static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4438{
4439 NvmeNamespace *ns;
4440 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4441 uint32_t nsid = le32_to_cpu(c->nsid);
4442 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4443
4444 struct data {
4445 struct {
4446 NvmeIdNsDescr hdr;
4447 uint8_t v[NVME_NIDL_UUID];
4448 } uuid;
4449 struct {
4450 NvmeIdNsDescr hdr;
4451 uint8_t v;
4452 } csi;
4453 };
4454
4455 struct data *ns_descrs = (struct data *)list;
4456
4457 trace_pci_nvme_identify_ns_descr_list(nsid);
4458
4459 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4460 return NVME_INVALID_NSID | NVME_DNR;
4461 }
4462
4463 ns = nvme_ns(n, nsid);
4464 if (unlikely(!ns)) {
4465 return NVME_INVALID_FIELD | NVME_DNR;
4466 }
4467
4468
4469
4470
4471
4472
4473 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
4474 ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID;
4475 memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4476
4477 ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI;
4478 ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI;
4479 ns_descrs->csi.v = ns->csi;
4480
4481 return nvme_c2h(n, list, sizeof(list), req);
4482}
4483
4484static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4485{
4486 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4487 static const int data_len = sizeof(list);
4488
4489 trace_pci_nvme_identify_cmd_set();
4490
4491 NVME_SET_CSI(*list, NVME_CSI_NVM);
4492 NVME_SET_CSI(*list, NVME_CSI_ZONED);
4493
4494 return nvme_c2h(n, list, data_len, req);
4495}
4496
4497static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4498{
4499 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4500
4501 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4502 c->csi);
4503
4504 switch (c->cns) {
4505 case NVME_ID_CNS_NS:
4506 return nvme_identify_ns(n, req, true);
4507 case NVME_ID_CNS_NS_PRESENT:
4508 return nvme_identify_ns(n, req, false);
4509 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4510 return nvme_identify_ns_attached_list(n, req);
4511 case NVME_ID_CNS_CS_NS:
4512 return nvme_identify_ns_csi(n, req, true);
4513 case NVME_ID_CNS_CS_NS_PRESENT:
4514 return nvme_identify_ns_csi(n, req, false);
4515 case NVME_ID_CNS_CTRL:
4516 return nvme_identify_ctrl(n, req);
4517 case NVME_ID_CNS_CS_CTRL:
4518 return nvme_identify_ctrl_csi(n, req);
4519 case NVME_ID_CNS_NS_ACTIVE_LIST:
4520 return nvme_identify_nslist(n, req, true);
4521 case NVME_ID_CNS_NS_PRESENT_LIST:
4522 return nvme_identify_nslist(n, req, false);
4523 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4524 return nvme_identify_nslist_csi(n, req, true);
4525 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4526 return nvme_identify_nslist_csi(n, req, false);
4527 case NVME_ID_CNS_NS_DESCR_LIST:
4528 return nvme_identify_ns_descr_list(n, req);
4529 case NVME_ID_CNS_IO_COMMAND_SET:
4530 return nvme_identify_cmd_set(n, req);
4531 default:
4532 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4533 return NVME_INVALID_FIELD | NVME_DNR;
4534 }
4535}
4536
4537static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4538{
4539 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4540
4541 req->cqe.result = 1;
4542 if (nvme_check_sqid(n, sqid)) {
4543 return NVME_INVALID_FIELD | NVME_DNR;
4544 }
4545
4546 return NVME_SUCCESS;
4547}
4548
4549static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4550{
4551 trace_pci_nvme_setfeat_timestamp(ts);
4552
4553 n->host_timestamp = le64_to_cpu(ts);
4554 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4555}
4556
4557static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4558{
4559 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4560 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4561
4562 union nvme_timestamp {
4563 struct {
4564 uint64_t timestamp:48;
4565 uint64_t sync:1;
4566 uint64_t origin:3;
4567 uint64_t rsvd1:12;
4568 };
4569 uint64_t all;
4570 };
4571
4572 union nvme_timestamp ts;
4573 ts.all = 0;
4574 ts.timestamp = n->host_timestamp + elapsed_time;
4575
4576
4577 ts.origin = n->host_timestamp ? 0x01 : 0x00;
4578
4579 trace_pci_nvme_getfeat_timestamp(ts.all);
4580
4581 return cpu_to_le64(ts.all);
4582}
4583
4584static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4585{
4586 uint64_t timestamp = nvme_get_timestamp(n);
4587
4588 return nvme_c2h(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4589}
4590
4591static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4592{
4593 NvmeCmd *cmd = &req->cmd;
4594 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4595 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4596 uint32_t nsid = le32_to_cpu(cmd->nsid);
4597 uint32_t result;
4598 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4599 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4600 uint16_t iv;
4601 NvmeNamespace *ns;
4602 int i;
4603
4604 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4605 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4606 };
4607
4608 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4609
4610 if (!nvme_feature_support[fid]) {
4611 return NVME_INVALID_FIELD | NVME_DNR;
4612 }
4613
4614 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4615 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4616
4617
4618
4619
4620
4621
4622
4623 return NVME_INVALID_NSID | NVME_DNR;
4624 }
4625
4626 if (!nvme_ns(n, nsid)) {
4627 return NVME_INVALID_FIELD | NVME_DNR;
4628 }
4629 }
4630
4631 switch (sel) {
4632 case NVME_GETFEAT_SELECT_CURRENT:
4633 break;
4634 case NVME_GETFEAT_SELECT_SAVED:
4635
4636 case NVME_GETFEAT_SELECT_DEFAULT:
4637 goto defaults;
4638 case NVME_GETFEAT_SELECT_CAP:
4639 result = nvme_feature_cap[fid];
4640 goto out;
4641 }
4642
4643 switch (fid) {
4644 case NVME_TEMPERATURE_THRESHOLD:
4645 result = 0;
4646
4647
4648
4649
4650
4651 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4652 goto out;
4653 }
4654
4655 switch (NVME_TEMP_THSEL(dw11)) {
4656 case NVME_TEMP_THSEL_OVER:
4657 result = n->features.temp_thresh_hi;
4658 goto out;
4659 case NVME_TEMP_THSEL_UNDER:
4660 result = n->features.temp_thresh_low;
4661 goto out;
4662 }
4663
4664 return NVME_INVALID_FIELD | NVME_DNR;
4665 case NVME_ERROR_RECOVERY:
4666 if (!nvme_nsid_valid(n, nsid)) {
4667 return NVME_INVALID_NSID | NVME_DNR;
4668 }
4669
4670 ns = nvme_ns(n, nsid);
4671 if (unlikely(!ns)) {
4672 return NVME_INVALID_FIELD | NVME_DNR;
4673 }
4674
4675 result = ns->features.err_rec;
4676 goto out;
4677 case NVME_VOLATILE_WRITE_CACHE:
4678 result = 0;
4679 for (i = 1; i <= n->num_namespaces; i++) {
4680 ns = nvme_ns(n, i);
4681 if (!ns) {
4682 continue;
4683 }
4684
4685 result = blk_enable_write_cache(ns->blkconf.blk);
4686 if (result) {
4687 break;
4688 }
4689 }
4690 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4691 goto out;
4692 case NVME_ASYNCHRONOUS_EVENT_CONF:
4693 result = n->features.async_config;
4694 goto out;
4695 case NVME_TIMESTAMP:
4696 return nvme_get_feature_timestamp(n, req);
4697 default:
4698 break;
4699 }
4700
4701defaults:
4702 switch (fid) {
4703 case NVME_TEMPERATURE_THRESHOLD:
4704 result = 0;
4705
4706 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4707 break;
4708 }
4709
4710 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4711 result = NVME_TEMPERATURE_WARNING;
4712 }
4713
4714 break;
4715 case NVME_NUMBER_OF_QUEUES:
4716 result = (n->params.max_ioqpairs - 1) |
4717 ((n->params.max_ioqpairs - 1) << 16);
4718 trace_pci_nvme_getfeat_numq(result);
4719 break;
4720 case NVME_INTERRUPT_VECTOR_CONF:
4721 iv = dw11 & 0xffff;
4722 if (iv >= n->params.max_ioqpairs + 1) {
4723 return NVME_INVALID_FIELD | NVME_DNR;
4724 }
4725
4726 result = iv;
4727 if (iv == n->admin_cq.vector) {
4728 result |= NVME_INTVC_NOCOALESCING;
4729 }
4730 break;
4731 case NVME_COMMAND_SET_PROFILE:
4732 result = 0;
4733 break;
4734 default:
4735 result = nvme_feature_default[fid];
4736 break;
4737 }
4738
4739out:
4740 req->cqe.result = cpu_to_le32(result);
4741 return NVME_SUCCESS;
4742}
4743
4744static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4745{
4746 uint16_t ret;
4747 uint64_t timestamp;
4748
4749 ret = nvme_h2c(n, (uint8_t *)×tamp, sizeof(timestamp), req);
4750 if (ret) {
4751 return ret;
4752 }
4753
4754 nvme_set_timestamp(n, timestamp);
4755
4756 return NVME_SUCCESS;
4757}
4758
4759static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4760{
4761 NvmeNamespace *ns = NULL;
4762
4763 NvmeCmd *cmd = &req->cmd;
4764 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4765 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4766 uint32_t nsid = le32_to_cpu(cmd->nsid);
4767 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4768 uint8_t save = NVME_SETFEAT_SAVE(dw10);
4769 int i;
4770
4771 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
4772
4773 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
4774 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
4775 }
4776
4777 if (!nvme_feature_support[fid]) {
4778 return NVME_INVALID_FIELD | NVME_DNR;
4779 }
4780
4781 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4782 if (nsid != NVME_NSID_BROADCAST) {
4783 if (!nvme_nsid_valid(n, nsid)) {
4784 return NVME_INVALID_NSID | NVME_DNR;
4785 }
4786
4787 ns = nvme_ns(n, nsid);
4788 if (unlikely(!ns)) {
4789 return NVME_INVALID_FIELD | NVME_DNR;
4790 }
4791 }
4792 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
4793 if (!nvme_nsid_valid(n, nsid)) {
4794 return NVME_INVALID_NSID | NVME_DNR;
4795 }
4796
4797 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
4798 }
4799
4800 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
4801 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
4802 }
4803
4804 switch (fid) {
4805 case NVME_TEMPERATURE_THRESHOLD:
4806 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4807 break;
4808 }
4809
4810 switch (NVME_TEMP_THSEL(dw11)) {
4811 case NVME_TEMP_THSEL_OVER:
4812 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
4813 break;
4814 case NVME_TEMP_THSEL_UNDER:
4815 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
4816 break;
4817 default:
4818 return NVME_INVALID_FIELD | NVME_DNR;
4819 }
4820
4821 if ((n->temperature >= n->features.temp_thresh_hi) ||
4822 (n->temperature <= n->features.temp_thresh_low)) {
4823 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
4824 }
4825
4826 break;
4827 case NVME_ERROR_RECOVERY:
4828 if (nsid == NVME_NSID_BROADCAST) {
4829 for (i = 1; i <= n->num_namespaces; i++) {
4830 ns = nvme_ns(n, i);
4831
4832 if (!ns) {
4833 continue;
4834 }
4835
4836 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
4837 ns->features.err_rec = dw11;
4838 }
4839 }
4840
4841 break;
4842 }
4843
4844 assert(ns);
4845 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
4846 ns->features.err_rec = dw11;
4847 }
4848 break;
4849 case NVME_VOLATILE_WRITE_CACHE:
4850 for (i = 1; i <= n->num_namespaces; i++) {
4851 ns = nvme_ns(n, i);
4852 if (!ns) {
4853 continue;
4854 }
4855
4856 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
4857 blk_flush(ns->blkconf.blk);
4858 }
4859
4860 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
4861 }
4862
4863 break;
4864
4865 case NVME_NUMBER_OF_QUEUES:
4866 if (n->qs_created) {
4867 return NVME_CMD_SEQ_ERROR | NVME_DNR;
4868 }
4869
4870
4871
4872
4873
4874 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
4875 return NVME_INVALID_FIELD | NVME_DNR;
4876 }
4877
4878 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
4879 ((dw11 >> 16) & 0xffff) + 1,
4880 n->params.max_ioqpairs,
4881 n->params.max_ioqpairs);
4882 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
4883 ((n->params.max_ioqpairs - 1) << 16));
4884 break;
4885 case NVME_ASYNCHRONOUS_EVENT_CONF:
4886 n->features.async_config = dw11;
4887 break;
4888 case NVME_TIMESTAMP:
4889 return nvme_set_feature_timestamp(n, req);
4890 case NVME_COMMAND_SET_PROFILE:
4891 if (dw11 & 0x1ff) {
4892 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
4893 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
4894 }
4895 break;
4896 default:
4897 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
4898 }
4899 return NVME_SUCCESS;
4900}
4901
4902static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
4903{
4904 trace_pci_nvme_aer(nvme_cid(req));
4905
4906 if (n->outstanding_aers > n->params.aerl) {
4907 trace_pci_nvme_aer_aerl_exceeded();
4908 return NVME_AER_LIMIT_EXCEEDED;
4909 }
4910
4911 n->aer_reqs[n->outstanding_aers] = req;
4912 n->outstanding_aers++;
4913
4914 if (!QTAILQ_EMPTY(&n->aer_queue)) {
4915 nvme_process_aers(n);
4916 }
4917
4918 return NVME_NO_COMPLETE;
4919}
4920
4921static void nvme_update_dmrsl(NvmeCtrl *n)
4922{
4923 int nsid;
4924
4925 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
4926 NvmeNamespace *ns = nvme_ns(n, nsid);
4927 if (!ns) {
4928 continue;
4929 }
4930
4931 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
4932 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
4933 }
4934}
4935
4936static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns);
4937static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
4938{
4939 NvmeNamespace *ns;
4940 NvmeCtrl *ctrl;
4941 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4942 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4943 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
4944 bool attach = !(dw10 & 0xf);
4945 uint16_t *nr_ids = &list[0];
4946 uint16_t *ids = &list[1];
4947 uint16_t ret;
4948 int i;
4949
4950 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
4951
4952 if (!nvme_nsid_valid(n, nsid)) {
4953 return NVME_INVALID_NSID | NVME_DNR;
4954 }
4955
4956 ns = nvme_subsys_ns(n->subsys, nsid);
4957 if (!ns) {
4958 return NVME_INVALID_FIELD | NVME_DNR;
4959 }
4960
4961 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
4962 if (ret) {
4963 return ret;
4964 }
4965
4966 if (!*nr_ids) {
4967 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4968 }
4969
4970 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
4971 for (i = 0; i < *nr_ids; i++) {
4972 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
4973 if (!ctrl) {
4974 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4975 }
4976
4977 if (attach) {
4978 if (nvme_ns(ctrl, nsid)) {
4979 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
4980 }
4981
4982 if (ns->attached && !ns->params.shared) {
4983 return NVME_NS_PRIVATE | NVME_DNR;
4984 }
4985
4986 nvme_attach_ns(ctrl, ns);
4987 __nvme_select_ns_iocs(ctrl, ns);
4988 } else {
4989 if (!nvme_ns(ctrl, nsid)) {
4990 return NVME_NS_NOT_ATTACHED | NVME_DNR;
4991 }
4992
4993 ctrl->namespaces[nsid - 1] = NULL;
4994 ns->attached--;
4995
4996 nvme_update_dmrsl(ctrl);
4997 }
4998
4999
5000
5001
5002
5003 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5004 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5005 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5006 NVME_LOG_CHANGED_NSLIST);
5007 }
5008 }
5009
5010 return NVME_SUCCESS;
5011}
5012
5013static uint16_t nvme_format_ns(NvmeCtrl *n, NvmeNamespace *ns, uint8_t lbaf,
5014 uint8_t mset, uint8_t pi, uint8_t pil,
5015 NvmeRequest *req)
5016{
5017 int64_t len, offset;
5018 struct nvme_aio_format_ctx *ctx;
5019 BlockBackend *blk = ns->blkconf.blk;
5020 uint16_t ms;
5021 uintptr_t *num_formats = (uintptr_t *)&req->opaque;
5022 int *count;
5023
5024 if (ns->params.zoned) {
5025 return NVME_INVALID_FORMAT | NVME_DNR;
5026 }
5027
5028 trace_pci_nvme_format_ns(nvme_cid(req), nvme_nsid(ns), lbaf, mset, pi, pil);
5029
5030 if (lbaf > ns->id_ns.nlbaf) {
5031 return NVME_INVALID_FORMAT | NVME_DNR;
5032 }
5033
5034 ms = ns->id_ns.lbaf[lbaf].ms;
5035
5036 if (pi && (ms < sizeof(NvmeDifTuple))) {
5037 return NVME_INVALID_FORMAT | NVME_DNR;
5038 }
5039
5040 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5041 return NVME_INVALID_FIELD | NVME_DNR;
5042 }
5043
5044 nvme_ns_drain(ns);
5045 nvme_ns_shutdown(ns);
5046 nvme_ns_cleanup(ns);
5047
5048 ns->id_ns.dps = (pil << 3) | pi;
5049 ns->id_ns.flbas = lbaf | (mset << 4);
5050
5051 nvme_ns_init_format(ns);
5052
5053 ns->status = NVME_FORMAT_IN_PROGRESS;
5054
5055 len = ns->size;
5056 offset = 0;
5057
5058 count = g_new(int, 1);
5059 *count = 1;
5060
5061 (*num_formats)++;
5062
5063 while (len) {
5064 ctx = g_new(struct nvme_aio_format_ctx, 1);
5065 ctx->req = req;
5066 ctx->ns = ns;
5067 ctx->count = count;
5068
5069 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
5070
5071 (*count)++;
5072
5073 blk_aio_pwrite_zeroes(blk, offset, bytes, BDRV_REQ_MAY_UNMAP,
5074 nvme_aio_format_cb, ctx);
5075
5076 offset += bytes;
5077 len -= bytes;
5078
5079 }
5080
5081 if (--(*count)) {
5082 return NVME_NO_COMPLETE;
5083 }
5084
5085 g_free(count);
5086 ns->status = 0x0;
5087 (*num_formats)--;
5088
5089 return NVME_SUCCESS;
5090}
5091
5092static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5093{
5094 NvmeNamespace *ns;
5095 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5096 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5097 uint8_t lbaf = dw10 & 0xf;
5098 uint8_t mset = (dw10 >> 4) & 0x1;
5099 uint8_t pi = (dw10 >> 5) & 0x7;
5100 uint8_t pil = (dw10 >> 8) & 0x1;
5101 uintptr_t *num_formats = (uintptr_t *)&req->opaque;
5102 uint16_t status;
5103 int i;
5104
5105 trace_pci_nvme_format(nvme_cid(req), nsid, lbaf, mset, pi, pil);
5106
5107
5108 *num_formats = 1;
5109
5110 if (nsid != NVME_NSID_BROADCAST) {
5111 if (!nvme_nsid_valid(n, nsid)) {
5112 return NVME_INVALID_NSID | NVME_DNR;
5113 }
5114
5115 ns = nvme_ns(n, nsid);
5116 if (!ns) {
5117 return NVME_INVALID_FIELD | NVME_DNR;
5118 }
5119
5120 status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
5121 if (status && status != NVME_NO_COMPLETE) {
5122 req->status = status;
5123 }
5124 } else {
5125 for (i = 1; i <= n->num_namespaces; i++) {
5126 ns = nvme_ns(n, i);
5127 if (!ns) {
5128 continue;
5129 }
5130
5131 status = nvme_format_ns(n, ns, lbaf, mset, pi, pil, req);
5132 if (status && status != NVME_NO_COMPLETE) {
5133 req->status = status;
5134 break;
5135 }
5136 }
5137 }
5138
5139
5140 if (--(*num_formats)) {
5141 return NVME_NO_COMPLETE;
5142 }
5143
5144 return req->status;
5145}
5146
5147static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5148{
5149 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5150 nvme_adm_opc_str(req->cmd.opcode));
5151
5152 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5153 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5154 return NVME_INVALID_OPCODE | NVME_DNR;
5155 }
5156
5157
5158 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5159 return NVME_INVALID_FIELD | NVME_DNR;
5160 }
5161
5162 switch (req->cmd.opcode) {
5163 case NVME_ADM_CMD_DELETE_SQ:
5164 return nvme_del_sq(n, req);
5165 case NVME_ADM_CMD_CREATE_SQ:
5166 return nvme_create_sq(n, req);
5167 case NVME_ADM_CMD_GET_LOG_PAGE:
5168 return nvme_get_log(n, req);
5169 case NVME_ADM_CMD_DELETE_CQ:
5170 return nvme_del_cq(n, req);
5171 case NVME_ADM_CMD_CREATE_CQ:
5172 return nvme_create_cq(n, req);
5173 case NVME_ADM_CMD_IDENTIFY:
5174 return nvme_identify(n, req);
5175 case NVME_ADM_CMD_ABORT:
5176 return nvme_abort(n, req);
5177 case NVME_ADM_CMD_SET_FEATURES:
5178 return nvme_set_feature(n, req);
5179 case NVME_ADM_CMD_GET_FEATURES:
5180 return nvme_get_feature(n, req);
5181 case NVME_ADM_CMD_ASYNC_EV_REQ:
5182 return nvme_aer(n, req);
5183 case NVME_ADM_CMD_NS_ATTACHMENT:
5184 return nvme_ns_attachment(n, req);
5185 case NVME_ADM_CMD_FORMAT_NVM:
5186 return nvme_format(n, req);
5187 default:
5188 assert(false);
5189 }
5190
5191 return NVME_INVALID_OPCODE | NVME_DNR;
5192}
5193
5194static void nvme_process_sq(void *opaque)
5195{
5196 NvmeSQueue *sq = opaque;
5197 NvmeCtrl *n = sq->ctrl;
5198 NvmeCQueue *cq = n->cq[sq->cqid];
5199
5200 uint16_t status;
5201 hwaddr addr;
5202 NvmeCmd cmd;
5203 NvmeRequest *req;
5204
5205 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5206 addr = sq->dma_addr + sq->head * n->sqe_size;
5207 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5208 trace_pci_nvme_err_addr_read(addr);
5209 trace_pci_nvme_err_cfs();
5210 n->bar.csts = NVME_CSTS_FAILED;
5211 break;
5212 }
5213 nvme_inc_sq_head(sq);
5214
5215 req = QTAILQ_FIRST(&sq->req_list);
5216 QTAILQ_REMOVE(&sq->req_list, req, entry);
5217 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5218 nvme_req_clear(req);
5219 req->cqe.cid = cmd.cid;
5220 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5221
5222 status = sq->sqid ? nvme_io_cmd(n, req) :
5223 nvme_admin_cmd(n, req);
5224 if (status != NVME_NO_COMPLETE) {
5225 req->status = status;
5226 nvme_enqueue_req_completion(cq, req);
5227 }
5228 }
5229}
5230
5231static void nvme_ctrl_reset(NvmeCtrl *n)
5232{
5233 NvmeNamespace *ns;
5234 int i;
5235
5236 for (i = 1; i <= n->num_namespaces; i++) {
5237 ns = nvme_ns(n, i);
5238 if (!ns) {
5239 continue;
5240 }
5241
5242 nvme_ns_drain(ns);
5243 }
5244
5245 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5246 if (n->sq[i] != NULL) {
5247 nvme_free_sq(n->sq[i], n);
5248 }
5249 }
5250 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5251 if (n->cq[i] != NULL) {
5252 nvme_free_cq(n->cq[i], n);
5253 }
5254 }
5255
5256 while (!QTAILQ_EMPTY(&n->aer_queue)) {
5257 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5258 QTAILQ_REMOVE(&n->aer_queue, event, entry);
5259 g_free(event);
5260 }
5261
5262 n->aer_queued = 0;
5263 n->outstanding_aers = 0;
5264 n->qs_created = false;
5265
5266 n->bar.cc = 0;
5267}
5268
5269static void nvme_ctrl_shutdown(NvmeCtrl *n)
5270{
5271 NvmeNamespace *ns;
5272 int i;
5273
5274 if (n->pmr.dev) {
5275 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5276 }
5277
5278 for (i = 1; i <= n->num_namespaces; i++) {
5279 ns = nvme_ns(n, i);
5280 if (!ns) {
5281 continue;
5282 }
5283
5284 nvme_ns_shutdown(ns);
5285 }
5286}
5287
5288static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns)
5289{
5290 ns->iocs = nvme_cse_iocs_none;
5291 switch (ns->csi) {
5292 case NVME_CSI_NVM:
5293 if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
5294 ns->iocs = nvme_cse_iocs_nvm;
5295 }
5296 break;
5297 case NVME_CSI_ZONED:
5298 if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
5299 ns->iocs = nvme_cse_iocs_zoned;
5300 } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
5301 ns->iocs = nvme_cse_iocs_nvm;
5302 }
5303 break;
5304 }
5305}
5306
5307static void nvme_select_ns_iocs(NvmeCtrl *n)
5308{
5309 NvmeNamespace *ns;
5310 int i;
5311
5312 for (i = 1; i <= n->num_namespaces; i++) {
5313 ns = nvme_ns(n, i);
5314 if (!ns) {
5315 continue;
5316 }
5317
5318 __nvme_select_ns_iocs(n, ns);
5319 }
5320}
5321
5322static int nvme_start_ctrl(NvmeCtrl *n)
5323{
5324 uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
5325 uint32_t page_size = 1 << page_bits;
5326
5327 if (unlikely(n->cq[0])) {
5328 trace_pci_nvme_err_startfail_cq();
5329 return -1;
5330 }
5331 if (unlikely(n->sq[0])) {
5332 trace_pci_nvme_err_startfail_sq();
5333 return -1;
5334 }
5335 if (unlikely(!n->bar.asq)) {
5336 trace_pci_nvme_err_startfail_nbarasq();
5337 return -1;
5338 }
5339 if (unlikely(!n->bar.acq)) {
5340 trace_pci_nvme_err_startfail_nbaracq();
5341 return -1;
5342 }
5343 if (unlikely(n->bar.asq & (page_size - 1))) {
5344 trace_pci_nvme_err_startfail_asq_misaligned(n->bar.asq);
5345 return -1;
5346 }
5347 if (unlikely(n->bar.acq & (page_size - 1))) {
5348 trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq);
5349 return -1;
5350 }
5351 if (unlikely(!(NVME_CAP_CSS(n->bar.cap) & (1 << NVME_CC_CSS(n->bar.cc))))) {
5352 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(n->bar.cc));
5353 return -1;
5354 }
5355 if (unlikely(NVME_CC_MPS(n->bar.cc) <
5356 NVME_CAP_MPSMIN(n->bar.cap))) {
5357 trace_pci_nvme_err_startfail_page_too_small(
5358 NVME_CC_MPS(n->bar.cc),
5359 NVME_CAP_MPSMIN(n->bar.cap));
5360 return -1;
5361 }
5362 if (unlikely(NVME_CC_MPS(n->bar.cc) >
5363 NVME_CAP_MPSMAX(n->bar.cap))) {
5364 trace_pci_nvme_err_startfail_page_too_large(
5365 NVME_CC_MPS(n->bar.cc),
5366 NVME_CAP_MPSMAX(n->bar.cap));
5367 return -1;
5368 }
5369 if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
5370 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5371 trace_pci_nvme_err_startfail_cqent_too_small(
5372 NVME_CC_IOCQES(n->bar.cc),
5373 NVME_CTRL_CQES_MIN(n->bar.cap));
5374 return -1;
5375 }
5376 if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
5377 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5378 trace_pci_nvme_err_startfail_cqent_too_large(
5379 NVME_CC_IOCQES(n->bar.cc),
5380 NVME_CTRL_CQES_MAX(n->bar.cap));
5381 return -1;
5382 }
5383 if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
5384 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5385 trace_pci_nvme_err_startfail_sqent_too_small(
5386 NVME_CC_IOSQES(n->bar.cc),
5387 NVME_CTRL_SQES_MIN(n->bar.cap));
5388 return -1;
5389 }
5390 if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
5391 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5392 trace_pci_nvme_err_startfail_sqent_too_large(
5393 NVME_CC_IOSQES(n->bar.cc),
5394 NVME_CTRL_SQES_MAX(n->bar.cap));
5395 return -1;
5396 }
5397 if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
5398 trace_pci_nvme_err_startfail_asqent_sz_zero();
5399 return -1;
5400 }
5401 if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
5402 trace_pci_nvme_err_startfail_acqent_sz_zero();
5403 return -1;
5404 }
5405
5406 n->page_bits = page_bits;
5407 n->page_size = page_size;
5408 n->max_prp_ents = n->page_size / sizeof(uint64_t);
5409 n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
5410 n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
5411 nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
5412 NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
5413 nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
5414 NVME_AQA_ASQS(n->bar.aqa) + 1);
5415
5416 nvme_set_timestamp(n, 0ULL);
5417
5418 QTAILQ_INIT(&n->aer_queue);
5419
5420 nvme_select_ns_iocs(n);
5421
5422 return 0;
5423}
5424
5425static void nvme_cmb_enable_regs(NvmeCtrl *n)
5426{
5427 NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1);
5428 NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1);
5429 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
5430
5431 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
5432 NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
5433 NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
5434 NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
5435 NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
5436 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2);
5437 NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
5438}
5439
5440static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5441 unsigned size)
5442{
5443 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5444 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5445 "MMIO write not 32-bit aligned,"
5446 " offset=0x%"PRIx64"", offset);
5447
5448 }
5449
5450 if (unlikely(size < sizeof(uint32_t))) {
5451 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5452 "MMIO write smaller than 32-bits,"
5453 " offset=0x%"PRIx64", size=%u",
5454 offset, size);
5455
5456 }
5457
5458 switch (offset) {
5459 case 0xc:
5460 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5461 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5462 "undefined access to interrupt mask set"
5463 " when MSI-X is enabled");
5464
5465 }
5466 n->bar.intms |= data & 0xffffffff;
5467 n->bar.intmc = n->bar.intms;
5468 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, n->bar.intmc);
5469 nvme_irq_check(n);
5470 break;
5471 case 0x10:
5472 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5473 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5474 "undefined access to interrupt mask clr"
5475 " when MSI-X is enabled");
5476
5477 }
5478 n->bar.intms &= ~(data & 0xffffffff);
5479 n->bar.intmc = n->bar.intms;
5480 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, n->bar.intmc);
5481 nvme_irq_check(n);
5482 break;
5483 case 0x14:
5484 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5485
5486 if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
5487 !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
5488 {
5489 n->bar.cc = data;
5490 }
5491
5492 if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
5493 n->bar.cc = data;
5494 if (unlikely(nvme_start_ctrl(n))) {
5495 trace_pci_nvme_err_startfail();
5496 n->bar.csts = NVME_CSTS_FAILED;
5497 } else {
5498 trace_pci_nvme_mmio_start_success();
5499 n->bar.csts = NVME_CSTS_READY;
5500 }
5501 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
5502 trace_pci_nvme_mmio_stopped();
5503 nvme_ctrl_reset(n);
5504 n->bar.csts &= ~NVME_CSTS_READY;
5505 }
5506 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
5507 trace_pci_nvme_mmio_shutdown_set();
5508 nvme_ctrl_shutdown(n);
5509 n->bar.cc = data;
5510 n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
5511 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
5512 trace_pci_nvme_mmio_shutdown_cleared();
5513 n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
5514 n->bar.cc = data;
5515 }
5516 break;
5517 case 0x1c:
5518 if (data & (1 << 4)) {
5519 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5520 "attempted to W1C CSTS.NSSRO"
5521 " but CAP.NSSRS is zero (not supported)");
5522 } else if (data != 0) {
5523 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5524 "attempted to set a read only bit"
5525 " of controller status");
5526 }
5527 break;
5528 case 0x20:
5529 if (data == 0x4e564d65) {
5530 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5531 } else {
5532
5533 return;
5534 }
5535 break;
5536 case 0x24:
5537 n->bar.aqa = data & 0xffffffff;
5538 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5539 break;
5540 case 0x28:
5541 n->bar.asq = size == 8 ? data :
5542 (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff);
5543 trace_pci_nvme_mmio_asqaddr(data);
5544 break;
5545 case 0x2c:
5546 n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32);
5547 trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
5548 break;
5549 case 0x30:
5550 trace_pci_nvme_mmio_acqaddr(data);
5551 n->bar.acq = size == 8 ? data :
5552 (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff);
5553 break;
5554 case 0x34:
5555 n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32);
5556 trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
5557 break;
5558 case 0x38:
5559 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5560 "invalid write to reserved CMBLOC"
5561 " when CMBSZ is zero, ignored");
5562 return;
5563 case 0x3C:
5564 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5565 "invalid write to read only CMBSZ, ignored");
5566 return;
5567 case 0x50:
5568 if (!NVME_CAP_CMBS(n->bar.cap)) {
5569 return;
5570 }
5571
5572 n->bar.cmbmsc = size == 8 ? data :
5573 (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff);
5574 n->cmb.cmse = false;
5575
5576 if (NVME_CMBMSC_CRE(data)) {
5577 nvme_cmb_enable_regs(n);
5578
5579 if (NVME_CMBMSC_CMSE(data)) {
5580 hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT;
5581 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5582 NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1);
5583 return;
5584 }
5585
5586 n->cmb.cba = cba;
5587 n->cmb.cmse = true;
5588 }
5589 } else {
5590 n->bar.cmbsz = 0;
5591 n->bar.cmbloc = 0;
5592 }
5593
5594 return;
5595 case 0x54:
5596 n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32);
5597 return;
5598
5599 case 0xe00:
5600 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5601 "invalid write to PMRCAP register, ignored");
5602 return;
5603 case 0xe04:
5604 if (!NVME_CAP_PMRS(n->bar.cap)) {
5605 return;
5606 }
5607
5608 n->bar.pmrctl = data;
5609 if (NVME_PMRCTL_EN(data)) {
5610 memory_region_set_enabled(&n->pmr.dev->mr, true);
5611 n->bar.pmrsts = 0;
5612 } else {
5613 memory_region_set_enabled(&n->pmr.dev->mr, false);
5614 NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
5615 n->pmr.cmse = false;
5616 }
5617 return;
5618 case 0xe08:
5619 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5620 "invalid write to PMRSTS register, ignored");
5621 return;
5622 case 0xe0C:
5623 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5624 "invalid write to PMREBS register, ignored");
5625 return;
5626 case 0xe10:
5627 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5628 "invalid write to PMRSWTP register, ignored");
5629 return;
5630 case 0xe14:
5631 if (!NVME_CAP_PMRS(n->bar.cap)) {
5632 return;
5633 }
5634
5635 n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff);
5636 n->pmr.cmse = false;
5637
5638 if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) {
5639 hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT;
5640 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5641 NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1);
5642 return;
5643 }
5644
5645 n->pmr.cmse = true;
5646 n->pmr.cba = cba;
5647 }
5648
5649 return;
5650 case 0xe18:
5651 if (!NVME_CAP_PMRS(n->bar.cap)) {
5652 return;
5653 }
5654
5655 n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32);
5656 return;
5657 default:
5658 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5659 "invalid MMIO write,"
5660 " offset=0x%"PRIx64", data=%"PRIx64"",
5661 offset, data);
5662 break;
5663 }
5664}
5665
5666static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5667{
5668 NvmeCtrl *n = (NvmeCtrl *)opaque;
5669 uint8_t *ptr = (uint8_t *)&n->bar;
5670 uint64_t val = 0;
5671
5672 trace_pci_nvme_mmio_read(addr, size);
5673
5674 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
5675 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
5676 "MMIO read not 32-bit aligned,"
5677 " offset=0x%"PRIx64"", addr);
5678
5679 } else if (unlikely(size < sizeof(uint32_t))) {
5680 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
5681 "MMIO read smaller than 32-bits,"
5682 " offset=0x%"PRIx64"", addr);
5683
5684 }
5685
5686 if (addr < sizeof(n->bar)) {
5687
5688
5689
5690
5691
5692 if (addr == 0xe08 &&
5693 (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
5694 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5695 }
5696 memcpy(&val, ptr + addr, size);
5697 } else {
5698 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
5699 "MMIO read beyond last register,"
5700 " offset=0x%"PRIx64", returning 0", addr);
5701 }
5702
5703 return val;
5704}
5705
5706static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
5707{
5708 uint32_t qid;
5709
5710 if (unlikely(addr & ((1 << 2) - 1))) {
5711 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
5712 "doorbell write not 32-bit aligned,"
5713 " offset=0x%"PRIx64", ignoring", addr);
5714 return;
5715 }
5716
5717 if (((addr - 0x1000) >> 2) & 1) {
5718
5719
5720 uint16_t new_head = val & 0xffff;
5721 int start_sqs;
5722 NvmeCQueue *cq;
5723
5724 qid = (addr - (0x1000 + (1 << 2))) >> 3;
5725 if (unlikely(nvme_check_cqid(n, qid))) {
5726 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
5727 "completion queue doorbell write"
5728 " for nonexistent queue,"
5729 " sqid=%"PRIu32", ignoring", qid);
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744 if (n->outstanding_aers) {
5745 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5746 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
5747 NVME_LOG_ERROR_INFO);
5748 }
5749
5750 return;
5751 }
5752
5753 cq = n->cq[qid];
5754 if (unlikely(new_head >= cq->size)) {
5755 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
5756 "completion queue doorbell write value"
5757 " beyond queue size, sqid=%"PRIu32","
5758 " new_head=%"PRIu16", ignoring",
5759 qid, new_head);
5760
5761 if (n->outstanding_aers) {
5762 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5763 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
5764 NVME_LOG_ERROR_INFO);
5765 }
5766
5767 return;
5768 }
5769
5770 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
5771
5772 start_sqs = nvme_cq_full(cq) ? 1 : 0;
5773 cq->head = new_head;
5774 if (start_sqs) {
5775 NvmeSQueue *sq;
5776 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
5777 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5778 }
5779 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5780 }
5781
5782 if (cq->tail == cq->head) {
5783 if (cq->irq_enabled) {
5784 n->cq_pending--;
5785 }
5786
5787 nvme_irq_deassert(n, cq);
5788 }
5789 } else {
5790
5791
5792 uint16_t new_tail = val & 0xffff;
5793 NvmeSQueue *sq;
5794
5795 qid = (addr - 0x1000) >> 3;
5796 if (unlikely(nvme_check_sqid(n, qid))) {
5797 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
5798 "submission queue doorbell write"
5799 " for nonexistent queue,"
5800 " sqid=%"PRIu32", ignoring", qid);
5801
5802 if (n->outstanding_aers) {
5803 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5804 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
5805 NVME_LOG_ERROR_INFO);
5806 }
5807
5808 return;
5809 }
5810
5811 sq = n->sq[qid];
5812 if (unlikely(new_tail >= sq->size)) {
5813 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
5814 "submission queue doorbell write value"
5815 " beyond queue size, sqid=%"PRIu32","
5816 " new_tail=%"PRIu16", ignoring",
5817 qid, new_tail);
5818
5819 if (n->outstanding_aers) {
5820 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
5821 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
5822 NVME_LOG_ERROR_INFO);
5823 }
5824
5825 return;
5826 }
5827
5828 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
5829
5830 sq->tail = new_tail;
5831 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
5832 }
5833}
5834
5835static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
5836 unsigned size)
5837{
5838 NvmeCtrl *n = (NvmeCtrl *)opaque;
5839
5840 trace_pci_nvme_mmio_write(addr, data, size);
5841
5842 if (addr < sizeof(n->bar)) {
5843 nvme_write_bar(n, addr, data, size);
5844 } else {
5845 nvme_process_db(n, addr, data);
5846 }
5847}
5848
5849static const MemoryRegionOps nvme_mmio_ops = {
5850 .read = nvme_mmio_read,
5851 .write = nvme_mmio_write,
5852 .endianness = DEVICE_LITTLE_ENDIAN,
5853 .impl = {
5854 .min_access_size = 2,
5855 .max_access_size = 8,
5856 },
5857};
5858
5859static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
5860 unsigned size)
5861{
5862 NvmeCtrl *n = (NvmeCtrl *)opaque;
5863 stn_le_p(&n->cmb.buf[addr], size, data);
5864}
5865
5866static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
5867{
5868 NvmeCtrl *n = (NvmeCtrl *)opaque;
5869 return ldn_le_p(&n->cmb.buf[addr], size);
5870}
5871
5872static const MemoryRegionOps nvme_cmb_ops = {
5873 .read = nvme_cmb_read,
5874 .write = nvme_cmb_write,
5875 .endianness = DEVICE_LITTLE_ENDIAN,
5876 .impl = {
5877 .min_access_size = 1,
5878 .max_access_size = 8,
5879 },
5880};
5881
5882static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
5883{
5884 NvmeParams *params = &n->params;
5885
5886 if (params->num_queues) {
5887 warn_report("num_queues is deprecated; please use max_ioqpairs "
5888 "instead");
5889
5890 params->max_ioqpairs = params->num_queues - 1;
5891 }
5892
5893 if (n->namespace.blkconf.blk && n->subsys) {
5894 error_setg(errp, "subsystem support is unavailable with legacy "
5895 "namespace ('drive' property)");
5896 return;
5897 }
5898
5899 if (params->max_ioqpairs < 1 ||
5900 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
5901 error_setg(errp, "max_ioqpairs must be between 1 and %d",
5902 NVME_MAX_IOQPAIRS);
5903 return;
5904 }
5905
5906 if (params->msix_qsize < 1 ||
5907 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
5908 error_setg(errp, "msix_qsize must be between 1 and %d",
5909 PCI_MSIX_FLAGS_QSIZE + 1);
5910 return;
5911 }
5912
5913 if (!params->serial) {
5914 error_setg(errp, "serial property not set");
5915 return;
5916 }
5917
5918 if (n->pmr.dev) {
5919 if (host_memory_backend_is_mapped(n->pmr.dev)) {
5920 error_setg(errp, "can't use already busy memdev: %s",
5921 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
5922 return;
5923 }
5924
5925 if (!is_power_of_2(n->pmr.dev->size)) {
5926 error_setg(errp, "pmr backend size needs to be power of 2 in size");
5927 return;
5928 }
5929
5930 host_memory_backend_set_mapped(n->pmr.dev, true);
5931 }
5932
5933 if (n->params.zasl > n->params.mdts) {
5934 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
5935 "than or equal to mdts (Maximum Data Transfer Size)");
5936 return;
5937 }
5938
5939 if (!n->params.vsl) {
5940 error_setg(errp, "vsl must be non-zero");
5941 return;
5942 }
5943}
5944
5945static void nvme_init_state(NvmeCtrl *n)
5946{
5947 n->num_namespaces = NVME_MAX_NAMESPACES;
5948
5949 n->reg_size = pow2ceil(sizeof(NvmeBar) +
5950 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
5951 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
5952 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
5953 n->temperature = NVME_TEMPERATURE;
5954 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
5955 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5956 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
5957}
5958
5959static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
5960{
5961 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
5962
5963 n->cmb.buf = g_malloc0(cmb_size);
5964 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
5965 "nvme-cmb", cmb_size);
5966 pci_register_bar(pci_dev, NVME_CMB_BIR,
5967 PCI_BASE_ADDRESS_SPACE_MEMORY |
5968 PCI_BASE_ADDRESS_MEM_TYPE_64 |
5969 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
5970
5971 NVME_CAP_SET_CMBS(n->bar.cap, 1);
5972
5973 if (n->params.legacy_cmb) {
5974 nvme_cmb_enable_regs(n);
5975 n->cmb.cmse = true;
5976 }
5977}
5978
5979static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
5980{
5981 NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1);
5982 NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1);
5983 NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
5984
5985 NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
5986 NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1);
5987
5988 pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
5989 PCI_BASE_ADDRESS_SPACE_MEMORY |
5990 PCI_BASE_ADDRESS_MEM_TYPE_64 |
5991 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
5992
5993 memory_region_set_enabled(&n->pmr.dev->mr, false);
5994}
5995
5996static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
5997{
5998 uint8_t *pci_conf = pci_dev->config;
5999 uint64_t bar_size, msix_table_size, msix_pba_size;
6000 unsigned msix_table_offset, msix_pba_offset;
6001 int ret;
6002
6003 Error *err = NULL;
6004
6005 pci_conf[PCI_INTERRUPT_PIN] = 1;
6006 pci_config_set_prog_interface(pci_conf, 0x2);
6007
6008 if (n->params.use_intel_id) {
6009 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6010 pci_config_set_device_id(pci_conf, 0x5845);
6011 } else {
6012 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6013 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6014 }
6015
6016 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6017 pcie_endpoint_cap_init(pci_dev, 0x80);
6018
6019 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6020 msix_table_offset = bar_size;
6021 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6022
6023 bar_size += msix_table_size;
6024 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6025 msix_pba_offset = bar_size;
6026 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6027
6028 bar_size += msix_pba_size;
6029 bar_size = pow2ceil(bar_size);
6030
6031 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6032 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6033 n->reg_size);
6034 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6035
6036 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6037 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6038 ret = msix_init(pci_dev, n->params.msix_qsize,
6039 &n->bar0, 0, msix_table_offset,
6040 &n->bar0, 0, msix_pba_offset, 0, &err);
6041 if (ret < 0) {
6042 if (ret == -ENOTSUP) {
6043 warn_report_err(err);
6044 } else {
6045 error_propagate(errp, err);
6046 return ret;
6047 }
6048 }
6049
6050 if (n->params.cmb_size_mb) {
6051 nvme_init_cmb(n, pci_dev);
6052 }
6053
6054 if (n->pmr.dev) {
6055 nvme_init_pmr(n, pci_dev);
6056 }
6057
6058 return 0;
6059}
6060
6061static void nvme_init_subnqn(NvmeCtrl *n)
6062{
6063 NvmeSubsystem *subsys = n->subsys;
6064 NvmeIdCtrl *id = &n->id_ctrl;
6065
6066 if (!subsys) {
6067 snprintf((char *)id->subnqn, sizeof(id->subnqn),
6068 "nqn.2019-08.org.qemu:%s", n->params.serial);
6069 } else {
6070 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6071 }
6072}
6073
6074static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6075{
6076 NvmeIdCtrl *id = &n->id_ctrl;
6077 uint8_t *pci_conf = pci_dev->config;
6078
6079 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6080 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6081 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6082 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6083 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6084
6085 id->cntlid = cpu_to_le16(n->cntlid);
6086
6087 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6088
6089 id->rab = 6;
6090
6091 if (n->params.use_intel_id) {
6092 id->ieee[0] = 0xb3;
6093 id->ieee[1] = 0x02;
6094 id->ieee[2] = 0x00;
6095 } else {
6096 id->ieee[0] = 0x00;
6097 id->ieee[1] = 0x54;
6098 id->ieee[2] = 0x52;
6099 }
6100
6101 id->mdts = n->params.mdts;
6102 id->ver = cpu_to_le32(NVME_SPEC_VER);
6103 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6104 id->cntrltype = 0x1;
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117 id->acl = 3;
6118 id->aerl = n->params.aerl;
6119 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6120 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6121
6122
6123 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6124 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6125
6126 id->sqes = (0x6 << 4) | 0x6;
6127 id->cqes = (0x4 << 4) | 0x4;
6128 id->nn = cpu_to_le32(n->num_namespaces);
6129 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6130 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6131 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6132
6133
6134
6135
6136
6137
6138
6139
6140 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6141
6142 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6143 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6144 NVME_CTRL_SGLS_BITBUCKET);
6145
6146 nvme_init_subnqn(n);
6147
6148 id->psd[0].mp = cpu_to_le16(0x9c4);
6149 id->psd[0].enlat = cpu_to_le32(0x10);
6150 id->psd[0].exlat = cpu_to_le32(0x4);
6151
6152 if (n->subsys) {
6153 id->cmic |= NVME_CMIC_MULTI_CTRL;
6154 }
6155
6156 NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
6157 NVME_CAP_SET_CQR(n->bar.cap, 1);
6158 NVME_CAP_SET_TO(n->bar.cap, 0xf);
6159 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM);
6160 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP);
6161 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
6162 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
6163 NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
6164 NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0);
6165
6166 n->bar.vs = NVME_SPEC_VER;
6167 n->bar.intmc = n->bar.intms = 0;
6168}
6169
6170static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6171{
6172 int cntlid;
6173
6174 if (!n->subsys) {
6175 return 0;
6176 }
6177
6178 cntlid = nvme_subsys_register_ctrl(n, errp);
6179 if (cntlid < 0) {
6180 return -1;
6181 }
6182
6183 n->cntlid = cntlid;
6184
6185 return 0;
6186}
6187
6188void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6189{
6190 uint32_t nsid = ns->params.nsid;
6191 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6192
6193 n->namespaces[nsid - 1] = ns;
6194 ns->attached++;
6195
6196 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6197 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6198}
6199
6200static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6201{
6202 NvmeCtrl *n = NVME(pci_dev);
6203 NvmeNamespace *ns;
6204 Error *local_err = NULL;
6205
6206 nvme_check_constraints(n, &local_err);
6207 if (local_err) {
6208 error_propagate(errp, local_err);
6209 return;
6210 }
6211
6212 qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6213 &pci_dev->qdev, n->parent_obj.qdev.id);
6214
6215 nvme_init_state(n);
6216 if (nvme_init_pci(n, pci_dev, errp)) {
6217 return;
6218 }
6219
6220 if (nvme_init_subsys(n, errp)) {
6221 error_propagate(errp, local_err);
6222 return;
6223 }
6224 nvme_init_ctrl(n, pci_dev);
6225
6226
6227 if (n->namespace.blkconf.blk) {
6228 ns = &n->namespace;
6229 ns->params.nsid = 1;
6230
6231 if (nvme_ns_setup(n, ns, errp)) {
6232 return;
6233 }
6234
6235 nvme_attach_ns(n, ns);
6236 }
6237}
6238
6239static void nvme_exit(PCIDevice *pci_dev)
6240{
6241 NvmeCtrl *n = NVME(pci_dev);
6242 NvmeNamespace *ns;
6243 int i;
6244
6245 nvme_ctrl_reset(n);
6246
6247 for (i = 1; i <= n->num_namespaces; i++) {
6248 ns = nvme_ns(n, i);
6249 if (!ns) {
6250 continue;
6251 }
6252
6253 nvme_ns_cleanup(ns);
6254 }
6255
6256 g_free(n->cq);
6257 g_free(n->sq);
6258 g_free(n->aer_reqs);
6259
6260 if (n->params.cmb_size_mb) {
6261 g_free(n->cmb.buf);
6262 }
6263
6264 if (n->pmr.dev) {
6265 host_memory_backend_set_mapped(n->pmr.dev, false);
6266 }
6267 msix_uninit(pci_dev, &n->bar0, &n->bar0);
6268 memory_region_del_subregion(&n->bar0, &n->iomem);
6269}
6270
6271static Property nvme_props[] = {
6272 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6273 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6274 HostMemoryBackend *),
6275 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6276 NvmeSubsystem *),
6277 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6278 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6279 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6280 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6281 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6282 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6283 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6284 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6285 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6286 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6287 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6288 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6289 DEFINE_PROP_END_OF_LIST(),
6290};
6291
6292static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6293 void *opaque, Error **errp)
6294{
6295 NvmeCtrl *n = NVME(obj);
6296 uint8_t value = n->smart_critical_warning;
6297
6298 visit_type_uint8(v, name, &value, errp);
6299}
6300
6301static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6302 void *opaque, Error **errp)
6303{
6304 NvmeCtrl *n = NVME(obj);
6305 uint8_t value, old_value, cap = 0, index, event;
6306
6307 if (!visit_type_uint8(v, name, &value, errp)) {
6308 return;
6309 }
6310
6311 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6312 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6313 if (NVME_CAP_PMRS(n->bar.cap)) {
6314 cap |= NVME_SMART_PMR_UNRELIABLE;
6315 }
6316
6317 if ((value & cap) != value) {
6318 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6319 value & ~cap);
6320 return;
6321 }
6322
6323 old_value = n->smart_critical_warning;
6324 n->smart_critical_warning = value;
6325
6326
6327 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6328 event = 1 << index;
6329 if (value & ~old_value & event)
6330 nvme_smart_event(n, event);
6331 }
6332}
6333
6334static const VMStateDescription nvme_vmstate = {
6335 .name = "nvme",
6336 .unmigratable = 1,
6337};
6338
6339static void nvme_class_init(ObjectClass *oc, void *data)
6340{
6341 DeviceClass *dc = DEVICE_CLASS(oc);
6342 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6343
6344 pc->realize = nvme_realize;
6345 pc->exit = nvme_exit;
6346 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6347 pc->revision = 2;
6348
6349 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6350 dc->desc = "Non-Volatile Memory Express";
6351 device_class_set_props(dc, nvme_props);
6352 dc->vmsd = &nvme_vmstate;
6353}
6354
6355static void nvme_instance_init(Object *obj)
6356{
6357 NvmeCtrl *n = NVME(obj);
6358
6359 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6360 "bootindex", "/namespace@1,0",
6361 DEVICE(obj));
6362
6363 object_property_add(obj, "smart_critical_warning", "uint8",
6364 nvme_get_smart_warning,
6365 nvme_set_smart_warning, NULL, NULL);
6366}
6367
6368static const TypeInfo nvme_info = {
6369 .name = TYPE_NVME,
6370 .parent = TYPE_PCI_DEVICE,
6371 .instance_size = sizeof(NvmeCtrl),
6372 .instance_init = nvme_instance_init,
6373 .class_init = nvme_class_init,
6374 .interfaces = (InterfaceInfo[]) {
6375 { INTERFACE_PCIE_DEVICE },
6376 { }
6377 },
6378};
6379
6380static const TypeInfo nvme_bus_info = {
6381 .name = TYPE_NVME_BUS,
6382 .parent = TYPE_BUS,
6383 .instance_size = sizeof(NvmeBus),
6384};
6385
6386static void nvme_register_types(void)
6387{
6388 type_register_static(&nvme_info);
6389 type_register_static(&nvme_bus_info);
6390}
6391
6392type_init(nvme_register_types)
6393