1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include "qemu/osdep.h"
22#include <sys/ioctl.h>
23#ifdef CONFIG_KVM
24#include <linux/kvm.h>
25#endif
26#include <linux/vfio.h>
27
28#include "hw/vfio/vfio-common.h"
29#include "hw/vfio/vfio.h"
30#include "exec/address-spaces.h"
31#include "exec/memory.h"
32#include "exec/ram_addr.h"
33#include "hw/hw.h"
34#include "qemu/error-report.h"
35#include "qemu/main-loop.h"
36#include "qemu/range.h"
37#include "sysemu/kvm.h"
38#include "sysemu/reset.h"
39#include "sysemu/runstate.h"
40#include "trace.h"
41#include "qapi/error.h"
42#include "migration/migration.h"
43#include "sysemu/tpm.h"
44
45VFIOGroupList vfio_group_list =
46 QLIST_HEAD_INITIALIZER(vfio_group_list);
47static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
48 QLIST_HEAD_INITIALIZER(vfio_address_spaces);
49
50#ifdef CONFIG_KVM
51
52
53
54
55
56
57
58static int vfio_kvm_device_fd = -1;
59#endif
60
61
62
63
64void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
65{
66 struct vfio_irq_set irq_set = {
67 .argsz = sizeof(irq_set),
68 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
69 .index = index,
70 .start = 0,
71 .count = 0,
72 };
73
74 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
75}
76
77void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
78{
79 struct vfio_irq_set irq_set = {
80 .argsz = sizeof(irq_set),
81 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
82 .index = index,
83 .start = 0,
84 .count = 1,
85 };
86
87 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
88}
89
90void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
91{
92 struct vfio_irq_set irq_set = {
93 .argsz = sizeof(irq_set),
94 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
95 .index = index,
96 .start = 0,
97 .count = 1,
98 };
99
100 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
101}
102
103static inline const char *action_to_str(int action)
104{
105 switch (action) {
106 case VFIO_IRQ_SET_ACTION_MASK:
107 return "MASK";
108 case VFIO_IRQ_SET_ACTION_UNMASK:
109 return "UNMASK";
110 case VFIO_IRQ_SET_ACTION_TRIGGER:
111 return "TRIGGER";
112 default:
113 return "UNKNOWN ACTION";
114 }
115}
116
117static const char *index_to_str(VFIODevice *vbasedev, int index)
118{
119 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
120 return NULL;
121 }
122
123 switch (index) {
124 case VFIO_PCI_INTX_IRQ_INDEX:
125 return "INTX";
126 case VFIO_PCI_MSI_IRQ_INDEX:
127 return "MSI";
128 case VFIO_PCI_MSIX_IRQ_INDEX:
129 return "MSIX";
130 case VFIO_PCI_ERR_IRQ_INDEX:
131 return "ERR";
132 case VFIO_PCI_REQ_IRQ_INDEX:
133 return "REQ";
134 default:
135 return NULL;
136 }
137}
138
139static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
140{
141 switch (container->iommu_type) {
142 case VFIO_TYPE1v2_IOMMU:
143 case VFIO_TYPE1_IOMMU:
144
145
146
147 return ram_block_uncoordinated_discard_disable(state);
148 default:
149
150
151
152
153
154
155
156
157
158 return ram_block_discard_disable(state);
159 }
160}
161
162int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
163 int action, int fd, Error **errp)
164{
165 struct vfio_irq_set *irq_set;
166 int argsz, ret = 0;
167 const char *name;
168 int32_t *pfd;
169
170 argsz = sizeof(*irq_set) + sizeof(*pfd);
171
172 irq_set = g_malloc0(argsz);
173 irq_set->argsz = argsz;
174 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
175 irq_set->index = index;
176 irq_set->start = subindex;
177 irq_set->count = 1;
178 pfd = (int32_t *)&irq_set->data;
179 *pfd = fd;
180
181 if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
182 ret = -errno;
183 }
184 g_free(irq_set);
185
186 if (!ret) {
187 return 0;
188 }
189
190 error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
191
192 name = index_to_str(vbasedev, index);
193 if (name) {
194 error_prepend(errp, "%s-%d: ", name, subindex);
195 } else {
196 error_prepend(errp, "index %d-%d: ", index, subindex);
197 }
198 error_prepend(errp,
199 "Failed to %s %s eventfd signaling for interrupt ",
200 fd < 0 ? "tear down" : "set up", action_to_str(action));
201 return ret;
202}
203
204
205
206
207void vfio_region_write(void *opaque, hwaddr addr,
208 uint64_t data, unsigned size)
209{
210 VFIORegion *region = opaque;
211 VFIODevice *vbasedev = region->vbasedev;
212 union {
213 uint8_t byte;
214 uint16_t word;
215 uint32_t dword;
216 uint64_t qword;
217 } buf;
218
219 switch (size) {
220 case 1:
221 buf.byte = data;
222 break;
223 case 2:
224 buf.word = cpu_to_le16(data);
225 break;
226 case 4:
227 buf.dword = cpu_to_le32(data);
228 break;
229 case 8:
230 buf.qword = cpu_to_le64(data);
231 break;
232 default:
233 hw_error("vfio: unsupported write size, %u bytes", size);
234 break;
235 }
236
237 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
238 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
239 ",%d) failed: %m",
240 __func__, vbasedev->name, region->nr,
241 addr, data, size);
242 }
243
244 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
245
246
247
248
249
250
251
252
253
254 vbasedev->ops->vfio_eoi(vbasedev);
255}
256
257uint64_t vfio_region_read(void *opaque,
258 hwaddr addr, unsigned size)
259{
260 VFIORegion *region = opaque;
261 VFIODevice *vbasedev = region->vbasedev;
262 union {
263 uint8_t byte;
264 uint16_t word;
265 uint32_t dword;
266 uint64_t qword;
267 } buf;
268 uint64_t data = 0;
269
270 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
271 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
272 __func__, vbasedev->name, region->nr,
273 addr, size);
274 return (uint64_t)-1;
275 }
276 switch (size) {
277 case 1:
278 data = buf.byte;
279 break;
280 case 2:
281 data = le16_to_cpu(buf.word);
282 break;
283 case 4:
284 data = le32_to_cpu(buf.dword);
285 break;
286 case 8:
287 data = le64_to_cpu(buf.qword);
288 break;
289 default:
290 hw_error("vfio: unsupported read size, %u bytes", size);
291 break;
292 }
293
294 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
295
296
297 vbasedev->ops->vfio_eoi(vbasedev);
298
299 return data;
300}
301
302const MemoryRegionOps vfio_region_ops = {
303 .read = vfio_region_read,
304 .write = vfio_region_write,
305 .endianness = DEVICE_LITTLE_ENDIAN,
306 .valid = {
307 .min_access_size = 1,
308 .max_access_size = 8,
309 },
310 .impl = {
311 .min_access_size = 1,
312 .max_access_size = 8,
313 },
314};
315
316
317
318
319
320bool vfio_mig_active(void)
321{
322 VFIOGroup *group;
323 VFIODevice *vbasedev;
324
325 if (QLIST_EMPTY(&vfio_group_list)) {
326 return false;
327 }
328
329 QLIST_FOREACH(group, &vfio_group_list, next) {
330 QLIST_FOREACH(vbasedev, &group->device_list, next) {
331 if (vbasedev->migration_blocker) {
332 return false;
333 }
334 }
335 }
336 return true;
337}
338
339static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
340{
341 VFIOGroup *group;
342 VFIODevice *vbasedev;
343 MigrationState *ms = migrate_get_current();
344
345 if (!migration_is_setup_or_active(ms->state)) {
346 return false;
347 }
348
349 QLIST_FOREACH(group, &container->group_list, container_next) {
350 QLIST_FOREACH(vbasedev, &group->device_list, next) {
351 VFIOMigration *migration = vbasedev->migration;
352
353 if (!migration) {
354 return false;
355 }
356
357 if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
358 && (migration->device_state & VFIO_DEVICE_STATE_V1_RUNNING)) {
359 return false;
360 }
361 }
362 }
363 return true;
364}
365
366static bool vfio_devices_all_running_and_saving(VFIOContainer *container)
367{
368 VFIOGroup *group;
369 VFIODevice *vbasedev;
370 MigrationState *ms = migrate_get_current();
371
372 if (!migration_is_setup_or_active(ms->state)) {
373 return false;
374 }
375
376 QLIST_FOREACH(group, &container->group_list, container_next) {
377 QLIST_FOREACH(vbasedev, &group->device_list, next) {
378 VFIOMigration *migration = vbasedev->migration;
379
380 if (!migration) {
381 return false;
382 }
383
384 if ((migration->device_state & VFIO_DEVICE_STATE_V1_SAVING) &&
385 (migration->device_state & VFIO_DEVICE_STATE_V1_RUNNING)) {
386 continue;
387 } else {
388 return false;
389 }
390 }
391 }
392 return true;
393}
394
395static int vfio_dma_unmap_bitmap(VFIOContainer *container,
396 hwaddr iova, ram_addr_t size,
397 IOMMUTLBEntry *iotlb)
398{
399 struct vfio_iommu_type1_dma_unmap *unmap;
400 struct vfio_bitmap *bitmap;
401 uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
402 int ret;
403
404 unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
405
406 unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
407 unmap->iova = iova;
408 unmap->size = size;
409 unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
410 bitmap = (struct vfio_bitmap *)&unmap->data;
411
412
413
414
415
416
417
418 bitmap->pgsize = qemu_real_host_page_size();
419 bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
420 BITS_PER_BYTE;
421
422 if (bitmap->size > container->max_dirty_bitmap_size) {
423 error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
424 (uint64_t)bitmap->size);
425 ret = -E2BIG;
426 goto unmap_exit;
427 }
428
429 bitmap->data = g_try_malloc0(bitmap->size);
430 if (!bitmap->data) {
431 ret = -ENOMEM;
432 goto unmap_exit;
433 }
434
435 ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
436 if (!ret) {
437 cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
438 iotlb->translated_addr, pages);
439 } else {
440 error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
441 }
442
443 g_free(bitmap->data);
444unmap_exit:
445 g_free(unmap);
446 return ret;
447}
448
449
450
451
452static int vfio_dma_unmap(VFIOContainer *container,
453 hwaddr iova, ram_addr_t size,
454 IOMMUTLBEntry *iotlb)
455{
456 struct vfio_iommu_type1_dma_unmap unmap = {
457 .argsz = sizeof(unmap),
458 .flags = 0,
459 .iova = iova,
460 .size = size,
461 };
462
463 if (iotlb && container->dirty_pages_supported &&
464 vfio_devices_all_running_and_saving(container)) {
465 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
466 }
467
468 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
469
470
471
472
473
474
475
476
477
478
479
480
481 if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
482 container->iommu_type == VFIO_TYPE1v2_IOMMU) {
483 trace_vfio_dma_unmap_overflow_workaround();
484 unmap.size -= 1ULL << ctz64(container->pgsizes);
485 continue;
486 }
487 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
488 return -errno;
489 }
490
491 return 0;
492}
493
494static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
495 ram_addr_t size, void *vaddr, bool readonly)
496{
497 struct vfio_iommu_type1_dma_map map = {
498 .argsz = sizeof(map),
499 .flags = VFIO_DMA_MAP_FLAG_READ,
500 .vaddr = (__u64)(uintptr_t)vaddr,
501 .iova = iova,
502 .size = size,
503 };
504
505 if (!readonly) {
506 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
507 }
508
509
510
511
512
513
514 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
515 (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
516 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
517 return 0;
518 }
519
520 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
521 return -errno;
522}
523
524static void vfio_host_win_add(VFIOContainer *container,
525 hwaddr min_iova, hwaddr max_iova,
526 uint64_t iova_pgsizes)
527{
528 VFIOHostDMAWindow *hostwin;
529
530 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
531 if (ranges_overlap(hostwin->min_iova,
532 hostwin->max_iova - hostwin->min_iova + 1,
533 min_iova,
534 max_iova - min_iova + 1)) {
535 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
536 }
537 }
538
539 hostwin = g_malloc0(sizeof(*hostwin));
540
541 hostwin->min_iova = min_iova;
542 hostwin->max_iova = max_iova;
543 hostwin->iova_pgsizes = iova_pgsizes;
544 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
545}
546
547static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
548 hwaddr max_iova)
549{
550 VFIOHostDMAWindow *hostwin;
551
552 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
553 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
554 QLIST_REMOVE(hostwin, hostwin_next);
555 g_free(hostwin);
556 return 0;
557 }
558 }
559
560 return -1;
561}
562
563static bool vfio_listener_skipped_section(MemoryRegionSection *section)
564{
565 return (!memory_region_is_ram(section->mr) &&
566 !memory_region_is_iommu(section->mr)) ||
567 memory_region_is_protected(section->mr) ||
568
569
570
571
572
573
574 section->offset_within_address_space & (1ULL << 63);
575}
576
577
578static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
579 ram_addr_t *ram_addr, bool *read_only)
580{
581 bool ret, mr_has_discard_manager;
582
583 ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
584 &mr_has_discard_manager);
585 if (ret && mr_has_discard_manager) {
586
587
588
589
590
591
592
593
594
595
596
597
598 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
599 " RAM (e.g., virtio-mem) works, however, malicious"
600 " guests can trigger pinning of more memory than"
601 " intended via an IOMMU. It's possible to mitigate "
602 " by setting/adjusting RLIMIT_MEMLOCK.");
603 }
604 return ret;
605}
606
607static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
608{
609 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
610 VFIOContainer *container = giommu->container;
611 hwaddr iova = iotlb->iova + giommu->iommu_offset;
612 void *vaddr;
613 int ret;
614
615 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
616 iova, iova + iotlb->addr_mask);
617
618 if (iotlb->target_as != &address_space_memory) {
619 error_report("Wrong target AS \"%s\", only system memory is allowed",
620 iotlb->target_as->name ? iotlb->target_as->name : "none");
621 return;
622 }
623
624 rcu_read_lock();
625
626 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
627 bool read_only;
628
629 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
630 goto out;
631 }
632
633
634
635
636
637
638
639 ret = vfio_dma_map(container, iova,
640 iotlb->addr_mask + 1, vaddr,
641 read_only);
642 if (ret) {
643 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
644 "0x%"HWADDR_PRIx", %p) = %d (%m)",
645 container, iova,
646 iotlb->addr_mask + 1, vaddr, ret);
647 }
648 } else {
649 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
650 if (ret) {
651 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
652 "0x%"HWADDR_PRIx") = %d (%m)",
653 container, iova,
654 iotlb->addr_mask + 1, ret);
655 }
656 }
657out:
658 rcu_read_unlock();
659}
660
661static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
662 MemoryRegionSection *section)
663{
664 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
665 listener);
666 const hwaddr size = int128_get64(section->size);
667 const hwaddr iova = section->offset_within_address_space;
668 int ret;
669
670
671 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
672 if (ret) {
673 error_report("%s: vfio_dma_unmap() failed: %s", __func__,
674 strerror(-ret));
675 }
676}
677
678static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
679 MemoryRegionSection *section)
680{
681 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
682 listener);
683 const hwaddr end = section->offset_within_region +
684 int128_get64(section->size);
685 hwaddr start, next, iova;
686 void *vaddr;
687 int ret;
688
689
690
691
692
693 for (start = section->offset_within_region; start < end; start = next) {
694 next = ROUND_UP(start + 1, vrdl->granularity);
695 next = MIN(next, end);
696
697 iova = start - section->offset_within_region +
698 section->offset_within_address_space;
699 vaddr = memory_region_get_ram_ptr(section->mr) + start;
700
701 ret = vfio_dma_map(vrdl->container, iova, next - start,
702 vaddr, section->readonly);
703 if (ret) {
704
705 vfio_ram_discard_notify_discard(rdl, section);
706 return ret;
707 }
708 }
709 return 0;
710}
711
712static void vfio_register_ram_discard_listener(VFIOContainer *container,
713 MemoryRegionSection *section)
714{
715 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
716 VFIORamDiscardListener *vrdl;
717
718
719 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
720 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
721 TARGET_PAGE_SIZE));
722 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
723
724 vrdl = g_new0(VFIORamDiscardListener, 1);
725 vrdl->container = container;
726 vrdl->mr = section->mr;
727 vrdl->offset_within_address_space = section->offset_within_address_space;
728 vrdl->size = int128_get64(section->size);
729 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
730 section->mr);
731
732 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
733 g_assert(container->pgsizes &&
734 vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
735
736 ram_discard_listener_init(&vrdl->listener,
737 vfio_ram_discard_notify_populate,
738 vfio_ram_discard_notify_discard, true);
739 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
740 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755 if (container->dma_max_mappings) {
756 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
757
758#ifdef CONFIG_KVM
759 if (kvm_enabled()) {
760 max_memslots = kvm_get_max_memslots();
761 }
762#endif
763
764 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
765 hwaddr start, end;
766
767 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
768 vrdl->granularity);
769 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
770 vrdl->granularity);
771 vrdl_mappings += (end - start) / vrdl->granularity;
772 vrdl_count++;
773 }
774
775 if (vrdl_mappings + max_memslots - vrdl_count >
776 container->dma_max_mappings) {
777 warn_report("%s: possibly running out of DMA mappings. E.g., try"
778 " increasing the 'block-size' of virtio-mem devies."
779 " Maximum possible DMA mappings: %d, Maximum possible"
780 " memslots: %d", __func__, container->dma_max_mappings,
781 max_memslots);
782 }
783 }
784}
785
786static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
787 MemoryRegionSection *section)
788{
789 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
790 VFIORamDiscardListener *vrdl = NULL;
791
792 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
793 if (vrdl->mr == section->mr &&
794 vrdl->offset_within_address_space ==
795 section->offset_within_address_space) {
796 break;
797 }
798 }
799
800 if (!vrdl) {
801 hw_error("vfio: Trying to unregister missing RAM discard listener");
802 }
803
804 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
805 QLIST_REMOVE(vrdl, next);
806 g_free(vrdl);
807}
808
809static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
810{
811 MemoryRegion *mr = section->mr;
812
813 if (!TPM_IS_CRB(mr->owner)) {
814 return false;
815 }
816
817
818 trace_vfio_known_safe_misalignment(memory_region_name(mr),
819 section->offset_within_address_space,
820 section->offset_within_region,
821 qemu_real_host_page_size());
822 return true;
823}
824
825static void vfio_listener_region_add(MemoryListener *listener,
826 MemoryRegionSection *section)
827{
828 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
829 hwaddr iova, end;
830 Int128 llend, llsize;
831 void *vaddr;
832 int ret;
833 VFIOHostDMAWindow *hostwin;
834 bool hostwin_found;
835 Error *err = NULL;
836
837 if (vfio_listener_skipped_section(section)) {
838 trace_vfio_listener_region_add_skip(
839 section->offset_within_address_space,
840 section->offset_within_address_space +
841 int128_get64(int128_sub(section->size, int128_one())));
842 return;
843 }
844
845 if (unlikely((section->offset_within_address_space &
846 ~qemu_real_host_page_mask()) !=
847 (section->offset_within_region & ~qemu_real_host_page_mask()))) {
848 if (!vfio_known_safe_misalignment(section)) {
849 error_report("%s received unaligned region %s iova=0x%"PRIx64
850 " offset_within_region=0x%"PRIx64
851 " qemu_real_host_page_size=0x%"PRIxPTR,
852 __func__, memory_region_name(section->mr),
853 section->offset_within_address_space,
854 section->offset_within_region,
855 qemu_real_host_page_size());
856 }
857 return;
858 }
859
860 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
861 llend = int128_make64(section->offset_within_address_space);
862 llend = int128_add(llend, section->size);
863 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
864
865 if (int128_ge(int128_make64(iova), llend)) {
866 if (memory_region_is_ram_device(section->mr)) {
867 trace_vfio_listener_region_add_no_dma_map(
868 memory_region_name(section->mr),
869 section->offset_within_address_space,
870 int128_getlo(section->size),
871 qemu_real_host_page_size());
872 }
873 return;
874 }
875 end = int128_get64(int128_sub(llend, int128_one()));
876
877 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
878 hwaddr pgsize = 0;
879
880
881 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
882 if (ranges_overlap(hostwin->min_iova,
883 hostwin->max_iova - hostwin->min_iova + 1,
884 section->offset_within_address_space,
885 int128_get64(section->size))) {
886 error_setg(&err,
887 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
888 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
889 section->offset_within_address_space,
890 section->offset_within_address_space +
891 int128_get64(section->size) - 1,
892 hostwin->min_iova, hostwin->max_iova);
893 goto fail;
894 }
895 }
896
897 ret = vfio_spapr_create_window(container, section, &pgsize);
898 if (ret) {
899 error_setg_errno(&err, -ret, "Failed to create SPAPR window");
900 goto fail;
901 }
902
903 vfio_host_win_add(container, section->offset_within_address_space,
904 section->offset_within_address_space +
905 int128_get64(section->size) - 1, pgsize);
906#ifdef CONFIG_KVM
907 if (kvm_enabled()) {
908 VFIOGroup *group;
909 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
910 struct kvm_vfio_spapr_tce param;
911 struct kvm_device_attr attr = {
912 .group = KVM_DEV_VFIO_GROUP,
913 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
914 .addr = (uint64_t)(unsigned long)¶m,
915 };
916
917 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
918 ¶m.tablefd)) {
919 QLIST_FOREACH(group, &container->group_list, container_next) {
920 param.groupfd = group->fd;
921 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
922 error_report("vfio: failed to setup fd %d "
923 "for a group with fd %d: %s",
924 param.tablefd, param.groupfd,
925 strerror(errno));
926 return;
927 }
928 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
929 }
930 }
931 }
932#endif
933 }
934
935 hostwin_found = false;
936 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
937 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
938 hostwin_found = true;
939 break;
940 }
941 }
942
943 if (!hostwin_found) {
944 error_setg(&err, "Container %p can't map guest IOVA region"
945 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
946 goto fail;
947 }
948
949 memory_region_ref(section->mr);
950
951 if (memory_region_is_iommu(section->mr)) {
952 VFIOGuestIOMMU *giommu;
953 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
954 int iommu_idx;
955
956 trace_vfio_listener_region_add_iommu(iova, end);
957
958
959
960
961
962
963 giommu = g_malloc0(sizeof(*giommu));
964 giommu->iommu_mr = iommu_mr;
965 giommu->iommu_offset = section->offset_within_address_space -
966 section->offset_within_region;
967 giommu->container = container;
968 llend = int128_add(int128_make64(section->offset_within_region),
969 section->size);
970 llend = int128_sub(llend, int128_one());
971 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
972 MEMTXATTRS_UNSPECIFIED);
973 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
974 IOMMU_NOTIFIER_IOTLB_EVENTS,
975 section->offset_within_region,
976 int128_get64(llend),
977 iommu_idx);
978
979 ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
980 container->pgsizes,
981 &err);
982 if (ret) {
983 g_free(giommu);
984 goto fail;
985 }
986
987 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
988 &err);
989 if (ret) {
990 g_free(giommu);
991 goto fail;
992 }
993 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
994 memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
995
996 return;
997 }
998
999
1000
1001
1002
1003
1004
1005
1006 if (memory_region_has_ram_discard_manager(section->mr)) {
1007 vfio_register_ram_discard_listener(container, section);
1008 return;
1009 }
1010
1011 vaddr = memory_region_get_ram_ptr(section->mr) +
1012 section->offset_within_region +
1013 (iova - section->offset_within_address_space);
1014
1015 trace_vfio_listener_region_add_ram(iova, end, vaddr);
1016
1017 llsize = int128_sub(llend, int128_make64(iova));
1018
1019 if (memory_region_is_ram_device(section->mr)) {
1020 hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1021
1022 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
1023 trace_vfio_listener_region_add_no_dma_map(
1024 memory_region_name(section->mr),
1025 section->offset_within_address_space,
1026 int128_getlo(section->size),
1027 pgmask + 1);
1028 return;
1029 }
1030 }
1031
1032 ret = vfio_dma_map(container, iova, int128_get64(llsize),
1033 vaddr, section->readonly);
1034 if (ret) {
1035 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
1036 "0x%"HWADDR_PRIx", %p) = %d (%m)",
1037 container, iova, int128_get64(llsize), vaddr, ret);
1038 if (memory_region_is_ram_device(section->mr)) {
1039
1040 error_report_err(err);
1041 return;
1042 }
1043 goto fail;
1044 }
1045
1046 return;
1047
1048fail:
1049 if (memory_region_is_ram_device(section->mr)) {
1050 error_report("failed to vfio_dma_map. pci p2p may not work");
1051 return;
1052 }
1053
1054
1055
1056
1057
1058 if (!container->initialized) {
1059 if (!container->error) {
1060 error_propagate_prepend(&container->error, err,
1061 "Region %s: ",
1062 memory_region_name(section->mr));
1063 } else {
1064 error_free(err);
1065 }
1066 } else {
1067 error_report_err(err);
1068 hw_error("vfio: DMA mapping failed, unable to continue");
1069 }
1070}
1071
1072static void vfio_listener_region_del(MemoryListener *listener,
1073 MemoryRegionSection *section)
1074{
1075 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1076 hwaddr iova, end;
1077 Int128 llend, llsize;
1078 int ret;
1079 bool try_unmap = true;
1080
1081 if (vfio_listener_skipped_section(section)) {
1082 trace_vfio_listener_region_del_skip(
1083 section->offset_within_address_space,
1084 section->offset_within_address_space +
1085 int128_get64(int128_sub(section->size, int128_one())));
1086 return;
1087 }
1088
1089 if (unlikely((section->offset_within_address_space &
1090 ~qemu_real_host_page_mask()) !=
1091 (section->offset_within_region & ~qemu_real_host_page_mask()))) {
1092 if (!vfio_known_safe_misalignment(section)) {
1093 error_report("%s received unaligned region %s iova=0x%"PRIx64
1094 " offset_within_region=0x%"PRIx64
1095 " qemu_real_host_page_size=0x%"PRIxPTR,
1096 __func__, memory_region_name(section->mr),
1097 section->offset_within_address_space,
1098 section->offset_within_region,
1099 qemu_real_host_page_size());
1100 }
1101 return;
1102 }
1103
1104 if (memory_region_is_iommu(section->mr)) {
1105 VFIOGuestIOMMU *giommu;
1106
1107 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1108 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
1109 giommu->n.start == section->offset_within_region) {
1110 memory_region_unregister_iommu_notifier(section->mr,
1111 &giommu->n);
1112 QLIST_REMOVE(giommu, giommu_next);
1113 g_free(giommu);
1114 break;
1115 }
1116 }
1117
1118
1119
1120
1121
1122
1123
1124
1125 }
1126
1127 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
1128 llend = int128_make64(section->offset_within_address_space);
1129 llend = int128_add(llend, section->size);
1130 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
1131
1132 if (int128_ge(int128_make64(iova), llend)) {
1133 return;
1134 }
1135 end = int128_get64(int128_sub(llend, int128_one()));
1136
1137 llsize = int128_sub(llend, int128_make64(iova));
1138
1139 trace_vfio_listener_region_del(iova, end);
1140
1141 if (memory_region_is_ram_device(section->mr)) {
1142 hwaddr pgmask;
1143 VFIOHostDMAWindow *hostwin;
1144 bool hostwin_found = false;
1145
1146 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1147 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
1148 hostwin_found = true;
1149 break;
1150 }
1151 }
1152 assert(hostwin_found);
1153
1154 pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1155 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
1156 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1157 vfio_unregister_ram_discard_listener(container, section);
1158
1159 try_unmap = false;
1160 }
1161
1162 if (try_unmap) {
1163 if (int128_eq(llsize, int128_2_64())) {
1164
1165 llsize = int128_rshift(llsize, 1);
1166 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1167 if (ret) {
1168 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1169 "0x%"HWADDR_PRIx") = %d (%m)",
1170 container, iova, int128_get64(llsize), ret);
1171 }
1172 iova += int128_get64(llsize);
1173 }
1174 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1175 if (ret) {
1176 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1177 "0x%"HWADDR_PRIx") = %d (%m)",
1178 container, iova, int128_get64(llsize), ret);
1179 }
1180 }
1181
1182 memory_region_unref(section->mr);
1183
1184 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1185 vfio_spapr_remove_window(container,
1186 section->offset_within_address_space);
1187 if (vfio_host_win_del(container,
1188 section->offset_within_address_space,
1189 section->offset_within_address_space +
1190 int128_get64(section->size) - 1) < 0) {
1191 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1192 __func__, section->offset_within_address_space);
1193 }
1194 }
1195}
1196
1197static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
1198{
1199 int ret;
1200 struct vfio_iommu_type1_dirty_bitmap dirty = {
1201 .argsz = sizeof(dirty),
1202 };
1203
1204 if (start) {
1205 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1206 } else {
1207 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1208 }
1209
1210 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1211 if (ret) {
1212 error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1213 dirty.flags, errno);
1214 }
1215}
1216
1217static void vfio_listener_log_global_start(MemoryListener *listener)
1218{
1219 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1220
1221 vfio_set_dirty_page_tracking(container, true);
1222}
1223
1224static void vfio_listener_log_global_stop(MemoryListener *listener)
1225{
1226 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1227
1228 vfio_set_dirty_page_tracking(container, false);
1229}
1230
1231static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1232 uint64_t size, ram_addr_t ram_addr)
1233{
1234 struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1235 struct vfio_iommu_type1_dirty_bitmap_get *range;
1236 uint64_t pages;
1237 int ret;
1238
1239 dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1240
1241 dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1242 dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1243 range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1244 range->iova = iova;
1245 range->size = size;
1246
1247
1248
1249
1250
1251
1252 range->bitmap.pgsize = qemu_real_host_page_size();
1253
1254 pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size();
1255 range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
1256 BITS_PER_BYTE;
1257 range->bitmap.data = g_try_malloc0(range->bitmap.size);
1258 if (!range->bitmap.data) {
1259 ret = -ENOMEM;
1260 goto err_out;
1261 }
1262
1263 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1264 if (ret) {
1265 error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1266 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1267 (uint64_t)range->size, errno);
1268 goto err_out;
1269 }
1270
1271 cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
1272 ram_addr, pages);
1273
1274 trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
1275 range->bitmap.size, ram_addr);
1276err_out:
1277 g_free(range->bitmap.data);
1278 g_free(dbitmap);
1279
1280 return ret;
1281}
1282
1283typedef struct {
1284 IOMMUNotifier n;
1285 VFIOGuestIOMMU *giommu;
1286} vfio_giommu_dirty_notifier;
1287
1288static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1289{
1290 vfio_giommu_dirty_notifier *gdn = container_of(n,
1291 vfio_giommu_dirty_notifier, n);
1292 VFIOGuestIOMMU *giommu = gdn->giommu;
1293 VFIOContainer *container = giommu->container;
1294 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1295 ram_addr_t translated_addr;
1296
1297 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1298
1299 if (iotlb->target_as != &address_space_memory) {
1300 error_report("Wrong target AS \"%s\", only system memory is allowed",
1301 iotlb->target_as->name ? iotlb->target_as->name : "none");
1302 return;
1303 }
1304
1305 rcu_read_lock();
1306 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
1307 int ret;
1308
1309 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1310 translated_addr);
1311 if (ret) {
1312 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
1313 "0x%"HWADDR_PRIx") = %d (%m)",
1314 container, iova,
1315 iotlb->addr_mask + 1, ret);
1316 }
1317 }
1318 rcu_read_unlock();
1319}
1320
1321static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1322 void *opaque)
1323{
1324 const hwaddr size = int128_get64(section->size);
1325 const hwaddr iova = section->offset_within_address_space;
1326 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1327 section->offset_within_region;
1328 VFIORamDiscardListener *vrdl = opaque;
1329
1330
1331
1332
1333
1334 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1335}
1336
1337static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1338 MemoryRegionSection *section)
1339{
1340 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1341 VFIORamDiscardListener *vrdl = NULL;
1342
1343 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1344 if (vrdl->mr == section->mr &&
1345 vrdl->offset_within_address_space ==
1346 section->offset_within_address_space) {
1347 break;
1348 }
1349 }
1350
1351 if (!vrdl) {
1352 hw_error("vfio: Trying to sync missing RAM discard listener");
1353 }
1354
1355
1356
1357
1358
1359 return ram_discard_manager_replay_populated(rdm, section,
1360 vfio_ram_discard_get_dirty_bitmap,
1361 &vrdl);
1362}
1363
1364static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1365 MemoryRegionSection *section)
1366{
1367 ram_addr_t ram_addr;
1368
1369 if (memory_region_is_iommu(section->mr)) {
1370 VFIOGuestIOMMU *giommu;
1371
1372 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1373 if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
1374 giommu->n.start == section->offset_within_region) {
1375 Int128 llend;
1376 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
1377 int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
1378 MEMTXATTRS_UNSPECIFIED);
1379
1380 llend = int128_add(int128_make64(section->offset_within_region),
1381 section->size);
1382 llend = int128_sub(llend, int128_one());
1383
1384 iommu_notifier_init(&gdn.n,
1385 vfio_iommu_map_dirty_notify,
1386 IOMMU_NOTIFIER_MAP,
1387 section->offset_within_region,
1388 int128_get64(llend),
1389 idx);
1390 memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
1391 break;
1392 }
1393 }
1394 return 0;
1395 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1396 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
1397 }
1398
1399 ram_addr = memory_region_get_ram_addr(section->mr) +
1400 section->offset_within_region;
1401
1402 return vfio_get_dirty_bitmap(container,
1403 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1404 int128_get64(section->size), ram_addr);
1405}
1406
1407static void vfio_listener_log_sync(MemoryListener *listener,
1408 MemoryRegionSection *section)
1409{
1410 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1411
1412 if (vfio_listener_skipped_section(section) ||
1413 !container->dirty_pages_supported) {
1414 return;
1415 }
1416
1417 if (vfio_devices_all_dirty_tracking(container)) {
1418 vfio_sync_dirty_bitmap(container, section);
1419 }
1420}
1421
1422static const MemoryListener vfio_memory_listener = {
1423 .name = "vfio",
1424 .region_add = vfio_listener_region_add,
1425 .region_del = vfio_listener_region_del,
1426 .log_global_start = vfio_listener_log_global_start,
1427 .log_global_stop = vfio_listener_log_global_stop,
1428 .log_sync = vfio_listener_log_sync,
1429};
1430
1431static void vfio_listener_release(VFIOContainer *container)
1432{
1433 memory_listener_unregister(&container->listener);
1434 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1435 memory_listener_unregister(&container->prereg_listener);
1436 }
1437}
1438
1439static struct vfio_info_cap_header *
1440vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
1441{
1442 struct vfio_info_cap_header *hdr;
1443
1444 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1445 if (hdr->id == id) {
1446 return hdr;
1447 }
1448 }
1449
1450 return NULL;
1451}
1452
1453struct vfio_info_cap_header *
1454vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1455{
1456 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1457 return NULL;
1458 }
1459
1460 return vfio_get_cap((void *)info, info->cap_offset, id);
1461}
1462
1463static struct vfio_info_cap_header *
1464vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1465{
1466 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1467 return NULL;
1468 }
1469
1470 return vfio_get_cap((void *)info, info->cap_offset, id);
1471}
1472
1473struct vfio_info_cap_header *
1474vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1475{
1476 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1477 return NULL;
1478 }
1479
1480 return vfio_get_cap((void *)info, info->cap_offset, id);
1481}
1482
1483bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1484 unsigned int *avail)
1485{
1486 struct vfio_info_cap_header *hdr;
1487 struct vfio_iommu_type1_info_dma_avail *cap;
1488
1489
1490 hdr = vfio_get_iommu_type1_info_cap(info,
1491 VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
1492 if (hdr == NULL) {
1493 return false;
1494 }
1495
1496 if (avail != NULL) {
1497 cap = (void *) hdr;
1498 *avail = cap->avail;
1499 }
1500
1501 return true;
1502}
1503
1504static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
1505 struct vfio_region_info *info)
1506{
1507 struct vfio_info_cap_header *hdr;
1508 struct vfio_region_info_cap_sparse_mmap *sparse;
1509 int i, j;
1510
1511 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
1512 if (!hdr) {
1513 return -ENODEV;
1514 }
1515
1516 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
1517
1518 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
1519 region->nr, sparse->nr_areas);
1520
1521 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
1522
1523 for (i = 0, j = 0; i < sparse->nr_areas; i++) {
1524 if (sparse->areas[i].size) {
1525 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
1526 sparse->areas[i].offset +
1527 sparse->areas[i].size - 1);
1528 region->mmaps[j].offset = sparse->areas[i].offset;
1529 region->mmaps[j].size = sparse->areas[i].size;
1530 j++;
1531 }
1532 }
1533
1534 region->nr_mmaps = j;
1535 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
1536
1537 return 0;
1538}
1539
1540int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
1541 int index, const char *name)
1542{
1543 struct vfio_region_info *info;
1544 int ret;
1545
1546 ret = vfio_get_region_info(vbasedev, index, &info);
1547 if (ret) {
1548 return ret;
1549 }
1550
1551 region->vbasedev = vbasedev;
1552 region->flags = info->flags;
1553 region->size = info->size;
1554 region->fd_offset = info->offset;
1555 region->nr = index;
1556
1557 if (region->size) {
1558 region->mem = g_new0(MemoryRegion, 1);
1559 memory_region_init_io(region->mem, obj, &vfio_region_ops,
1560 region, name, region->size);
1561
1562 if (!vbasedev->no_mmap &&
1563 region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
1564
1565 ret = vfio_setup_region_sparse_mmaps(region, info);
1566
1567 if (ret) {
1568 region->nr_mmaps = 1;
1569 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
1570 region->mmaps[0].offset = 0;
1571 region->mmaps[0].size = region->size;
1572 }
1573 }
1574 }
1575
1576 g_free(info);
1577
1578 trace_vfio_region_setup(vbasedev->name, index, name,
1579 region->flags, region->fd_offset, region->size);
1580 return 0;
1581}
1582
1583static void vfio_subregion_unmap(VFIORegion *region, int index)
1584{
1585 trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem),
1586 region->mmaps[index].offset,
1587 region->mmaps[index].offset +
1588 region->mmaps[index].size - 1);
1589 memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem);
1590 munmap(region->mmaps[index].mmap, region->mmaps[index].size);
1591 object_unparent(OBJECT(®ion->mmaps[index].mem));
1592 region->mmaps[index].mmap = NULL;
1593}
1594
1595int vfio_region_mmap(VFIORegion *region)
1596{
1597 int i, prot = 0;
1598 char *name;
1599
1600 if (!region->mem) {
1601 return 0;
1602 }
1603
1604 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
1605 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
1606
1607 for (i = 0; i < region->nr_mmaps; i++) {
1608 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
1609 MAP_SHARED, region->vbasedev->fd,
1610 region->fd_offset +
1611 region->mmaps[i].offset);
1612 if (region->mmaps[i].mmap == MAP_FAILED) {
1613 int ret = -errno;
1614
1615 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
1616 region->fd_offset +
1617 region->mmaps[i].offset,
1618 region->fd_offset +
1619 region->mmaps[i].offset +
1620 region->mmaps[i].size - 1, ret);
1621
1622 region->mmaps[i].mmap = NULL;
1623
1624 for (i--; i >= 0; i--) {
1625 vfio_subregion_unmap(region, i);
1626 }
1627
1628 return ret;
1629 }
1630
1631 name = g_strdup_printf("%s mmaps[%d]",
1632 memory_region_name(region->mem), i);
1633 memory_region_init_ram_device_ptr(®ion->mmaps[i].mem,
1634 memory_region_owner(region->mem),
1635 name, region->mmaps[i].size,
1636 region->mmaps[i].mmap);
1637 g_free(name);
1638 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
1639 ®ion->mmaps[i].mem);
1640
1641 trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem),
1642 region->mmaps[i].offset,
1643 region->mmaps[i].offset +
1644 region->mmaps[i].size - 1);
1645 }
1646
1647 return 0;
1648}
1649
1650void vfio_region_unmap(VFIORegion *region)
1651{
1652 int i;
1653
1654 if (!region->mem) {
1655 return;
1656 }
1657
1658 for (i = 0; i < region->nr_mmaps; i++) {
1659 if (region->mmaps[i].mmap) {
1660 vfio_subregion_unmap(region, i);
1661 }
1662 }
1663}
1664
1665void vfio_region_exit(VFIORegion *region)
1666{
1667 int i;
1668
1669 if (!region->mem) {
1670 return;
1671 }
1672
1673 for (i = 0; i < region->nr_mmaps; i++) {
1674 if (region->mmaps[i].mmap) {
1675 memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem);
1676 }
1677 }
1678
1679 trace_vfio_region_exit(region->vbasedev->name, region->nr);
1680}
1681
1682void vfio_region_finalize(VFIORegion *region)
1683{
1684 int i;
1685
1686 if (!region->mem) {
1687 return;
1688 }
1689
1690 for (i = 0; i < region->nr_mmaps; i++) {
1691 if (region->mmaps[i].mmap) {
1692 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
1693 object_unparent(OBJECT(®ion->mmaps[i].mem));
1694 }
1695 }
1696
1697 object_unparent(OBJECT(region->mem));
1698
1699 g_free(region->mem);
1700 g_free(region->mmaps);
1701
1702 trace_vfio_region_finalize(region->vbasedev->name, region->nr);
1703
1704 region->mem = NULL;
1705 region->mmaps = NULL;
1706 region->nr_mmaps = 0;
1707 region->size = 0;
1708 region->flags = 0;
1709 region->nr = 0;
1710}
1711
1712void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
1713{
1714 int i;
1715
1716 if (!region->mem) {
1717 return;
1718 }
1719
1720 for (i = 0; i < region->nr_mmaps; i++) {
1721 if (region->mmaps[i].mmap) {
1722 memory_region_set_enabled(®ion->mmaps[i].mem, enabled);
1723 }
1724 }
1725
1726 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
1727 enabled);
1728}
1729
1730void vfio_reset_handler(void *opaque)
1731{
1732 VFIOGroup *group;
1733 VFIODevice *vbasedev;
1734
1735 QLIST_FOREACH(group, &vfio_group_list, next) {
1736 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1737 if (vbasedev->dev->realized) {
1738 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1739 }
1740 }
1741 }
1742
1743 QLIST_FOREACH(group, &vfio_group_list, next) {
1744 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1745 if (vbasedev->dev->realized && vbasedev->needs_reset) {
1746 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1747 }
1748 }
1749 }
1750}
1751
1752static void vfio_kvm_device_add_group(VFIOGroup *group)
1753{
1754#ifdef CONFIG_KVM
1755 struct kvm_device_attr attr = {
1756 .group = KVM_DEV_VFIO_GROUP,
1757 .attr = KVM_DEV_VFIO_GROUP_ADD,
1758 .addr = (uint64_t)(unsigned long)&group->fd,
1759 };
1760
1761 if (!kvm_enabled()) {
1762 return;
1763 }
1764
1765 if (vfio_kvm_device_fd < 0) {
1766 struct kvm_create_device cd = {
1767 .type = KVM_DEV_TYPE_VFIO,
1768 };
1769
1770 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
1771 error_report("Failed to create KVM VFIO device: %m");
1772 return;
1773 }
1774
1775 vfio_kvm_device_fd = cd.fd;
1776 }
1777
1778 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1779 error_report("Failed to add group %d to KVM VFIO device: %m",
1780 group->groupid);
1781 }
1782#endif
1783}
1784
1785static void vfio_kvm_device_del_group(VFIOGroup *group)
1786{
1787#ifdef CONFIG_KVM
1788 struct kvm_device_attr attr = {
1789 .group = KVM_DEV_VFIO_GROUP,
1790 .attr = KVM_DEV_VFIO_GROUP_DEL,
1791 .addr = (uint64_t)(unsigned long)&group->fd,
1792 };
1793
1794 if (vfio_kvm_device_fd < 0) {
1795 return;
1796 }
1797
1798 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1799 error_report("Failed to remove group %d from KVM VFIO device: %m",
1800 group->groupid);
1801 }
1802#endif
1803}
1804
1805static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1806{
1807 VFIOAddressSpace *space;
1808
1809 QLIST_FOREACH(space, &vfio_address_spaces, list) {
1810 if (space->as == as) {
1811 return space;
1812 }
1813 }
1814
1815
1816 space = g_malloc0(sizeof(*space));
1817 space->as = as;
1818 QLIST_INIT(&space->containers);
1819
1820 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1821
1822 return space;
1823}
1824
1825static void vfio_put_address_space(VFIOAddressSpace *space)
1826{
1827 if (QLIST_EMPTY(&space->containers)) {
1828 QLIST_REMOVE(space, list);
1829 g_free(space);
1830 }
1831}
1832
1833
1834
1835
1836static int vfio_get_iommu_type(VFIOContainer *container,
1837 Error **errp)
1838{
1839 int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
1840 VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
1841 int i;
1842
1843 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
1844 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
1845 return iommu_types[i];
1846 }
1847 }
1848 error_setg(errp, "No available IOMMU models");
1849 return -EINVAL;
1850}
1851
1852static int vfio_init_container(VFIOContainer *container, int group_fd,
1853 Error **errp)
1854{
1855 int iommu_type, ret;
1856
1857 iommu_type = vfio_get_iommu_type(container, errp);
1858 if (iommu_type < 0) {
1859 return iommu_type;
1860 }
1861
1862 ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
1863 if (ret) {
1864 error_setg_errno(errp, errno, "Failed to set group container");
1865 return -errno;
1866 }
1867
1868 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
1869 if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1870
1871
1872
1873
1874
1875
1876 iommu_type = VFIO_SPAPR_TCE_IOMMU;
1877 continue;
1878 }
1879 error_setg_errno(errp, errno, "Failed to set iommu for container");
1880 return -errno;
1881 }
1882
1883 container->iommu_type = iommu_type;
1884 return 0;
1885}
1886
1887static int vfio_get_iommu_info(VFIOContainer *container,
1888 struct vfio_iommu_type1_info **info)
1889{
1890
1891 size_t argsz = sizeof(struct vfio_iommu_type1_info);
1892
1893 *info = g_new0(struct vfio_iommu_type1_info, 1);
1894again:
1895 (*info)->argsz = argsz;
1896
1897 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
1898 g_free(*info);
1899 *info = NULL;
1900 return -errno;
1901 }
1902
1903 if (((*info)->argsz > argsz)) {
1904 argsz = (*info)->argsz;
1905 *info = g_realloc(*info, argsz);
1906 goto again;
1907 }
1908
1909 return 0;
1910}
1911
1912static struct vfio_info_cap_header *
1913vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1914{
1915 struct vfio_info_cap_header *hdr;
1916 void *ptr = info;
1917
1918 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1919 return NULL;
1920 }
1921
1922 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1923 if (hdr->id == id) {
1924 return hdr;
1925 }
1926 }
1927
1928 return NULL;
1929}
1930
1931static void vfio_get_iommu_info_migration(VFIOContainer *container,
1932 struct vfio_iommu_type1_info *info)
1933{
1934 struct vfio_info_cap_header *hdr;
1935 struct vfio_iommu_type1_info_cap_migration *cap_mig;
1936
1937 hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
1938 if (!hdr) {
1939 return;
1940 }
1941
1942 cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
1943 header);
1944
1945
1946
1947
1948
1949 if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
1950 container->dirty_pages_supported = true;
1951 container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
1952 container->dirty_pgsizes = cap_mig->pgsize_bitmap;
1953 }
1954}
1955
1956static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
1957 Error **errp)
1958{
1959 VFIOContainer *container;
1960 int ret, fd;
1961 VFIOAddressSpace *space;
1962
1963 space = vfio_get_address_space(as);
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996 QLIST_FOREACH(container, &space->containers, next) {
1997 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
1998 ret = vfio_ram_block_discard_disable(container, true);
1999 if (ret) {
2000 error_setg_errno(errp, -ret,
2001 "Cannot set discarding of RAM broken");
2002 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2003 &container->fd)) {
2004 error_report("vfio: error disconnecting group %d from"
2005 " container", group->groupid);
2006 }
2007 return ret;
2008 }
2009 group->container = container;
2010 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2011 vfio_kvm_device_add_group(group);
2012 return 0;
2013 }
2014 }
2015
2016 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
2017 if (fd < 0) {
2018 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
2019 ret = -errno;
2020 goto put_space_exit;
2021 }
2022
2023 ret = ioctl(fd, VFIO_GET_API_VERSION);
2024 if (ret != VFIO_API_VERSION) {
2025 error_setg(errp, "supported vfio version: %d, "
2026 "reported version: %d", VFIO_API_VERSION, ret);
2027 ret = -EINVAL;
2028 goto close_fd_exit;
2029 }
2030
2031 container = g_malloc0(sizeof(*container));
2032 container->space = space;
2033 container->fd = fd;
2034 container->error = NULL;
2035 container->dirty_pages_supported = false;
2036 container->dma_max_mappings = 0;
2037 QLIST_INIT(&container->giommu_list);
2038 QLIST_INIT(&container->hostwin_list);
2039 QLIST_INIT(&container->vrdl_list);
2040
2041 ret = vfio_init_container(container, group->fd, errp);
2042 if (ret) {
2043 goto free_container_exit;
2044 }
2045
2046 ret = vfio_ram_block_discard_disable(container, true);
2047 if (ret) {
2048 error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2049 goto free_container_exit;
2050 }
2051
2052 switch (container->iommu_type) {
2053 case VFIO_TYPE1v2_IOMMU:
2054 case VFIO_TYPE1_IOMMU:
2055 {
2056 struct vfio_iommu_type1_info *info;
2057
2058 ret = vfio_get_iommu_info(container, &info);
2059 if (ret) {
2060 error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
2061 goto enable_discards_exit;
2062 }
2063
2064 if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
2065 container->pgsizes = info->iova_pgsizes;
2066 } else {
2067 container->pgsizes = qemu_real_host_page_size();
2068 }
2069
2070 if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
2071 container->dma_max_mappings = 65535;
2072 }
2073 vfio_get_iommu_info_migration(container, info);
2074 g_free(info);
2075
2076
2077
2078
2079
2080
2081 vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
2082
2083 break;
2084 }
2085 case VFIO_SPAPR_TCE_v2_IOMMU:
2086 case VFIO_SPAPR_TCE_IOMMU:
2087 {
2088 struct vfio_iommu_spapr_tce_info info;
2089 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
2090
2091
2092
2093
2094
2095
2096 if (!v2) {
2097 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2098 if (ret) {
2099 error_setg_errno(errp, errno, "failed to enable container");
2100 ret = -errno;
2101 goto enable_discards_exit;
2102 }
2103 } else {
2104 container->prereg_listener = vfio_prereg_listener;
2105
2106 memory_listener_register(&container->prereg_listener,
2107 &address_space_memory);
2108 if (container->error) {
2109 memory_listener_unregister(&container->prereg_listener);
2110 ret = -1;
2111 error_propagate_prepend(errp, container->error,
2112 "RAM memory listener initialization failed: ");
2113 goto enable_discards_exit;
2114 }
2115 }
2116
2117 info.argsz = sizeof(info);
2118 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2119 if (ret) {
2120 error_setg_errno(errp, errno,
2121 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
2122 ret = -errno;
2123 if (v2) {
2124 memory_listener_unregister(&container->prereg_listener);
2125 }
2126 goto enable_discards_exit;
2127 }
2128
2129 if (v2) {
2130 container->pgsizes = info.ddw.pgsizes;
2131
2132
2133
2134
2135
2136
2137 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2138 if (ret) {
2139 error_setg_errno(errp, -ret,
2140 "failed to remove existing window");
2141 goto enable_discards_exit;
2142 }
2143 } else {
2144
2145 container->pgsizes = 0x1000;
2146 vfio_host_win_add(container, info.dma32_window_start,
2147 info.dma32_window_start +
2148 info.dma32_window_size - 1,
2149 0x1000);
2150 }
2151 }
2152 }
2153
2154 vfio_kvm_device_add_group(group);
2155
2156 QLIST_INIT(&container->group_list);
2157 QLIST_INSERT_HEAD(&space->containers, container, next);
2158
2159 group->container = container;
2160 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2161
2162 container->listener = vfio_memory_listener;
2163
2164 memory_listener_register(&container->listener, container->space->as);
2165
2166 if (container->error) {
2167 ret = -1;
2168 error_propagate_prepend(errp, container->error,
2169 "memory listener initialization failed: ");
2170 goto listener_release_exit;
2171 }
2172
2173 container->initialized = true;
2174
2175 return 0;
2176listener_release_exit:
2177 QLIST_REMOVE(group, container_next);
2178 QLIST_REMOVE(container, next);
2179 vfio_kvm_device_del_group(group);
2180 vfio_listener_release(container);
2181
2182enable_discards_exit:
2183 vfio_ram_block_discard_disable(container, false);
2184
2185free_container_exit:
2186 g_free(container);
2187
2188close_fd_exit:
2189 close(fd);
2190
2191put_space_exit:
2192 vfio_put_address_space(space);
2193
2194 return ret;
2195}
2196
2197static void vfio_disconnect_container(VFIOGroup *group)
2198{
2199 VFIOContainer *container = group->container;
2200
2201 QLIST_REMOVE(group, container_next);
2202 group->container = NULL;
2203
2204
2205
2206
2207
2208
2209 if (QLIST_EMPTY(&container->group_list)) {
2210 vfio_listener_release(container);
2211 }
2212
2213 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2214 error_report("vfio: error disconnecting group %d from container",
2215 group->groupid);
2216 }
2217
2218 if (QLIST_EMPTY(&container->group_list)) {
2219 VFIOAddressSpace *space = container->space;
2220 VFIOGuestIOMMU *giommu, *tmp;
2221 VFIOHostDMAWindow *hostwin, *next;
2222
2223 QLIST_REMOVE(container, next);
2224
2225 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
2226 memory_region_unregister_iommu_notifier(
2227 MEMORY_REGION(giommu->iommu_mr), &giommu->n);
2228 QLIST_REMOVE(giommu, giommu_next);
2229 g_free(giommu);
2230 }
2231
2232 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
2233 next) {
2234 QLIST_REMOVE(hostwin, hostwin_next);
2235 g_free(hostwin);
2236 }
2237
2238 trace_vfio_disconnect_container(container->fd);
2239 close(container->fd);
2240 g_free(container);
2241
2242 vfio_put_address_space(space);
2243 }
2244}
2245
2246VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
2247{
2248 VFIOGroup *group;
2249 char path[32];
2250 struct vfio_group_status status = { .argsz = sizeof(status) };
2251
2252 QLIST_FOREACH(group, &vfio_group_list, next) {
2253 if (group->groupid == groupid) {
2254
2255 if (group->container->space->as == as) {
2256 return group;
2257 } else {
2258 error_setg(errp, "group %d used in multiple address spaces",
2259 group->groupid);
2260 return NULL;
2261 }
2262 }
2263 }
2264
2265 group = g_malloc0(sizeof(*group));
2266
2267 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
2268 group->fd = qemu_open_old(path, O_RDWR);
2269 if (group->fd < 0) {
2270 error_setg_errno(errp, errno, "failed to open %s", path);
2271 goto free_group_exit;
2272 }
2273
2274 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
2275 error_setg_errno(errp, errno, "failed to get group %d status", groupid);
2276 goto close_fd_exit;
2277 }
2278
2279 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
2280 error_setg(errp, "group %d is not viable", groupid);
2281 error_append_hint(errp,
2282 "Please ensure all devices within the iommu_group "
2283 "are bound to their vfio bus driver.\n");
2284 goto close_fd_exit;
2285 }
2286
2287 group->groupid = groupid;
2288 QLIST_INIT(&group->device_list);
2289
2290 if (vfio_connect_container(group, as, errp)) {
2291 error_prepend(errp, "failed to setup container for group %d: ",
2292 groupid);
2293 goto close_fd_exit;
2294 }
2295
2296 if (QLIST_EMPTY(&vfio_group_list)) {
2297 qemu_register_reset(vfio_reset_handler, NULL);
2298 }
2299
2300 QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2301
2302 return group;
2303
2304close_fd_exit:
2305 close(group->fd);
2306
2307free_group_exit:
2308 g_free(group);
2309
2310 return NULL;
2311}
2312
2313void vfio_put_group(VFIOGroup *group)
2314{
2315 if (!group || !QLIST_EMPTY(&group->device_list)) {
2316 return;
2317 }
2318
2319 if (!group->ram_block_discard_allowed) {
2320 vfio_ram_block_discard_disable(group->container, false);
2321 }
2322 vfio_kvm_device_del_group(group);
2323 vfio_disconnect_container(group);
2324 QLIST_REMOVE(group, next);
2325 trace_vfio_put_group(group->fd);
2326 close(group->fd);
2327 g_free(group);
2328
2329 if (QLIST_EMPTY(&vfio_group_list)) {
2330 qemu_unregister_reset(vfio_reset_handler, NULL);
2331 }
2332}
2333
2334int vfio_get_device(VFIOGroup *group, const char *name,
2335 VFIODevice *vbasedev, Error **errp)
2336{
2337 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
2338 int ret, fd;
2339
2340 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2341 if (fd < 0) {
2342 error_setg_errno(errp, errno, "error getting device from group %d",
2343 group->groupid);
2344 error_append_hint(errp,
2345 "Verify all devices in group %d are bound to vfio-<bus> "
2346 "or pci-stub and not already in use\n", group->groupid);
2347 return fd;
2348 }
2349
2350 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
2351 if (ret) {
2352 error_setg_errno(errp, errno, "error getting device info");
2353 close(fd);
2354 return ret;
2355 }
2356
2357
2358
2359
2360
2361
2362
2363 if (vbasedev->ram_block_discard_allowed !=
2364 group->ram_block_discard_allowed) {
2365 if (!QLIST_EMPTY(&group->device_list)) {
2366 error_setg(errp, "Inconsistent setting of support for discarding "
2367 "RAM (e.g., balloon) within group");
2368 close(fd);
2369 return -1;
2370 }
2371
2372 if (!group->ram_block_discard_allowed) {
2373 group->ram_block_discard_allowed = true;
2374 vfio_ram_block_discard_disable(group->container, false);
2375 }
2376 }
2377
2378 vbasedev->fd = fd;
2379 vbasedev->group = group;
2380 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2381
2382 vbasedev->num_irqs = dev_info.num_irqs;
2383 vbasedev->num_regions = dev_info.num_regions;
2384 vbasedev->flags = dev_info.flags;
2385
2386 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2387 dev_info.num_irqs);
2388
2389 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
2390 return 0;
2391}
2392
2393void vfio_put_base_device(VFIODevice *vbasedev)
2394{
2395 if (!vbasedev->group) {
2396 return;
2397 }
2398 QLIST_REMOVE(vbasedev, next);
2399 vbasedev->group = NULL;
2400 trace_vfio_put_base_device(vbasedev->fd);
2401 close(vbasedev->fd);
2402}
2403
2404int vfio_get_region_info(VFIODevice *vbasedev, int index,
2405 struct vfio_region_info **info)
2406{
2407 size_t argsz = sizeof(struct vfio_region_info);
2408
2409 *info = g_malloc0(argsz);
2410
2411 (*info)->index = index;
2412retry:
2413 (*info)->argsz = argsz;
2414
2415 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2416 g_free(*info);
2417 *info = NULL;
2418 return -errno;
2419 }
2420
2421 if ((*info)->argsz > argsz) {
2422 argsz = (*info)->argsz;
2423 *info = g_realloc(*info, argsz);
2424
2425 goto retry;
2426 }
2427
2428 return 0;
2429}
2430
2431int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2432 uint32_t subtype, struct vfio_region_info **info)
2433{
2434 int i;
2435
2436 for (i = 0; i < vbasedev->num_regions; i++) {
2437 struct vfio_info_cap_header *hdr;
2438 struct vfio_region_info_cap_type *cap_type;
2439
2440 if (vfio_get_region_info(vbasedev, i, info)) {
2441 continue;
2442 }
2443
2444 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2445 if (!hdr) {
2446 g_free(*info);
2447 continue;
2448 }
2449
2450 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2451
2452 trace_vfio_get_dev_region(vbasedev->name, i,
2453 cap_type->type, cap_type->subtype);
2454
2455 if (cap_type->type == type && cap_type->subtype == subtype) {
2456 return 0;
2457 }
2458
2459 g_free(*info);
2460 }
2461
2462 *info = NULL;
2463 return -ENODEV;
2464}
2465
2466bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2467{
2468 struct vfio_region_info *info = NULL;
2469 bool ret = false;
2470
2471 if (!vfio_get_region_info(vbasedev, region, &info)) {
2472 if (vfio_get_region_info_cap(info, cap_type)) {
2473 ret = true;
2474 }
2475 g_free(info);
2476 }
2477
2478 return ret;
2479}
2480
2481
2482
2483
2484static bool vfio_eeh_container_ok(VFIOContainer *container)
2485{
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501 if (QLIST_EMPTY(&container->group_list)) {
2502 return false;
2503 }
2504
2505 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
2506 return false;
2507 }
2508
2509 return true;
2510}
2511
2512static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
2513{
2514 struct vfio_eeh_pe_op pe_op = {
2515 .argsz = sizeof(pe_op),
2516 .op = op,
2517 };
2518 int ret;
2519
2520 if (!vfio_eeh_container_ok(container)) {
2521 error_report("vfio/eeh: EEH_PE_OP 0x%x: "
2522 "kernel requires a container with exactly one group", op);
2523 return -EPERM;
2524 }
2525
2526 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
2527 if (ret < 0) {
2528 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
2529 return -errno;
2530 }
2531
2532 return ret;
2533}
2534
2535static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
2536{
2537 VFIOAddressSpace *space = vfio_get_address_space(as);
2538 VFIOContainer *container = NULL;
2539
2540 if (QLIST_EMPTY(&space->containers)) {
2541
2542 goto out;
2543 }
2544
2545 container = QLIST_FIRST(&space->containers);
2546
2547 if (QLIST_NEXT(container, next)) {
2548
2549
2550 container = NULL;
2551 goto out;
2552 }
2553
2554out:
2555 vfio_put_address_space(space);
2556 return container;
2557}
2558
2559bool vfio_eeh_as_ok(AddressSpace *as)
2560{
2561 VFIOContainer *container = vfio_eeh_as_container(as);
2562
2563 return (container != NULL) && vfio_eeh_container_ok(container);
2564}
2565
2566int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
2567{
2568 VFIOContainer *container = vfio_eeh_as_container(as);
2569
2570 if (!container) {
2571 return -ENODEV;
2572 }
2573 return vfio_eeh_container_op(container, op);
2574}
2575