1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include "qemu/osdep.h"
22#include <sys/ioctl.h>
23#ifdef CONFIG_KVM
24#include <linux/kvm.h>
25#endif
26#include <linux/vfio.h>
27
28#include "hw/vfio/vfio-common.h"
29#include "hw/vfio/vfio.h"
30#include "exec/address-spaces.h"
31#include "exec/memory.h"
32#include "exec/ram_addr.h"
33#include "hw/hw.h"
34#include "qemu/error-report.h"
35#include "qemu/main-loop.h"
36#include "qemu/range.h"
37#include "sysemu/kvm.h"
38#include "sysemu/reset.h"
39#include "sysemu/runstate.h"
40#include "trace.h"
41#include "qapi/error.h"
42#include "migration/migration.h"
43
44VFIOGroupList vfio_group_list =
45 QLIST_HEAD_INITIALIZER(vfio_group_list);
46static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
47 QLIST_HEAD_INITIALIZER(vfio_address_spaces);
48
49#ifdef CONFIG_KVM
50
51
52
53
54
55
56
57static int vfio_kvm_device_fd = -1;
58#endif
59
60
61
62
63void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
64{
65 struct vfio_irq_set irq_set = {
66 .argsz = sizeof(irq_set),
67 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
68 .index = index,
69 .start = 0,
70 .count = 0,
71 };
72
73 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
74}
75
76void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
77{
78 struct vfio_irq_set irq_set = {
79 .argsz = sizeof(irq_set),
80 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
81 .index = index,
82 .start = 0,
83 .count = 1,
84 };
85
86 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
87}
88
89void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
90{
91 struct vfio_irq_set irq_set = {
92 .argsz = sizeof(irq_set),
93 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
94 .index = index,
95 .start = 0,
96 .count = 1,
97 };
98
99 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
100}
101
102static inline const char *action_to_str(int action)
103{
104 switch (action) {
105 case VFIO_IRQ_SET_ACTION_MASK:
106 return "MASK";
107 case VFIO_IRQ_SET_ACTION_UNMASK:
108 return "UNMASK";
109 case VFIO_IRQ_SET_ACTION_TRIGGER:
110 return "TRIGGER";
111 default:
112 return "UNKNOWN ACTION";
113 }
114}
115
116static const char *index_to_str(VFIODevice *vbasedev, int index)
117{
118 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
119 return NULL;
120 }
121
122 switch (index) {
123 case VFIO_PCI_INTX_IRQ_INDEX:
124 return "INTX";
125 case VFIO_PCI_MSI_IRQ_INDEX:
126 return "MSI";
127 case VFIO_PCI_MSIX_IRQ_INDEX:
128 return "MSIX";
129 case VFIO_PCI_ERR_IRQ_INDEX:
130 return "ERR";
131 case VFIO_PCI_REQ_IRQ_INDEX:
132 return "REQ";
133 default:
134 return NULL;
135 }
136}
137
138static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
139{
140 switch (container->iommu_type) {
141 case VFIO_TYPE1v2_IOMMU:
142 case VFIO_TYPE1_IOMMU:
143
144
145
146 return ram_block_uncoordinated_discard_disable(state);
147 default:
148
149
150
151
152
153
154
155
156
157 return ram_block_discard_disable(state);
158 }
159}
160
161int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
162 int action, int fd, Error **errp)
163{
164 struct vfio_irq_set *irq_set;
165 int argsz, ret = 0;
166 const char *name;
167 int32_t *pfd;
168
169 argsz = sizeof(*irq_set) + sizeof(*pfd);
170
171 irq_set = g_malloc0(argsz);
172 irq_set->argsz = argsz;
173 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
174 irq_set->index = index;
175 irq_set->start = subindex;
176 irq_set->count = 1;
177 pfd = (int32_t *)&irq_set->data;
178 *pfd = fd;
179
180 if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
181 ret = -errno;
182 }
183 g_free(irq_set);
184
185 if (!ret) {
186 return 0;
187 }
188
189 error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
190
191 name = index_to_str(vbasedev, index);
192 if (name) {
193 error_prepend(errp, "%s-%d: ", name, subindex);
194 } else {
195 error_prepend(errp, "index %d-%d: ", index, subindex);
196 }
197 error_prepend(errp,
198 "Failed to %s %s eventfd signaling for interrupt ",
199 fd < 0 ? "tear down" : "set up", action_to_str(action));
200 return ret;
201}
202
203
204
205
206void vfio_region_write(void *opaque, hwaddr addr,
207 uint64_t data, unsigned size)
208{
209 VFIORegion *region = opaque;
210 VFIODevice *vbasedev = region->vbasedev;
211 union {
212 uint8_t byte;
213 uint16_t word;
214 uint32_t dword;
215 uint64_t qword;
216 } buf;
217
218 switch (size) {
219 case 1:
220 buf.byte = data;
221 break;
222 case 2:
223 buf.word = cpu_to_le16(data);
224 break;
225 case 4:
226 buf.dword = cpu_to_le32(data);
227 break;
228 case 8:
229 buf.qword = cpu_to_le64(data);
230 break;
231 default:
232 hw_error("vfio: unsupported write size, %u bytes", size);
233 break;
234 }
235
236 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
237 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
238 ",%d) failed: %m",
239 __func__, vbasedev->name, region->nr,
240 addr, data, size);
241 }
242
243 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
244
245
246
247
248
249
250
251
252
253 vbasedev->ops->vfio_eoi(vbasedev);
254}
255
256uint64_t vfio_region_read(void *opaque,
257 hwaddr addr, unsigned size)
258{
259 VFIORegion *region = opaque;
260 VFIODevice *vbasedev = region->vbasedev;
261 union {
262 uint8_t byte;
263 uint16_t word;
264 uint32_t dword;
265 uint64_t qword;
266 } buf;
267 uint64_t data = 0;
268
269 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
270 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
271 __func__, vbasedev->name, region->nr,
272 addr, size);
273 return (uint64_t)-1;
274 }
275 switch (size) {
276 case 1:
277 data = buf.byte;
278 break;
279 case 2:
280 data = le16_to_cpu(buf.word);
281 break;
282 case 4:
283 data = le32_to_cpu(buf.dword);
284 break;
285 case 8:
286 data = le64_to_cpu(buf.qword);
287 break;
288 default:
289 hw_error("vfio: unsupported read size, %u bytes", size);
290 break;
291 }
292
293 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
294
295
296 vbasedev->ops->vfio_eoi(vbasedev);
297
298 return data;
299}
300
301const MemoryRegionOps vfio_region_ops = {
302 .read = vfio_region_read,
303 .write = vfio_region_write,
304 .endianness = DEVICE_LITTLE_ENDIAN,
305 .valid = {
306 .min_access_size = 1,
307 .max_access_size = 8,
308 },
309 .impl = {
310 .min_access_size = 1,
311 .max_access_size = 8,
312 },
313};
314
315
316
317
318
319bool vfio_mig_active(void)
320{
321 VFIOGroup *group;
322 VFIODevice *vbasedev;
323
324 if (QLIST_EMPTY(&vfio_group_list)) {
325 return false;
326 }
327
328 QLIST_FOREACH(group, &vfio_group_list, next) {
329 QLIST_FOREACH(vbasedev, &group->device_list, next) {
330 if (vbasedev->migration_blocker) {
331 return false;
332 }
333 }
334 }
335 return true;
336}
337
338static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
339{
340 VFIOGroup *group;
341 VFIODevice *vbasedev;
342 MigrationState *ms = migrate_get_current();
343
344 if (!migration_is_setup_or_active(ms->state)) {
345 return false;
346 }
347
348 QLIST_FOREACH(group, &container->group_list, container_next) {
349 QLIST_FOREACH(vbasedev, &group->device_list, next) {
350 VFIOMigration *migration = vbasedev->migration;
351
352 if (!migration) {
353 return false;
354 }
355
356 if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
357 && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
358 return false;
359 }
360 }
361 }
362 return true;
363}
364
365static bool vfio_devices_all_running_and_saving(VFIOContainer *container)
366{
367 VFIOGroup *group;
368 VFIODevice *vbasedev;
369 MigrationState *ms = migrate_get_current();
370
371 if (!migration_is_setup_or_active(ms->state)) {
372 return false;
373 }
374
375 QLIST_FOREACH(group, &container->group_list, container_next) {
376 QLIST_FOREACH(vbasedev, &group->device_list, next) {
377 VFIOMigration *migration = vbasedev->migration;
378
379 if (!migration) {
380 return false;
381 }
382
383 if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) &&
384 (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
385 continue;
386 } else {
387 return false;
388 }
389 }
390 }
391 return true;
392}
393
394static int vfio_dma_unmap_bitmap(VFIOContainer *container,
395 hwaddr iova, ram_addr_t size,
396 IOMMUTLBEntry *iotlb)
397{
398 struct vfio_iommu_type1_dma_unmap *unmap;
399 struct vfio_bitmap *bitmap;
400 uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
401 int ret;
402
403 unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
404
405 unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
406 unmap->iova = iova;
407 unmap->size = size;
408 unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
409 bitmap = (struct vfio_bitmap *)&unmap->data;
410
411
412
413
414
415
416
417 bitmap->pgsize = qemu_real_host_page_size;
418 bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
419 BITS_PER_BYTE;
420
421 if (bitmap->size > container->max_dirty_bitmap_size) {
422 error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
423 (uint64_t)bitmap->size);
424 ret = -E2BIG;
425 goto unmap_exit;
426 }
427
428 bitmap->data = g_try_malloc0(bitmap->size);
429 if (!bitmap->data) {
430 ret = -ENOMEM;
431 goto unmap_exit;
432 }
433
434 ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
435 if (!ret) {
436 cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
437 iotlb->translated_addr, pages);
438 } else {
439 error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
440 }
441
442 g_free(bitmap->data);
443unmap_exit:
444 g_free(unmap);
445 return ret;
446}
447
448
449
450
451static int vfio_dma_unmap(VFIOContainer *container,
452 hwaddr iova, ram_addr_t size,
453 IOMMUTLBEntry *iotlb)
454{
455 struct vfio_iommu_type1_dma_unmap unmap = {
456 .argsz = sizeof(unmap),
457 .flags = 0,
458 .iova = iova,
459 .size = size,
460 };
461
462 if (iotlb && container->dirty_pages_supported &&
463 vfio_devices_all_running_and_saving(container)) {
464 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
465 }
466
467 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
468
469
470
471
472
473
474
475
476
477
478
479
480 if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
481 container->iommu_type == VFIO_TYPE1v2_IOMMU) {
482 trace_vfio_dma_unmap_overflow_workaround();
483 unmap.size -= 1ULL << ctz64(container->pgsizes);
484 continue;
485 }
486 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
487 return -errno;
488 }
489
490 return 0;
491}
492
493static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
494 ram_addr_t size, void *vaddr, bool readonly)
495{
496 struct vfio_iommu_type1_dma_map map = {
497 .argsz = sizeof(map),
498 .flags = VFIO_DMA_MAP_FLAG_READ,
499 .vaddr = (__u64)(uintptr_t)vaddr,
500 .iova = iova,
501 .size = size,
502 };
503
504 if (!readonly) {
505 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
506 }
507
508
509
510
511
512
513 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
514 (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
515 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
516 return 0;
517 }
518
519 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
520 return -errno;
521}
522
523static void vfio_host_win_add(VFIOContainer *container,
524 hwaddr min_iova, hwaddr max_iova,
525 uint64_t iova_pgsizes)
526{
527 VFIOHostDMAWindow *hostwin;
528
529 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
530 if (ranges_overlap(hostwin->min_iova,
531 hostwin->max_iova - hostwin->min_iova + 1,
532 min_iova,
533 max_iova - min_iova + 1)) {
534 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
535 }
536 }
537
538 hostwin = g_malloc0(sizeof(*hostwin));
539
540 hostwin->min_iova = min_iova;
541 hostwin->max_iova = max_iova;
542 hostwin->iova_pgsizes = iova_pgsizes;
543 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
544}
545
546static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
547 hwaddr max_iova)
548{
549 VFIOHostDMAWindow *hostwin;
550
551 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
552 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
553 QLIST_REMOVE(hostwin, hostwin_next);
554 return 0;
555 }
556 }
557
558 return -1;
559}
560
561static bool vfio_listener_skipped_section(MemoryRegionSection *section)
562{
563 return (!memory_region_is_ram(section->mr) &&
564 !memory_region_is_iommu(section->mr)) ||
565
566
567
568
569
570
571 section->offset_within_address_space & (1ULL << 63);
572}
573
574
575static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
576 ram_addr_t *ram_addr, bool *read_only)
577{
578 MemoryRegion *mr;
579 hwaddr xlat;
580 hwaddr len = iotlb->addr_mask + 1;
581 bool writable = iotlb->perm & IOMMU_WO;
582
583
584
585
586
587
588 mr = address_space_translate(&address_space_memory,
589 iotlb->translated_addr,
590 &xlat, &len, writable,
591 MEMTXATTRS_UNSPECIFIED);
592 if (!memory_region_is_ram(mr)) {
593 error_report("iommu map to non memory area %"HWADDR_PRIx"",
594 xlat);
595 return false;
596 } else if (memory_region_has_ram_discard_manager(mr)) {
597 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr);
598 MemoryRegionSection tmp = {
599 .mr = mr,
600 .offset_within_region = xlat,
601 .size = int128_make64(len),
602 };
603
604
605
606
607
608
609
610 if (!ram_discard_manager_is_populated(rdm, &tmp)) {
611 error_report("iommu map to discarded memory (e.g., unplugged via"
612 " virtio-mem): %"HWADDR_PRIx"",
613 iotlb->translated_addr);
614 return false;
615 }
616
617
618
619
620
621
622
623
624
625
626
627
628
629 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
630 " RAM (e.g., virtio-mem) works, however, malicious"
631 " guests can trigger pinning of more memory than"
632 " intended via an IOMMU. It's possible to mitigate "
633 " by setting/adjusting RLIMIT_MEMLOCK.");
634 }
635
636
637
638
639
640 if (len & iotlb->addr_mask) {
641 error_report("iommu has granularity incompatible with target AS");
642 return false;
643 }
644
645 if (vaddr) {
646 *vaddr = memory_region_get_ram_ptr(mr) + xlat;
647 }
648
649 if (ram_addr) {
650 *ram_addr = memory_region_get_ram_addr(mr) + xlat;
651 }
652
653 if (read_only) {
654 *read_only = !writable || mr->readonly;
655 }
656
657 return true;
658}
659
660static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
661{
662 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
663 VFIOContainer *container = giommu->container;
664 hwaddr iova = iotlb->iova + giommu->iommu_offset;
665 void *vaddr;
666 int ret;
667
668 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
669 iova, iova + iotlb->addr_mask);
670
671 if (iotlb->target_as != &address_space_memory) {
672 error_report("Wrong target AS \"%s\", only system memory is allowed",
673 iotlb->target_as->name ? iotlb->target_as->name : "none");
674 return;
675 }
676
677 rcu_read_lock();
678
679 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
680 bool read_only;
681
682 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
683 goto out;
684 }
685
686
687
688
689
690
691
692 ret = vfio_dma_map(container, iova,
693 iotlb->addr_mask + 1, vaddr,
694 read_only);
695 if (ret) {
696 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
697 "0x%"HWADDR_PRIx", %p) = %d (%m)",
698 container, iova,
699 iotlb->addr_mask + 1, vaddr, ret);
700 }
701 } else {
702 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
703 if (ret) {
704 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
705 "0x%"HWADDR_PRIx") = %d (%m)",
706 container, iova,
707 iotlb->addr_mask + 1, ret);
708 }
709 }
710out:
711 rcu_read_unlock();
712}
713
714static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
715 MemoryRegionSection *section)
716{
717 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
718 listener);
719 const hwaddr size = int128_get64(section->size);
720 const hwaddr iova = section->offset_within_address_space;
721 int ret;
722
723
724 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
725 if (ret) {
726 error_report("%s: vfio_dma_unmap() failed: %s", __func__,
727 strerror(-ret));
728 }
729}
730
731static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
732 MemoryRegionSection *section)
733{
734 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
735 listener);
736 const hwaddr end = section->offset_within_region +
737 int128_get64(section->size);
738 hwaddr start, next, iova;
739 void *vaddr;
740 int ret;
741
742
743
744
745
746 for (start = section->offset_within_region; start < end; start = next) {
747 next = ROUND_UP(start + 1, vrdl->granularity);
748 next = MIN(next, end);
749
750 iova = start - section->offset_within_region +
751 section->offset_within_address_space;
752 vaddr = memory_region_get_ram_ptr(section->mr) + start;
753
754 ret = vfio_dma_map(vrdl->container, iova, next - start,
755 vaddr, section->readonly);
756 if (ret) {
757
758 vfio_ram_discard_notify_discard(rdl, section);
759 return ret;
760 }
761 }
762 return 0;
763}
764
765static void vfio_register_ram_discard_listener(VFIOContainer *container,
766 MemoryRegionSection *section)
767{
768 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
769 VFIORamDiscardListener *vrdl;
770
771
772 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
773 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
774 TARGET_PAGE_SIZE));
775 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
776
777 vrdl = g_new0(VFIORamDiscardListener, 1);
778 vrdl->container = container;
779 vrdl->mr = section->mr;
780 vrdl->offset_within_address_space = section->offset_within_address_space;
781 vrdl->size = int128_get64(section->size);
782 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
783 section->mr);
784
785 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
786 g_assert(container->pgsizes &&
787 vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
788
789 ram_discard_listener_init(&vrdl->listener,
790 vfio_ram_discard_notify_populate,
791 vfio_ram_discard_notify_discard, true);
792 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
793 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808 if (container->dma_max_mappings) {
809 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
810
811#ifdef CONFIG_KVM
812 if (kvm_enabled()) {
813 max_memslots = kvm_get_max_memslots();
814 }
815#endif
816
817 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
818 hwaddr start, end;
819
820 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
821 vrdl->granularity);
822 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
823 vrdl->granularity);
824 vrdl_mappings += (end - start) / vrdl->granularity;
825 vrdl_count++;
826 }
827
828 if (vrdl_mappings + max_memslots - vrdl_count >
829 container->dma_max_mappings) {
830 warn_report("%s: possibly running out of DMA mappings. E.g., try"
831 " increasing the 'block-size' of virtio-mem devies."
832 " Maximum possible DMA mappings: %d, Maximum possible"
833 " memslots: %d", __func__, container->dma_max_mappings,
834 max_memslots);
835 }
836 }
837}
838
839static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
840 MemoryRegionSection *section)
841{
842 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
843 VFIORamDiscardListener *vrdl = NULL;
844
845 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
846 if (vrdl->mr == section->mr &&
847 vrdl->offset_within_address_space ==
848 section->offset_within_address_space) {
849 break;
850 }
851 }
852
853 if (!vrdl) {
854 hw_error("vfio: Trying to unregister missing RAM discard listener");
855 }
856
857 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
858 QLIST_REMOVE(vrdl, next);
859 g_free(vrdl);
860}
861
862static void vfio_listener_region_add(MemoryListener *listener,
863 MemoryRegionSection *section)
864{
865 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
866 hwaddr iova, end;
867 Int128 llend, llsize;
868 void *vaddr;
869 int ret;
870 VFIOHostDMAWindow *hostwin;
871 bool hostwin_found;
872 Error *err = NULL;
873
874 if (vfio_listener_skipped_section(section)) {
875 trace_vfio_listener_region_add_skip(
876 section->offset_within_address_space,
877 section->offset_within_address_space +
878 int128_get64(int128_sub(section->size, int128_one())));
879 return;
880 }
881
882 if (unlikely((section->offset_within_address_space &
883 ~qemu_real_host_page_mask) !=
884 (section->offset_within_region & ~qemu_real_host_page_mask))) {
885 error_report("%s received unaligned region", __func__);
886 return;
887 }
888
889 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
890 llend = int128_make64(section->offset_within_address_space);
891 llend = int128_add(llend, section->size);
892 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
893
894 if (int128_ge(int128_make64(iova), llend)) {
895 return;
896 }
897 end = int128_get64(int128_sub(llend, int128_one()));
898
899 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
900 hwaddr pgsize = 0;
901
902
903 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
904 if (ranges_overlap(hostwin->min_iova,
905 hostwin->max_iova - hostwin->min_iova + 1,
906 section->offset_within_address_space,
907 int128_get64(section->size))) {
908 error_setg(&err,
909 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
910 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
911 section->offset_within_address_space,
912 section->offset_within_address_space +
913 int128_get64(section->size) - 1,
914 hostwin->min_iova, hostwin->max_iova);
915 goto fail;
916 }
917 }
918
919 ret = vfio_spapr_create_window(container, section, &pgsize);
920 if (ret) {
921 error_setg_errno(&err, -ret, "Failed to create SPAPR window");
922 goto fail;
923 }
924
925 vfio_host_win_add(container, section->offset_within_address_space,
926 section->offset_within_address_space +
927 int128_get64(section->size) - 1, pgsize);
928#ifdef CONFIG_KVM
929 if (kvm_enabled()) {
930 VFIOGroup *group;
931 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
932 struct kvm_vfio_spapr_tce param;
933 struct kvm_device_attr attr = {
934 .group = KVM_DEV_VFIO_GROUP,
935 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
936 .addr = (uint64_t)(unsigned long)¶m,
937 };
938
939 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
940 ¶m.tablefd)) {
941 QLIST_FOREACH(group, &container->group_list, container_next) {
942 param.groupfd = group->fd;
943 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
944 error_report("vfio: failed to setup fd %d "
945 "for a group with fd %d: %s",
946 param.tablefd, param.groupfd,
947 strerror(errno));
948 return;
949 }
950 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
951 }
952 }
953 }
954#endif
955 }
956
957 hostwin_found = false;
958 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
959 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
960 hostwin_found = true;
961 break;
962 }
963 }
964
965 if (!hostwin_found) {
966 error_setg(&err, "Container %p can't map guest IOVA region"
967 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
968 goto fail;
969 }
970
971 memory_region_ref(section->mr);
972
973 if (memory_region_is_iommu(section->mr)) {
974 VFIOGuestIOMMU *giommu;
975 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
976 int iommu_idx;
977
978 trace_vfio_listener_region_add_iommu(iova, end);
979
980
981
982
983
984
985 giommu = g_malloc0(sizeof(*giommu));
986 giommu->iommu = iommu_mr;
987 giommu->iommu_offset = section->offset_within_address_space -
988 section->offset_within_region;
989 giommu->container = container;
990 llend = int128_add(int128_make64(section->offset_within_region),
991 section->size);
992 llend = int128_sub(llend, int128_one());
993 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
994 MEMTXATTRS_UNSPECIFIED);
995 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
996 IOMMU_NOTIFIER_IOTLB_EVENTS,
997 section->offset_within_region,
998 int128_get64(llend),
999 iommu_idx);
1000
1001 ret = memory_region_iommu_set_page_size_mask(giommu->iommu,
1002 container->pgsizes,
1003 &err);
1004 if (ret) {
1005 g_free(giommu);
1006 goto fail;
1007 }
1008
1009 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
1010 &err);
1011 if (ret) {
1012 g_free(giommu);
1013 goto fail;
1014 }
1015 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
1016 memory_region_iommu_replay(giommu->iommu, &giommu->n);
1017
1018 return;
1019 }
1020
1021
1022
1023
1024
1025
1026
1027
1028 if (memory_region_has_ram_discard_manager(section->mr)) {
1029 vfio_register_ram_discard_listener(container, section);
1030 return;
1031 }
1032
1033 vaddr = memory_region_get_ram_ptr(section->mr) +
1034 section->offset_within_region +
1035 (iova - section->offset_within_address_space);
1036
1037 trace_vfio_listener_region_add_ram(iova, end, vaddr);
1038
1039 llsize = int128_sub(llend, int128_make64(iova));
1040
1041 if (memory_region_is_ram_device(section->mr)) {
1042 hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1043
1044 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
1045 trace_vfio_listener_region_add_no_dma_map(
1046 memory_region_name(section->mr),
1047 section->offset_within_address_space,
1048 int128_getlo(section->size),
1049 pgmask + 1);
1050 return;
1051 }
1052 }
1053
1054 ret = vfio_dma_map(container, iova, int128_get64(llsize),
1055 vaddr, section->readonly);
1056 if (ret) {
1057 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
1058 "0x%"HWADDR_PRIx", %p) = %d (%m)",
1059 container, iova, int128_get64(llsize), vaddr, ret);
1060 if (memory_region_is_ram_device(section->mr)) {
1061
1062 error_report_err(err);
1063 return;
1064 }
1065 goto fail;
1066 }
1067
1068 return;
1069
1070fail:
1071 if (memory_region_is_ram_device(section->mr)) {
1072 error_report("failed to vfio_dma_map. pci p2p may not work");
1073 return;
1074 }
1075
1076
1077
1078
1079
1080 if (!container->initialized) {
1081 if (!container->error) {
1082 error_propagate_prepend(&container->error, err,
1083 "Region %s: ",
1084 memory_region_name(section->mr));
1085 } else {
1086 error_free(err);
1087 }
1088 } else {
1089 error_report_err(err);
1090 hw_error("vfio: DMA mapping failed, unable to continue");
1091 }
1092}
1093
1094static void vfio_listener_region_del(MemoryListener *listener,
1095 MemoryRegionSection *section)
1096{
1097 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1098 hwaddr iova, end;
1099 Int128 llend, llsize;
1100 int ret;
1101 bool try_unmap = true;
1102
1103 if (vfio_listener_skipped_section(section)) {
1104 trace_vfio_listener_region_del_skip(
1105 section->offset_within_address_space,
1106 section->offset_within_address_space +
1107 int128_get64(int128_sub(section->size, int128_one())));
1108 return;
1109 }
1110
1111 if (unlikely((section->offset_within_address_space &
1112 ~qemu_real_host_page_mask) !=
1113 (section->offset_within_region & ~qemu_real_host_page_mask))) {
1114 error_report("%s received unaligned region", __func__);
1115 return;
1116 }
1117
1118 if (memory_region_is_iommu(section->mr)) {
1119 VFIOGuestIOMMU *giommu;
1120
1121 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1122 if (MEMORY_REGION(giommu->iommu) == section->mr &&
1123 giommu->n.start == section->offset_within_region) {
1124 memory_region_unregister_iommu_notifier(section->mr,
1125 &giommu->n);
1126 QLIST_REMOVE(giommu, giommu_next);
1127 g_free(giommu);
1128 break;
1129 }
1130 }
1131
1132
1133
1134
1135
1136
1137
1138
1139 }
1140
1141 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
1142 llend = int128_make64(section->offset_within_address_space);
1143 llend = int128_add(llend, section->size);
1144 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
1145
1146 if (int128_ge(int128_make64(iova), llend)) {
1147 return;
1148 }
1149 end = int128_get64(int128_sub(llend, int128_one()));
1150
1151 llsize = int128_sub(llend, int128_make64(iova));
1152
1153 trace_vfio_listener_region_del(iova, end);
1154
1155 if (memory_region_is_ram_device(section->mr)) {
1156 hwaddr pgmask;
1157 VFIOHostDMAWindow *hostwin;
1158 bool hostwin_found = false;
1159
1160 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1161 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
1162 hostwin_found = true;
1163 break;
1164 }
1165 }
1166 assert(hostwin_found);
1167
1168 pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1169 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
1170 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1171 vfio_unregister_ram_discard_listener(container, section);
1172
1173 try_unmap = false;
1174 }
1175
1176 if (try_unmap) {
1177 if (int128_eq(llsize, int128_2_64())) {
1178
1179 llsize = int128_rshift(llsize, 1);
1180 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1181 if (ret) {
1182 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1183 "0x%"HWADDR_PRIx") = %d (%m)",
1184 container, iova, int128_get64(llsize), ret);
1185 }
1186 iova += int128_get64(llsize);
1187 }
1188 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1189 if (ret) {
1190 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1191 "0x%"HWADDR_PRIx") = %d (%m)",
1192 container, iova, int128_get64(llsize), ret);
1193 }
1194 }
1195
1196 memory_region_unref(section->mr);
1197
1198 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1199 vfio_spapr_remove_window(container,
1200 section->offset_within_address_space);
1201 if (vfio_host_win_del(container,
1202 section->offset_within_address_space,
1203 section->offset_within_address_space +
1204 int128_get64(section->size) - 1) < 0) {
1205 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1206 __func__, section->offset_within_address_space);
1207 }
1208 }
1209}
1210
1211static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
1212{
1213 int ret;
1214 struct vfio_iommu_type1_dirty_bitmap dirty = {
1215 .argsz = sizeof(dirty),
1216 };
1217
1218 if (start) {
1219 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1220 } else {
1221 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1222 }
1223
1224 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1225 if (ret) {
1226 error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1227 dirty.flags, errno);
1228 }
1229}
1230
1231static void vfio_listener_log_global_start(MemoryListener *listener)
1232{
1233 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1234
1235 vfio_set_dirty_page_tracking(container, true);
1236}
1237
1238static void vfio_listener_log_global_stop(MemoryListener *listener)
1239{
1240 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1241
1242 vfio_set_dirty_page_tracking(container, false);
1243}
1244
1245static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1246 uint64_t size, ram_addr_t ram_addr)
1247{
1248 struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1249 struct vfio_iommu_type1_dirty_bitmap_get *range;
1250 uint64_t pages;
1251 int ret;
1252
1253 dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1254
1255 dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1256 dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1257 range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1258 range->iova = iova;
1259 range->size = size;
1260
1261
1262
1263
1264
1265
1266 range->bitmap.pgsize = qemu_real_host_page_size;
1267
1268 pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size;
1269 range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
1270 BITS_PER_BYTE;
1271 range->bitmap.data = g_try_malloc0(range->bitmap.size);
1272 if (!range->bitmap.data) {
1273 ret = -ENOMEM;
1274 goto err_out;
1275 }
1276
1277 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1278 if (ret) {
1279 error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1280 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1281 (uint64_t)range->size, errno);
1282 goto err_out;
1283 }
1284
1285 cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
1286 ram_addr, pages);
1287
1288 trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
1289 range->bitmap.size, ram_addr);
1290err_out:
1291 g_free(range->bitmap.data);
1292 g_free(dbitmap);
1293
1294 return ret;
1295}
1296
1297typedef struct {
1298 IOMMUNotifier n;
1299 VFIOGuestIOMMU *giommu;
1300} vfio_giommu_dirty_notifier;
1301
1302static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1303{
1304 vfio_giommu_dirty_notifier *gdn = container_of(n,
1305 vfio_giommu_dirty_notifier, n);
1306 VFIOGuestIOMMU *giommu = gdn->giommu;
1307 VFIOContainer *container = giommu->container;
1308 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1309 ram_addr_t translated_addr;
1310
1311 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1312
1313 if (iotlb->target_as != &address_space_memory) {
1314 error_report("Wrong target AS \"%s\", only system memory is allowed",
1315 iotlb->target_as->name ? iotlb->target_as->name : "none");
1316 return;
1317 }
1318
1319 rcu_read_lock();
1320 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
1321 int ret;
1322
1323 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1324 translated_addr);
1325 if (ret) {
1326 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
1327 "0x%"HWADDR_PRIx") = %d (%m)",
1328 container, iova,
1329 iotlb->addr_mask + 1, ret);
1330 }
1331 }
1332 rcu_read_unlock();
1333}
1334
1335static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1336 void *opaque)
1337{
1338 const hwaddr size = int128_get64(section->size);
1339 const hwaddr iova = section->offset_within_address_space;
1340 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1341 section->offset_within_region;
1342 VFIORamDiscardListener *vrdl = opaque;
1343
1344
1345
1346
1347
1348 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1349}
1350
1351static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1352 MemoryRegionSection *section)
1353{
1354 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1355 VFIORamDiscardListener *vrdl = NULL;
1356
1357 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1358 if (vrdl->mr == section->mr &&
1359 vrdl->offset_within_address_space ==
1360 section->offset_within_address_space) {
1361 break;
1362 }
1363 }
1364
1365 if (!vrdl) {
1366 hw_error("vfio: Trying to sync missing RAM discard listener");
1367 }
1368
1369
1370
1371
1372
1373 return ram_discard_manager_replay_populated(rdm, section,
1374 vfio_ram_discard_get_dirty_bitmap,
1375 &vrdl);
1376}
1377
1378static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1379 MemoryRegionSection *section)
1380{
1381 ram_addr_t ram_addr;
1382
1383 if (memory_region_is_iommu(section->mr)) {
1384 VFIOGuestIOMMU *giommu;
1385
1386 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1387 if (MEMORY_REGION(giommu->iommu) == section->mr &&
1388 giommu->n.start == section->offset_within_region) {
1389 Int128 llend;
1390 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
1391 int idx = memory_region_iommu_attrs_to_index(giommu->iommu,
1392 MEMTXATTRS_UNSPECIFIED);
1393
1394 llend = int128_add(int128_make64(section->offset_within_region),
1395 section->size);
1396 llend = int128_sub(llend, int128_one());
1397
1398 iommu_notifier_init(&gdn.n,
1399 vfio_iommu_map_dirty_notify,
1400 IOMMU_NOTIFIER_MAP,
1401 section->offset_within_region,
1402 int128_get64(llend),
1403 idx);
1404 memory_region_iommu_replay(giommu->iommu, &gdn.n);
1405 break;
1406 }
1407 }
1408 return 0;
1409 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1410 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
1411 }
1412
1413 ram_addr = memory_region_get_ram_addr(section->mr) +
1414 section->offset_within_region;
1415
1416 return vfio_get_dirty_bitmap(container,
1417 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1418 int128_get64(section->size), ram_addr);
1419}
1420
1421static void vfio_listener_log_sync(MemoryListener *listener,
1422 MemoryRegionSection *section)
1423{
1424 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1425
1426 if (vfio_listener_skipped_section(section) ||
1427 !container->dirty_pages_supported) {
1428 return;
1429 }
1430
1431 if (vfio_devices_all_dirty_tracking(container)) {
1432 vfio_sync_dirty_bitmap(container, section);
1433 }
1434}
1435
1436static const MemoryListener vfio_memory_listener = {
1437 .region_add = vfio_listener_region_add,
1438 .region_del = vfio_listener_region_del,
1439 .log_global_start = vfio_listener_log_global_start,
1440 .log_global_stop = vfio_listener_log_global_stop,
1441 .log_sync = vfio_listener_log_sync,
1442};
1443
1444static void vfio_listener_release(VFIOContainer *container)
1445{
1446 memory_listener_unregister(&container->listener);
1447 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1448 memory_listener_unregister(&container->prereg_listener);
1449 }
1450}
1451
1452static struct vfio_info_cap_header *
1453vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
1454{
1455 struct vfio_info_cap_header *hdr;
1456
1457 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1458 if (hdr->id == id) {
1459 return hdr;
1460 }
1461 }
1462
1463 return NULL;
1464}
1465
1466struct vfio_info_cap_header *
1467vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1468{
1469 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1470 return NULL;
1471 }
1472
1473 return vfio_get_cap((void *)info, info->cap_offset, id);
1474}
1475
1476static struct vfio_info_cap_header *
1477vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1478{
1479 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1480 return NULL;
1481 }
1482
1483 return vfio_get_cap((void *)info, info->cap_offset, id);
1484}
1485
1486struct vfio_info_cap_header *
1487vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1488{
1489 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1490 return NULL;
1491 }
1492
1493 return vfio_get_cap((void *)info, info->cap_offset, id);
1494}
1495
1496bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1497 unsigned int *avail)
1498{
1499 struct vfio_info_cap_header *hdr;
1500 struct vfio_iommu_type1_info_dma_avail *cap;
1501
1502
1503 hdr = vfio_get_iommu_type1_info_cap(info,
1504 VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
1505 if (hdr == NULL) {
1506 return false;
1507 }
1508
1509 if (avail != NULL) {
1510 cap = (void *) hdr;
1511 *avail = cap->avail;
1512 }
1513
1514 return true;
1515}
1516
1517static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
1518 struct vfio_region_info *info)
1519{
1520 struct vfio_info_cap_header *hdr;
1521 struct vfio_region_info_cap_sparse_mmap *sparse;
1522 int i, j;
1523
1524 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
1525 if (!hdr) {
1526 return -ENODEV;
1527 }
1528
1529 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
1530
1531 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
1532 region->nr, sparse->nr_areas);
1533
1534 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
1535
1536 for (i = 0, j = 0; i < sparse->nr_areas; i++) {
1537 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
1538 sparse->areas[i].offset +
1539 sparse->areas[i].size);
1540
1541 if (sparse->areas[i].size) {
1542 region->mmaps[j].offset = sparse->areas[i].offset;
1543 region->mmaps[j].size = sparse->areas[i].size;
1544 j++;
1545 }
1546 }
1547
1548 region->nr_mmaps = j;
1549 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
1550
1551 return 0;
1552}
1553
1554int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
1555 int index, const char *name)
1556{
1557 struct vfio_region_info *info;
1558 int ret;
1559
1560 ret = vfio_get_region_info(vbasedev, index, &info);
1561 if (ret) {
1562 return ret;
1563 }
1564
1565 region->vbasedev = vbasedev;
1566 region->flags = info->flags;
1567 region->size = info->size;
1568 region->fd_offset = info->offset;
1569 region->nr = index;
1570
1571 if (region->size) {
1572 region->mem = g_new0(MemoryRegion, 1);
1573 memory_region_init_io(region->mem, obj, &vfio_region_ops,
1574 region, name, region->size);
1575
1576 if (!vbasedev->no_mmap &&
1577 region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
1578
1579 ret = vfio_setup_region_sparse_mmaps(region, info);
1580
1581 if (ret) {
1582 region->nr_mmaps = 1;
1583 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
1584 region->mmaps[0].offset = 0;
1585 region->mmaps[0].size = region->size;
1586 }
1587 }
1588 }
1589
1590 g_free(info);
1591
1592 trace_vfio_region_setup(vbasedev->name, index, name,
1593 region->flags, region->fd_offset, region->size);
1594 return 0;
1595}
1596
1597static void vfio_subregion_unmap(VFIORegion *region, int index)
1598{
1599 trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem),
1600 region->mmaps[index].offset,
1601 region->mmaps[index].offset +
1602 region->mmaps[index].size - 1);
1603 memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem);
1604 munmap(region->mmaps[index].mmap, region->mmaps[index].size);
1605 object_unparent(OBJECT(®ion->mmaps[index].mem));
1606 region->mmaps[index].mmap = NULL;
1607}
1608
1609int vfio_region_mmap(VFIORegion *region)
1610{
1611 int i, prot = 0;
1612 char *name;
1613
1614 if (!region->mem) {
1615 return 0;
1616 }
1617
1618 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
1619 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
1620
1621 for (i = 0; i < region->nr_mmaps; i++) {
1622 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
1623 MAP_SHARED, region->vbasedev->fd,
1624 region->fd_offset +
1625 region->mmaps[i].offset);
1626 if (region->mmaps[i].mmap == MAP_FAILED) {
1627 int ret = -errno;
1628
1629 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
1630 region->fd_offset +
1631 region->mmaps[i].offset,
1632 region->fd_offset +
1633 region->mmaps[i].offset +
1634 region->mmaps[i].size - 1, ret);
1635
1636 region->mmaps[i].mmap = NULL;
1637
1638 for (i--; i >= 0; i--) {
1639 vfio_subregion_unmap(region, i);
1640 }
1641
1642 return ret;
1643 }
1644
1645 name = g_strdup_printf("%s mmaps[%d]",
1646 memory_region_name(region->mem), i);
1647 memory_region_init_ram_device_ptr(®ion->mmaps[i].mem,
1648 memory_region_owner(region->mem),
1649 name, region->mmaps[i].size,
1650 region->mmaps[i].mmap);
1651 g_free(name);
1652 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
1653 ®ion->mmaps[i].mem);
1654
1655 trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem),
1656 region->mmaps[i].offset,
1657 region->mmaps[i].offset +
1658 region->mmaps[i].size - 1);
1659 }
1660
1661 return 0;
1662}
1663
1664void vfio_region_unmap(VFIORegion *region)
1665{
1666 int i;
1667
1668 if (!region->mem) {
1669 return;
1670 }
1671
1672 for (i = 0; i < region->nr_mmaps; i++) {
1673 if (region->mmaps[i].mmap) {
1674 vfio_subregion_unmap(region, i);
1675 }
1676 }
1677}
1678
1679void vfio_region_exit(VFIORegion *region)
1680{
1681 int i;
1682
1683 if (!region->mem) {
1684 return;
1685 }
1686
1687 for (i = 0; i < region->nr_mmaps; i++) {
1688 if (region->mmaps[i].mmap) {
1689 memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem);
1690 }
1691 }
1692
1693 trace_vfio_region_exit(region->vbasedev->name, region->nr);
1694}
1695
1696void vfio_region_finalize(VFIORegion *region)
1697{
1698 int i;
1699
1700 if (!region->mem) {
1701 return;
1702 }
1703
1704 for (i = 0; i < region->nr_mmaps; i++) {
1705 if (region->mmaps[i].mmap) {
1706 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
1707 object_unparent(OBJECT(®ion->mmaps[i].mem));
1708 }
1709 }
1710
1711 object_unparent(OBJECT(region->mem));
1712
1713 g_free(region->mem);
1714 g_free(region->mmaps);
1715
1716 trace_vfio_region_finalize(region->vbasedev->name, region->nr);
1717
1718 region->mem = NULL;
1719 region->mmaps = NULL;
1720 region->nr_mmaps = 0;
1721 region->size = 0;
1722 region->flags = 0;
1723 region->nr = 0;
1724}
1725
1726void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
1727{
1728 int i;
1729
1730 if (!region->mem) {
1731 return;
1732 }
1733
1734 for (i = 0; i < region->nr_mmaps; i++) {
1735 if (region->mmaps[i].mmap) {
1736 memory_region_set_enabled(®ion->mmaps[i].mem, enabled);
1737 }
1738 }
1739
1740 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
1741 enabled);
1742}
1743
1744void vfio_reset_handler(void *opaque)
1745{
1746 VFIOGroup *group;
1747 VFIODevice *vbasedev;
1748
1749 QLIST_FOREACH(group, &vfio_group_list, next) {
1750 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1751 if (vbasedev->dev->realized) {
1752 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1753 }
1754 }
1755 }
1756
1757 QLIST_FOREACH(group, &vfio_group_list, next) {
1758 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1759 if (vbasedev->dev->realized && vbasedev->needs_reset) {
1760 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1761 }
1762 }
1763 }
1764}
1765
1766static void vfio_kvm_device_add_group(VFIOGroup *group)
1767{
1768#ifdef CONFIG_KVM
1769 struct kvm_device_attr attr = {
1770 .group = KVM_DEV_VFIO_GROUP,
1771 .attr = KVM_DEV_VFIO_GROUP_ADD,
1772 .addr = (uint64_t)(unsigned long)&group->fd,
1773 };
1774
1775 if (!kvm_enabled()) {
1776 return;
1777 }
1778
1779 if (vfio_kvm_device_fd < 0) {
1780 struct kvm_create_device cd = {
1781 .type = KVM_DEV_TYPE_VFIO,
1782 };
1783
1784 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
1785 error_report("Failed to create KVM VFIO device: %m");
1786 return;
1787 }
1788
1789 vfio_kvm_device_fd = cd.fd;
1790 }
1791
1792 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1793 error_report("Failed to add group %d to KVM VFIO device: %m",
1794 group->groupid);
1795 }
1796#endif
1797}
1798
1799static void vfio_kvm_device_del_group(VFIOGroup *group)
1800{
1801#ifdef CONFIG_KVM
1802 struct kvm_device_attr attr = {
1803 .group = KVM_DEV_VFIO_GROUP,
1804 .attr = KVM_DEV_VFIO_GROUP_DEL,
1805 .addr = (uint64_t)(unsigned long)&group->fd,
1806 };
1807
1808 if (vfio_kvm_device_fd < 0) {
1809 return;
1810 }
1811
1812 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1813 error_report("Failed to remove group %d from KVM VFIO device: %m",
1814 group->groupid);
1815 }
1816#endif
1817}
1818
1819static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1820{
1821 VFIOAddressSpace *space;
1822
1823 QLIST_FOREACH(space, &vfio_address_spaces, list) {
1824 if (space->as == as) {
1825 return space;
1826 }
1827 }
1828
1829
1830 space = g_malloc0(sizeof(*space));
1831 space->as = as;
1832 QLIST_INIT(&space->containers);
1833
1834 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1835
1836 return space;
1837}
1838
1839static void vfio_put_address_space(VFIOAddressSpace *space)
1840{
1841 if (QLIST_EMPTY(&space->containers)) {
1842 QLIST_REMOVE(space, list);
1843 g_free(space);
1844 }
1845}
1846
1847
1848
1849
1850static int vfio_get_iommu_type(VFIOContainer *container,
1851 Error **errp)
1852{
1853 int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
1854 VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
1855 int i;
1856
1857 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
1858 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
1859 return iommu_types[i];
1860 }
1861 }
1862 error_setg(errp, "No available IOMMU models");
1863 return -EINVAL;
1864}
1865
1866static int vfio_init_container(VFIOContainer *container, int group_fd,
1867 Error **errp)
1868{
1869 int iommu_type, ret;
1870
1871 iommu_type = vfio_get_iommu_type(container, errp);
1872 if (iommu_type < 0) {
1873 return iommu_type;
1874 }
1875
1876 ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
1877 if (ret) {
1878 error_setg_errno(errp, errno, "Failed to set group container");
1879 return -errno;
1880 }
1881
1882 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
1883 if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1884
1885
1886
1887
1888
1889
1890 iommu_type = VFIO_SPAPR_TCE_IOMMU;
1891 continue;
1892 }
1893 error_setg_errno(errp, errno, "Failed to set iommu for container");
1894 return -errno;
1895 }
1896
1897 container->iommu_type = iommu_type;
1898 return 0;
1899}
1900
1901static int vfio_get_iommu_info(VFIOContainer *container,
1902 struct vfio_iommu_type1_info **info)
1903{
1904
1905 size_t argsz = sizeof(struct vfio_iommu_type1_info);
1906
1907 *info = g_new0(struct vfio_iommu_type1_info, 1);
1908again:
1909 (*info)->argsz = argsz;
1910
1911 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
1912 g_free(*info);
1913 *info = NULL;
1914 return -errno;
1915 }
1916
1917 if (((*info)->argsz > argsz)) {
1918 argsz = (*info)->argsz;
1919 *info = g_realloc(*info, argsz);
1920 goto again;
1921 }
1922
1923 return 0;
1924}
1925
1926static struct vfio_info_cap_header *
1927vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1928{
1929 struct vfio_info_cap_header *hdr;
1930 void *ptr = info;
1931
1932 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1933 return NULL;
1934 }
1935
1936 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1937 if (hdr->id == id) {
1938 return hdr;
1939 }
1940 }
1941
1942 return NULL;
1943}
1944
1945static void vfio_get_iommu_info_migration(VFIOContainer *container,
1946 struct vfio_iommu_type1_info *info)
1947{
1948 struct vfio_info_cap_header *hdr;
1949 struct vfio_iommu_type1_info_cap_migration *cap_mig;
1950
1951 hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
1952 if (!hdr) {
1953 return;
1954 }
1955
1956 cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
1957 header);
1958
1959
1960
1961
1962
1963 if (cap_mig->pgsize_bitmap & qemu_real_host_page_size) {
1964 container->dirty_pages_supported = true;
1965 container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
1966 container->dirty_pgsizes = cap_mig->pgsize_bitmap;
1967 }
1968}
1969
1970static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
1971 Error **errp)
1972{
1973 VFIOContainer *container;
1974 int ret, fd;
1975 VFIOAddressSpace *space;
1976
1977 space = vfio_get_address_space(as);
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010 QLIST_FOREACH(container, &space->containers, next) {
2011 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
2012 ret = vfio_ram_block_discard_disable(container, true);
2013 if (ret) {
2014 error_setg_errno(errp, -ret,
2015 "Cannot set discarding of RAM broken");
2016 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2017 &container->fd)) {
2018 error_report("vfio: error disconnecting group %d from"
2019 " container", group->groupid);
2020 }
2021 return ret;
2022 }
2023 group->container = container;
2024 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2025 vfio_kvm_device_add_group(group);
2026 return 0;
2027 }
2028 }
2029
2030 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
2031 if (fd < 0) {
2032 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
2033 ret = -errno;
2034 goto put_space_exit;
2035 }
2036
2037 ret = ioctl(fd, VFIO_GET_API_VERSION);
2038 if (ret != VFIO_API_VERSION) {
2039 error_setg(errp, "supported vfio version: %d, "
2040 "reported version: %d", VFIO_API_VERSION, ret);
2041 ret = -EINVAL;
2042 goto close_fd_exit;
2043 }
2044
2045 container = g_malloc0(sizeof(*container));
2046 container->space = space;
2047 container->fd = fd;
2048 container->error = NULL;
2049 container->dirty_pages_supported = false;
2050 container->dma_max_mappings = 0;
2051 QLIST_INIT(&container->giommu_list);
2052 QLIST_INIT(&container->hostwin_list);
2053 QLIST_INIT(&container->vrdl_list);
2054
2055 ret = vfio_init_container(container, group->fd, errp);
2056 if (ret) {
2057 goto free_container_exit;
2058 }
2059
2060 ret = vfio_ram_block_discard_disable(container, true);
2061 if (ret) {
2062 error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2063 goto free_container_exit;
2064 }
2065
2066 switch (container->iommu_type) {
2067 case VFIO_TYPE1v2_IOMMU:
2068 case VFIO_TYPE1_IOMMU:
2069 {
2070 struct vfio_iommu_type1_info *info;
2071
2072
2073
2074
2075
2076
2077
2078
2079 ret = vfio_get_iommu_info(container, &info);
2080
2081 if (ret || !(info->flags & VFIO_IOMMU_INFO_PGSIZES)) {
2082
2083 info->iova_pgsizes = 4096;
2084 }
2085 vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes);
2086 container->pgsizes = info->iova_pgsizes;
2087
2088
2089 container->dma_max_mappings = 65535;
2090 if (!ret) {
2091 vfio_get_info_dma_avail(info, &container->dma_max_mappings);
2092 vfio_get_iommu_info_migration(container, info);
2093 }
2094 g_free(info);
2095 break;
2096 }
2097 case VFIO_SPAPR_TCE_v2_IOMMU:
2098 case VFIO_SPAPR_TCE_IOMMU:
2099 {
2100 struct vfio_iommu_spapr_tce_info info;
2101 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
2102
2103
2104
2105
2106
2107
2108 if (!v2) {
2109 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2110 if (ret) {
2111 error_setg_errno(errp, errno, "failed to enable container");
2112 ret = -errno;
2113 goto enable_discards_exit;
2114 }
2115 } else {
2116 container->prereg_listener = vfio_prereg_listener;
2117
2118 memory_listener_register(&container->prereg_listener,
2119 &address_space_memory);
2120 if (container->error) {
2121 memory_listener_unregister(&container->prereg_listener);
2122 ret = -1;
2123 error_propagate_prepend(errp, container->error,
2124 "RAM memory listener initialization failed: ");
2125 goto enable_discards_exit;
2126 }
2127 }
2128
2129 info.argsz = sizeof(info);
2130 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2131 if (ret) {
2132 error_setg_errno(errp, errno,
2133 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
2134 ret = -errno;
2135 if (v2) {
2136 memory_listener_unregister(&container->prereg_listener);
2137 }
2138 goto enable_discards_exit;
2139 }
2140
2141 if (v2) {
2142 container->pgsizes = info.ddw.pgsizes;
2143
2144
2145
2146
2147
2148
2149 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2150 if (ret) {
2151 error_setg_errno(errp, -ret,
2152 "failed to remove existing window");
2153 goto enable_discards_exit;
2154 }
2155 } else {
2156
2157 container->pgsizes = 0x1000;
2158 vfio_host_win_add(container, info.dma32_window_start,
2159 info.dma32_window_start +
2160 info.dma32_window_size - 1,
2161 0x1000);
2162 }
2163 }
2164 }
2165
2166 vfio_kvm_device_add_group(group);
2167
2168 QLIST_INIT(&container->group_list);
2169 QLIST_INSERT_HEAD(&space->containers, container, next);
2170
2171 group->container = container;
2172 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2173
2174 container->listener = vfio_memory_listener;
2175
2176 memory_listener_register(&container->listener, container->space->as);
2177
2178 if (container->error) {
2179 ret = -1;
2180 error_propagate_prepend(errp, container->error,
2181 "memory listener initialization failed: ");
2182 goto listener_release_exit;
2183 }
2184
2185 container->initialized = true;
2186
2187 return 0;
2188listener_release_exit:
2189 QLIST_REMOVE(group, container_next);
2190 QLIST_REMOVE(container, next);
2191 vfio_kvm_device_del_group(group);
2192 vfio_listener_release(container);
2193
2194enable_discards_exit:
2195 vfio_ram_block_discard_disable(container, false);
2196
2197free_container_exit:
2198 g_free(container);
2199
2200close_fd_exit:
2201 close(fd);
2202
2203put_space_exit:
2204 vfio_put_address_space(space);
2205
2206 return ret;
2207}
2208
2209static void vfio_disconnect_container(VFIOGroup *group)
2210{
2211 VFIOContainer *container = group->container;
2212
2213 QLIST_REMOVE(group, container_next);
2214 group->container = NULL;
2215
2216
2217
2218
2219
2220
2221 if (QLIST_EMPTY(&container->group_list)) {
2222 vfio_listener_release(container);
2223 }
2224
2225 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2226 error_report("vfio: error disconnecting group %d from container",
2227 group->groupid);
2228 }
2229
2230 if (QLIST_EMPTY(&container->group_list)) {
2231 VFIOAddressSpace *space = container->space;
2232 VFIOGuestIOMMU *giommu, *tmp;
2233
2234 QLIST_REMOVE(container, next);
2235
2236 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
2237 memory_region_unregister_iommu_notifier(
2238 MEMORY_REGION(giommu->iommu), &giommu->n);
2239 QLIST_REMOVE(giommu, giommu_next);
2240 g_free(giommu);
2241 }
2242
2243 trace_vfio_disconnect_container(container->fd);
2244 close(container->fd);
2245 g_free(container);
2246
2247 vfio_put_address_space(space);
2248 }
2249}
2250
2251VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
2252{
2253 VFIOGroup *group;
2254 char path[32];
2255 struct vfio_group_status status = { .argsz = sizeof(status) };
2256
2257 QLIST_FOREACH(group, &vfio_group_list, next) {
2258 if (group->groupid == groupid) {
2259
2260 if (group->container->space->as == as) {
2261 return group;
2262 } else {
2263 error_setg(errp, "group %d used in multiple address spaces",
2264 group->groupid);
2265 return NULL;
2266 }
2267 }
2268 }
2269
2270 group = g_malloc0(sizeof(*group));
2271
2272 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
2273 group->fd = qemu_open_old(path, O_RDWR);
2274 if (group->fd < 0) {
2275 error_setg_errno(errp, errno, "failed to open %s", path);
2276 goto free_group_exit;
2277 }
2278
2279 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
2280 error_setg_errno(errp, errno, "failed to get group %d status", groupid);
2281 goto close_fd_exit;
2282 }
2283
2284 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
2285 error_setg(errp, "group %d is not viable", groupid);
2286 error_append_hint(errp,
2287 "Please ensure all devices within the iommu_group "
2288 "are bound to their vfio bus driver.\n");
2289 goto close_fd_exit;
2290 }
2291
2292 group->groupid = groupid;
2293 QLIST_INIT(&group->device_list);
2294
2295 if (vfio_connect_container(group, as, errp)) {
2296 error_prepend(errp, "failed to setup container for group %d: ",
2297 groupid);
2298 goto close_fd_exit;
2299 }
2300
2301 if (QLIST_EMPTY(&vfio_group_list)) {
2302 qemu_register_reset(vfio_reset_handler, NULL);
2303 }
2304
2305 QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2306
2307 return group;
2308
2309close_fd_exit:
2310 close(group->fd);
2311
2312free_group_exit:
2313 g_free(group);
2314
2315 return NULL;
2316}
2317
2318void vfio_put_group(VFIOGroup *group)
2319{
2320 if (!group || !QLIST_EMPTY(&group->device_list)) {
2321 return;
2322 }
2323
2324 if (!group->ram_block_discard_allowed) {
2325 vfio_ram_block_discard_disable(group->container, false);
2326 }
2327 vfio_kvm_device_del_group(group);
2328 vfio_disconnect_container(group);
2329 QLIST_REMOVE(group, next);
2330 trace_vfio_put_group(group->fd);
2331 close(group->fd);
2332 g_free(group);
2333
2334 if (QLIST_EMPTY(&vfio_group_list)) {
2335 qemu_unregister_reset(vfio_reset_handler, NULL);
2336 }
2337}
2338
2339int vfio_get_device(VFIOGroup *group, const char *name,
2340 VFIODevice *vbasedev, Error **errp)
2341{
2342 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
2343 int ret, fd;
2344
2345 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2346 if (fd < 0) {
2347 error_setg_errno(errp, errno, "error getting device from group %d",
2348 group->groupid);
2349 error_append_hint(errp,
2350 "Verify all devices in group %d are bound to vfio-<bus> "
2351 "or pci-stub and not already in use\n", group->groupid);
2352 return fd;
2353 }
2354
2355 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
2356 if (ret) {
2357 error_setg_errno(errp, errno, "error getting device info");
2358 close(fd);
2359 return ret;
2360 }
2361
2362
2363
2364
2365
2366
2367
2368 if (vbasedev->ram_block_discard_allowed !=
2369 group->ram_block_discard_allowed) {
2370 if (!QLIST_EMPTY(&group->device_list)) {
2371 error_setg(errp, "Inconsistent setting of support for discarding "
2372 "RAM (e.g., balloon) within group");
2373 close(fd);
2374 return -1;
2375 }
2376
2377 if (!group->ram_block_discard_allowed) {
2378 group->ram_block_discard_allowed = true;
2379 vfio_ram_block_discard_disable(group->container, false);
2380 }
2381 }
2382
2383 vbasedev->fd = fd;
2384 vbasedev->group = group;
2385 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2386
2387 vbasedev->num_irqs = dev_info.num_irqs;
2388 vbasedev->num_regions = dev_info.num_regions;
2389 vbasedev->flags = dev_info.flags;
2390
2391 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2392 dev_info.num_irqs);
2393
2394 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
2395 return 0;
2396}
2397
2398void vfio_put_base_device(VFIODevice *vbasedev)
2399{
2400 if (!vbasedev->group) {
2401 return;
2402 }
2403 QLIST_REMOVE(vbasedev, next);
2404 vbasedev->group = NULL;
2405 trace_vfio_put_base_device(vbasedev->fd);
2406 close(vbasedev->fd);
2407}
2408
2409int vfio_get_region_info(VFIODevice *vbasedev, int index,
2410 struct vfio_region_info **info)
2411{
2412 size_t argsz = sizeof(struct vfio_region_info);
2413
2414 *info = g_malloc0(argsz);
2415
2416 (*info)->index = index;
2417retry:
2418 (*info)->argsz = argsz;
2419
2420 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2421 g_free(*info);
2422 *info = NULL;
2423 return -errno;
2424 }
2425
2426 if ((*info)->argsz > argsz) {
2427 argsz = (*info)->argsz;
2428 *info = g_realloc(*info, argsz);
2429
2430 goto retry;
2431 }
2432
2433 return 0;
2434}
2435
2436int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2437 uint32_t subtype, struct vfio_region_info **info)
2438{
2439 int i;
2440
2441 for (i = 0; i < vbasedev->num_regions; i++) {
2442 struct vfio_info_cap_header *hdr;
2443 struct vfio_region_info_cap_type *cap_type;
2444
2445 if (vfio_get_region_info(vbasedev, i, info)) {
2446 continue;
2447 }
2448
2449 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2450 if (!hdr) {
2451 g_free(*info);
2452 continue;
2453 }
2454
2455 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2456
2457 trace_vfio_get_dev_region(vbasedev->name, i,
2458 cap_type->type, cap_type->subtype);
2459
2460 if (cap_type->type == type && cap_type->subtype == subtype) {
2461 return 0;
2462 }
2463
2464 g_free(*info);
2465 }
2466
2467 *info = NULL;
2468 return -ENODEV;
2469}
2470
2471bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2472{
2473 struct vfio_region_info *info = NULL;
2474 bool ret = false;
2475
2476 if (!vfio_get_region_info(vbasedev, region, &info)) {
2477 if (vfio_get_region_info_cap(info, cap_type)) {
2478 ret = true;
2479 }
2480 g_free(info);
2481 }
2482
2483 return ret;
2484}
2485
2486
2487
2488
2489static bool vfio_eeh_container_ok(VFIOContainer *container)
2490{
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506 if (QLIST_EMPTY(&container->group_list)) {
2507 return false;
2508 }
2509
2510 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
2511 return false;
2512 }
2513
2514 return true;
2515}
2516
2517static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
2518{
2519 struct vfio_eeh_pe_op pe_op = {
2520 .argsz = sizeof(pe_op),
2521 .op = op,
2522 };
2523 int ret;
2524
2525 if (!vfio_eeh_container_ok(container)) {
2526 error_report("vfio/eeh: EEH_PE_OP 0x%x: "
2527 "kernel requires a container with exactly one group", op);
2528 return -EPERM;
2529 }
2530
2531 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
2532 if (ret < 0) {
2533 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
2534 return -errno;
2535 }
2536
2537 return ret;
2538}
2539
2540static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
2541{
2542 VFIOAddressSpace *space = vfio_get_address_space(as);
2543 VFIOContainer *container = NULL;
2544
2545 if (QLIST_EMPTY(&space->containers)) {
2546
2547 goto out;
2548 }
2549
2550 container = QLIST_FIRST(&space->containers);
2551
2552 if (QLIST_NEXT(container, next)) {
2553
2554
2555 container = NULL;
2556 goto out;
2557 }
2558
2559out:
2560 vfio_put_address_space(space);
2561 return container;
2562}
2563
2564bool vfio_eeh_as_ok(AddressSpace *as)
2565{
2566 VFIOContainer *container = vfio_eeh_as_container(as);
2567
2568 return (container != NULL) && vfio_eeh_container_ok(container);
2569}
2570
2571int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
2572{
2573 VFIOContainer *container = vfio_eeh_as_container(as);
2574
2575 if (!container) {
2576 return -ENODEV;
2577 }
2578 return vfio_eeh_container_op(container, op);
2579}
2580