1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include "qemu/osdep.h"
22#include <sys/ioctl.h>
23#ifdef CONFIG_KVM
24#include <linux/kvm.h>
25#endif
26#include <linux/vfio.h>
27
28#include "hw/vfio/vfio-common.h"
29#include "hw/vfio/vfio.h"
30#include "exec/address-spaces.h"
31#include "exec/memory.h"
32#include "exec/ram_addr.h"
33#include "hw/hw.h"
34#include "qemu/error-report.h"
35#include "qemu/main-loop.h"
36#include "qemu/range.h"
37#include "sysemu/kvm.h"
38#include "sysemu/reset.h"
39#include "sysemu/runstate.h"
40#include "trace.h"
41#include "qapi/error.h"
42#include "migration/migration.h"
43
44VFIOGroupList vfio_group_list =
45 QLIST_HEAD_INITIALIZER(vfio_group_list);
46static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
47 QLIST_HEAD_INITIALIZER(vfio_address_spaces);
48
49#ifdef CONFIG_KVM
50
51
52
53
54
55
56
57static int vfio_kvm_device_fd = -1;
58#endif
59
60
61
62
63void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
64{
65 struct vfio_irq_set irq_set = {
66 .argsz = sizeof(irq_set),
67 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
68 .index = index,
69 .start = 0,
70 .count = 0,
71 };
72
73 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
74}
75
76void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
77{
78 struct vfio_irq_set irq_set = {
79 .argsz = sizeof(irq_set),
80 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
81 .index = index,
82 .start = 0,
83 .count = 1,
84 };
85
86 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
87}
88
89void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
90{
91 struct vfio_irq_set irq_set = {
92 .argsz = sizeof(irq_set),
93 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
94 .index = index,
95 .start = 0,
96 .count = 1,
97 };
98
99 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
100}
101
102static inline const char *action_to_str(int action)
103{
104 switch (action) {
105 case VFIO_IRQ_SET_ACTION_MASK:
106 return "MASK";
107 case VFIO_IRQ_SET_ACTION_UNMASK:
108 return "UNMASK";
109 case VFIO_IRQ_SET_ACTION_TRIGGER:
110 return "TRIGGER";
111 default:
112 return "UNKNOWN ACTION";
113 }
114}
115
116static const char *index_to_str(VFIODevice *vbasedev, int index)
117{
118 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
119 return NULL;
120 }
121
122 switch (index) {
123 case VFIO_PCI_INTX_IRQ_INDEX:
124 return "INTX";
125 case VFIO_PCI_MSI_IRQ_INDEX:
126 return "MSI";
127 case VFIO_PCI_MSIX_IRQ_INDEX:
128 return "MSIX";
129 case VFIO_PCI_ERR_IRQ_INDEX:
130 return "ERR";
131 case VFIO_PCI_REQ_IRQ_INDEX:
132 return "REQ";
133 default:
134 return NULL;
135 }
136}
137
138static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
139{
140 switch (container->iommu_type) {
141 case VFIO_TYPE1v2_IOMMU:
142 case VFIO_TYPE1_IOMMU:
143
144
145
146 return ram_block_uncoordinated_discard_disable(state);
147 default:
148
149
150
151
152
153
154
155
156
157 return ram_block_discard_disable(state);
158 }
159}
160
161int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
162 int action, int fd, Error **errp)
163{
164 struct vfio_irq_set *irq_set;
165 int argsz, ret = 0;
166 const char *name;
167 int32_t *pfd;
168
169 argsz = sizeof(*irq_set) + sizeof(*pfd);
170
171 irq_set = g_malloc0(argsz);
172 irq_set->argsz = argsz;
173 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
174 irq_set->index = index;
175 irq_set->start = subindex;
176 irq_set->count = 1;
177 pfd = (int32_t *)&irq_set->data;
178 *pfd = fd;
179
180 if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
181 ret = -errno;
182 }
183 g_free(irq_set);
184
185 if (!ret) {
186 return 0;
187 }
188
189 error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
190
191 name = index_to_str(vbasedev, index);
192 if (name) {
193 error_prepend(errp, "%s-%d: ", name, subindex);
194 } else {
195 error_prepend(errp, "index %d-%d: ", index, subindex);
196 }
197 error_prepend(errp,
198 "Failed to %s %s eventfd signaling for interrupt ",
199 fd < 0 ? "tear down" : "set up", action_to_str(action));
200 return ret;
201}
202
203
204
205
206void vfio_region_write(void *opaque, hwaddr addr,
207 uint64_t data, unsigned size)
208{
209 VFIORegion *region = opaque;
210 VFIODevice *vbasedev = region->vbasedev;
211 union {
212 uint8_t byte;
213 uint16_t word;
214 uint32_t dword;
215 uint64_t qword;
216 } buf;
217
218 switch (size) {
219 case 1:
220 buf.byte = data;
221 break;
222 case 2:
223 buf.word = cpu_to_le16(data);
224 break;
225 case 4:
226 buf.dword = cpu_to_le32(data);
227 break;
228 case 8:
229 buf.qword = cpu_to_le64(data);
230 break;
231 default:
232 hw_error("vfio: unsupported write size, %u bytes", size);
233 break;
234 }
235
236 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
237 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
238 ",%d) failed: %m",
239 __func__, vbasedev->name, region->nr,
240 addr, data, size);
241 }
242
243 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
244
245
246
247
248
249
250
251
252
253 vbasedev->ops->vfio_eoi(vbasedev);
254}
255
256uint64_t vfio_region_read(void *opaque,
257 hwaddr addr, unsigned size)
258{
259 VFIORegion *region = opaque;
260 VFIODevice *vbasedev = region->vbasedev;
261 union {
262 uint8_t byte;
263 uint16_t word;
264 uint32_t dword;
265 uint64_t qword;
266 } buf;
267 uint64_t data = 0;
268
269 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
270 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
271 __func__, vbasedev->name, region->nr,
272 addr, size);
273 return (uint64_t)-1;
274 }
275 switch (size) {
276 case 1:
277 data = buf.byte;
278 break;
279 case 2:
280 data = le16_to_cpu(buf.word);
281 break;
282 case 4:
283 data = le32_to_cpu(buf.dword);
284 break;
285 case 8:
286 data = le64_to_cpu(buf.qword);
287 break;
288 default:
289 hw_error("vfio: unsupported read size, %u bytes", size);
290 break;
291 }
292
293 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
294
295
296 vbasedev->ops->vfio_eoi(vbasedev);
297
298 return data;
299}
300
301const MemoryRegionOps vfio_region_ops = {
302 .read = vfio_region_read,
303 .write = vfio_region_write,
304 .endianness = DEVICE_LITTLE_ENDIAN,
305 .valid = {
306 .min_access_size = 1,
307 .max_access_size = 8,
308 },
309 .impl = {
310 .min_access_size = 1,
311 .max_access_size = 8,
312 },
313};
314
315
316
317
318
319bool vfio_mig_active(void)
320{
321 VFIOGroup *group;
322 VFIODevice *vbasedev;
323
324 if (QLIST_EMPTY(&vfio_group_list)) {
325 return false;
326 }
327
328 QLIST_FOREACH(group, &vfio_group_list, next) {
329 QLIST_FOREACH(vbasedev, &group->device_list, next) {
330 if (vbasedev->migration_blocker) {
331 return false;
332 }
333 }
334 }
335 return true;
336}
337
338static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
339{
340 VFIOGroup *group;
341 VFIODevice *vbasedev;
342 MigrationState *ms = migrate_get_current();
343
344 if (!migration_is_setup_or_active(ms->state)) {
345 return false;
346 }
347
348 QLIST_FOREACH(group, &container->group_list, container_next) {
349 QLIST_FOREACH(vbasedev, &group->device_list, next) {
350 VFIOMigration *migration = vbasedev->migration;
351
352 if (!migration) {
353 return false;
354 }
355
356 if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
357 && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
358 return false;
359 }
360 }
361 }
362 return true;
363}
364
365static bool vfio_devices_all_running_and_saving(VFIOContainer *container)
366{
367 VFIOGroup *group;
368 VFIODevice *vbasedev;
369 MigrationState *ms = migrate_get_current();
370
371 if (!migration_is_setup_or_active(ms->state)) {
372 return false;
373 }
374
375 QLIST_FOREACH(group, &container->group_list, container_next) {
376 QLIST_FOREACH(vbasedev, &group->device_list, next) {
377 VFIOMigration *migration = vbasedev->migration;
378
379 if (!migration) {
380 return false;
381 }
382
383 if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) &&
384 (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
385 continue;
386 } else {
387 return false;
388 }
389 }
390 }
391 return true;
392}
393
394static int vfio_dma_unmap_bitmap(VFIOContainer *container,
395 hwaddr iova, ram_addr_t size,
396 IOMMUTLBEntry *iotlb)
397{
398 struct vfio_iommu_type1_dma_unmap *unmap;
399 struct vfio_bitmap *bitmap;
400 uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
401 int ret;
402
403 unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
404
405 unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
406 unmap->iova = iova;
407 unmap->size = size;
408 unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
409 bitmap = (struct vfio_bitmap *)&unmap->data;
410
411
412
413
414
415
416
417 bitmap->pgsize = qemu_real_host_page_size;
418 bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
419 BITS_PER_BYTE;
420
421 if (bitmap->size > container->max_dirty_bitmap_size) {
422 error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
423 (uint64_t)bitmap->size);
424 ret = -E2BIG;
425 goto unmap_exit;
426 }
427
428 bitmap->data = g_try_malloc0(bitmap->size);
429 if (!bitmap->data) {
430 ret = -ENOMEM;
431 goto unmap_exit;
432 }
433
434 ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
435 if (!ret) {
436 cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
437 iotlb->translated_addr, pages);
438 } else {
439 error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
440 }
441
442 g_free(bitmap->data);
443unmap_exit:
444 g_free(unmap);
445 return ret;
446}
447
448
449
450
451static int vfio_dma_unmap(VFIOContainer *container,
452 hwaddr iova, ram_addr_t size,
453 IOMMUTLBEntry *iotlb)
454{
455 struct vfio_iommu_type1_dma_unmap unmap = {
456 .argsz = sizeof(unmap),
457 .flags = 0,
458 .iova = iova,
459 .size = size,
460 };
461
462 if (iotlb && container->dirty_pages_supported &&
463 vfio_devices_all_running_and_saving(container)) {
464 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
465 }
466
467 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
468
469
470
471
472
473
474
475
476
477
478
479
480 if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
481 container->iommu_type == VFIO_TYPE1v2_IOMMU) {
482 trace_vfio_dma_unmap_overflow_workaround();
483 unmap.size -= 1ULL << ctz64(container->pgsizes);
484 continue;
485 }
486 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
487 return -errno;
488 }
489
490 return 0;
491}
492
493static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
494 ram_addr_t size, void *vaddr, bool readonly)
495{
496 struct vfio_iommu_type1_dma_map map = {
497 .argsz = sizeof(map),
498 .flags = VFIO_DMA_MAP_FLAG_READ,
499 .vaddr = (__u64)(uintptr_t)vaddr,
500 .iova = iova,
501 .size = size,
502 };
503
504 if (!readonly) {
505 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
506 }
507
508
509
510
511
512
513 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
514 (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
515 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
516 return 0;
517 }
518
519 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
520 return -errno;
521}
522
523static void vfio_host_win_add(VFIOContainer *container,
524 hwaddr min_iova, hwaddr max_iova,
525 uint64_t iova_pgsizes)
526{
527 VFIOHostDMAWindow *hostwin;
528
529 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
530 if (ranges_overlap(hostwin->min_iova,
531 hostwin->max_iova - hostwin->min_iova + 1,
532 min_iova,
533 max_iova - min_iova + 1)) {
534 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
535 }
536 }
537
538 hostwin = g_malloc0(sizeof(*hostwin));
539
540 hostwin->min_iova = min_iova;
541 hostwin->max_iova = max_iova;
542 hostwin->iova_pgsizes = iova_pgsizes;
543 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
544}
545
546static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
547 hwaddr max_iova)
548{
549 VFIOHostDMAWindow *hostwin;
550
551 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
552 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
553 QLIST_REMOVE(hostwin, hostwin_next);
554 g_free(hostwin);
555 return 0;
556 }
557 }
558
559 return -1;
560}
561
562static bool vfio_listener_skipped_section(MemoryRegionSection *section)
563{
564 return (!memory_region_is_ram(section->mr) &&
565 !memory_region_is_iommu(section->mr)) ||
566 memory_region_is_protected(section->mr) ||
567
568
569
570
571
572
573 section->offset_within_address_space & (1ULL << 63);
574}
575
576
577static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
578 ram_addr_t *ram_addr, bool *read_only)
579{
580 MemoryRegion *mr;
581 hwaddr xlat;
582 hwaddr len = iotlb->addr_mask + 1;
583 bool writable = iotlb->perm & IOMMU_WO;
584
585
586
587
588
589
590 mr = address_space_translate(&address_space_memory,
591 iotlb->translated_addr,
592 &xlat, &len, writable,
593 MEMTXATTRS_UNSPECIFIED);
594 if (!memory_region_is_ram(mr)) {
595 error_report("iommu map to non memory area %"HWADDR_PRIx"",
596 xlat);
597 return false;
598 } else if (memory_region_has_ram_discard_manager(mr)) {
599 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr);
600 MemoryRegionSection tmp = {
601 .mr = mr,
602 .offset_within_region = xlat,
603 .size = int128_make64(len),
604 };
605
606
607
608
609
610
611
612 if (!ram_discard_manager_is_populated(rdm, &tmp)) {
613 error_report("iommu map to discarded memory (e.g., unplugged via"
614 " virtio-mem): %"HWADDR_PRIx"",
615 iotlb->translated_addr);
616 return false;
617 }
618
619
620
621
622
623
624
625
626
627
628
629
630
631 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
632 " RAM (e.g., virtio-mem) works, however, malicious"
633 " guests can trigger pinning of more memory than"
634 " intended via an IOMMU. It's possible to mitigate "
635 " by setting/adjusting RLIMIT_MEMLOCK.");
636 }
637
638
639
640
641
642 if (len & iotlb->addr_mask) {
643 error_report("iommu has granularity incompatible with target AS");
644 return false;
645 }
646
647 if (vaddr) {
648 *vaddr = memory_region_get_ram_ptr(mr) + xlat;
649 }
650
651 if (ram_addr) {
652 *ram_addr = memory_region_get_ram_addr(mr) + xlat;
653 }
654
655 if (read_only) {
656 *read_only = !writable || mr->readonly;
657 }
658
659 return true;
660}
661
662static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
663{
664 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
665 VFIOContainer *container = giommu->container;
666 hwaddr iova = iotlb->iova + giommu->iommu_offset;
667 void *vaddr;
668 int ret;
669
670 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
671 iova, iova + iotlb->addr_mask);
672
673 if (iotlb->target_as != &address_space_memory) {
674 error_report("Wrong target AS \"%s\", only system memory is allowed",
675 iotlb->target_as->name ? iotlb->target_as->name : "none");
676 return;
677 }
678
679 rcu_read_lock();
680
681 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
682 bool read_only;
683
684 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
685 goto out;
686 }
687
688
689
690
691
692
693
694 ret = vfio_dma_map(container, iova,
695 iotlb->addr_mask + 1, vaddr,
696 read_only);
697 if (ret) {
698 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
699 "0x%"HWADDR_PRIx", %p) = %d (%m)",
700 container, iova,
701 iotlb->addr_mask + 1, vaddr, ret);
702 }
703 } else {
704 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
705 if (ret) {
706 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
707 "0x%"HWADDR_PRIx") = %d (%m)",
708 container, iova,
709 iotlb->addr_mask + 1, ret);
710 }
711 }
712out:
713 rcu_read_unlock();
714}
715
716static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
717 MemoryRegionSection *section)
718{
719 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
720 listener);
721 const hwaddr size = int128_get64(section->size);
722 const hwaddr iova = section->offset_within_address_space;
723 int ret;
724
725
726 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
727 if (ret) {
728 error_report("%s: vfio_dma_unmap() failed: %s", __func__,
729 strerror(-ret));
730 }
731}
732
733static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
734 MemoryRegionSection *section)
735{
736 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
737 listener);
738 const hwaddr end = section->offset_within_region +
739 int128_get64(section->size);
740 hwaddr start, next, iova;
741 void *vaddr;
742 int ret;
743
744
745
746
747
748 for (start = section->offset_within_region; start < end; start = next) {
749 next = ROUND_UP(start + 1, vrdl->granularity);
750 next = MIN(next, end);
751
752 iova = start - section->offset_within_region +
753 section->offset_within_address_space;
754 vaddr = memory_region_get_ram_ptr(section->mr) + start;
755
756 ret = vfio_dma_map(vrdl->container, iova, next - start,
757 vaddr, section->readonly);
758 if (ret) {
759
760 vfio_ram_discard_notify_discard(rdl, section);
761 return ret;
762 }
763 }
764 return 0;
765}
766
767static void vfio_register_ram_discard_listener(VFIOContainer *container,
768 MemoryRegionSection *section)
769{
770 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
771 VFIORamDiscardListener *vrdl;
772
773
774 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
775 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
776 TARGET_PAGE_SIZE));
777 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
778
779 vrdl = g_new0(VFIORamDiscardListener, 1);
780 vrdl->container = container;
781 vrdl->mr = section->mr;
782 vrdl->offset_within_address_space = section->offset_within_address_space;
783 vrdl->size = int128_get64(section->size);
784 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
785 section->mr);
786
787 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
788 g_assert(container->pgsizes &&
789 vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
790
791 ram_discard_listener_init(&vrdl->listener,
792 vfio_ram_discard_notify_populate,
793 vfio_ram_discard_notify_discard, true);
794 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
795 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810 if (container->dma_max_mappings) {
811 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
812
813#ifdef CONFIG_KVM
814 if (kvm_enabled()) {
815 max_memslots = kvm_get_max_memslots();
816 }
817#endif
818
819 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
820 hwaddr start, end;
821
822 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
823 vrdl->granularity);
824 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
825 vrdl->granularity);
826 vrdl_mappings += (end - start) / vrdl->granularity;
827 vrdl_count++;
828 }
829
830 if (vrdl_mappings + max_memslots - vrdl_count >
831 container->dma_max_mappings) {
832 warn_report("%s: possibly running out of DMA mappings. E.g., try"
833 " increasing the 'block-size' of virtio-mem devies."
834 " Maximum possible DMA mappings: %d, Maximum possible"
835 " memslots: %d", __func__, container->dma_max_mappings,
836 max_memslots);
837 }
838 }
839}
840
841static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
842 MemoryRegionSection *section)
843{
844 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
845 VFIORamDiscardListener *vrdl = NULL;
846
847 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
848 if (vrdl->mr == section->mr &&
849 vrdl->offset_within_address_space ==
850 section->offset_within_address_space) {
851 break;
852 }
853 }
854
855 if (!vrdl) {
856 hw_error("vfio: Trying to unregister missing RAM discard listener");
857 }
858
859 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
860 QLIST_REMOVE(vrdl, next);
861 g_free(vrdl);
862}
863
864static void vfio_listener_region_add(MemoryListener *listener,
865 MemoryRegionSection *section)
866{
867 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
868 hwaddr iova, end;
869 Int128 llend, llsize;
870 void *vaddr;
871 int ret;
872 VFIOHostDMAWindow *hostwin;
873 bool hostwin_found;
874 Error *err = NULL;
875
876 if (vfio_listener_skipped_section(section)) {
877 trace_vfio_listener_region_add_skip(
878 section->offset_within_address_space,
879 section->offset_within_address_space +
880 int128_get64(int128_sub(section->size, int128_one())));
881 return;
882 }
883
884 if (unlikely((section->offset_within_address_space &
885 ~qemu_real_host_page_mask) !=
886 (section->offset_within_region & ~qemu_real_host_page_mask))) {
887 error_report("%s received unaligned region", __func__);
888 return;
889 }
890
891 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
892 llend = int128_make64(section->offset_within_address_space);
893 llend = int128_add(llend, section->size);
894 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
895
896 if (int128_ge(int128_make64(iova), llend)) {
897 if (memory_region_is_ram_device(section->mr)) {
898 trace_vfio_listener_region_add_no_dma_map(
899 memory_region_name(section->mr),
900 section->offset_within_address_space,
901 int128_getlo(section->size),
902 qemu_real_host_page_size);
903 }
904 return;
905 }
906 end = int128_get64(int128_sub(llend, int128_one()));
907
908 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
909 hwaddr pgsize = 0;
910
911
912 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
913 if (ranges_overlap(hostwin->min_iova,
914 hostwin->max_iova - hostwin->min_iova + 1,
915 section->offset_within_address_space,
916 int128_get64(section->size))) {
917 error_setg(&err,
918 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
919 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
920 section->offset_within_address_space,
921 section->offset_within_address_space +
922 int128_get64(section->size) - 1,
923 hostwin->min_iova, hostwin->max_iova);
924 goto fail;
925 }
926 }
927
928 ret = vfio_spapr_create_window(container, section, &pgsize);
929 if (ret) {
930 error_setg_errno(&err, -ret, "Failed to create SPAPR window");
931 goto fail;
932 }
933
934 vfio_host_win_add(container, section->offset_within_address_space,
935 section->offset_within_address_space +
936 int128_get64(section->size) - 1, pgsize);
937#ifdef CONFIG_KVM
938 if (kvm_enabled()) {
939 VFIOGroup *group;
940 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
941 struct kvm_vfio_spapr_tce param;
942 struct kvm_device_attr attr = {
943 .group = KVM_DEV_VFIO_GROUP,
944 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
945 .addr = (uint64_t)(unsigned long)¶m,
946 };
947
948 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
949 ¶m.tablefd)) {
950 QLIST_FOREACH(group, &container->group_list, container_next) {
951 param.groupfd = group->fd;
952 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
953 error_report("vfio: failed to setup fd %d "
954 "for a group with fd %d: %s",
955 param.tablefd, param.groupfd,
956 strerror(errno));
957 return;
958 }
959 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
960 }
961 }
962 }
963#endif
964 }
965
966 hostwin_found = false;
967 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
968 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
969 hostwin_found = true;
970 break;
971 }
972 }
973
974 if (!hostwin_found) {
975 error_setg(&err, "Container %p can't map guest IOVA region"
976 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
977 goto fail;
978 }
979
980 memory_region_ref(section->mr);
981
982 if (memory_region_is_iommu(section->mr)) {
983 VFIOGuestIOMMU *giommu;
984 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
985 int iommu_idx;
986
987 trace_vfio_listener_region_add_iommu(iova, end);
988
989
990
991
992
993
994 giommu = g_malloc0(sizeof(*giommu));
995 giommu->iommu = iommu_mr;
996 giommu->iommu_offset = section->offset_within_address_space -
997 section->offset_within_region;
998 giommu->container = container;
999 llend = int128_add(int128_make64(section->offset_within_region),
1000 section->size);
1001 llend = int128_sub(llend, int128_one());
1002 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
1003 MEMTXATTRS_UNSPECIFIED);
1004 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
1005 IOMMU_NOTIFIER_IOTLB_EVENTS,
1006 section->offset_within_region,
1007 int128_get64(llend),
1008 iommu_idx);
1009
1010 ret = memory_region_iommu_set_page_size_mask(giommu->iommu,
1011 container->pgsizes,
1012 &err);
1013 if (ret) {
1014 g_free(giommu);
1015 goto fail;
1016 }
1017
1018 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
1019 &err);
1020 if (ret) {
1021 g_free(giommu);
1022 goto fail;
1023 }
1024 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
1025 memory_region_iommu_replay(giommu->iommu, &giommu->n);
1026
1027 return;
1028 }
1029
1030
1031
1032
1033
1034
1035
1036
1037 if (memory_region_has_ram_discard_manager(section->mr)) {
1038 vfio_register_ram_discard_listener(container, section);
1039 return;
1040 }
1041
1042 vaddr = memory_region_get_ram_ptr(section->mr) +
1043 section->offset_within_region +
1044 (iova - section->offset_within_address_space);
1045
1046 trace_vfio_listener_region_add_ram(iova, end, vaddr);
1047
1048 llsize = int128_sub(llend, int128_make64(iova));
1049
1050 if (memory_region_is_ram_device(section->mr)) {
1051 hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1052
1053 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
1054 trace_vfio_listener_region_add_no_dma_map(
1055 memory_region_name(section->mr),
1056 section->offset_within_address_space,
1057 int128_getlo(section->size),
1058 pgmask + 1);
1059 return;
1060 }
1061 }
1062
1063 ret = vfio_dma_map(container, iova, int128_get64(llsize),
1064 vaddr, section->readonly);
1065 if (ret) {
1066 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
1067 "0x%"HWADDR_PRIx", %p) = %d (%m)",
1068 container, iova, int128_get64(llsize), vaddr, ret);
1069 if (memory_region_is_ram_device(section->mr)) {
1070
1071 error_report_err(err);
1072 return;
1073 }
1074 goto fail;
1075 }
1076
1077 return;
1078
1079fail:
1080 if (memory_region_is_ram_device(section->mr)) {
1081 error_report("failed to vfio_dma_map. pci p2p may not work");
1082 return;
1083 }
1084
1085
1086
1087
1088
1089 if (!container->initialized) {
1090 if (!container->error) {
1091 error_propagate_prepend(&container->error, err,
1092 "Region %s: ",
1093 memory_region_name(section->mr));
1094 } else {
1095 error_free(err);
1096 }
1097 } else {
1098 error_report_err(err);
1099 hw_error("vfio: DMA mapping failed, unable to continue");
1100 }
1101}
1102
1103static void vfio_listener_region_del(MemoryListener *listener,
1104 MemoryRegionSection *section)
1105{
1106 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1107 hwaddr iova, end;
1108 Int128 llend, llsize;
1109 int ret;
1110 bool try_unmap = true;
1111
1112 if (vfio_listener_skipped_section(section)) {
1113 trace_vfio_listener_region_del_skip(
1114 section->offset_within_address_space,
1115 section->offset_within_address_space +
1116 int128_get64(int128_sub(section->size, int128_one())));
1117 return;
1118 }
1119
1120 if (unlikely((section->offset_within_address_space &
1121 ~qemu_real_host_page_mask) !=
1122 (section->offset_within_region & ~qemu_real_host_page_mask))) {
1123 error_report("%s received unaligned region", __func__);
1124 return;
1125 }
1126
1127 if (memory_region_is_iommu(section->mr)) {
1128 VFIOGuestIOMMU *giommu;
1129
1130 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1131 if (MEMORY_REGION(giommu->iommu) == section->mr &&
1132 giommu->n.start == section->offset_within_region) {
1133 memory_region_unregister_iommu_notifier(section->mr,
1134 &giommu->n);
1135 QLIST_REMOVE(giommu, giommu_next);
1136 g_free(giommu);
1137 break;
1138 }
1139 }
1140
1141
1142
1143
1144
1145
1146
1147
1148 }
1149
1150 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
1151 llend = int128_make64(section->offset_within_address_space);
1152 llend = int128_add(llend, section->size);
1153 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
1154
1155 if (int128_ge(int128_make64(iova), llend)) {
1156 return;
1157 }
1158 end = int128_get64(int128_sub(llend, int128_one()));
1159
1160 llsize = int128_sub(llend, int128_make64(iova));
1161
1162 trace_vfio_listener_region_del(iova, end);
1163
1164 if (memory_region_is_ram_device(section->mr)) {
1165 hwaddr pgmask;
1166 VFIOHostDMAWindow *hostwin;
1167 bool hostwin_found = false;
1168
1169 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1170 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
1171 hostwin_found = true;
1172 break;
1173 }
1174 }
1175 assert(hostwin_found);
1176
1177 pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1178 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
1179 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1180 vfio_unregister_ram_discard_listener(container, section);
1181
1182 try_unmap = false;
1183 }
1184
1185 if (try_unmap) {
1186 if (int128_eq(llsize, int128_2_64())) {
1187
1188 llsize = int128_rshift(llsize, 1);
1189 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1190 if (ret) {
1191 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1192 "0x%"HWADDR_PRIx") = %d (%m)",
1193 container, iova, int128_get64(llsize), ret);
1194 }
1195 iova += int128_get64(llsize);
1196 }
1197 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1198 if (ret) {
1199 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1200 "0x%"HWADDR_PRIx") = %d (%m)",
1201 container, iova, int128_get64(llsize), ret);
1202 }
1203 }
1204
1205 memory_region_unref(section->mr);
1206
1207 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1208 vfio_spapr_remove_window(container,
1209 section->offset_within_address_space);
1210 if (vfio_host_win_del(container,
1211 section->offset_within_address_space,
1212 section->offset_within_address_space +
1213 int128_get64(section->size) - 1) < 0) {
1214 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1215 __func__, section->offset_within_address_space);
1216 }
1217 }
1218}
1219
1220static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
1221{
1222 int ret;
1223 struct vfio_iommu_type1_dirty_bitmap dirty = {
1224 .argsz = sizeof(dirty),
1225 };
1226
1227 if (start) {
1228 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1229 } else {
1230 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1231 }
1232
1233 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1234 if (ret) {
1235 error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1236 dirty.flags, errno);
1237 }
1238}
1239
1240static void vfio_listener_log_global_start(MemoryListener *listener)
1241{
1242 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1243
1244 vfio_set_dirty_page_tracking(container, true);
1245}
1246
1247static void vfio_listener_log_global_stop(MemoryListener *listener)
1248{
1249 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1250
1251 vfio_set_dirty_page_tracking(container, false);
1252}
1253
1254static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1255 uint64_t size, ram_addr_t ram_addr)
1256{
1257 struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1258 struct vfio_iommu_type1_dirty_bitmap_get *range;
1259 uint64_t pages;
1260 int ret;
1261
1262 dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1263
1264 dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1265 dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1266 range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1267 range->iova = iova;
1268 range->size = size;
1269
1270
1271
1272
1273
1274
1275 range->bitmap.pgsize = qemu_real_host_page_size;
1276
1277 pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size;
1278 range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
1279 BITS_PER_BYTE;
1280 range->bitmap.data = g_try_malloc0(range->bitmap.size);
1281 if (!range->bitmap.data) {
1282 ret = -ENOMEM;
1283 goto err_out;
1284 }
1285
1286 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1287 if (ret) {
1288 error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1289 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1290 (uint64_t)range->size, errno);
1291 goto err_out;
1292 }
1293
1294 cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
1295 ram_addr, pages);
1296
1297 trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
1298 range->bitmap.size, ram_addr);
1299err_out:
1300 g_free(range->bitmap.data);
1301 g_free(dbitmap);
1302
1303 return ret;
1304}
1305
1306typedef struct {
1307 IOMMUNotifier n;
1308 VFIOGuestIOMMU *giommu;
1309} vfio_giommu_dirty_notifier;
1310
1311static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1312{
1313 vfio_giommu_dirty_notifier *gdn = container_of(n,
1314 vfio_giommu_dirty_notifier, n);
1315 VFIOGuestIOMMU *giommu = gdn->giommu;
1316 VFIOContainer *container = giommu->container;
1317 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1318 ram_addr_t translated_addr;
1319
1320 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1321
1322 if (iotlb->target_as != &address_space_memory) {
1323 error_report("Wrong target AS \"%s\", only system memory is allowed",
1324 iotlb->target_as->name ? iotlb->target_as->name : "none");
1325 return;
1326 }
1327
1328 rcu_read_lock();
1329 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
1330 int ret;
1331
1332 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1333 translated_addr);
1334 if (ret) {
1335 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
1336 "0x%"HWADDR_PRIx") = %d (%m)",
1337 container, iova,
1338 iotlb->addr_mask + 1, ret);
1339 }
1340 }
1341 rcu_read_unlock();
1342}
1343
1344static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1345 void *opaque)
1346{
1347 const hwaddr size = int128_get64(section->size);
1348 const hwaddr iova = section->offset_within_address_space;
1349 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1350 section->offset_within_region;
1351 VFIORamDiscardListener *vrdl = opaque;
1352
1353
1354
1355
1356
1357 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1358}
1359
1360static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1361 MemoryRegionSection *section)
1362{
1363 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1364 VFIORamDiscardListener *vrdl = NULL;
1365
1366 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1367 if (vrdl->mr == section->mr &&
1368 vrdl->offset_within_address_space ==
1369 section->offset_within_address_space) {
1370 break;
1371 }
1372 }
1373
1374 if (!vrdl) {
1375 hw_error("vfio: Trying to sync missing RAM discard listener");
1376 }
1377
1378
1379
1380
1381
1382 return ram_discard_manager_replay_populated(rdm, section,
1383 vfio_ram_discard_get_dirty_bitmap,
1384 &vrdl);
1385}
1386
1387static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1388 MemoryRegionSection *section)
1389{
1390 ram_addr_t ram_addr;
1391
1392 if (memory_region_is_iommu(section->mr)) {
1393 VFIOGuestIOMMU *giommu;
1394
1395 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1396 if (MEMORY_REGION(giommu->iommu) == section->mr &&
1397 giommu->n.start == section->offset_within_region) {
1398 Int128 llend;
1399 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
1400 int idx = memory_region_iommu_attrs_to_index(giommu->iommu,
1401 MEMTXATTRS_UNSPECIFIED);
1402
1403 llend = int128_add(int128_make64(section->offset_within_region),
1404 section->size);
1405 llend = int128_sub(llend, int128_one());
1406
1407 iommu_notifier_init(&gdn.n,
1408 vfio_iommu_map_dirty_notify,
1409 IOMMU_NOTIFIER_MAP,
1410 section->offset_within_region,
1411 int128_get64(llend),
1412 idx);
1413 memory_region_iommu_replay(giommu->iommu, &gdn.n);
1414 break;
1415 }
1416 }
1417 return 0;
1418 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1419 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
1420 }
1421
1422 ram_addr = memory_region_get_ram_addr(section->mr) +
1423 section->offset_within_region;
1424
1425 return vfio_get_dirty_bitmap(container,
1426 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1427 int128_get64(section->size), ram_addr);
1428}
1429
1430static void vfio_listener_log_sync(MemoryListener *listener,
1431 MemoryRegionSection *section)
1432{
1433 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1434
1435 if (vfio_listener_skipped_section(section) ||
1436 !container->dirty_pages_supported) {
1437 return;
1438 }
1439
1440 if (vfio_devices_all_dirty_tracking(container)) {
1441 vfio_sync_dirty_bitmap(container, section);
1442 }
1443}
1444
1445static const MemoryListener vfio_memory_listener = {
1446 .name = "vfio",
1447 .region_add = vfio_listener_region_add,
1448 .region_del = vfio_listener_region_del,
1449 .log_global_start = vfio_listener_log_global_start,
1450 .log_global_stop = vfio_listener_log_global_stop,
1451 .log_sync = vfio_listener_log_sync,
1452};
1453
1454static void vfio_listener_release(VFIOContainer *container)
1455{
1456 memory_listener_unregister(&container->listener);
1457 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1458 memory_listener_unregister(&container->prereg_listener);
1459 }
1460}
1461
1462static struct vfio_info_cap_header *
1463vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
1464{
1465 struct vfio_info_cap_header *hdr;
1466
1467 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1468 if (hdr->id == id) {
1469 return hdr;
1470 }
1471 }
1472
1473 return NULL;
1474}
1475
1476struct vfio_info_cap_header *
1477vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1478{
1479 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1480 return NULL;
1481 }
1482
1483 return vfio_get_cap((void *)info, info->cap_offset, id);
1484}
1485
1486static struct vfio_info_cap_header *
1487vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1488{
1489 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1490 return NULL;
1491 }
1492
1493 return vfio_get_cap((void *)info, info->cap_offset, id);
1494}
1495
1496struct vfio_info_cap_header *
1497vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1498{
1499 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1500 return NULL;
1501 }
1502
1503 return vfio_get_cap((void *)info, info->cap_offset, id);
1504}
1505
1506bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1507 unsigned int *avail)
1508{
1509 struct vfio_info_cap_header *hdr;
1510 struct vfio_iommu_type1_info_dma_avail *cap;
1511
1512
1513 hdr = vfio_get_iommu_type1_info_cap(info,
1514 VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
1515 if (hdr == NULL) {
1516 return false;
1517 }
1518
1519 if (avail != NULL) {
1520 cap = (void *) hdr;
1521 *avail = cap->avail;
1522 }
1523
1524 return true;
1525}
1526
1527static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
1528 struct vfio_region_info *info)
1529{
1530 struct vfio_info_cap_header *hdr;
1531 struct vfio_region_info_cap_sparse_mmap *sparse;
1532 int i, j;
1533
1534 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
1535 if (!hdr) {
1536 return -ENODEV;
1537 }
1538
1539 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
1540
1541 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
1542 region->nr, sparse->nr_areas);
1543
1544 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
1545
1546 for (i = 0, j = 0; i < sparse->nr_areas; i++) {
1547 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
1548 sparse->areas[i].offset +
1549 sparse->areas[i].size);
1550
1551 if (sparse->areas[i].size) {
1552 region->mmaps[j].offset = sparse->areas[i].offset;
1553 region->mmaps[j].size = sparse->areas[i].size;
1554 j++;
1555 }
1556 }
1557
1558 region->nr_mmaps = j;
1559 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
1560
1561 return 0;
1562}
1563
1564int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
1565 int index, const char *name)
1566{
1567 struct vfio_region_info *info;
1568 int ret;
1569
1570 ret = vfio_get_region_info(vbasedev, index, &info);
1571 if (ret) {
1572 return ret;
1573 }
1574
1575 region->vbasedev = vbasedev;
1576 region->flags = info->flags;
1577 region->size = info->size;
1578 region->fd_offset = info->offset;
1579 region->nr = index;
1580
1581 if (region->size) {
1582 region->mem = g_new0(MemoryRegion, 1);
1583 memory_region_init_io(region->mem, obj, &vfio_region_ops,
1584 region, name, region->size);
1585
1586 if (!vbasedev->no_mmap &&
1587 region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
1588
1589 ret = vfio_setup_region_sparse_mmaps(region, info);
1590
1591 if (ret) {
1592 region->nr_mmaps = 1;
1593 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
1594 region->mmaps[0].offset = 0;
1595 region->mmaps[0].size = region->size;
1596 }
1597 }
1598 }
1599
1600 g_free(info);
1601
1602 trace_vfio_region_setup(vbasedev->name, index, name,
1603 region->flags, region->fd_offset, region->size);
1604 return 0;
1605}
1606
1607static void vfio_subregion_unmap(VFIORegion *region, int index)
1608{
1609 trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem),
1610 region->mmaps[index].offset,
1611 region->mmaps[index].offset +
1612 region->mmaps[index].size - 1);
1613 memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem);
1614 munmap(region->mmaps[index].mmap, region->mmaps[index].size);
1615 object_unparent(OBJECT(®ion->mmaps[index].mem));
1616 region->mmaps[index].mmap = NULL;
1617}
1618
1619int vfio_region_mmap(VFIORegion *region)
1620{
1621 int i, prot = 0;
1622 char *name;
1623
1624 if (!region->mem) {
1625 return 0;
1626 }
1627
1628 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
1629 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
1630
1631 for (i = 0; i < region->nr_mmaps; i++) {
1632 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
1633 MAP_SHARED, region->vbasedev->fd,
1634 region->fd_offset +
1635 region->mmaps[i].offset);
1636 if (region->mmaps[i].mmap == MAP_FAILED) {
1637 int ret = -errno;
1638
1639 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
1640 region->fd_offset +
1641 region->mmaps[i].offset,
1642 region->fd_offset +
1643 region->mmaps[i].offset +
1644 region->mmaps[i].size - 1, ret);
1645
1646 region->mmaps[i].mmap = NULL;
1647
1648 for (i--; i >= 0; i--) {
1649 vfio_subregion_unmap(region, i);
1650 }
1651
1652 return ret;
1653 }
1654
1655 name = g_strdup_printf("%s mmaps[%d]",
1656 memory_region_name(region->mem), i);
1657 memory_region_init_ram_device_ptr(®ion->mmaps[i].mem,
1658 memory_region_owner(region->mem),
1659 name, region->mmaps[i].size,
1660 region->mmaps[i].mmap);
1661 g_free(name);
1662 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
1663 ®ion->mmaps[i].mem);
1664
1665 trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem),
1666 region->mmaps[i].offset,
1667 region->mmaps[i].offset +
1668 region->mmaps[i].size - 1);
1669 }
1670
1671 return 0;
1672}
1673
1674void vfio_region_unmap(VFIORegion *region)
1675{
1676 int i;
1677
1678 if (!region->mem) {
1679 return;
1680 }
1681
1682 for (i = 0; i < region->nr_mmaps; i++) {
1683 if (region->mmaps[i].mmap) {
1684 vfio_subregion_unmap(region, i);
1685 }
1686 }
1687}
1688
1689void vfio_region_exit(VFIORegion *region)
1690{
1691 int i;
1692
1693 if (!region->mem) {
1694 return;
1695 }
1696
1697 for (i = 0; i < region->nr_mmaps; i++) {
1698 if (region->mmaps[i].mmap) {
1699 memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem);
1700 }
1701 }
1702
1703 trace_vfio_region_exit(region->vbasedev->name, region->nr);
1704}
1705
1706void vfio_region_finalize(VFIORegion *region)
1707{
1708 int i;
1709
1710 if (!region->mem) {
1711 return;
1712 }
1713
1714 for (i = 0; i < region->nr_mmaps; i++) {
1715 if (region->mmaps[i].mmap) {
1716 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
1717 object_unparent(OBJECT(®ion->mmaps[i].mem));
1718 }
1719 }
1720
1721 object_unparent(OBJECT(region->mem));
1722
1723 g_free(region->mem);
1724 g_free(region->mmaps);
1725
1726 trace_vfio_region_finalize(region->vbasedev->name, region->nr);
1727
1728 region->mem = NULL;
1729 region->mmaps = NULL;
1730 region->nr_mmaps = 0;
1731 region->size = 0;
1732 region->flags = 0;
1733 region->nr = 0;
1734}
1735
1736void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
1737{
1738 int i;
1739
1740 if (!region->mem) {
1741 return;
1742 }
1743
1744 for (i = 0; i < region->nr_mmaps; i++) {
1745 if (region->mmaps[i].mmap) {
1746 memory_region_set_enabled(®ion->mmaps[i].mem, enabled);
1747 }
1748 }
1749
1750 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
1751 enabled);
1752}
1753
1754void vfio_reset_handler(void *opaque)
1755{
1756 VFIOGroup *group;
1757 VFIODevice *vbasedev;
1758
1759 QLIST_FOREACH(group, &vfio_group_list, next) {
1760 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1761 if (vbasedev->dev->realized) {
1762 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1763 }
1764 }
1765 }
1766
1767 QLIST_FOREACH(group, &vfio_group_list, next) {
1768 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1769 if (vbasedev->dev->realized && vbasedev->needs_reset) {
1770 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1771 }
1772 }
1773 }
1774}
1775
1776static void vfio_kvm_device_add_group(VFIOGroup *group)
1777{
1778#ifdef CONFIG_KVM
1779 struct kvm_device_attr attr = {
1780 .group = KVM_DEV_VFIO_GROUP,
1781 .attr = KVM_DEV_VFIO_GROUP_ADD,
1782 .addr = (uint64_t)(unsigned long)&group->fd,
1783 };
1784
1785 if (!kvm_enabled()) {
1786 return;
1787 }
1788
1789 if (vfio_kvm_device_fd < 0) {
1790 struct kvm_create_device cd = {
1791 .type = KVM_DEV_TYPE_VFIO,
1792 };
1793
1794 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
1795 error_report("Failed to create KVM VFIO device: %m");
1796 return;
1797 }
1798
1799 vfio_kvm_device_fd = cd.fd;
1800 }
1801
1802 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1803 error_report("Failed to add group %d to KVM VFIO device: %m",
1804 group->groupid);
1805 }
1806#endif
1807}
1808
1809static void vfio_kvm_device_del_group(VFIOGroup *group)
1810{
1811#ifdef CONFIG_KVM
1812 struct kvm_device_attr attr = {
1813 .group = KVM_DEV_VFIO_GROUP,
1814 .attr = KVM_DEV_VFIO_GROUP_DEL,
1815 .addr = (uint64_t)(unsigned long)&group->fd,
1816 };
1817
1818 if (vfio_kvm_device_fd < 0) {
1819 return;
1820 }
1821
1822 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1823 error_report("Failed to remove group %d from KVM VFIO device: %m",
1824 group->groupid);
1825 }
1826#endif
1827}
1828
1829static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1830{
1831 VFIOAddressSpace *space;
1832
1833 QLIST_FOREACH(space, &vfio_address_spaces, list) {
1834 if (space->as == as) {
1835 return space;
1836 }
1837 }
1838
1839
1840 space = g_malloc0(sizeof(*space));
1841 space->as = as;
1842 QLIST_INIT(&space->containers);
1843
1844 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1845
1846 return space;
1847}
1848
1849static void vfio_put_address_space(VFIOAddressSpace *space)
1850{
1851 if (QLIST_EMPTY(&space->containers)) {
1852 QLIST_REMOVE(space, list);
1853 g_free(space);
1854 }
1855}
1856
1857
1858
1859
1860static int vfio_get_iommu_type(VFIOContainer *container,
1861 Error **errp)
1862{
1863 int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
1864 VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
1865 int i;
1866
1867 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
1868 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
1869 return iommu_types[i];
1870 }
1871 }
1872 error_setg(errp, "No available IOMMU models");
1873 return -EINVAL;
1874}
1875
1876static int vfio_init_container(VFIOContainer *container, int group_fd,
1877 Error **errp)
1878{
1879 int iommu_type, ret;
1880
1881 iommu_type = vfio_get_iommu_type(container, errp);
1882 if (iommu_type < 0) {
1883 return iommu_type;
1884 }
1885
1886 ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
1887 if (ret) {
1888 error_setg_errno(errp, errno, "Failed to set group container");
1889 return -errno;
1890 }
1891
1892 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
1893 if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1894
1895
1896
1897
1898
1899
1900 iommu_type = VFIO_SPAPR_TCE_IOMMU;
1901 continue;
1902 }
1903 error_setg_errno(errp, errno, "Failed to set iommu for container");
1904 return -errno;
1905 }
1906
1907 container->iommu_type = iommu_type;
1908 return 0;
1909}
1910
1911static int vfio_get_iommu_info(VFIOContainer *container,
1912 struct vfio_iommu_type1_info **info)
1913{
1914
1915 size_t argsz = sizeof(struct vfio_iommu_type1_info);
1916
1917 *info = g_new0(struct vfio_iommu_type1_info, 1);
1918again:
1919 (*info)->argsz = argsz;
1920
1921 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
1922 g_free(*info);
1923 *info = NULL;
1924 return -errno;
1925 }
1926
1927 if (((*info)->argsz > argsz)) {
1928 argsz = (*info)->argsz;
1929 *info = g_realloc(*info, argsz);
1930 goto again;
1931 }
1932
1933 return 0;
1934}
1935
1936static struct vfio_info_cap_header *
1937vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1938{
1939 struct vfio_info_cap_header *hdr;
1940 void *ptr = info;
1941
1942 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1943 return NULL;
1944 }
1945
1946 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1947 if (hdr->id == id) {
1948 return hdr;
1949 }
1950 }
1951
1952 return NULL;
1953}
1954
1955static void vfio_get_iommu_info_migration(VFIOContainer *container,
1956 struct vfio_iommu_type1_info *info)
1957{
1958 struct vfio_info_cap_header *hdr;
1959 struct vfio_iommu_type1_info_cap_migration *cap_mig;
1960
1961 hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
1962 if (!hdr) {
1963 return;
1964 }
1965
1966 cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
1967 header);
1968
1969
1970
1971
1972
1973 if (cap_mig->pgsize_bitmap & qemu_real_host_page_size) {
1974 container->dirty_pages_supported = true;
1975 container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
1976 container->dirty_pgsizes = cap_mig->pgsize_bitmap;
1977 }
1978}
1979
1980static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
1981 Error **errp)
1982{
1983 VFIOContainer *container;
1984 int ret, fd;
1985 VFIOAddressSpace *space;
1986
1987 space = vfio_get_address_space(as);
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020 QLIST_FOREACH(container, &space->containers, next) {
2021 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
2022 ret = vfio_ram_block_discard_disable(container, true);
2023 if (ret) {
2024 error_setg_errno(errp, -ret,
2025 "Cannot set discarding of RAM broken");
2026 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2027 &container->fd)) {
2028 error_report("vfio: error disconnecting group %d from"
2029 " container", group->groupid);
2030 }
2031 return ret;
2032 }
2033 group->container = container;
2034 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2035 vfio_kvm_device_add_group(group);
2036 return 0;
2037 }
2038 }
2039
2040 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
2041 if (fd < 0) {
2042 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
2043 ret = -errno;
2044 goto put_space_exit;
2045 }
2046
2047 ret = ioctl(fd, VFIO_GET_API_VERSION);
2048 if (ret != VFIO_API_VERSION) {
2049 error_setg(errp, "supported vfio version: %d, "
2050 "reported version: %d", VFIO_API_VERSION, ret);
2051 ret = -EINVAL;
2052 goto close_fd_exit;
2053 }
2054
2055 container = g_malloc0(sizeof(*container));
2056 container->space = space;
2057 container->fd = fd;
2058 container->error = NULL;
2059 container->dirty_pages_supported = false;
2060 container->dma_max_mappings = 0;
2061 QLIST_INIT(&container->giommu_list);
2062 QLIST_INIT(&container->hostwin_list);
2063 QLIST_INIT(&container->vrdl_list);
2064
2065 ret = vfio_init_container(container, group->fd, errp);
2066 if (ret) {
2067 goto free_container_exit;
2068 }
2069
2070 ret = vfio_ram_block_discard_disable(container, true);
2071 if (ret) {
2072 error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2073 goto free_container_exit;
2074 }
2075
2076 switch (container->iommu_type) {
2077 case VFIO_TYPE1v2_IOMMU:
2078 case VFIO_TYPE1_IOMMU:
2079 {
2080 struct vfio_iommu_type1_info *info;
2081
2082
2083
2084
2085
2086
2087
2088
2089 ret = vfio_get_iommu_info(container, &info);
2090
2091 if (ret || !(info->flags & VFIO_IOMMU_INFO_PGSIZES)) {
2092
2093 info->iova_pgsizes = 4096;
2094 }
2095 vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes);
2096 container->pgsizes = info->iova_pgsizes;
2097
2098
2099 container->dma_max_mappings = 65535;
2100 if (!ret) {
2101 vfio_get_info_dma_avail(info, &container->dma_max_mappings);
2102 vfio_get_iommu_info_migration(container, info);
2103 }
2104 g_free(info);
2105 break;
2106 }
2107 case VFIO_SPAPR_TCE_v2_IOMMU:
2108 case VFIO_SPAPR_TCE_IOMMU:
2109 {
2110 struct vfio_iommu_spapr_tce_info info;
2111 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
2112
2113
2114
2115
2116
2117
2118 if (!v2) {
2119 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2120 if (ret) {
2121 error_setg_errno(errp, errno, "failed to enable container");
2122 ret = -errno;
2123 goto enable_discards_exit;
2124 }
2125 } else {
2126 container->prereg_listener = vfio_prereg_listener;
2127
2128 memory_listener_register(&container->prereg_listener,
2129 &address_space_memory);
2130 if (container->error) {
2131 memory_listener_unregister(&container->prereg_listener);
2132 ret = -1;
2133 error_propagate_prepend(errp, container->error,
2134 "RAM memory listener initialization failed: ");
2135 goto enable_discards_exit;
2136 }
2137 }
2138
2139 info.argsz = sizeof(info);
2140 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2141 if (ret) {
2142 error_setg_errno(errp, errno,
2143 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
2144 ret = -errno;
2145 if (v2) {
2146 memory_listener_unregister(&container->prereg_listener);
2147 }
2148 goto enable_discards_exit;
2149 }
2150
2151 if (v2) {
2152 container->pgsizes = info.ddw.pgsizes;
2153
2154
2155
2156
2157
2158
2159 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2160 if (ret) {
2161 error_setg_errno(errp, -ret,
2162 "failed to remove existing window");
2163 goto enable_discards_exit;
2164 }
2165 } else {
2166
2167 container->pgsizes = 0x1000;
2168 vfio_host_win_add(container, info.dma32_window_start,
2169 info.dma32_window_start +
2170 info.dma32_window_size - 1,
2171 0x1000);
2172 }
2173 }
2174 }
2175
2176 vfio_kvm_device_add_group(group);
2177
2178 QLIST_INIT(&container->group_list);
2179 QLIST_INSERT_HEAD(&space->containers, container, next);
2180
2181 group->container = container;
2182 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2183
2184 container->listener = vfio_memory_listener;
2185
2186 memory_listener_register(&container->listener, container->space->as);
2187
2188 if (container->error) {
2189 ret = -1;
2190 error_propagate_prepend(errp, container->error,
2191 "memory listener initialization failed: ");
2192 goto listener_release_exit;
2193 }
2194
2195 container->initialized = true;
2196
2197 return 0;
2198listener_release_exit:
2199 QLIST_REMOVE(group, container_next);
2200 QLIST_REMOVE(container, next);
2201 vfio_kvm_device_del_group(group);
2202 vfio_listener_release(container);
2203
2204enable_discards_exit:
2205 vfio_ram_block_discard_disable(container, false);
2206
2207free_container_exit:
2208 g_free(container);
2209
2210close_fd_exit:
2211 close(fd);
2212
2213put_space_exit:
2214 vfio_put_address_space(space);
2215
2216 return ret;
2217}
2218
2219static void vfio_disconnect_container(VFIOGroup *group)
2220{
2221 VFIOContainer *container = group->container;
2222
2223 QLIST_REMOVE(group, container_next);
2224 group->container = NULL;
2225
2226
2227
2228
2229
2230
2231 if (QLIST_EMPTY(&container->group_list)) {
2232 vfio_listener_release(container);
2233 }
2234
2235 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2236 error_report("vfio: error disconnecting group %d from container",
2237 group->groupid);
2238 }
2239
2240 if (QLIST_EMPTY(&container->group_list)) {
2241 VFIOAddressSpace *space = container->space;
2242 VFIOGuestIOMMU *giommu, *tmp;
2243 VFIOHostDMAWindow *hostwin, *next;
2244
2245 QLIST_REMOVE(container, next);
2246
2247 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
2248 memory_region_unregister_iommu_notifier(
2249 MEMORY_REGION(giommu->iommu), &giommu->n);
2250 QLIST_REMOVE(giommu, giommu_next);
2251 g_free(giommu);
2252 }
2253
2254 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
2255 next) {
2256 QLIST_REMOVE(hostwin, hostwin_next);
2257 g_free(hostwin);
2258 }
2259
2260 trace_vfio_disconnect_container(container->fd);
2261 close(container->fd);
2262 g_free(container);
2263
2264 vfio_put_address_space(space);
2265 }
2266}
2267
2268VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
2269{
2270 VFIOGroup *group;
2271 char path[32];
2272 struct vfio_group_status status = { .argsz = sizeof(status) };
2273
2274 QLIST_FOREACH(group, &vfio_group_list, next) {
2275 if (group->groupid == groupid) {
2276
2277 if (group->container->space->as == as) {
2278 return group;
2279 } else {
2280 error_setg(errp, "group %d used in multiple address spaces",
2281 group->groupid);
2282 return NULL;
2283 }
2284 }
2285 }
2286
2287 group = g_malloc0(sizeof(*group));
2288
2289 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
2290 group->fd = qemu_open_old(path, O_RDWR);
2291 if (group->fd < 0) {
2292 error_setg_errno(errp, errno, "failed to open %s", path);
2293 goto free_group_exit;
2294 }
2295
2296 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
2297 error_setg_errno(errp, errno, "failed to get group %d status", groupid);
2298 goto close_fd_exit;
2299 }
2300
2301 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
2302 error_setg(errp, "group %d is not viable", groupid);
2303 error_append_hint(errp,
2304 "Please ensure all devices within the iommu_group "
2305 "are bound to their vfio bus driver.\n");
2306 goto close_fd_exit;
2307 }
2308
2309 group->groupid = groupid;
2310 QLIST_INIT(&group->device_list);
2311
2312 if (vfio_connect_container(group, as, errp)) {
2313 error_prepend(errp, "failed to setup container for group %d: ",
2314 groupid);
2315 goto close_fd_exit;
2316 }
2317
2318 if (QLIST_EMPTY(&vfio_group_list)) {
2319 qemu_register_reset(vfio_reset_handler, NULL);
2320 }
2321
2322 QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2323
2324 return group;
2325
2326close_fd_exit:
2327 close(group->fd);
2328
2329free_group_exit:
2330 g_free(group);
2331
2332 return NULL;
2333}
2334
2335void vfio_put_group(VFIOGroup *group)
2336{
2337 if (!group || !QLIST_EMPTY(&group->device_list)) {
2338 return;
2339 }
2340
2341 if (!group->ram_block_discard_allowed) {
2342 vfio_ram_block_discard_disable(group->container, false);
2343 }
2344 vfio_kvm_device_del_group(group);
2345 vfio_disconnect_container(group);
2346 QLIST_REMOVE(group, next);
2347 trace_vfio_put_group(group->fd);
2348 close(group->fd);
2349 g_free(group);
2350
2351 if (QLIST_EMPTY(&vfio_group_list)) {
2352 qemu_unregister_reset(vfio_reset_handler, NULL);
2353 }
2354}
2355
2356int vfio_get_device(VFIOGroup *group, const char *name,
2357 VFIODevice *vbasedev, Error **errp)
2358{
2359 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
2360 int ret, fd;
2361
2362 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2363 if (fd < 0) {
2364 error_setg_errno(errp, errno, "error getting device from group %d",
2365 group->groupid);
2366 error_append_hint(errp,
2367 "Verify all devices in group %d are bound to vfio-<bus> "
2368 "or pci-stub and not already in use\n", group->groupid);
2369 return fd;
2370 }
2371
2372 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
2373 if (ret) {
2374 error_setg_errno(errp, errno, "error getting device info");
2375 close(fd);
2376 return ret;
2377 }
2378
2379
2380
2381
2382
2383
2384
2385 if (vbasedev->ram_block_discard_allowed !=
2386 group->ram_block_discard_allowed) {
2387 if (!QLIST_EMPTY(&group->device_list)) {
2388 error_setg(errp, "Inconsistent setting of support for discarding "
2389 "RAM (e.g., balloon) within group");
2390 close(fd);
2391 return -1;
2392 }
2393
2394 if (!group->ram_block_discard_allowed) {
2395 group->ram_block_discard_allowed = true;
2396 vfio_ram_block_discard_disable(group->container, false);
2397 }
2398 }
2399
2400 vbasedev->fd = fd;
2401 vbasedev->group = group;
2402 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2403
2404 vbasedev->num_irqs = dev_info.num_irqs;
2405 vbasedev->num_regions = dev_info.num_regions;
2406 vbasedev->flags = dev_info.flags;
2407
2408 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2409 dev_info.num_irqs);
2410
2411 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
2412 return 0;
2413}
2414
2415void vfio_put_base_device(VFIODevice *vbasedev)
2416{
2417 if (!vbasedev->group) {
2418 return;
2419 }
2420 QLIST_REMOVE(vbasedev, next);
2421 vbasedev->group = NULL;
2422 trace_vfio_put_base_device(vbasedev->fd);
2423 close(vbasedev->fd);
2424}
2425
2426int vfio_get_region_info(VFIODevice *vbasedev, int index,
2427 struct vfio_region_info **info)
2428{
2429 size_t argsz = sizeof(struct vfio_region_info);
2430
2431 *info = g_malloc0(argsz);
2432
2433 (*info)->index = index;
2434retry:
2435 (*info)->argsz = argsz;
2436
2437 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2438 g_free(*info);
2439 *info = NULL;
2440 return -errno;
2441 }
2442
2443 if ((*info)->argsz > argsz) {
2444 argsz = (*info)->argsz;
2445 *info = g_realloc(*info, argsz);
2446
2447 goto retry;
2448 }
2449
2450 return 0;
2451}
2452
2453int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2454 uint32_t subtype, struct vfio_region_info **info)
2455{
2456 int i;
2457
2458 for (i = 0; i < vbasedev->num_regions; i++) {
2459 struct vfio_info_cap_header *hdr;
2460 struct vfio_region_info_cap_type *cap_type;
2461
2462 if (vfio_get_region_info(vbasedev, i, info)) {
2463 continue;
2464 }
2465
2466 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2467 if (!hdr) {
2468 g_free(*info);
2469 continue;
2470 }
2471
2472 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2473
2474 trace_vfio_get_dev_region(vbasedev->name, i,
2475 cap_type->type, cap_type->subtype);
2476
2477 if (cap_type->type == type && cap_type->subtype == subtype) {
2478 return 0;
2479 }
2480
2481 g_free(*info);
2482 }
2483
2484 *info = NULL;
2485 return -ENODEV;
2486}
2487
2488bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2489{
2490 struct vfio_region_info *info = NULL;
2491 bool ret = false;
2492
2493 if (!vfio_get_region_info(vbasedev, region, &info)) {
2494 if (vfio_get_region_info_cap(info, cap_type)) {
2495 ret = true;
2496 }
2497 g_free(info);
2498 }
2499
2500 return ret;
2501}
2502
2503
2504
2505
2506static bool vfio_eeh_container_ok(VFIOContainer *container)
2507{
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523 if (QLIST_EMPTY(&container->group_list)) {
2524 return false;
2525 }
2526
2527 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
2528 return false;
2529 }
2530
2531 return true;
2532}
2533
2534static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
2535{
2536 struct vfio_eeh_pe_op pe_op = {
2537 .argsz = sizeof(pe_op),
2538 .op = op,
2539 };
2540 int ret;
2541
2542 if (!vfio_eeh_container_ok(container)) {
2543 error_report("vfio/eeh: EEH_PE_OP 0x%x: "
2544 "kernel requires a container with exactly one group", op);
2545 return -EPERM;
2546 }
2547
2548 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
2549 if (ret < 0) {
2550 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
2551 return -errno;
2552 }
2553
2554 return ret;
2555}
2556
2557static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
2558{
2559 VFIOAddressSpace *space = vfio_get_address_space(as);
2560 VFIOContainer *container = NULL;
2561
2562 if (QLIST_EMPTY(&space->containers)) {
2563
2564 goto out;
2565 }
2566
2567 container = QLIST_FIRST(&space->containers);
2568
2569 if (QLIST_NEXT(container, next)) {
2570
2571
2572 container = NULL;
2573 goto out;
2574 }
2575
2576out:
2577 vfio_put_address_space(space);
2578 return container;
2579}
2580
2581bool vfio_eeh_as_ok(AddressSpace *as)
2582{
2583 VFIOContainer *container = vfio_eeh_as_container(as);
2584
2585 return (container != NULL) && vfio_eeh_container_ok(container);
2586}
2587
2588int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
2589{
2590 VFIOContainer *container = vfio_eeh_as_container(as);
2591
2592 if (!container) {
2593 return -ENODEV;
2594 }
2595 return vfio_eeh_container_op(container, op);
2596}
2597