1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21#include "qemu/osdep.h"
22#include <sys/ioctl.h>
23#ifdef CONFIG_KVM
24#include <linux/kvm.h>
25#endif
26#include <linux/vfio.h>
27
28#include "hw/vfio/vfio-common.h"
29#include "hw/vfio/vfio.h"
30#include "exec/address-spaces.h"
31#include "exec/memory.h"
32#include "exec/ram_addr.h"
33#include "hw/hw.h"
34#include "qemu/error-report.h"
35#include "qemu/main-loop.h"
36#include "qemu/range.h"
37#include "sysemu/kvm.h"
38#include "sysemu/reset.h"
39#include "sysemu/runstate.h"
40#include "trace.h"
41#include "qapi/error.h"
42#include "migration/migration.h"
43
44VFIOGroupList vfio_group_list =
45 QLIST_HEAD_INITIALIZER(vfio_group_list);
46static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
47 QLIST_HEAD_INITIALIZER(vfio_address_spaces);
48
49#ifdef CONFIG_KVM
50
51
52
53
54
55
56
57static int vfio_kvm_device_fd = -1;
58#endif
59
60
61
62
63void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
64{
65 struct vfio_irq_set irq_set = {
66 .argsz = sizeof(irq_set),
67 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
68 .index = index,
69 .start = 0,
70 .count = 0,
71 };
72
73 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
74}
75
76void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
77{
78 struct vfio_irq_set irq_set = {
79 .argsz = sizeof(irq_set),
80 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
81 .index = index,
82 .start = 0,
83 .count = 1,
84 };
85
86 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
87}
88
89void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
90{
91 struct vfio_irq_set irq_set = {
92 .argsz = sizeof(irq_set),
93 .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
94 .index = index,
95 .start = 0,
96 .count = 1,
97 };
98
99 ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
100}
101
102static inline const char *action_to_str(int action)
103{
104 switch (action) {
105 case VFIO_IRQ_SET_ACTION_MASK:
106 return "MASK";
107 case VFIO_IRQ_SET_ACTION_UNMASK:
108 return "UNMASK";
109 case VFIO_IRQ_SET_ACTION_TRIGGER:
110 return "TRIGGER";
111 default:
112 return "UNKNOWN ACTION";
113 }
114}
115
116static const char *index_to_str(VFIODevice *vbasedev, int index)
117{
118 if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
119 return NULL;
120 }
121
122 switch (index) {
123 case VFIO_PCI_INTX_IRQ_INDEX:
124 return "INTX";
125 case VFIO_PCI_MSI_IRQ_INDEX:
126 return "MSI";
127 case VFIO_PCI_MSIX_IRQ_INDEX:
128 return "MSIX";
129 case VFIO_PCI_ERR_IRQ_INDEX:
130 return "ERR";
131 case VFIO_PCI_REQ_IRQ_INDEX:
132 return "REQ";
133 default:
134 return NULL;
135 }
136}
137
138static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
139{
140 switch (container->iommu_type) {
141 case VFIO_TYPE1v2_IOMMU:
142 case VFIO_TYPE1_IOMMU:
143
144
145
146 return ram_block_uncoordinated_discard_disable(state);
147 default:
148
149
150
151
152
153
154
155
156
157 return ram_block_discard_disable(state);
158 }
159}
160
161int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
162 int action, int fd, Error **errp)
163{
164 struct vfio_irq_set *irq_set;
165 int argsz, ret = 0;
166 const char *name;
167 int32_t *pfd;
168
169 argsz = sizeof(*irq_set) + sizeof(*pfd);
170
171 irq_set = g_malloc0(argsz);
172 irq_set->argsz = argsz;
173 irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
174 irq_set->index = index;
175 irq_set->start = subindex;
176 irq_set->count = 1;
177 pfd = (int32_t *)&irq_set->data;
178 *pfd = fd;
179
180 if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
181 ret = -errno;
182 }
183 g_free(irq_set);
184
185 if (!ret) {
186 return 0;
187 }
188
189 error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
190
191 name = index_to_str(vbasedev, index);
192 if (name) {
193 error_prepend(errp, "%s-%d: ", name, subindex);
194 } else {
195 error_prepend(errp, "index %d-%d: ", index, subindex);
196 }
197 error_prepend(errp,
198 "Failed to %s %s eventfd signaling for interrupt ",
199 fd < 0 ? "tear down" : "set up", action_to_str(action));
200 return ret;
201}
202
203
204
205
206void vfio_region_write(void *opaque, hwaddr addr,
207 uint64_t data, unsigned size)
208{
209 VFIORegion *region = opaque;
210 VFIODevice *vbasedev = region->vbasedev;
211 union {
212 uint8_t byte;
213 uint16_t word;
214 uint32_t dword;
215 uint64_t qword;
216 } buf;
217
218 switch (size) {
219 case 1:
220 buf.byte = data;
221 break;
222 case 2:
223 buf.word = cpu_to_le16(data);
224 break;
225 case 4:
226 buf.dword = cpu_to_le32(data);
227 break;
228 case 8:
229 buf.qword = cpu_to_le64(data);
230 break;
231 default:
232 hw_error("vfio: unsupported write size, %u bytes", size);
233 break;
234 }
235
236 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
237 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
238 ",%d) failed: %m",
239 __func__, vbasedev->name, region->nr,
240 addr, data, size);
241 }
242
243 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
244
245
246
247
248
249
250
251
252
253 vbasedev->ops->vfio_eoi(vbasedev);
254}
255
256uint64_t vfio_region_read(void *opaque,
257 hwaddr addr, unsigned size)
258{
259 VFIORegion *region = opaque;
260 VFIODevice *vbasedev = region->vbasedev;
261 union {
262 uint8_t byte;
263 uint16_t word;
264 uint32_t dword;
265 uint64_t qword;
266 } buf;
267 uint64_t data = 0;
268
269 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
270 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
271 __func__, vbasedev->name, region->nr,
272 addr, size);
273 return (uint64_t)-1;
274 }
275 switch (size) {
276 case 1:
277 data = buf.byte;
278 break;
279 case 2:
280 data = le16_to_cpu(buf.word);
281 break;
282 case 4:
283 data = le32_to_cpu(buf.dword);
284 break;
285 case 8:
286 data = le64_to_cpu(buf.qword);
287 break;
288 default:
289 hw_error("vfio: unsupported read size, %u bytes", size);
290 break;
291 }
292
293 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
294
295
296 vbasedev->ops->vfio_eoi(vbasedev);
297
298 return data;
299}
300
301const MemoryRegionOps vfio_region_ops = {
302 .read = vfio_region_read,
303 .write = vfio_region_write,
304 .endianness = DEVICE_LITTLE_ENDIAN,
305 .valid = {
306 .min_access_size = 1,
307 .max_access_size = 8,
308 },
309 .impl = {
310 .min_access_size = 1,
311 .max_access_size = 8,
312 },
313};
314
315
316
317
318
319bool vfio_mig_active(void)
320{
321 VFIOGroup *group;
322 VFIODevice *vbasedev;
323
324 if (QLIST_EMPTY(&vfio_group_list)) {
325 return false;
326 }
327
328 QLIST_FOREACH(group, &vfio_group_list, next) {
329 QLIST_FOREACH(vbasedev, &group->device_list, next) {
330 if (vbasedev->migration_blocker) {
331 return false;
332 }
333 }
334 }
335 return true;
336}
337
338static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
339{
340 VFIOGroup *group;
341 VFIODevice *vbasedev;
342 MigrationState *ms = migrate_get_current();
343
344 if (!migration_is_setup_or_active(ms->state)) {
345 return false;
346 }
347
348 QLIST_FOREACH(group, &container->group_list, container_next) {
349 QLIST_FOREACH(vbasedev, &group->device_list, next) {
350 VFIOMigration *migration = vbasedev->migration;
351
352 if (!migration) {
353 return false;
354 }
355
356 if ((vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF)
357 && (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
358 return false;
359 }
360 }
361 }
362 return true;
363}
364
365static bool vfio_devices_all_running_and_saving(VFIOContainer *container)
366{
367 VFIOGroup *group;
368 VFIODevice *vbasedev;
369 MigrationState *ms = migrate_get_current();
370
371 if (!migration_is_setup_or_active(ms->state)) {
372 return false;
373 }
374
375 QLIST_FOREACH(group, &container->group_list, container_next) {
376 QLIST_FOREACH(vbasedev, &group->device_list, next) {
377 VFIOMigration *migration = vbasedev->migration;
378
379 if (!migration) {
380 return false;
381 }
382
383 if ((migration->device_state & VFIO_DEVICE_STATE_SAVING) &&
384 (migration->device_state & VFIO_DEVICE_STATE_RUNNING)) {
385 continue;
386 } else {
387 return false;
388 }
389 }
390 }
391 return true;
392}
393
394static int vfio_dma_unmap_bitmap(VFIOContainer *container,
395 hwaddr iova, ram_addr_t size,
396 IOMMUTLBEntry *iotlb)
397{
398 struct vfio_iommu_type1_dma_unmap *unmap;
399 struct vfio_bitmap *bitmap;
400 uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size;
401 int ret;
402
403 unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
404
405 unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
406 unmap->iova = iova;
407 unmap->size = size;
408 unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
409 bitmap = (struct vfio_bitmap *)&unmap->data;
410
411
412
413
414
415
416
417 bitmap->pgsize = qemu_real_host_page_size;
418 bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
419 BITS_PER_BYTE;
420
421 if (bitmap->size > container->max_dirty_bitmap_size) {
422 error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
423 (uint64_t)bitmap->size);
424 ret = -E2BIG;
425 goto unmap_exit;
426 }
427
428 bitmap->data = g_try_malloc0(bitmap->size);
429 if (!bitmap->data) {
430 ret = -ENOMEM;
431 goto unmap_exit;
432 }
433
434 ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
435 if (!ret) {
436 cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
437 iotlb->translated_addr, pages);
438 } else {
439 error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
440 }
441
442 g_free(bitmap->data);
443unmap_exit:
444 g_free(unmap);
445 return ret;
446}
447
448
449
450
451static int vfio_dma_unmap(VFIOContainer *container,
452 hwaddr iova, ram_addr_t size,
453 IOMMUTLBEntry *iotlb)
454{
455 struct vfio_iommu_type1_dma_unmap unmap = {
456 .argsz = sizeof(unmap),
457 .flags = 0,
458 .iova = iova,
459 .size = size,
460 };
461
462 if (iotlb && container->dirty_pages_supported &&
463 vfio_devices_all_running_and_saving(container)) {
464 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
465 }
466
467 while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
468
469
470
471
472
473
474
475
476
477
478
479
480 if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
481 container->iommu_type == VFIO_TYPE1v2_IOMMU) {
482 trace_vfio_dma_unmap_overflow_workaround();
483 unmap.size -= 1ULL << ctz64(container->pgsizes);
484 continue;
485 }
486 error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
487 return -errno;
488 }
489
490 return 0;
491}
492
493static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
494 ram_addr_t size, void *vaddr, bool readonly)
495{
496 struct vfio_iommu_type1_dma_map map = {
497 .argsz = sizeof(map),
498 .flags = VFIO_DMA_MAP_FLAG_READ,
499 .vaddr = (__u64)(uintptr_t)vaddr,
500 .iova = iova,
501 .size = size,
502 };
503
504 if (!readonly) {
505 map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
506 }
507
508
509
510
511
512
513 if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
514 (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
515 ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
516 return 0;
517 }
518
519 error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
520 return -errno;
521}
522
523static void vfio_host_win_add(VFIOContainer *container,
524 hwaddr min_iova, hwaddr max_iova,
525 uint64_t iova_pgsizes)
526{
527 VFIOHostDMAWindow *hostwin;
528
529 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
530 if (ranges_overlap(hostwin->min_iova,
531 hostwin->max_iova - hostwin->min_iova + 1,
532 min_iova,
533 max_iova - min_iova + 1)) {
534 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
535 }
536 }
537
538 hostwin = g_malloc0(sizeof(*hostwin));
539
540 hostwin->min_iova = min_iova;
541 hostwin->max_iova = max_iova;
542 hostwin->iova_pgsizes = iova_pgsizes;
543 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
544}
545
546static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
547 hwaddr max_iova)
548{
549 VFIOHostDMAWindow *hostwin;
550
551 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
552 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
553 QLIST_REMOVE(hostwin, hostwin_next);
554 g_free(hostwin);
555 return 0;
556 }
557 }
558
559 return -1;
560}
561
562static bool vfio_listener_skipped_section(MemoryRegionSection *section)
563{
564 return (!memory_region_is_ram(section->mr) &&
565 !memory_region_is_iommu(section->mr)) ||
566
567
568
569
570
571
572 section->offset_within_address_space & (1ULL << 63);
573}
574
575
576static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
577 ram_addr_t *ram_addr, bool *read_only)
578{
579 MemoryRegion *mr;
580 hwaddr xlat;
581 hwaddr len = iotlb->addr_mask + 1;
582 bool writable = iotlb->perm & IOMMU_WO;
583
584
585
586
587
588
589 mr = address_space_translate(&address_space_memory,
590 iotlb->translated_addr,
591 &xlat, &len, writable,
592 MEMTXATTRS_UNSPECIFIED);
593 if (!memory_region_is_ram(mr)) {
594 error_report("iommu map to non memory area %"HWADDR_PRIx"",
595 xlat);
596 return false;
597 } else if (memory_region_has_ram_discard_manager(mr)) {
598 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr);
599 MemoryRegionSection tmp = {
600 .mr = mr,
601 .offset_within_region = xlat,
602 .size = int128_make64(len),
603 };
604
605
606
607
608
609
610
611 if (!ram_discard_manager_is_populated(rdm, &tmp)) {
612 error_report("iommu map to discarded memory (e.g., unplugged via"
613 " virtio-mem): %"HWADDR_PRIx"",
614 iotlb->translated_addr);
615 return false;
616 }
617
618
619
620
621
622
623
624
625
626
627
628
629
630 warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
631 " RAM (e.g., virtio-mem) works, however, malicious"
632 " guests can trigger pinning of more memory than"
633 " intended via an IOMMU. It's possible to mitigate "
634 " by setting/adjusting RLIMIT_MEMLOCK.");
635 }
636
637
638
639
640
641 if (len & iotlb->addr_mask) {
642 error_report("iommu has granularity incompatible with target AS");
643 return false;
644 }
645
646 if (vaddr) {
647 *vaddr = memory_region_get_ram_ptr(mr) + xlat;
648 }
649
650 if (ram_addr) {
651 *ram_addr = memory_region_get_ram_addr(mr) + xlat;
652 }
653
654 if (read_only) {
655 *read_only = !writable || mr->readonly;
656 }
657
658 return true;
659}
660
661static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
662{
663 VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
664 VFIOContainer *container = giommu->container;
665 hwaddr iova = iotlb->iova + giommu->iommu_offset;
666 void *vaddr;
667 int ret;
668
669 trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
670 iova, iova + iotlb->addr_mask);
671
672 if (iotlb->target_as != &address_space_memory) {
673 error_report("Wrong target AS \"%s\", only system memory is allowed",
674 iotlb->target_as->name ? iotlb->target_as->name : "none");
675 return;
676 }
677
678 rcu_read_lock();
679
680 if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
681 bool read_only;
682
683 if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
684 goto out;
685 }
686
687
688
689
690
691
692
693 ret = vfio_dma_map(container, iova,
694 iotlb->addr_mask + 1, vaddr,
695 read_only);
696 if (ret) {
697 error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
698 "0x%"HWADDR_PRIx", %p) = %d (%m)",
699 container, iova,
700 iotlb->addr_mask + 1, vaddr, ret);
701 }
702 } else {
703 ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
704 if (ret) {
705 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
706 "0x%"HWADDR_PRIx") = %d (%m)",
707 container, iova,
708 iotlb->addr_mask + 1, ret);
709 }
710 }
711out:
712 rcu_read_unlock();
713}
714
715static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
716 MemoryRegionSection *section)
717{
718 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
719 listener);
720 const hwaddr size = int128_get64(section->size);
721 const hwaddr iova = section->offset_within_address_space;
722 int ret;
723
724
725 ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
726 if (ret) {
727 error_report("%s: vfio_dma_unmap() failed: %s", __func__,
728 strerror(-ret));
729 }
730}
731
732static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
733 MemoryRegionSection *section)
734{
735 VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
736 listener);
737 const hwaddr end = section->offset_within_region +
738 int128_get64(section->size);
739 hwaddr start, next, iova;
740 void *vaddr;
741 int ret;
742
743
744
745
746
747 for (start = section->offset_within_region; start < end; start = next) {
748 next = ROUND_UP(start + 1, vrdl->granularity);
749 next = MIN(next, end);
750
751 iova = start - section->offset_within_region +
752 section->offset_within_address_space;
753 vaddr = memory_region_get_ram_ptr(section->mr) + start;
754
755 ret = vfio_dma_map(vrdl->container, iova, next - start,
756 vaddr, section->readonly);
757 if (ret) {
758
759 vfio_ram_discard_notify_discard(rdl, section);
760 return ret;
761 }
762 }
763 return 0;
764}
765
766static void vfio_register_ram_discard_listener(VFIOContainer *container,
767 MemoryRegionSection *section)
768{
769 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
770 VFIORamDiscardListener *vrdl;
771
772
773 g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
774 g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
775 TARGET_PAGE_SIZE));
776 g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
777
778 vrdl = g_new0(VFIORamDiscardListener, 1);
779 vrdl->container = container;
780 vrdl->mr = section->mr;
781 vrdl->offset_within_address_space = section->offset_within_address_space;
782 vrdl->size = int128_get64(section->size);
783 vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
784 section->mr);
785
786 g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
787 g_assert(container->pgsizes &&
788 vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
789
790 ram_discard_listener_init(&vrdl->listener,
791 vfio_ram_discard_notify_populate,
792 vfio_ram_discard_notify_discard, true);
793 ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
794 QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809 if (container->dma_max_mappings) {
810 unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
811
812#ifdef CONFIG_KVM
813 if (kvm_enabled()) {
814 max_memslots = kvm_get_max_memslots();
815 }
816#endif
817
818 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
819 hwaddr start, end;
820
821 start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
822 vrdl->granularity);
823 end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
824 vrdl->granularity);
825 vrdl_mappings += (end - start) / vrdl->granularity;
826 vrdl_count++;
827 }
828
829 if (vrdl_mappings + max_memslots - vrdl_count >
830 container->dma_max_mappings) {
831 warn_report("%s: possibly running out of DMA mappings. E.g., try"
832 " increasing the 'block-size' of virtio-mem devies."
833 " Maximum possible DMA mappings: %d, Maximum possible"
834 " memslots: %d", __func__, container->dma_max_mappings,
835 max_memslots);
836 }
837 }
838}
839
840static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
841 MemoryRegionSection *section)
842{
843 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
844 VFIORamDiscardListener *vrdl = NULL;
845
846 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
847 if (vrdl->mr == section->mr &&
848 vrdl->offset_within_address_space ==
849 section->offset_within_address_space) {
850 break;
851 }
852 }
853
854 if (!vrdl) {
855 hw_error("vfio: Trying to unregister missing RAM discard listener");
856 }
857
858 ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
859 QLIST_REMOVE(vrdl, next);
860 g_free(vrdl);
861}
862
863static void vfio_listener_region_add(MemoryListener *listener,
864 MemoryRegionSection *section)
865{
866 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
867 hwaddr iova, end;
868 Int128 llend, llsize;
869 void *vaddr;
870 int ret;
871 VFIOHostDMAWindow *hostwin;
872 bool hostwin_found;
873 Error *err = NULL;
874
875 if (vfio_listener_skipped_section(section)) {
876 trace_vfio_listener_region_add_skip(
877 section->offset_within_address_space,
878 section->offset_within_address_space +
879 int128_get64(int128_sub(section->size, int128_one())));
880 return;
881 }
882
883 if (unlikely((section->offset_within_address_space &
884 ~qemu_real_host_page_mask) !=
885 (section->offset_within_region & ~qemu_real_host_page_mask))) {
886 error_report("%s received unaligned region", __func__);
887 return;
888 }
889
890 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
891 llend = int128_make64(section->offset_within_address_space);
892 llend = int128_add(llend, section->size);
893 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
894
895 if (int128_ge(int128_make64(iova), llend)) {
896 return;
897 }
898 end = int128_get64(int128_sub(llend, int128_one()));
899
900 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
901 hwaddr pgsize = 0;
902
903
904 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
905 if (ranges_overlap(hostwin->min_iova,
906 hostwin->max_iova - hostwin->min_iova + 1,
907 section->offset_within_address_space,
908 int128_get64(section->size))) {
909 error_setg(&err,
910 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
911 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
912 section->offset_within_address_space,
913 section->offset_within_address_space +
914 int128_get64(section->size) - 1,
915 hostwin->min_iova, hostwin->max_iova);
916 goto fail;
917 }
918 }
919
920 ret = vfio_spapr_create_window(container, section, &pgsize);
921 if (ret) {
922 error_setg_errno(&err, -ret, "Failed to create SPAPR window");
923 goto fail;
924 }
925
926 vfio_host_win_add(container, section->offset_within_address_space,
927 section->offset_within_address_space +
928 int128_get64(section->size) - 1, pgsize);
929#ifdef CONFIG_KVM
930 if (kvm_enabled()) {
931 VFIOGroup *group;
932 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
933 struct kvm_vfio_spapr_tce param;
934 struct kvm_device_attr attr = {
935 .group = KVM_DEV_VFIO_GROUP,
936 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
937 .addr = (uint64_t)(unsigned long)¶m,
938 };
939
940 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
941 ¶m.tablefd)) {
942 QLIST_FOREACH(group, &container->group_list, container_next) {
943 param.groupfd = group->fd;
944 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
945 error_report("vfio: failed to setup fd %d "
946 "for a group with fd %d: %s",
947 param.tablefd, param.groupfd,
948 strerror(errno));
949 return;
950 }
951 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
952 }
953 }
954 }
955#endif
956 }
957
958 hostwin_found = false;
959 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
960 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
961 hostwin_found = true;
962 break;
963 }
964 }
965
966 if (!hostwin_found) {
967 error_setg(&err, "Container %p can't map guest IOVA region"
968 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
969 goto fail;
970 }
971
972 memory_region_ref(section->mr);
973
974 if (memory_region_is_iommu(section->mr)) {
975 VFIOGuestIOMMU *giommu;
976 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
977 int iommu_idx;
978
979 trace_vfio_listener_region_add_iommu(iova, end);
980
981
982
983
984
985
986 giommu = g_malloc0(sizeof(*giommu));
987 giommu->iommu = iommu_mr;
988 giommu->iommu_offset = section->offset_within_address_space -
989 section->offset_within_region;
990 giommu->container = container;
991 llend = int128_add(int128_make64(section->offset_within_region),
992 section->size);
993 llend = int128_sub(llend, int128_one());
994 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
995 MEMTXATTRS_UNSPECIFIED);
996 iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
997 IOMMU_NOTIFIER_IOTLB_EVENTS,
998 section->offset_within_region,
999 int128_get64(llend),
1000 iommu_idx);
1001
1002 ret = memory_region_iommu_set_page_size_mask(giommu->iommu,
1003 container->pgsizes,
1004 &err);
1005 if (ret) {
1006 g_free(giommu);
1007 goto fail;
1008 }
1009
1010 ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
1011 &err);
1012 if (ret) {
1013 g_free(giommu);
1014 goto fail;
1015 }
1016 QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
1017 memory_region_iommu_replay(giommu->iommu, &giommu->n);
1018
1019 return;
1020 }
1021
1022
1023
1024
1025
1026
1027
1028
1029 if (memory_region_has_ram_discard_manager(section->mr)) {
1030 vfio_register_ram_discard_listener(container, section);
1031 return;
1032 }
1033
1034 vaddr = memory_region_get_ram_ptr(section->mr) +
1035 section->offset_within_region +
1036 (iova - section->offset_within_address_space);
1037
1038 trace_vfio_listener_region_add_ram(iova, end, vaddr);
1039
1040 llsize = int128_sub(llend, int128_make64(iova));
1041
1042 if (memory_region_is_ram_device(section->mr)) {
1043 hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1044
1045 if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
1046 trace_vfio_listener_region_add_no_dma_map(
1047 memory_region_name(section->mr),
1048 section->offset_within_address_space,
1049 int128_getlo(section->size),
1050 pgmask + 1);
1051 return;
1052 }
1053 }
1054
1055 ret = vfio_dma_map(container, iova, int128_get64(llsize),
1056 vaddr, section->readonly);
1057 if (ret) {
1058 error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
1059 "0x%"HWADDR_PRIx", %p) = %d (%m)",
1060 container, iova, int128_get64(llsize), vaddr, ret);
1061 if (memory_region_is_ram_device(section->mr)) {
1062
1063 error_report_err(err);
1064 return;
1065 }
1066 goto fail;
1067 }
1068
1069 return;
1070
1071fail:
1072 if (memory_region_is_ram_device(section->mr)) {
1073 error_report("failed to vfio_dma_map. pci p2p may not work");
1074 return;
1075 }
1076
1077
1078
1079
1080
1081 if (!container->initialized) {
1082 if (!container->error) {
1083 error_propagate_prepend(&container->error, err,
1084 "Region %s: ",
1085 memory_region_name(section->mr));
1086 } else {
1087 error_free(err);
1088 }
1089 } else {
1090 error_report_err(err);
1091 hw_error("vfio: DMA mapping failed, unable to continue");
1092 }
1093}
1094
1095static void vfio_listener_region_del(MemoryListener *listener,
1096 MemoryRegionSection *section)
1097{
1098 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1099 hwaddr iova, end;
1100 Int128 llend, llsize;
1101 int ret;
1102 bool try_unmap = true;
1103
1104 if (vfio_listener_skipped_section(section)) {
1105 trace_vfio_listener_region_del_skip(
1106 section->offset_within_address_space,
1107 section->offset_within_address_space +
1108 int128_get64(int128_sub(section->size, int128_one())));
1109 return;
1110 }
1111
1112 if (unlikely((section->offset_within_address_space &
1113 ~qemu_real_host_page_mask) !=
1114 (section->offset_within_region & ~qemu_real_host_page_mask))) {
1115 error_report("%s received unaligned region", __func__);
1116 return;
1117 }
1118
1119 if (memory_region_is_iommu(section->mr)) {
1120 VFIOGuestIOMMU *giommu;
1121
1122 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1123 if (MEMORY_REGION(giommu->iommu) == section->mr &&
1124 giommu->n.start == section->offset_within_region) {
1125 memory_region_unregister_iommu_notifier(section->mr,
1126 &giommu->n);
1127 QLIST_REMOVE(giommu, giommu_next);
1128 g_free(giommu);
1129 break;
1130 }
1131 }
1132
1133
1134
1135
1136
1137
1138
1139
1140 }
1141
1142 iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
1143 llend = int128_make64(section->offset_within_address_space);
1144 llend = int128_add(llend, section->size);
1145 llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask));
1146
1147 if (int128_ge(int128_make64(iova), llend)) {
1148 return;
1149 }
1150 end = int128_get64(int128_sub(llend, int128_one()));
1151
1152 llsize = int128_sub(llend, int128_make64(iova));
1153
1154 trace_vfio_listener_region_del(iova, end);
1155
1156 if (memory_region_is_ram_device(section->mr)) {
1157 hwaddr pgmask;
1158 VFIOHostDMAWindow *hostwin;
1159 bool hostwin_found = false;
1160
1161 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
1162 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
1163 hostwin_found = true;
1164 break;
1165 }
1166 }
1167 assert(hostwin_found);
1168
1169 pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
1170 try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
1171 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1172 vfio_unregister_ram_discard_listener(container, section);
1173
1174 try_unmap = false;
1175 }
1176
1177 if (try_unmap) {
1178 if (int128_eq(llsize, int128_2_64())) {
1179
1180 llsize = int128_rshift(llsize, 1);
1181 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1182 if (ret) {
1183 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1184 "0x%"HWADDR_PRIx") = %d (%m)",
1185 container, iova, int128_get64(llsize), ret);
1186 }
1187 iova += int128_get64(llsize);
1188 }
1189 ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
1190 if (ret) {
1191 error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
1192 "0x%"HWADDR_PRIx") = %d (%m)",
1193 container, iova, int128_get64(llsize), ret);
1194 }
1195 }
1196
1197 memory_region_unref(section->mr);
1198
1199 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1200 vfio_spapr_remove_window(container,
1201 section->offset_within_address_space);
1202 if (vfio_host_win_del(container,
1203 section->offset_within_address_space,
1204 section->offset_within_address_space +
1205 int128_get64(section->size) - 1) < 0) {
1206 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
1207 __func__, section->offset_within_address_space);
1208 }
1209 }
1210}
1211
1212static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
1213{
1214 int ret;
1215 struct vfio_iommu_type1_dirty_bitmap dirty = {
1216 .argsz = sizeof(dirty),
1217 };
1218
1219 if (start) {
1220 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
1221 } else {
1222 dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
1223 }
1224
1225 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
1226 if (ret) {
1227 error_report("Failed to set dirty tracking flag 0x%x errno: %d",
1228 dirty.flags, errno);
1229 }
1230}
1231
1232static void vfio_listener_log_global_start(MemoryListener *listener)
1233{
1234 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1235
1236 vfio_set_dirty_page_tracking(container, true);
1237}
1238
1239static void vfio_listener_log_global_stop(MemoryListener *listener)
1240{
1241 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1242
1243 vfio_set_dirty_page_tracking(container, false);
1244}
1245
1246static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
1247 uint64_t size, ram_addr_t ram_addr)
1248{
1249 struct vfio_iommu_type1_dirty_bitmap *dbitmap;
1250 struct vfio_iommu_type1_dirty_bitmap_get *range;
1251 uint64_t pages;
1252 int ret;
1253
1254 dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
1255
1256 dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
1257 dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
1258 range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
1259 range->iova = iova;
1260 range->size = size;
1261
1262
1263
1264
1265
1266
1267 range->bitmap.pgsize = qemu_real_host_page_size;
1268
1269 pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size;
1270 range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
1271 BITS_PER_BYTE;
1272 range->bitmap.data = g_try_malloc0(range->bitmap.size);
1273 if (!range->bitmap.data) {
1274 ret = -ENOMEM;
1275 goto err_out;
1276 }
1277
1278 ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
1279 if (ret) {
1280 error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
1281 " size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
1282 (uint64_t)range->size, errno);
1283 goto err_out;
1284 }
1285
1286 cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
1287 ram_addr, pages);
1288
1289 trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
1290 range->bitmap.size, ram_addr);
1291err_out:
1292 g_free(range->bitmap.data);
1293 g_free(dbitmap);
1294
1295 return ret;
1296}
1297
1298typedef struct {
1299 IOMMUNotifier n;
1300 VFIOGuestIOMMU *giommu;
1301} vfio_giommu_dirty_notifier;
1302
1303static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
1304{
1305 vfio_giommu_dirty_notifier *gdn = container_of(n,
1306 vfio_giommu_dirty_notifier, n);
1307 VFIOGuestIOMMU *giommu = gdn->giommu;
1308 VFIOContainer *container = giommu->container;
1309 hwaddr iova = iotlb->iova + giommu->iommu_offset;
1310 ram_addr_t translated_addr;
1311
1312 trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
1313
1314 if (iotlb->target_as != &address_space_memory) {
1315 error_report("Wrong target AS \"%s\", only system memory is allowed",
1316 iotlb->target_as->name ? iotlb->target_as->name : "none");
1317 return;
1318 }
1319
1320 rcu_read_lock();
1321 if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
1322 int ret;
1323
1324 ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
1325 translated_addr);
1326 if (ret) {
1327 error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
1328 "0x%"HWADDR_PRIx") = %d (%m)",
1329 container, iova,
1330 iotlb->addr_mask + 1, ret);
1331 }
1332 }
1333 rcu_read_unlock();
1334}
1335
1336static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
1337 void *opaque)
1338{
1339 const hwaddr size = int128_get64(section->size);
1340 const hwaddr iova = section->offset_within_address_space;
1341 const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
1342 section->offset_within_region;
1343 VFIORamDiscardListener *vrdl = opaque;
1344
1345
1346
1347
1348
1349 return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
1350}
1351
1352static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
1353 MemoryRegionSection *section)
1354{
1355 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
1356 VFIORamDiscardListener *vrdl = NULL;
1357
1358 QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
1359 if (vrdl->mr == section->mr &&
1360 vrdl->offset_within_address_space ==
1361 section->offset_within_address_space) {
1362 break;
1363 }
1364 }
1365
1366 if (!vrdl) {
1367 hw_error("vfio: Trying to sync missing RAM discard listener");
1368 }
1369
1370
1371
1372
1373
1374 return ram_discard_manager_replay_populated(rdm, section,
1375 vfio_ram_discard_get_dirty_bitmap,
1376 &vrdl);
1377}
1378
1379static int vfio_sync_dirty_bitmap(VFIOContainer *container,
1380 MemoryRegionSection *section)
1381{
1382 ram_addr_t ram_addr;
1383
1384 if (memory_region_is_iommu(section->mr)) {
1385 VFIOGuestIOMMU *giommu;
1386
1387 QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
1388 if (MEMORY_REGION(giommu->iommu) == section->mr &&
1389 giommu->n.start == section->offset_within_region) {
1390 Int128 llend;
1391 vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
1392 int idx = memory_region_iommu_attrs_to_index(giommu->iommu,
1393 MEMTXATTRS_UNSPECIFIED);
1394
1395 llend = int128_add(int128_make64(section->offset_within_region),
1396 section->size);
1397 llend = int128_sub(llend, int128_one());
1398
1399 iommu_notifier_init(&gdn.n,
1400 vfio_iommu_map_dirty_notify,
1401 IOMMU_NOTIFIER_MAP,
1402 section->offset_within_region,
1403 int128_get64(llend),
1404 idx);
1405 memory_region_iommu_replay(giommu->iommu, &gdn.n);
1406 break;
1407 }
1408 }
1409 return 0;
1410 } else if (memory_region_has_ram_discard_manager(section->mr)) {
1411 return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
1412 }
1413
1414 ram_addr = memory_region_get_ram_addr(section->mr) +
1415 section->offset_within_region;
1416
1417 return vfio_get_dirty_bitmap(container,
1418 REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
1419 int128_get64(section->size), ram_addr);
1420}
1421
1422static void vfio_listener_log_sync(MemoryListener *listener,
1423 MemoryRegionSection *section)
1424{
1425 VFIOContainer *container = container_of(listener, VFIOContainer, listener);
1426
1427 if (vfio_listener_skipped_section(section) ||
1428 !container->dirty_pages_supported) {
1429 return;
1430 }
1431
1432 if (vfio_devices_all_dirty_tracking(container)) {
1433 vfio_sync_dirty_bitmap(container, section);
1434 }
1435}
1436
1437static const MemoryListener vfio_memory_listener = {
1438 .region_add = vfio_listener_region_add,
1439 .region_del = vfio_listener_region_del,
1440 .log_global_start = vfio_listener_log_global_start,
1441 .log_global_stop = vfio_listener_log_global_stop,
1442 .log_sync = vfio_listener_log_sync,
1443};
1444
1445static void vfio_listener_release(VFIOContainer *container)
1446{
1447 memory_listener_unregister(&container->listener);
1448 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1449 memory_listener_unregister(&container->prereg_listener);
1450 }
1451}
1452
1453static struct vfio_info_cap_header *
1454vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
1455{
1456 struct vfio_info_cap_header *hdr;
1457
1458 for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1459 if (hdr->id == id) {
1460 return hdr;
1461 }
1462 }
1463
1464 return NULL;
1465}
1466
1467struct vfio_info_cap_header *
1468vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
1469{
1470 if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
1471 return NULL;
1472 }
1473
1474 return vfio_get_cap((void *)info, info->cap_offset, id);
1475}
1476
1477static struct vfio_info_cap_header *
1478vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1479{
1480 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1481 return NULL;
1482 }
1483
1484 return vfio_get_cap((void *)info, info->cap_offset, id);
1485}
1486
1487struct vfio_info_cap_header *
1488vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
1489{
1490 if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
1491 return NULL;
1492 }
1493
1494 return vfio_get_cap((void *)info, info->cap_offset, id);
1495}
1496
1497bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
1498 unsigned int *avail)
1499{
1500 struct vfio_info_cap_header *hdr;
1501 struct vfio_iommu_type1_info_dma_avail *cap;
1502
1503
1504 hdr = vfio_get_iommu_type1_info_cap(info,
1505 VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
1506 if (hdr == NULL) {
1507 return false;
1508 }
1509
1510 if (avail != NULL) {
1511 cap = (void *) hdr;
1512 *avail = cap->avail;
1513 }
1514
1515 return true;
1516}
1517
1518static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
1519 struct vfio_region_info *info)
1520{
1521 struct vfio_info_cap_header *hdr;
1522 struct vfio_region_info_cap_sparse_mmap *sparse;
1523 int i, j;
1524
1525 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
1526 if (!hdr) {
1527 return -ENODEV;
1528 }
1529
1530 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
1531
1532 trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
1533 region->nr, sparse->nr_areas);
1534
1535 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
1536
1537 for (i = 0, j = 0; i < sparse->nr_areas; i++) {
1538 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
1539 sparse->areas[i].offset +
1540 sparse->areas[i].size);
1541
1542 if (sparse->areas[i].size) {
1543 region->mmaps[j].offset = sparse->areas[i].offset;
1544 region->mmaps[j].size = sparse->areas[i].size;
1545 j++;
1546 }
1547 }
1548
1549 region->nr_mmaps = j;
1550 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
1551
1552 return 0;
1553}
1554
1555int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
1556 int index, const char *name)
1557{
1558 struct vfio_region_info *info;
1559 int ret;
1560
1561 ret = vfio_get_region_info(vbasedev, index, &info);
1562 if (ret) {
1563 return ret;
1564 }
1565
1566 region->vbasedev = vbasedev;
1567 region->flags = info->flags;
1568 region->size = info->size;
1569 region->fd_offset = info->offset;
1570 region->nr = index;
1571
1572 if (region->size) {
1573 region->mem = g_new0(MemoryRegion, 1);
1574 memory_region_init_io(region->mem, obj, &vfio_region_ops,
1575 region, name, region->size);
1576
1577 if (!vbasedev->no_mmap &&
1578 region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
1579
1580 ret = vfio_setup_region_sparse_mmaps(region, info);
1581
1582 if (ret) {
1583 region->nr_mmaps = 1;
1584 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
1585 region->mmaps[0].offset = 0;
1586 region->mmaps[0].size = region->size;
1587 }
1588 }
1589 }
1590
1591 g_free(info);
1592
1593 trace_vfio_region_setup(vbasedev->name, index, name,
1594 region->flags, region->fd_offset, region->size);
1595 return 0;
1596}
1597
1598static void vfio_subregion_unmap(VFIORegion *region, int index)
1599{
1600 trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem),
1601 region->mmaps[index].offset,
1602 region->mmaps[index].offset +
1603 region->mmaps[index].size - 1);
1604 memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem);
1605 munmap(region->mmaps[index].mmap, region->mmaps[index].size);
1606 object_unparent(OBJECT(®ion->mmaps[index].mem));
1607 region->mmaps[index].mmap = NULL;
1608}
1609
1610int vfio_region_mmap(VFIORegion *region)
1611{
1612 int i, prot = 0;
1613 char *name;
1614
1615 if (!region->mem) {
1616 return 0;
1617 }
1618
1619 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
1620 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
1621
1622 for (i = 0; i < region->nr_mmaps; i++) {
1623 region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
1624 MAP_SHARED, region->vbasedev->fd,
1625 region->fd_offset +
1626 region->mmaps[i].offset);
1627 if (region->mmaps[i].mmap == MAP_FAILED) {
1628 int ret = -errno;
1629
1630 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
1631 region->fd_offset +
1632 region->mmaps[i].offset,
1633 region->fd_offset +
1634 region->mmaps[i].offset +
1635 region->mmaps[i].size - 1, ret);
1636
1637 region->mmaps[i].mmap = NULL;
1638
1639 for (i--; i >= 0; i--) {
1640 vfio_subregion_unmap(region, i);
1641 }
1642
1643 return ret;
1644 }
1645
1646 name = g_strdup_printf("%s mmaps[%d]",
1647 memory_region_name(region->mem), i);
1648 memory_region_init_ram_device_ptr(®ion->mmaps[i].mem,
1649 memory_region_owner(region->mem),
1650 name, region->mmaps[i].size,
1651 region->mmaps[i].mmap);
1652 g_free(name);
1653 memory_region_add_subregion(region->mem, region->mmaps[i].offset,
1654 ®ion->mmaps[i].mem);
1655
1656 trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem),
1657 region->mmaps[i].offset,
1658 region->mmaps[i].offset +
1659 region->mmaps[i].size - 1);
1660 }
1661
1662 return 0;
1663}
1664
1665void vfio_region_unmap(VFIORegion *region)
1666{
1667 int i;
1668
1669 if (!region->mem) {
1670 return;
1671 }
1672
1673 for (i = 0; i < region->nr_mmaps; i++) {
1674 if (region->mmaps[i].mmap) {
1675 vfio_subregion_unmap(region, i);
1676 }
1677 }
1678}
1679
1680void vfio_region_exit(VFIORegion *region)
1681{
1682 int i;
1683
1684 if (!region->mem) {
1685 return;
1686 }
1687
1688 for (i = 0; i < region->nr_mmaps; i++) {
1689 if (region->mmaps[i].mmap) {
1690 memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem);
1691 }
1692 }
1693
1694 trace_vfio_region_exit(region->vbasedev->name, region->nr);
1695}
1696
1697void vfio_region_finalize(VFIORegion *region)
1698{
1699 int i;
1700
1701 if (!region->mem) {
1702 return;
1703 }
1704
1705 for (i = 0; i < region->nr_mmaps; i++) {
1706 if (region->mmaps[i].mmap) {
1707 munmap(region->mmaps[i].mmap, region->mmaps[i].size);
1708 object_unparent(OBJECT(®ion->mmaps[i].mem));
1709 }
1710 }
1711
1712 object_unparent(OBJECT(region->mem));
1713
1714 g_free(region->mem);
1715 g_free(region->mmaps);
1716
1717 trace_vfio_region_finalize(region->vbasedev->name, region->nr);
1718
1719 region->mem = NULL;
1720 region->mmaps = NULL;
1721 region->nr_mmaps = 0;
1722 region->size = 0;
1723 region->flags = 0;
1724 region->nr = 0;
1725}
1726
1727void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
1728{
1729 int i;
1730
1731 if (!region->mem) {
1732 return;
1733 }
1734
1735 for (i = 0; i < region->nr_mmaps; i++) {
1736 if (region->mmaps[i].mmap) {
1737 memory_region_set_enabled(®ion->mmaps[i].mem, enabled);
1738 }
1739 }
1740
1741 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
1742 enabled);
1743}
1744
1745void vfio_reset_handler(void *opaque)
1746{
1747 VFIOGroup *group;
1748 VFIODevice *vbasedev;
1749
1750 QLIST_FOREACH(group, &vfio_group_list, next) {
1751 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1752 if (vbasedev->dev->realized) {
1753 vbasedev->ops->vfio_compute_needs_reset(vbasedev);
1754 }
1755 }
1756 }
1757
1758 QLIST_FOREACH(group, &vfio_group_list, next) {
1759 QLIST_FOREACH(vbasedev, &group->device_list, next) {
1760 if (vbasedev->dev->realized && vbasedev->needs_reset) {
1761 vbasedev->ops->vfio_hot_reset_multi(vbasedev);
1762 }
1763 }
1764 }
1765}
1766
1767static void vfio_kvm_device_add_group(VFIOGroup *group)
1768{
1769#ifdef CONFIG_KVM
1770 struct kvm_device_attr attr = {
1771 .group = KVM_DEV_VFIO_GROUP,
1772 .attr = KVM_DEV_VFIO_GROUP_ADD,
1773 .addr = (uint64_t)(unsigned long)&group->fd,
1774 };
1775
1776 if (!kvm_enabled()) {
1777 return;
1778 }
1779
1780 if (vfio_kvm_device_fd < 0) {
1781 struct kvm_create_device cd = {
1782 .type = KVM_DEV_TYPE_VFIO,
1783 };
1784
1785 if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
1786 error_report("Failed to create KVM VFIO device: %m");
1787 return;
1788 }
1789
1790 vfio_kvm_device_fd = cd.fd;
1791 }
1792
1793 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1794 error_report("Failed to add group %d to KVM VFIO device: %m",
1795 group->groupid);
1796 }
1797#endif
1798}
1799
1800static void vfio_kvm_device_del_group(VFIOGroup *group)
1801{
1802#ifdef CONFIG_KVM
1803 struct kvm_device_attr attr = {
1804 .group = KVM_DEV_VFIO_GROUP,
1805 .attr = KVM_DEV_VFIO_GROUP_DEL,
1806 .addr = (uint64_t)(unsigned long)&group->fd,
1807 };
1808
1809 if (vfio_kvm_device_fd < 0) {
1810 return;
1811 }
1812
1813 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
1814 error_report("Failed to remove group %d from KVM VFIO device: %m",
1815 group->groupid);
1816 }
1817#endif
1818}
1819
1820static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
1821{
1822 VFIOAddressSpace *space;
1823
1824 QLIST_FOREACH(space, &vfio_address_spaces, list) {
1825 if (space->as == as) {
1826 return space;
1827 }
1828 }
1829
1830
1831 space = g_malloc0(sizeof(*space));
1832 space->as = as;
1833 QLIST_INIT(&space->containers);
1834
1835 QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
1836
1837 return space;
1838}
1839
1840static void vfio_put_address_space(VFIOAddressSpace *space)
1841{
1842 if (QLIST_EMPTY(&space->containers)) {
1843 QLIST_REMOVE(space, list);
1844 g_free(space);
1845 }
1846}
1847
1848
1849
1850
1851static int vfio_get_iommu_type(VFIOContainer *container,
1852 Error **errp)
1853{
1854 int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
1855 VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
1856 int i;
1857
1858 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
1859 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
1860 return iommu_types[i];
1861 }
1862 }
1863 error_setg(errp, "No available IOMMU models");
1864 return -EINVAL;
1865}
1866
1867static int vfio_init_container(VFIOContainer *container, int group_fd,
1868 Error **errp)
1869{
1870 int iommu_type, ret;
1871
1872 iommu_type = vfio_get_iommu_type(container, errp);
1873 if (iommu_type < 0) {
1874 return iommu_type;
1875 }
1876
1877 ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
1878 if (ret) {
1879 error_setg_errno(errp, errno, "Failed to set group container");
1880 return -errno;
1881 }
1882
1883 while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
1884 if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
1885
1886
1887
1888
1889
1890
1891 iommu_type = VFIO_SPAPR_TCE_IOMMU;
1892 continue;
1893 }
1894 error_setg_errno(errp, errno, "Failed to set iommu for container");
1895 return -errno;
1896 }
1897
1898 container->iommu_type = iommu_type;
1899 return 0;
1900}
1901
1902static int vfio_get_iommu_info(VFIOContainer *container,
1903 struct vfio_iommu_type1_info **info)
1904{
1905
1906 size_t argsz = sizeof(struct vfio_iommu_type1_info);
1907
1908 *info = g_new0(struct vfio_iommu_type1_info, 1);
1909again:
1910 (*info)->argsz = argsz;
1911
1912 if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
1913 g_free(*info);
1914 *info = NULL;
1915 return -errno;
1916 }
1917
1918 if (((*info)->argsz > argsz)) {
1919 argsz = (*info)->argsz;
1920 *info = g_realloc(*info, argsz);
1921 goto again;
1922 }
1923
1924 return 0;
1925}
1926
1927static struct vfio_info_cap_header *
1928vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
1929{
1930 struct vfio_info_cap_header *hdr;
1931 void *ptr = info;
1932
1933 if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
1934 return NULL;
1935 }
1936
1937 for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
1938 if (hdr->id == id) {
1939 return hdr;
1940 }
1941 }
1942
1943 return NULL;
1944}
1945
1946static void vfio_get_iommu_info_migration(VFIOContainer *container,
1947 struct vfio_iommu_type1_info *info)
1948{
1949 struct vfio_info_cap_header *hdr;
1950 struct vfio_iommu_type1_info_cap_migration *cap_mig;
1951
1952 hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
1953 if (!hdr) {
1954 return;
1955 }
1956
1957 cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
1958 header);
1959
1960
1961
1962
1963
1964 if (cap_mig->pgsize_bitmap & qemu_real_host_page_size) {
1965 container->dirty_pages_supported = true;
1966 container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
1967 container->dirty_pgsizes = cap_mig->pgsize_bitmap;
1968 }
1969}
1970
1971static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
1972 Error **errp)
1973{
1974 VFIOContainer *container;
1975 int ret, fd;
1976 VFIOAddressSpace *space;
1977
1978 space = vfio_get_address_space(as);
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011 QLIST_FOREACH(container, &space->containers, next) {
2012 if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
2013 ret = vfio_ram_block_discard_disable(container, true);
2014 if (ret) {
2015 error_setg_errno(errp, -ret,
2016 "Cannot set discarding of RAM broken");
2017 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
2018 &container->fd)) {
2019 error_report("vfio: error disconnecting group %d from"
2020 " container", group->groupid);
2021 }
2022 return ret;
2023 }
2024 group->container = container;
2025 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2026 vfio_kvm_device_add_group(group);
2027 return 0;
2028 }
2029 }
2030
2031 fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
2032 if (fd < 0) {
2033 error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
2034 ret = -errno;
2035 goto put_space_exit;
2036 }
2037
2038 ret = ioctl(fd, VFIO_GET_API_VERSION);
2039 if (ret != VFIO_API_VERSION) {
2040 error_setg(errp, "supported vfio version: %d, "
2041 "reported version: %d", VFIO_API_VERSION, ret);
2042 ret = -EINVAL;
2043 goto close_fd_exit;
2044 }
2045
2046 container = g_malloc0(sizeof(*container));
2047 container->space = space;
2048 container->fd = fd;
2049 container->error = NULL;
2050 container->dirty_pages_supported = false;
2051 container->dma_max_mappings = 0;
2052 QLIST_INIT(&container->giommu_list);
2053 QLIST_INIT(&container->hostwin_list);
2054 QLIST_INIT(&container->vrdl_list);
2055
2056 ret = vfio_init_container(container, group->fd, errp);
2057 if (ret) {
2058 goto free_container_exit;
2059 }
2060
2061 ret = vfio_ram_block_discard_disable(container, true);
2062 if (ret) {
2063 error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
2064 goto free_container_exit;
2065 }
2066
2067 switch (container->iommu_type) {
2068 case VFIO_TYPE1v2_IOMMU:
2069 case VFIO_TYPE1_IOMMU:
2070 {
2071 struct vfio_iommu_type1_info *info;
2072
2073
2074
2075
2076
2077
2078
2079
2080 ret = vfio_get_iommu_info(container, &info);
2081
2082 if (ret || !(info->flags & VFIO_IOMMU_INFO_PGSIZES)) {
2083
2084 info->iova_pgsizes = 4096;
2085 }
2086 vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes);
2087 container->pgsizes = info->iova_pgsizes;
2088
2089
2090 container->dma_max_mappings = 65535;
2091 if (!ret) {
2092 vfio_get_info_dma_avail(info, &container->dma_max_mappings);
2093 vfio_get_iommu_info_migration(container, info);
2094 }
2095 g_free(info);
2096 break;
2097 }
2098 case VFIO_SPAPR_TCE_v2_IOMMU:
2099 case VFIO_SPAPR_TCE_IOMMU:
2100 {
2101 struct vfio_iommu_spapr_tce_info info;
2102 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
2103
2104
2105
2106
2107
2108
2109 if (!v2) {
2110 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
2111 if (ret) {
2112 error_setg_errno(errp, errno, "failed to enable container");
2113 ret = -errno;
2114 goto enable_discards_exit;
2115 }
2116 } else {
2117 container->prereg_listener = vfio_prereg_listener;
2118
2119 memory_listener_register(&container->prereg_listener,
2120 &address_space_memory);
2121 if (container->error) {
2122 memory_listener_unregister(&container->prereg_listener);
2123 ret = -1;
2124 error_propagate_prepend(errp, container->error,
2125 "RAM memory listener initialization failed: ");
2126 goto enable_discards_exit;
2127 }
2128 }
2129
2130 info.argsz = sizeof(info);
2131 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
2132 if (ret) {
2133 error_setg_errno(errp, errno,
2134 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
2135 ret = -errno;
2136 if (v2) {
2137 memory_listener_unregister(&container->prereg_listener);
2138 }
2139 goto enable_discards_exit;
2140 }
2141
2142 if (v2) {
2143 container->pgsizes = info.ddw.pgsizes;
2144
2145
2146
2147
2148
2149
2150 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
2151 if (ret) {
2152 error_setg_errno(errp, -ret,
2153 "failed to remove existing window");
2154 goto enable_discards_exit;
2155 }
2156 } else {
2157
2158 container->pgsizes = 0x1000;
2159 vfio_host_win_add(container, info.dma32_window_start,
2160 info.dma32_window_start +
2161 info.dma32_window_size - 1,
2162 0x1000);
2163 }
2164 }
2165 }
2166
2167 vfio_kvm_device_add_group(group);
2168
2169 QLIST_INIT(&container->group_list);
2170 QLIST_INSERT_HEAD(&space->containers, container, next);
2171
2172 group->container = container;
2173 QLIST_INSERT_HEAD(&container->group_list, group, container_next);
2174
2175 container->listener = vfio_memory_listener;
2176
2177 memory_listener_register(&container->listener, container->space->as);
2178
2179 if (container->error) {
2180 ret = -1;
2181 error_propagate_prepend(errp, container->error,
2182 "memory listener initialization failed: ");
2183 goto listener_release_exit;
2184 }
2185
2186 container->initialized = true;
2187
2188 return 0;
2189listener_release_exit:
2190 QLIST_REMOVE(group, container_next);
2191 QLIST_REMOVE(container, next);
2192 vfio_kvm_device_del_group(group);
2193 vfio_listener_release(container);
2194
2195enable_discards_exit:
2196 vfio_ram_block_discard_disable(container, false);
2197
2198free_container_exit:
2199 g_free(container);
2200
2201close_fd_exit:
2202 close(fd);
2203
2204put_space_exit:
2205 vfio_put_address_space(space);
2206
2207 return ret;
2208}
2209
2210static void vfio_disconnect_container(VFIOGroup *group)
2211{
2212 VFIOContainer *container = group->container;
2213
2214 QLIST_REMOVE(group, container_next);
2215 group->container = NULL;
2216
2217
2218
2219
2220
2221
2222 if (QLIST_EMPTY(&container->group_list)) {
2223 vfio_listener_release(container);
2224 }
2225
2226 if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
2227 error_report("vfio: error disconnecting group %d from container",
2228 group->groupid);
2229 }
2230
2231 if (QLIST_EMPTY(&container->group_list)) {
2232 VFIOAddressSpace *space = container->space;
2233 VFIOGuestIOMMU *giommu, *tmp;
2234 VFIOHostDMAWindow *hostwin, *next;
2235
2236 QLIST_REMOVE(container, next);
2237
2238 QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
2239 memory_region_unregister_iommu_notifier(
2240 MEMORY_REGION(giommu->iommu), &giommu->n);
2241 QLIST_REMOVE(giommu, giommu_next);
2242 g_free(giommu);
2243 }
2244
2245 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
2246 next) {
2247 QLIST_REMOVE(hostwin, hostwin_next);
2248 g_free(hostwin);
2249 }
2250
2251 trace_vfio_disconnect_container(container->fd);
2252 close(container->fd);
2253 g_free(container);
2254
2255 vfio_put_address_space(space);
2256 }
2257}
2258
2259VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
2260{
2261 VFIOGroup *group;
2262 char path[32];
2263 struct vfio_group_status status = { .argsz = sizeof(status) };
2264
2265 QLIST_FOREACH(group, &vfio_group_list, next) {
2266 if (group->groupid == groupid) {
2267
2268 if (group->container->space->as == as) {
2269 return group;
2270 } else {
2271 error_setg(errp, "group %d used in multiple address spaces",
2272 group->groupid);
2273 return NULL;
2274 }
2275 }
2276 }
2277
2278 group = g_malloc0(sizeof(*group));
2279
2280 snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
2281 group->fd = qemu_open_old(path, O_RDWR);
2282 if (group->fd < 0) {
2283 error_setg_errno(errp, errno, "failed to open %s", path);
2284 goto free_group_exit;
2285 }
2286
2287 if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
2288 error_setg_errno(errp, errno, "failed to get group %d status", groupid);
2289 goto close_fd_exit;
2290 }
2291
2292 if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
2293 error_setg(errp, "group %d is not viable", groupid);
2294 error_append_hint(errp,
2295 "Please ensure all devices within the iommu_group "
2296 "are bound to their vfio bus driver.\n");
2297 goto close_fd_exit;
2298 }
2299
2300 group->groupid = groupid;
2301 QLIST_INIT(&group->device_list);
2302
2303 if (vfio_connect_container(group, as, errp)) {
2304 error_prepend(errp, "failed to setup container for group %d: ",
2305 groupid);
2306 goto close_fd_exit;
2307 }
2308
2309 if (QLIST_EMPTY(&vfio_group_list)) {
2310 qemu_register_reset(vfio_reset_handler, NULL);
2311 }
2312
2313 QLIST_INSERT_HEAD(&vfio_group_list, group, next);
2314
2315 return group;
2316
2317close_fd_exit:
2318 close(group->fd);
2319
2320free_group_exit:
2321 g_free(group);
2322
2323 return NULL;
2324}
2325
2326void vfio_put_group(VFIOGroup *group)
2327{
2328 if (!group || !QLIST_EMPTY(&group->device_list)) {
2329 return;
2330 }
2331
2332 if (!group->ram_block_discard_allowed) {
2333 vfio_ram_block_discard_disable(group->container, false);
2334 }
2335 vfio_kvm_device_del_group(group);
2336 vfio_disconnect_container(group);
2337 QLIST_REMOVE(group, next);
2338 trace_vfio_put_group(group->fd);
2339 close(group->fd);
2340 g_free(group);
2341
2342 if (QLIST_EMPTY(&vfio_group_list)) {
2343 qemu_unregister_reset(vfio_reset_handler, NULL);
2344 }
2345}
2346
2347int vfio_get_device(VFIOGroup *group, const char *name,
2348 VFIODevice *vbasedev, Error **errp)
2349{
2350 struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
2351 int ret, fd;
2352
2353 fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
2354 if (fd < 0) {
2355 error_setg_errno(errp, errno, "error getting device from group %d",
2356 group->groupid);
2357 error_append_hint(errp,
2358 "Verify all devices in group %d are bound to vfio-<bus> "
2359 "or pci-stub and not already in use\n", group->groupid);
2360 return fd;
2361 }
2362
2363 ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
2364 if (ret) {
2365 error_setg_errno(errp, errno, "error getting device info");
2366 close(fd);
2367 return ret;
2368 }
2369
2370
2371
2372
2373
2374
2375
2376 if (vbasedev->ram_block_discard_allowed !=
2377 group->ram_block_discard_allowed) {
2378 if (!QLIST_EMPTY(&group->device_list)) {
2379 error_setg(errp, "Inconsistent setting of support for discarding "
2380 "RAM (e.g., balloon) within group");
2381 close(fd);
2382 return -1;
2383 }
2384
2385 if (!group->ram_block_discard_allowed) {
2386 group->ram_block_discard_allowed = true;
2387 vfio_ram_block_discard_disable(group->container, false);
2388 }
2389 }
2390
2391 vbasedev->fd = fd;
2392 vbasedev->group = group;
2393 QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
2394
2395 vbasedev->num_irqs = dev_info.num_irqs;
2396 vbasedev->num_regions = dev_info.num_regions;
2397 vbasedev->flags = dev_info.flags;
2398
2399 trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
2400 dev_info.num_irqs);
2401
2402 vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
2403 return 0;
2404}
2405
2406void vfio_put_base_device(VFIODevice *vbasedev)
2407{
2408 if (!vbasedev->group) {
2409 return;
2410 }
2411 QLIST_REMOVE(vbasedev, next);
2412 vbasedev->group = NULL;
2413 trace_vfio_put_base_device(vbasedev->fd);
2414 close(vbasedev->fd);
2415}
2416
2417int vfio_get_region_info(VFIODevice *vbasedev, int index,
2418 struct vfio_region_info **info)
2419{
2420 size_t argsz = sizeof(struct vfio_region_info);
2421
2422 *info = g_malloc0(argsz);
2423
2424 (*info)->index = index;
2425retry:
2426 (*info)->argsz = argsz;
2427
2428 if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
2429 g_free(*info);
2430 *info = NULL;
2431 return -errno;
2432 }
2433
2434 if ((*info)->argsz > argsz) {
2435 argsz = (*info)->argsz;
2436 *info = g_realloc(*info, argsz);
2437
2438 goto retry;
2439 }
2440
2441 return 0;
2442}
2443
2444int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
2445 uint32_t subtype, struct vfio_region_info **info)
2446{
2447 int i;
2448
2449 for (i = 0; i < vbasedev->num_regions; i++) {
2450 struct vfio_info_cap_header *hdr;
2451 struct vfio_region_info_cap_type *cap_type;
2452
2453 if (vfio_get_region_info(vbasedev, i, info)) {
2454 continue;
2455 }
2456
2457 hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
2458 if (!hdr) {
2459 g_free(*info);
2460 continue;
2461 }
2462
2463 cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
2464
2465 trace_vfio_get_dev_region(vbasedev->name, i,
2466 cap_type->type, cap_type->subtype);
2467
2468 if (cap_type->type == type && cap_type->subtype == subtype) {
2469 return 0;
2470 }
2471
2472 g_free(*info);
2473 }
2474
2475 *info = NULL;
2476 return -ENODEV;
2477}
2478
2479bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
2480{
2481 struct vfio_region_info *info = NULL;
2482 bool ret = false;
2483
2484 if (!vfio_get_region_info(vbasedev, region, &info)) {
2485 if (vfio_get_region_info_cap(info, cap_type)) {
2486 ret = true;
2487 }
2488 g_free(info);
2489 }
2490
2491 return ret;
2492}
2493
2494
2495
2496
2497static bool vfio_eeh_container_ok(VFIOContainer *container)
2498{
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514 if (QLIST_EMPTY(&container->group_list)) {
2515 return false;
2516 }
2517
2518 if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
2519 return false;
2520 }
2521
2522 return true;
2523}
2524
2525static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
2526{
2527 struct vfio_eeh_pe_op pe_op = {
2528 .argsz = sizeof(pe_op),
2529 .op = op,
2530 };
2531 int ret;
2532
2533 if (!vfio_eeh_container_ok(container)) {
2534 error_report("vfio/eeh: EEH_PE_OP 0x%x: "
2535 "kernel requires a container with exactly one group", op);
2536 return -EPERM;
2537 }
2538
2539 ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
2540 if (ret < 0) {
2541 error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
2542 return -errno;
2543 }
2544
2545 return ret;
2546}
2547
2548static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
2549{
2550 VFIOAddressSpace *space = vfio_get_address_space(as);
2551 VFIOContainer *container = NULL;
2552
2553 if (QLIST_EMPTY(&space->containers)) {
2554
2555 goto out;
2556 }
2557
2558 container = QLIST_FIRST(&space->containers);
2559
2560 if (QLIST_NEXT(container, next)) {
2561
2562
2563 container = NULL;
2564 goto out;
2565 }
2566
2567out:
2568 vfio_put_address_space(space);
2569 return container;
2570}
2571
2572bool vfio_eeh_as_ok(AddressSpace *as)
2573{
2574 VFIOContainer *container = vfio_eeh_as_container(as);
2575
2576 return (container != NULL) && vfio_eeh_container_ok(container);
2577}
2578
2579int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
2580{
2581 VFIOContainer *container = vfio_eeh_as_container(as);
2582
2583 if (!container) {
2584 return -ENODEV;
2585 }
2586 return vfio_eeh_container_op(container, op);
2587}
2588