1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/compat.h>
25#include <linux/device.h>
26#include <linux/fs.h>
27#include <linux/iommu.h>
28#include <linux/module.h>
29#include <linux/mm.h>
30#include <linux/kthread.h>
31#include <linux/rbtree.h>
32#include <linux/sched/signal.h>
33#include <linux/sched/mm.h>
34#include <linux/slab.h>
35#include <linux/uaccess.h>
36#include <linux/vfio.h>
37#include <linux/workqueue.h>
38#include <linux/mdev.h>
39#include <linux/notifier.h>
40#include <linux/dma-iommu.h>
41#include <linux/irqdomain.h>
42
43#define DRIVER_VERSION "0.2"
44#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
45#define DRIVER_DESC "Type1 IOMMU driver for VFIO"
46
47static bool allow_unsafe_interrupts;
48module_param_named(allow_unsafe_interrupts,
49 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
50MODULE_PARM_DESC(allow_unsafe_interrupts,
51 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
52
53static bool disable_hugepages;
54module_param_named(disable_hugepages,
55 disable_hugepages, bool, S_IRUGO | S_IWUSR);
56MODULE_PARM_DESC(disable_hugepages,
57 "Disable VFIO IOMMU support for IOMMU hugepages.");
58
59static unsigned int dma_entry_limit __read_mostly = U16_MAX;
60module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
61MODULE_PARM_DESC(dma_entry_limit,
62 "Maximum number of user DMA mappings per container (65535).");
63
64struct vfio_iommu {
65 struct list_head domain_list;
66 struct list_head iova_list;
67 struct vfio_domain *external_domain;
68 struct mutex lock;
69 struct rb_root dma_list;
70 struct blocking_notifier_head notifier;
71 unsigned int dma_avail;
72 uint64_t pgsize_bitmap;
73 bool v2;
74 bool nesting;
75 bool dirty_page_tracking;
76 bool pinned_page_dirty_scope;
77};
78
79struct vfio_domain {
80 struct iommu_domain *domain;
81 struct list_head next;
82 struct list_head group_list;
83 int prot;
84 bool fgsp;
85};
86
87struct vfio_dma {
88 struct rb_node node;
89 dma_addr_t iova;
90 unsigned long vaddr;
91 size_t size;
92 int prot;
93 bool iommu_mapped;
94 bool lock_cap;
95 struct task_struct *task;
96 struct rb_root pfn_list;
97 unsigned long *bitmap;
98};
99
100struct vfio_group {
101 struct iommu_group *iommu_group;
102 struct list_head next;
103 bool mdev_group;
104 bool pinned_page_dirty_scope;
105};
106
107struct vfio_iova {
108 struct list_head list;
109 dma_addr_t start;
110 dma_addr_t end;
111};
112
113
114
115
116struct vfio_pfn {
117 struct rb_node node;
118 dma_addr_t iova;
119 unsigned long pfn;
120 unsigned int ref_count;
121};
122
123struct vfio_regions {
124 struct list_head list;
125 dma_addr_t iova;
126 phys_addr_t phys;
127 size_t len;
128};
129
130#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
131 (!list_empty(&iommu->domain_list))
132
133#define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
134
135
136
137
138
139
140
141
142
143#define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
144#define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
145
146static int put_pfn(unsigned long pfn, int prot);
147
148static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
149 struct iommu_group *iommu_group);
150
151static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
152
153
154
155
156
157static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
158 dma_addr_t start, size_t size)
159{
160 struct rb_node *node = iommu->dma_list.rb_node;
161
162 while (node) {
163 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
164
165 if (start + size <= dma->iova)
166 node = node->rb_left;
167 else if (start >= dma->iova + dma->size)
168 node = node->rb_right;
169 else
170 return dma;
171 }
172
173 return NULL;
174}
175
176static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
177{
178 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
179 struct vfio_dma *dma;
180
181 while (*link) {
182 parent = *link;
183 dma = rb_entry(parent, struct vfio_dma, node);
184
185 if (new->iova + new->size <= dma->iova)
186 link = &(*link)->rb_left;
187 else
188 link = &(*link)->rb_right;
189 }
190
191 rb_link_node(&new->node, parent, link);
192 rb_insert_color(&new->node, &iommu->dma_list);
193}
194
195static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
196{
197 rb_erase(&old->node, &iommu->dma_list);
198}
199
200
201static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
202{
203 uint64_t npages = dma->size / pgsize;
204
205 if (npages > DIRTY_BITMAP_PAGES_MAX)
206 return -EINVAL;
207
208
209
210
211
212
213 dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
214 GFP_KERNEL);
215 if (!dma->bitmap)
216 return -ENOMEM;
217
218 return 0;
219}
220
221static void vfio_dma_bitmap_free(struct vfio_dma *dma)
222{
223 kfree(dma->bitmap);
224 dma->bitmap = NULL;
225}
226
227static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
228{
229 struct rb_node *p;
230 unsigned long pgshift = __ffs(pgsize);
231
232 for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
233 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
234
235 bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
236 }
237}
238
239static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
240{
241 struct rb_node *n;
242
243 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
244 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
245 int ret;
246
247 ret = vfio_dma_bitmap_alloc(dma, pgsize);
248 if (ret) {
249 struct rb_node *p;
250
251 for (p = rb_prev(n); p; p = rb_prev(p)) {
252 struct vfio_dma *dma = rb_entry(n,
253 struct vfio_dma, node);
254
255 vfio_dma_bitmap_free(dma);
256 }
257 return ret;
258 }
259 vfio_dma_populate_bitmap(dma, pgsize);
260 }
261 return 0;
262}
263
264static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
265{
266 struct rb_node *n;
267
268 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
269 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
270
271 vfio_dma_bitmap_free(dma);
272 }
273}
274
275
276
277
278static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
279{
280 struct vfio_pfn *vpfn;
281 struct rb_node *node = dma->pfn_list.rb_node;
282
283 while (node) {
284 vpfn = rb_entry(node, struct vfio_pfn, node);
285
286 if (iova < vpfn->iova)
287 node = node->rb_left;
288 else if (iova > vpfn->iova)
289 node = node->rb_right;
290 else
291 return vpfn;
292 }
293 return NULL;
294}
295
296static void vfio_link_pfn(struct vfio_dma *dma,
297 struct vfio_pfn *new)
298{
299 struct rb_node **link, *parent = NULL;
300 struct vfio_pfn *vpfn;
301
302 link = &dma->pfn_list.rb_node;
303 while (*link) {
304 parent = *link;
305 vpfn = rb_entry(parent, struct vfio_pfn, node);
306
307 if (new->iova < vpfn->iova)
308 link = &(*link)->rb_left;
309 else
310 link = &(*link)->rb_right;
311 }
312
313 rb_link_node(&new->node, parent, link);
314 rb_insert_color(&new->node, &dma->pfn_list);
315}
316
317static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
318{
319 rb_erase(&old->node, &dma->pfn_list);
320}
321
322static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
323 unsigned long pfn)
324{
325 struct vfio_pfn *vpfn;
326
327 vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
328 if (!vpfn)
329 return -ENOMEM;
330
331 vpfn->iova = iova;
332 vpfn->pfn = pfn;
333 vpfn->ref_count = 1;
334 vfio_link_pfn(dma, vpfn);
335 return 0;
336}
337
338static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
339 struct vfio_pfn *vpfn)
340{
341 vfio_unlink_pfn(dma, vpfn);
342 kfree(vpfn);
343}
344
345static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
346 unsigned long iova)
347{
348 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
349
350 if (vpfn)
351 vpfn->ref_count++;
352 return vpfn;
353}
354
355static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
356{
357 int ret = 0;
358
359 vpfn->ref_count--;
360 if (!vpfn->ref_count) {
361 ret = put_pfn(vpfn->pfn, dma->prot);
362 vfio_remove_from_pfn_list(dma, vpfn);
363 }
364 return ret;
365}
366
367static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
368{
369 struct mm_struct *mm;
370 int ret;
371
372 if (!npage)
373 return 0;
374
375 mm = async ? get_task_mm(dma->task) : dma->task->mm;
376 if (!mm)
377 return -ESRCH;
378
379 ret = mmap_write_lock_killable(mm);
380 if (!ret) {
381 ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
382 dma->lock_cap);
383 mmap_write_unlock(mm);
384 }
385
386 if (async)
387 mmput(mm);
388
389 return ret;
390}
391
392
393
394
395
396
397
398
399static bool is_invalid_reserved_pfn(unsigned long pfn)
400{
401 if (pfn_valid(pfn))
402 return PageReserved(pfn_to_page(pfn));
403
404 return true;
405}
406
407static int put_pfn(unsigned long pfn, int prot)
408{
409 if (!is_invalid_reserved_pfn(pfn)) {
410 struct page *page = pfn_to_page(pfn);
411
412 unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
413 return 1;
414 }
415 return 0;
416}
417
418static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
419 unsigned long vaddr, unsigned long *pfn,
420 bool write_fault)
421{
422 int ret;
423
424 ret = follow_pfn(vma, vaddr, pfn);
425 if (ret) {
426 bool unlocked = false;
427
428 ret = fixup_user_fault(mm, vaddr,
429 FAULT_FLAG_REMOTE |
430 (write_fault ? FAULT_FLAG_WRITE : 0),
431 &unlocked);
432 if (unlocked)
433 return -EAGAIN;
434
435 if (ret)
436 return ret;
437
438 ret = follow_pfn(vma, vaddr, pfn);
439 }
440
441 return ret;
442}
443
444static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
445 int prot, unsigned long *pfn)
446{
447 struct page *page[1];
448 struct vm_area_struct *vma;
449 unsigned int flags = 0;
450 int ret;
451
452 if (prot & IOMMU_WRITE)
453 flags |= FOLL_WRITE;
454
455 mmap_read_lock(mm);
456 ret = pin_user_pages_remote(mm, vaddr, 1, flags | FOLL_LONGTERM,
457 page, NULL, NULL);
458 if (ret == 1) {
459 *pfn = page_to_pfn(page[0]);
460 ret = 0;
461 goto done;
462 }
463
464 vaddr = untagged_addr(vaddr);
465
466retry:
467 vma = find_vma_intersection(mm, vaddr, vaddr + 1);
468
469 if (vma && vma->vm_flags & VM_PFNMAP) {
470 ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
471 if (ret == -EAGAIN)
472 goto retry;
473
474 if (!ret && !is_invalid_reserved_pfn(*pfn))
475 ret = -EFAULT;
476 }
477done:
478 mmap_read_unlock(mm);
479 return ret;
480}
481
482
483
484
485
486
487static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
488 long npage, unsigned long *pfn_base,
489 unsigned long limit)
490{
491 unsigned long pfn = 0;
492 long ret, pinned = 0, lock_acct = 0;
493 bool rsvd;
494 dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
495
496
497 if (!current->mm)
498 return -ENODEV;
499
500 ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
501 if (ret)
502 return ret;
503
504 pinned++;
505 rsvd = is_invalid_reserved_pfn(*pfn_base);
506
507
508
509
510
511 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
512 if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
513 put_pfn(*pfn_base, dma->prot);
514 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
515 limit << PAGE_SHIFT);
516 return -ENOMEM;
517 }
518 lock_acct++;
519 }
520
521 if (unlikely(disable_hugepages))
522 goto out;
523
524
525 for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
526 pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
527 ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
528 if (ret)
529 break;
530
531 if (pfn != *pfn_base + pinned ||
532 rsvd != is_invalid_reserved_pfn(pfn)) {
533 put_pfn(pfn, dma->prot);
534 break;
535 }
536
537 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
538 if (!dma->lock_cap &&
539 current->mm->locked_vm + lock_acct + 1 > limit) {
540 put_pfn(pfn, dma->prot);
541 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
542 __func__, limit << PAGE_SHIFT);
543 ret = -ENOMEM;
544 goto unpin_out;
545 }
546 lock_acct++;
547 }
548 }
549
550out:
551 ret = vfio_lock_acct(dma, lock_acct, false);
552
553unpin_out:
554 if (ret) {
555 if (!rsvd) {
556 for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
557 put_pfn(pfn, dma->prot);
558 }
559
560 return ret;
561 }
562
563 return pinned;
564}
565
566static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
567 unsigned long pfn, long npage,
568 bool do_accounting)
569{
570 long unlocked = 0, locked = 0;
571 long i;
572
573 for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
574 if (put_pfn(pfn++, dma->prot)) {
575 unlocked++;
576 if (vfio_find_vpfn(dma, iova))
577 locked++;
578 }
579 }
580
581 if (do_accounting)
582 vfio_lock_acct(dma, locked - unlocked, true);
583
584 return unlocked;
585}
586
587static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
588 unsigned long *pfn_base, bool do_accounting)
589{
590 struct mm_struct *mm;
591 int ret;
592
593 mm = get_task_mm(dma->task);
594 if (!mm)
595 return -ENODEV;
596
597 ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
598 if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
599 ret = vfio_lock_acct(dma, 1, true);
600 if (ret) {
601 put_pfn(*pfn_base, dma->prot);
602 if (ret == -ENOMEM)
603 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
604 "(%ld) exceeded\n", __func__,
605 dma->task->comm, task_pid_nr(dma->task),
606 task_rlimit(dma->task, RLIMIT_MEMLOCK));
607 }
608 }
609
610 mmput(mm);
611 return ret;
612}
613
614static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
615 bool do_accounting)
616{
617 int unlocked;
618 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
619
620 if (!vpfn)
621 return 0;
622
623 unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
624
625 if (do_accounting)
626 vfio_lock_acct(dma, -unlocked, true);
627
628 return unlocked;
629}
630
631static int vfio_iommu_type1_pin_pages(void *iommu_data,
632 struct iommu_group *iommu_group,
633 unsigned long *user_pfn,
634 int npage, int prot,
635 unsigned long *phys_pfn)
636{
637 struct vfio_iommu *iommu = iommu_data;
638 struct vfio_group *group;
639 int i, j, ret;
640 unsigned long remote_vaddr;
641 struct vfio_dma *dma;
642 bool do_accounting;
643
644 if (!iommu || !user_pfn || !phys_pfn)
645 return -EINVAL;
646
647
648 if (!iommu->v2)
649 return -EACCES;
650
651 mutex_lock(&iommu->lock);
652
653
654 if (!iommu->notifier.head) {
655 ret = -EINVAL;
656 goto pin_done;
657 }
658
659
660
661
662
663
664 do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
665
666 for (i = 0; i < npage; i++) {
667 dma_addr_t iova;
668 struct vfio_pfn *vpfn;
669
670 iova = user_pfn[i] << PAGE_SHIFT;
671 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
672 if (!dma) {
673 ret = -EINVAL;
674 goto pin_unwind;
675 }
676
677 if ((dma->prot & prot) != prot) {
678 ret = -EPERM;
679 goto pin_unwind;
680 }
681
682 vpfn = vfio_iova_get_vfio_pfn(dma, iova);
683 if (vpfn) {
684 phys_pfn[i] = vpfn->pfn;
685 continue;
686 }
687
688 remote_vaddr = dma->vaddr + (iova - dma->iova);
689 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
690 do_accounting);
691 if (ret)
692 goto pin_unwind;
693
694 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
695 if (ret) {
696 vfio_unpin_page_external(dma, iova, do_accounting);
697 goto pin_unwind;
698 }
699
700 if (iommu->dirty_page_tracking) {
701 unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
702
703
704
705
706
707 bitmap_set(dma->bitmap,
708 (iova - dma->iova) >> pgshift, 1);
709 }
710 }
711 ret = i;
712
713 group = vfio_iommu_find_iommu_group(iommu, iommu_group);
714 if (!group->pinned_page_dirty_scope) {
715 group->pinned_page_dirty_scope = true;
716 update_pinned_page_dirty_scope(iommu);
717 }
718
719 goto pin_done;
720
721pin_unwind:
722 phys_pfn[i] = 0;
723 for (j = 0; j < i; j++) {
724 dma_addr_t iova;
725
726 iova = user_pfn[j] << PAGE_SHIFT;
727 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
728 vfio_unpin_page_external(dma, iova, do_accounting);
729 phys_pfn[j] = 0;
730 }
731pin_done:
732 mutex_unlock(&iommu->lock);
733 return ret;
734}
735
736static int vfio_iommu_type1_unpin_pages(void *iommu_data,
737 unsigned long *user_pfn,
738 int npage)
739{
740 struct vfio_iommu *iommu = iommu_data;
741 bool do_accounting;
742 int i;
743
744 if (!iommu || !user_pfn)
745 return -EINVAL;
746
747
748 if (!iommu->v2)
749 return -EACCES;
750
751 mutex_lock(&iommu->lock);
752
753 do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
754 for (i = 0; i < npage; i++) {
755 struct vfio_dma *dma;
756 dma_addr_t iova;
757
758 iova = user_pfn[i] << PAGE_SHIFT;
759 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
760 if (!dma)
761 goto unpin_exit;
762 vfio_unpin_page_external(dma, iova, do_accounting);
763 }
764
765unpin_exit:
766 mutex_unlock(&iommu->lock);
767 return i > npage ? npage : (i > 0 ? i : -EINVAL);
768}
769
770static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
771 struct list_head *regions,
772 struct iommu_iotlb_gather *iotlb_gather)
773{
774 long unlocked = 0;
775 struct vfio_regions *entry, *next;
776
777 iommu_tlb_sync(domain->domain, iotlb_gather);
778
779 list_for_each_entry_safe(entry, next, regions, list) {
780 unlocked += vfio_unpin_pages_remote(dma,
781 entry->iova,
782 entry->phys >> PAGE_SHIFT,
783 entry->len >> PAGE_SHIFT,
784 false);
785 list_del(&entry->list);
786 kfree(entry);
787 }
788
789 cond_resched();
790
791 return unlocked;
792}
793
794
795
796
797
798
799
800
801#define VFIO_IOMMU_TLB_SYNC_MAX 512
802
803static size_t unmap_unpin_fast(struct vfio_domain *domain,
804 struct vfio_dma *dma, dma_addr_t *iova,
805 size_t len, phys_addr_t phys, long *unlocked,
806 struct list_head *unmapped_list,
807 int *unmapped_cnt,
808 struct iommu_iotlb_gather *iotlb_gather)
809{
810 size_t unmapped = 0;
811 struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
812
813 if (entry) {
814 unmapped = iommu_unmap_fast(domain->domain, *iova, len,
815 iotlb_gather);
816
817 if (!unmapped) {
818 kfree(entry);
819 } else {
820 entry->iova = *iova;
821 entry->phys = phys;
822 entry->len = unmapped;
823 list_add_tail(&entry->list, unmapped_list);
824
825 *iova += unmapped;
826 (*unmapped_cnt)++;
827 }
828 }
829
830
831
832
833
834 if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
835 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
836 iotlb_gather);
837 *unmapped_cnt = 0;
838 }
839
840 return unmapped;
841}
842
843static size_t unmap_unpin_slow(struct vfio_domain *domain,
844 struct vfio_dma *dma, dma_addr_t *iova,
845 size_t len, phys_addr_t phys,
846 long *unlocked)
847{
848 size_t unmapped = iommu_unmap(domain->domain, *iova, len);
849
850 if (unmapped) {
851 *unlocked += vfio_unpin_pages_remote(dma, *iova,
852 phys >> PAGE_SHIFT,
853 unmapped >> PAGE_SHIFT,
854 false);
855 *iova += unmapped;
856 cond_resched();
857 }
858 return unmapped;
859}
860
861static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
862 bool do_accounting)
863{
864 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
865 struct vfio_domain *domain, *d;
866 LIST_HEAD(unmapped_region_list);
867 struct iommu_iotlb_gather iotlb_gather;
868 int unmapped_region_cnt = 0;
869 long unlocked = 0;
870
871 if (!dma->size)
872 return 0;
873
874 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
875 return 0;
876
877
878
879
880
881
882
883
884 domain = d = list_first_entry(&iommu->domain_list,
885 struct vfio_domain, next);
886
887 list_for_each_entry_continue(d, &iommu->domain_list, next) {
888 iommu_unmap(d->domain, dma->iova, dma->size);
889 cond_resched();
890 }
891
892 iommu_iotlb_gather_init(&iotlb_gather);
893 while (iova < end) {
894 size_t unmapped, len;
895 phys_addr_t phys, next;
896
897 phys = iommu_iova_to_phys(domain->domain, iova);
898 if (WARN_ON(!phys)) {
899 iova += PAGE_SIZE;
900 continue;
901 }
902
903
904
905
906
907
908 for (len = PAGE_SIZE;
909 !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
910 next = iommu_iova_to_phys(domain->domain, iova + len);
911 if (next != phys + len)
912 break;
913 }
914
915
916
917
918
919 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
920 &unlocked, &unmapped_region_list,
921 &unmapped_region_cnt,
922 &iotlb_gather);
923 if (!unmapped) {
924 unmapped = unmap_unpin_slow(domain, dma, &iova, len,
925 phys, &unlocked);
926 if (WARN_ON(!unmapped))
927 break;
928 }
929 }
930
931 dma->iommu_mapped = false;
932
933 if (unmapped_region_cnt) {
934 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
935 &iotlb_gather);
936 }
937
938 if (do_accounting) {
939 vfio_lock_acct(dma, -unlocked, true);
940 return 0;
941 }
942 return unlocked;
943}
944
945static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
946{
947 vfio_unmap_unpin(iommu, dma, true);
948 vfio_unlink_dma(iommu, dma);
949 put_task_struct(dma->task);
950 vfio_dma_bitmap_free(dma);
951 kfree(dma);
952 iommu->dma_avail++;
953}
954
955static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
956{
957 struct vfio_domain *domain;
958
959 iommu->pgsize_bitmap = ULONG_MAX;
960
961 list_for_each_entry(domain, &iommu->domain_list, next)
962 iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
963
964
965
966
967
968
969
970
971
972 if (iommu->pgsize_bitmap & ~PAGE_MASK) {
973 iommu->pgsize_bitmap &= PAGE_MASK;
974 iommu->pgsize_bitmap |= PAGE_SIZE;
975 }
976}
977
978static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
979 struct vfio_dma *dma, dma_addr_t base_iova,
980 size_t pgsize)
981{
982 unsigned long pgshift = __ffs(pgsize);
983 unsigned long nbits = dma->size >> pgshift;
984 unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
985 unsigned long copy_offset = bit_offset / BITS_PER_LONG;
986 unsigned long shift = bit_offset % BITS_PER_LONG;
987 unsigned long leftover;
988
989
990
991
992
993 if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped)
994 bitmap_set(dma->bitmap, 0, nbits);
995
996 if (shift) {
997 bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
998 nbits + shift);
999
1000 if (copy_from_user(&leftover,
1001 (void __user *)(bitmap + copy_offset),
1002 sizeof(leftover)))
1003 return -EFAULT;
1004
1005 bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1006 }
1007
1008 if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1009 DIRTY_BITMAP_BYTES(nbits + shift)))
1010 return -EFAULT;
1011
1012 return 0;
1013}
1014
1015static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1016 dma_addr_t iova, size_t size, size_t pgsize)
1017{
1018 struct vfio_dma *dma;
1019 struct rb_node *n;
1020 unsigned long pgshift = __ffs(pgsize);
1021 int ret;
1022
1023
1024
1025
1026
1027
1028
1029 dma = vfio_find_dma(iommu, iova, 1);
1030 if (dma && dma->iova != iova)
1031 return -EINVAL;
1032
1033 dma = vfio_find_dma(iommu, iova + size - 1, 0);
1034 if (dma && dma->iova + dma->size != iova + size)
1035 return -EINVAL;
1036
1037 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1038 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1039
1040 if (dma->iova < iova)
1041 continue;
1042
1043 if (dma->iova > iova + size - 1)
1044 break;
1045
1046 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1047 if (ret)
1048 return ret;
1049
1050
1051
1052
1053
1054
1055 bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1056 vfio_dma_populate_bitmap(dma, pgsize);
1057 }
1058 return 0;
1059}
1060
1061static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1062{
1063 if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1064 (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1065 return -EINVAL;
1066
1067 return 0;
1068}
1069
1070static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1071 struct vfio_iommu_type1_dma_unmap *unmap,
1072 struct vfio_bitmap *bitmap)
1073{
1074 struct vfio_dma *dma, *dma_last = NULL;
1075 size_t unmapped = 0, pgsize;
1076 int ret = 0, retries = 0;
1077 unsigned long pgshift;
1078
1079 mutex_lock(&iommu->lock);
1080
1081 pgshift = __ffs(iommu->pgsize_bitmap);
1082 pgsize = (size_t)1 << pgshift;
1083
1084 if (unmap->iova & (pgsize - 1)) {
1085 ret = -EINVAL;
1086 goto unlock;
1087 }
1088
1089 if (!unmap->size || unmap->size & (pgsize - 1)) {
1090 ret = -EINVAL;
1091 goto unlock;
1092 }
1093
1094 if (unmap->iova + unmap->size - 1 < unmap->iova ||
1095 unmap->size > SIZE_MAX) {
1096 ret = -EINVAL;
1097 goto unlock;
1098 }
1099
1100
1101 if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1102 (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1103 ret = -EINVAL;
1104 goto unlock;
1105 }
1106
1107 WARN_ON((pgsize - 1) & PAGE_MASK);
1108again:
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140 if (iommu->v2) {
1141 dma = vfio_find_dma(iommu, unmap->iova, 1);
1142 if (dma && dma->iova != unmap->iova) {
1143 ret = -EINVAL;
1144 goto unlock;
1145 }
1146 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
1147 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
1148 ret = -EINVAL;
1149 goto unlock;
1150 }
1151 }
1152
1153 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
1154 if (!iommu->v2 && unmap->iova > dma->iova)
1155 break;
1156
1157
1158
1159
1160 if (dma->task->mm != current->mm)
1161 break;
1162
1163 if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1164 struct vfio_iommu_type1_dma_unmap nb_unmap;
1165
1166 if (dma_last == dma) {
1167 BUG_ON(++retries > 10);
1168 } else {
1169 dma_last = dma;
1170 retries = 0;
1171 }
1172
1173 nb_unmap.iova = dma->iova;
1174 nb_unmap.size = dma->size;
1175
1176
1177
1178
1179
1180
1181
1182 mutex_unlock(&iommu->lock);
1183 blocking_notifier_call_chain(&iommu->notifier,
1184 VFIO_IOMMU_NOTIFY_DMA_UNMAP,
1185 &nb_unmap);
1186 mutex_lock(&iommu->lock);
1187 goto again;
1188 }
1189
1190 if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1191 ret = update_user_bitmap(bitmap->data, iommu, dma,
1192 unmap->iova, pgsize);
1193 if (ret)
1194 break;
1195 }
1196
1197 unmapped += dma->size;
1198 vfio_remove_dma(iommu, dma);
1199 }
1200
1201unlock:
1202 mutex_unlock(&iommu->lock);
1203
1204
1205 unmap->size = unmapped;
1206
1207 return ret;
1208}
1209
1210static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1211 unsigned long pfn, long npage, int prot)
1212{
1213 struct vfio_domain *d;
1214 int ret;
1215
1216 list_for_each_entry(d, &iommu->domain_list, next) {
1217 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1218 npage << PAGE_SHIFT, prot | d->prot);
1219 if (ret)
1220 goto unwind;
1221
1222 cond_resched();
1223 }
1224
1225 return 0;
1226
1227unwind:
1228 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1229 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1230 cond_resched();
1231 }
1232
1233 return ret;
1234}
1235
1236static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1237 size_t map_size)
1238{
1239 dma_addr_t iova = dma->iova;
1240 unsigned long vaddr = dma->vaddr;
1241 size_t size = map_size;
1242 long npage;
1243 unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1244 int ret = 0;
1245
1246 while (size) {
1247
1248 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1249 size >> PAGE_SHIFT, &pfn, limit);
1250 if (npage <= 0) {
1251 WARN_ON(!npage);
1252 ret = (int)npage;
1253 break;
1254 }
1255
1256
1257 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1258 dma->prot);
1259 if (ret) {
1260 vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1261 npage, true);
1262 break;
1263 }
1264
1265 size -= npage << PAGE_SHIFT;
1266 dma->size += npage << PAGE_SHIFT;
1267 }
1268
1269 dma->iommu_mapped = true;
1270
1271 if (ret)
1272 vfio_remove_dma(iommu, dma);
1273
1274 return ret;
1275}
1276
1277
1278
1279
1280static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1281 dma_addr_t start, dma_addr_t end)
1282{
1283 struct list_head *iova = &iommu->iova_list;
1284 struct vfio_iova *node;
1285
1286 list_for_each_entry(node, iova, list) {
1287 if (start >= node->start && end <= node->end)
1288 return true;
1289 }
1290
1291
1292
1293
1294
1295 return list_empty(iova);
1296}
1297
1298static int vfio_dma_do_map(struct vfio_iommu *iommu,
1299 struct vfio_iommu_type1_dma_map *map)
1300{
1301 dma_addr_t iova = map->iova;
1302 unsigned long vaddr = map->vaddr;
1303 size_t size = map->size;
1304 int ret = 0, prot = 0;
1305 size_t pgsize;
1306 struct vfio_dma *dma;
1307
1308
1309 if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1310 return -EINVAL;
1311
1312
1313 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1314 prot |= IOMMU_WRITE;
1315 if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1316 prot |= IOMMU_READ;
1317
1318 mutex_lock(&iommu->lock);
1319
1320 pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1321
1322 WARN_ON((pgsize - 1) & PAGE_MASK);
1323
1324 if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) {
1325 ret = -EINVAL;
1326 goto out_unlock;
1327 }
1328
1329
1330 if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1331 ret = -EINVAL;
1332 goto out_unlock;
1333 }
1334
1335 if (vfio_find_dma(iommu, iova, size)) {
1336 ret = -EEXIST;
1337 goto out_unlock;
1338 }
1339
1340 if (!iommu->dma_avail) {
1341 ret = -ENOSPC;
1342 goto out_unlock;
1343 }
1344
1345 if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1346 ret = -EINVAL;
1347 goto out_unlock;
1348 }
1349
1350 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1351 if (!dma) {
1352 ret = -ENOMEM;
1353 goto out_unlock;
1354 }
1355
1356 iommu->dma_avail--;
1357 dma->iova = iova;
1358 dma->vaddr = vaddr;
1359 dma->prot = prot;
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386 get_task_struct(current->group_leader);
1387 dma->task = current->group_leader;
1388 dma->lock_cap = capable(CAP_IPC_LOCK);
1389
1390 dma->pfn_list = RB_ROOT;
1391
1392
1393 vfio_link_dma(iommu, dma);
1394
1395
1396 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1397 dma->size = size;
1398 else
1399 ret = vfio_pin_map_dma(iommu, dma, size);
1400
1401 if (!ret && iommu->dirty_page_tracking) {
1402 ret = vfio_dma_bitmap_alloc(dma, pgsize);
1403 if (ret)
1404 vfio_remove_dma(iommu, dma);
1405 }
1406
1407out_unlock:
1408 mutex_unlock(&iommu->lock);
1409 return ret;
1410}
1411
1412static int vfio_bus_type(struct device *dev, void *data)
1413{
1414 struct bus_type **bus = data;
1415
1416 if (*bus && *bus != dev->bus)
1417 return -EINVAL;
1418
1419 *bus = dev->bus;
1420
1421 return 0;
1422}
1423
1424static int vfio_iommu_replay(struct vfio_iommu *iommu,
1425 struct vfio_domain *domain)
1426{
1427 struct vfio_domain *d = NULL;
1428 struct rb_node *n;
1429 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1430 int ret;
1431
1432
1433 if (!list_empty(&iommu->domain_list))
1434 d = list_first_entry(&iommu->domain_list,
1435 struct vfio_domain, next);
1436
1437 n = rb_first(&iommu->dma_list);
1438
1439 for (; n; n = rb_next(n)) {
1440 struct vfio_dma *dma;
1441 dma_addr_t iova;
1442
1443 dma = rb_entry(n, struct vfio_dma, node);
1444 iova = dma->iova;
1445
1446 while (iova < dma->iova + dma->size) {
1447 phys_addr_t phys;
1448 size_t size;
1449
1450 if (dma->iommu_mapped) {
1451 phys_addr_t p;
1452 dma_addr_t i;
1453
1454 if (WARN_ON(!d)) {
1455 ret = -EINVAL;
1456 goto unwind;
1457 }
1458
1459 phys = iommu_iova_to_phys(d->domain, iova);
1460
1461 if (WARN_ON(!phys)) {
1462 iova += PAGE_SIZE;
1463 continue;
1464 }
1465
1466 size = PAGE_SIZE;
1467 p = phys + size;
1468 i = iova + size;
1469 while (i < dma->iova + dma->size &&
1470 p == iommu_iova_to_phys(d->domain, i)) {
1471 size += PAGE_SIZE;
1472 p += PAGE_SIZE;
1473 i += PAGE_SIZE;
1474 }
1475 } else {
1476 unsigned long pfn;
1477 unsigned long vaddr = dma->vaddr +
1478 (iova - dma->iova);
1479 size_t n = dma->iova + dma->size - iova;
1480 long npage;
1481
1482 npage = vfio_pin_pages_remote(dma, vaddr,
1483 n >> PAGE_SHIFT,
1484 &pfn, limit);
1485 if (npage <= 0) {
1486 WARN_ON(!npage);
1487 ret = (int)npage;
1488 goto unwind;
1489 }
1490
1491 phys = pfn << PAGE_SHIFT;
1492 size = npage << PAGE_SHIFT;
1493 }
1494
1495 ret = iommu_map(domain->domain, iova, phys,
1496 size, dma->prot | domain->prot);
1497 if (ret) {
1498 if (!dma->iommu_mapped)
1499 vfio_unpin_pages_remote(dma, iova,
1500 phys >> PAGE_SHIFT,
1501 size >> PAGE_SHIFT,
1502 true);
1503 goto unwind;
1504 }
1505
1506 iova += size;
1507 }
1508 }
1509
1510
1511 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1512 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1513
1514 dma->iommu_mapped = true;
1515 }
1516
1517 return 0;
1518
1519unwind:
1520 for (; n; n = rb_prev(n)) {
1521 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1522 dma_addr_t iova;
1523
1524 if (dma->iommu_mapped) {
1525 iommu_unmap(domain->domain, dma->iova, dma->size);
1526 continue;
1527 }
1528
1529 iova = dma->iova;
1530 while (iova < dma->iova + dma->size) {
1531 phys_addr_t phys, p;
1532 size_t size;
1533 dma_addr_t i;
1534
1535 phys = iommu_iova_to_phys(domain->domain, iova);
1536 if (!phys) {
1537 iova += PAGE_SIZE;
1538 continue;
1539 }
1540
1541 size = PAGE_SIZE;
1542 p = phys + size;
1543 i = iova + size;
1544 while (i < dma->iova + dma->size &&
1545 p == iommu_iova_to_phys(domain->domain, i)) {
1546 size += PAGE_SIZE;
1547 p += PAGE_SIZE;
1548 i += PAGE_SIZE;
1549 }
1550
1551 iommu_unmap(domain->domain, iova, size);
1552 vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1553 size >> PAGE_SHIFT, true);
1554 }
1555 }
1556
1557 return ret;
1558}
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570static void vfio_test_domain_fgsp(struct vfio_domain *domain)
1571{
1572 struct page *pages;
1573 int ret, order = get_order(PAGE_SIZE * 2);
1574
1575 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1576 if (!pages)
1577 return;
1578
1579 ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
1580 IOMMU_READ | IOMMU_WRITE | domain->prot);
1581 if (!ret) {
1582 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
1583
1584 if (unmapped == PAGE_SIZE)
1585 iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
1586 else
1587 domain->fgsp = true;
1588 }
1589
1590 __free_pages(pages, order);
1591}
1592
1593static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
1594 struct iommu_group *iommu_group)
1595{
1596 struct vfio_group *g;
1597
1598 list_for_each_entry(g, &domain->group_list, next) {
1599 if (g->iommu_group == iommu_group)
1600 return g;
1601 }
1602
1603 return NULL;
1604}
1605
1606static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1607 struct iommu_group *iommu_group)
1608{
1609 struct vfio_domain *domain;
1610 struct vfio_group *group = NULL;
1611
1612 list_for_each_entry(domain, &iommu->domain_list, next) {
1613 group = find_iommu_group(domain, iommu_group);
1614 if (group)
1615 return group;
1616 }
1617
1618 if (iommu->external_domain)
1619 group = find_iommu_group(iommu->external_domain, iommu_group);
1620
1621 return group;
1622}
1623
1624static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
1625{
1626 struct vfio_domain *domain;
1627 struct vfio_group *group;
1628
1629 list_for_each_entry(domain, &iommu->domain_list, next) {
1630 list_for_each_entry(group, &domain->group_list, next) {
1631 if (!group->pinned_page_dirty_scope) {
1632 iommu->pinned_page_dirty_scope = false;
1633 return;
1634 }
1635 }
1636 }
1637
1638 if (iommu->external_domain) {
1639 domain = iommu->external_domain;
1640 list_for_each_entry(group, &domain->group_list, next) {
1641 if (!group->pinned_page_dirty_scope) {
1642 iommu->pinned_page_dirty_scope = false;
1643 return;
1644 }
1645 }
1646 }
1647
1648 iommu->pinned_page_dirty_scope = true;
1649}
1650
1651static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1652 phys_addr_t *base)
1653{
1654 struct iommu_resv_region *region;
1655 bool ret = false;
1656
1657 list_for_each_entry(region, group_resv_regions, list) {
1658
1659
1660
1661
1662
1663 if (region->type == IOMMU_RESV_MSI) {
1664 ret = false;
1665 break;
1666 }
1667
1668 if (region->type == IOMMU_RESV_SW_MSI) {
1669 *base = region->start;
1670 ret = true;
1671 }
1672 }
1673
1674 return ret;
1675}
1676
1677static struct device *vfio_mdev_get_iommu_device(struct device *dev)
1678{
1679 struct device *(*fn)(struct device *dev);
1680 struct device *iommu_device;
1681
1682 fn = symbol_get(mdev_get_iommu_device);
1683 if (fn) {
1684 iommu_device = fn(dev);
1685 symbol_put(mdev_get_iommu_device);
1686
1687 return iommu_device;
1688 }
1689
1690 return NULL;
1691}
1692
1693static int vfio_mdev_attach_domain(struct device *dev, void *data)
1694{
1695 struct iommu_domain *domain = data;
1696 struct device *iommu_device;
1697
1698 iommu_device = vfio_mdev_get_iommu_device(dev);
1699 if (iommu_device) {
1700 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1701 return iommu_aux_attach_device(domain, iommu_device);
1702 else
1703 return iommu_attach_device(domain, iommu_device);
1704 }
1705
1706 return -EINVAL;
1707}
1708
1709static int vfio_mdev_detach_domain(struct device *dev, void *data)
1710{
1711 struct iommu_domain *domain = data;
1712 struct device *iommu_device;
1713
1714 iommu_device = vfio_mdev_get_iommu_device(dev);
1715 if (iommu_device) {
1716 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1717 iommu_aux_detach_device(domain, iommu_device);
1718 else
1719 iommu_detach_device(domain, iommu_device);
1720 }
1721
1722 return 0;
1723}
1724
1725static int vfio_iommu_attach_group(struct vfio_domain *domain,
1726 struct vfio_group *group)
1727{
1728 if (group->mdev_group)
1729 return iommu_group_for_each_dev(group->iommu_group,
1730 domain->domain,
1731 vfio_mdev_attach_domain);
1732 else
1733 return iommu_attach_group(domain->domain, group->iommu_group);
1734}
1735
1736static void vfio_iommu_detach_group(struct vfio_domain *domain,
1737 struct vfio_group *group)
1738{
1739 if (group->mdev_group)
1740 iommu_group_for_each_dev(group->iommu_group, domain->domain,
1741 vfio_mdev_detach_domain);
1742 else
1743 iommu_detach_group(domain->domain, group->iommu_group);
1744}
1745
1746static bool vfio_bus_is_mdev(struct bus_type *bus)
1747{
1748 struct bus_type *mdev_bus;
1749 bool ret = false;
1750
1751 mdev_bus = symbol_get(mdev_bus_type);
1752 if (mdev_bus) {
1753 ret = (bus == mdev_bus);
1754 symbol_put(mdev_bus_type);
1755 }
1756
1757 return ret;
1758}
1759
1760static int vfio_mdev_iommu_device(struct device *dev, void *data)
1761{
1762 struct device **old = data, *new;
1763
1764 new = vfio_mdev_get_iommu_device(dev);
1765 if (!new || (*old && *old != new))
1766 return -EINVAL;
1767
1768 *old = new;
1769
1770 return 0;
1771}
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782static int vfio_iommu_iova_insert(struct list_head *head,
1783 dma_addr_t start, dma_addr_t end)
1784{
1785 struct vfio_iova *region;
1786
1787 region = kmalloc(sizeof(*region), GFP_KERNEL);
1788 if (!region)
1789 return -ENOMEM;
1790
1791 INIT_LIST_HEAD(®ion->list);
1792 region->start = start;
1793 region->end = end;
1794
1795 list_add_tail(®ion->list, head);
1796 return 0;
1797}
1798
1799
1800
1801
1802
1803static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1804 dma_addr_t start, dma_addr_t end)
1805{
1806 struct vfio_iova *first, *last;
1807 struct list_head *iova = &iommu->iova_list;
1808
1809 if (list_empty(iova))
1810 return false;
1811
1812
1813 first = list_first_entry(iova, struct vfio_iova, list);
1814 last = list_last_entry(iova, struct vfio_iova, list);
1815 if (start > last->end || end < first->start)
1816 return true;
1817
1818
1819 if (start > first->start) {
1820 if (vfio_find_dma(iommu, first->start, start - first->start))
1821 return true;
1822 }
1823
1824
1825 if (end < last->end) {
1826 if (vfio_find_dma(iommu, end + 1, last->end - end))
1827 return true;
1828 }
1829
1830 return false;
1831}
1832
1833
1834
1835
1836
1837static int vfio_iommu_aper_resize(struct list_head *iova,
1838 dma_addr_t start, dma_addr_t end)
1839{
1840 struct vfio_iova *node, *next;
1841
1842 if (list_empty(iova))
1843 return vfio_iommu_iova_insert(iova, start, end);
1844
1845
1846 list_for_each_entry_safe(node, next, iova, list) {
1847 if (start < node->start)
1848 break;
1849 if (start >= node->start && start < node->end) {
1850 node->start = start;
1851 break;
1852 }
1853
1854 list_del(&node->list);
1855 kfree(node);
1856 }
1857
1858
1859 list_for_each_entry_safe(node, next, iova, list) {
1860 if (end > node->end)
1861 continue;
1862 if (end > node->start && end <= node->end) {
1863 node->end = end;
1864 continue;
1865 }
1866
1867 list_del(&node->list);
1868 kfree(node);
1869 }
1870
1871 return 0;
1872}
1873
1874
1875
1876
1877static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
1878 struct list_head *resv_regions)
1879{
1880 struct iommu_resv_region *region;
1881
1882
1883 list_for_each_entry(region, resv_regions, list) {
1884 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
1885 continue;
1886
1887 if (vfio_find_dma(iommu, region->start, region->length))
1888 return true;
1889 }
1890
1891 return false;
1892}
1893
1894
1895
1896
1897
1898static int vfio_iommu_resv_exclude(struct list_head *iova,
1899 struct list_head *resv_regions)
1900{
1901 struct iommu_resv_region *resv;
1902 struct vfio_iova *n, *next;
1903
1904 list_for_each_entry(resv, resv_regions, list) {
1905 phys_addr_t start, end;
1906
1907 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1908 continue;
1909
1910 start = resv->start;
1911 end = resv->start + resv->length - 1;
1912
1913 list_for_each_entry_safe(n, next, iova, list) {
1914 int ret = 0;
1915
1916
1917 if (start > n->end || end < n->start)
1918 continue;
1919
1920
1921
1922
1923
1924
1925
1926 if (start > n->start)
1927 ret = vfio_iommu_iova_insert(&n->list, n->start,
1928 start - 1);
1929 if (!ret && end < n->end)
1930 ret = vfio_iommu_iova_insert(&n->list, end + 1,
1931 n->end);
1932 if (ret)
1933 return ret;
1934
1935 list_del(&n->list);
1936 kfree(n);
1937 }
1938 }
1939
1940 if (list_empty(iova))
1941 return -EINVAL;
1942
1943 return 0;
1944}
1945
1946static void vfio_iommu_resv_free(struct list_head *resv_regions)
1947{
1948 struct iommu_resv_region *n, *next;
1949
1950 list_for_each_entry_safe(n, next, resv_regions, list) {
1951 list_del(&n->list);
1952 kfree(n);
1953 }
1954}
1955
1956static void vfio_iommu_iova_free(struct list_head *iova)
1957{
1958 struct vfio_iova *n, *next;
1959
1960 list_for_each_entry_safe(n, next, iova, list) {
1961 list_del(&n->list);
1962 kfree(n);
1963 }
1964}
1965
1966static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
1967 struct list_head *iova_copy)
1968{
1969 struct list_head *iova = &iommu->iova_list;
1970 struct vfio_iova *n;
1971 int ret;
1972
1973 list_for_each_entry(n, iova, list) {
1974 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
1975 if (ret)
1976 goto out_free;
1977 }
1978
1979 return 0;
1980
1981out_free:
1982 vfio_iommu_iova_free(iova_copy);
1983 return ret;
1984}
1985
1986static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
1987 struct list_head *iova_copy)
1988{
1989 struct list_head *iova = &iommu->iova_list;
1990
1991 vfio_iommu_iova_free(iova);
1992
1993 list_splice_tail(iova_copy, iova);
1994}
1995static int vfio_iommu_type1_attach_group(void *iommu_data,
1996 struct iommu_group *iommu_group)
1997{
1998 struct vfio_iommu *iommu = iommu_data;
1999 struct vfio_group *group;
2000 struct vfio_domain *domain, *d;
2001 struct bus_type *bus = NULL;
2002 int ret;
2003 bool resv_msi, msi_remap;
2004 phys_addr_t resv_msi_base = 0;
2005 struct iommu_domain_geometry geo;
2006 LIST_HEAD(iova_copy);
2007 LIST_HEAD(group_resv_regions);
2008
2009 mutex_lock(&iommu->lock);
2010
2011 list_for_each_entry(d, &iommu->domain_list, next) {
2012 if (find_iommu_group(d, iommu_group)) {
2013 mutex_unlock(&iommu->lock);
2014 return -EINVAL;
2015 }
2016 }
2017
2018 if (iommu->external_domain) {
2019 if (find_iommu_group(iommu->external_domain, iommu_group)) {
2020 mutex_unlock(&iommu->lock);
2021 return -EINVAL;
2022 }
2023 }
2024
2025 group = kzalloc(sizeof(*group), GFP_KERNEL);
2026 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2027 if (!group || !domain) {
2028 ret = -ENOMEM;
2029 goto out_free;
2030 }
2031
2032 group->iommu_group = iommu_group;
2033
2034
2035 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
2036 if (ret)
2037 goto out_free;
2038
2039 if (vfio_bus_is_mdev(bus)) {
2040 struct device *iommu_device = NULL;
2041
2042 group->mdev_group = true;
2043
2044
2045 ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
2046 vfio_mdev_iommu_device);
2047 if (ret || !iommu_device) {
2048 if (!iommu->external_domain) {
2049 INIT_LIST_HEAD(&domain->group_list);
2050 iommu->external_domain = domain;
2051 vfio_update_pgsize_bitmap(iommu);
2052 } else {
2053 kfree(domain);
2054 }
2055
2056 list_add(&group->next,
2057 &iommu->external_domain->group_list);
2058
2059
2060
2061
2062
2063
2064
2065 group->pinned_page_dirty_scope = true;
2066 if (!iommu->pinned_page_dirty_scope)
2067 update_pinned_page_dirty_scope(iommu);
2068 mutex_unlock(&iommu->lock);
2069
2070 return 0;
2071 }
2072
2073 bus = iommu_device->bus;
2074 }
2075
2076 domain->domain = iommu_domain_alloc(bus);
2077 if (!domain->domain) {
2078 ret = -EIO;
2079 goto out_free;
2080 }
2081
2082 if (iommu->nesting) {
2083 int attr = 1;
2084
2085 ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
2086 &attr);
2087 if (ret)
2088 goto out_domain;
2089 }
2090
2091 ret = vfio_iommu_attach_group(domain, group);
2092 if (ret)
2093 goto out_domain;
2094
2095
2096 iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
2097
2098 if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
2099 geo.aperture_end)) {
2100 ret = -EINVAL;
2101 goto out_detach;
2102 }
2103
2104 ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2105 if (ret)
2106 goto out_detach;
2107
2108 if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2109 ret = -EINVAL;
2110 goto out_detach;
2111 }
2112
2113
2114
2115
2116
2117
2118 ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2119 if (ret)
2120 goto out_detach;
2121
2122 ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
2123 geo.aperture_end);
2124 if (ret)
2125 goto out_detach;
2126
2127 ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2128 if (ret)
2129 goto out_detach;
2130
2131 resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2132
2133 INIT_LIST_HEAD(&domain->group_list);
2134 list_add(&group->next, &domain->group_list);
2135
2136 msi_remap = irq_domain_check_msi_remap() ||
2137 iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
2138
2139 if (!allow_unsafe_interrupts && !msi_remap) {
2140 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2141 __func__);
2142 ret = -EPERM;
2143 goto out_detach;
2144 }
2145
2146 if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
2147 domain->prot |= IOMMU_CACHE;
2148
2149
2150
2151
2152
2153
2154
2155
2156 list_for_each_entry(d, &iommu->domain_list, next) {
2157 if (d->domain->ops == domain->domain->ops &&
2158 d->prot == domain->prot) {
2159 vfio_iommu_detach_group(domain, group);
2160 if (!vfio_iommu_attach_group(d, group)) {
2161 list_add(&group->next, &d->group_list);
2162 iommu_domain_free(domain->domain);
2163 kfree(domain);
2164 goto done;
2165 }
2166
2167 ret = vfio_iommu_attach_group(domain, group);
2168 if (ret)
2169 goto out_domain;
2170 }
2171 }
2172
2173 vfio_test_domain_fgsp(domain);
2174
2175
2176 ret = vfio_iommu_replay(iommu, domain);
2177 if (ret)
2178 goto out_detach;
2179
2180 if (resv_msi) {
2181 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2182 if (ret && ret != -ENODEV)
2183 goto out_detach;
2184 }
2185
2186 list_add(&domain->next, &iommu->domain_list);
2187 vfio_update_pgsize_bitmap(iommu);
2188done:
2189
2190 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2191
2192
2193
2194
2195
2196
2197 iommu->pinned_page_dirty_scope = false;
2198 mutex_unlock(&iommu->lock);
2199 vfio_iommu_resv_free(&group_resv_regions);
2200
2201 return 0;
2202
2203out_detach:
2204 vfio_iommu_detach_group(domain, group);
2205out_domain:
2206 iommu_domain_free(domain->domain);
2207 vfio_iommu_iova_free(&iova_copy);
2208 vfio_iommu_resv_free(&group_resv_regions);
2209out_free:
2210 kfree(domain);
2211 kfree(group);
2212 mutex_unlock(&iommu->lock);
2213 return ret;
2214}
2215
2216static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2217{
2218 struct rb_node *node;
2219
2220 while ((node = rb_first(&iommu->dma_list)))
2221 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2222}
2223
2224static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2225{
2226 struct rb_node *n, *p;
2227
2228 n = rb_first(&iommu->dma_list);
2229 for (; n; n = rb_next(n)) {
2230 struct vfio_dma *dma;
2231 long locked = 0, unlocked = 0;
2232
2233 dma = rb_entry(n, struct vfio_dma, node);
2234 unlocked += vfio_unmap_unpin(iommu, dma, false);
2235 p = rb_first(&dma->pfn_list);
2236 for (; p; p = rb_next(p)) {
2237 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2238 node);
2239
2240 if (!is_invalid_reserved_pfn(vpfn->pfn))
2241 locked++;
2242 }
2243 vfio_lock_acct(dma, locked - unlocked, true);
2244 }
2245}
2246
2247static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
2248{
2249 struct rb_node *n;
2250
2251 n = rb_first(&iommu->dma_list);
2252 for (; n; n = rb_next(n)) {
2253 struct vfio_dma *dma;
2254
2255 dma = rb_entry(n, struct vfio_dma, node);
2256
2257 if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
2258 break;
2259 }
2260
2261 WARN_ON(iommu->notifier.head);
2262}
2263
2264
2265
2266
2267
2268
2269static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2270 struct list_head *iova_copy)
2271{
2272 struct vfio_domain *domain;
2273 struct iommu_domain_geometry geo;
2274 struct vfio_iova *node;
2275 dma_addr_t start = 0;
2276 dma_addr_t end = (dma_addr_t)~0;
2277
2278 if (list_empty(iova_copy))
2279 return;
2280
2281 list_for_each_entry(domain, &iommu->domain_list, next) {
2282 iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
2283 &geo);
2284 if (geo.aperture_start > start)
2285 start = geo.aperture_start;
2286 if (geo.aperture_end < end)
2287 end = geo.aperture_end;
2288 }
2289
2290
2291 node = list_first_entry(iova_copy, struct vfio_iova, list);
2292 node->start = start;
2293 node = list_last_entry(iova_copy, struct vfio_iova, list);
2294 node->end = end;
2295}
2296
2297
2298
2299
2300
2301
2302
2303static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2304 struct list_head *iova_copy)
2305{
2306 struct vfio_domain *d;
2307 struct vfio_group *g;
2308 struct vfio_iova *node;
2309 dma_addr_t start, end;
2310 LIST_HEAD(resv_regions);
2311 int ret;
2312
2313 if (list_empty(iova_copy))
2314 return -EINVAL;
2315
2316 list_for_each_entry(d, &iommu->domain_list, next) {
2317 list_for_each_entry(g, &d->group_list, next) {
2318 ret = iommu_get_group_resv_regions(g->iommu_group,
2319 &resv_regions);
2320 if (ret)
2321 goto done;
2322 }
2323 }
2324
2325 node = list_first_entry(iova_copy, struct vfio_iova, list);
2326 start = node->start;
2327 node = list_last_entry(iova_copy, struct vfio_iova, list);
2328 end = node->end;
2329
2330
2331 vfio_iommu_iova_free(iova_copy);
2332
2333 ret = vfio_iommu_aper_resize(iova_copy, start, end);
2334 if (ret)
2335 goto done;
2336
2337
2338 ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2339done:
2340 vfio_iommu_resv_free(&resv_regions);
2341 return ret;
2342}
2343
2344static void vfio_iommu_type1_detach_group(void *iommu_data,
2345 struct iommu_group *iommu_group)
2346{
2347 struct vfio_iommu *iommu = iommu_data;
2348 struct vfio_domain *domain;
2349 struct vfio_group *group;
2350 bool update_dirty_scope = false;
2351 LIST_HEAD(iova_copy);
2352
2353 mutex_lock(&iommu->lock);
2354
2355 if (iommu->external_domain) {
2356 group = find_iommu_group(iommu->external_domain, iommu_group);
2357 if (group) {
2358 update_dirty_scope = !group->pinned_page_dirty_scope;
2359 list_del(&group->next);
2360 kfree(group);
2361
2362 if (list_empty(&iommu->external_domain->group_list)) {
2363 vfio_sanity_check_pfn_list(iommu);
2364
2365 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
2366 vfio_iommu_unmap_unpin_all(iommu);
2367
2368 kfree(iommu->external_domain);
2369 iommu->external_domain = NULL;
2370 }
2371 goto detach_group_done;
2372 }
2373 }
2374
2375
2376
2377
2378
2379
2380 vfio_iommu_iova_get_copy(iommu, &iova_copy);
2381
2382 list_for_each_entry(domain, &iommu->domain_list, next) {
2383 group = find_iommu_group(domain, iommu_group);
2384 if (!group)
2385 continue;
2386
2387 vfio_iommu_detach_group(domain, group);
2388 update_dirty_scope = !group->pinned_page_dirty_scope;
2389 list_del(&group->next);
2390 kfree(group);
2391
2392
2393
2394
2395
2396
2397
2398 if (list_empty(&domain->group_list)) {
2399 if (list_is_singular(&iommu->domain_list)) {
2400 if (!iommu->external_domain)
2401 vfio_iommu_unmap_unpin_all(iommu);
2402 else
2403 vfio_iommu_unmap_unpin_reaccount(iommu);
2404 }
2405 iommu_domain_free(domain->domain);
2406 list_del(&domain->next);
2407 kfree(domain);
2408 vfio_iommu_aper_expand(iommu, &iova_copy);
2409 vfio_update_pgsize_bitmap(iommu);
2410 }
2411 break;
2412 }
2413
2414 if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2415 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2416 else
2417 vfio_iommu_iova_free(&iova_copy);
2418
2419detach_group_done:
2420
2421
2422
2423
2424 if (update_dirty_scope)
2425 update_pinned_page_dirty_scope(iommu);
2426 mutex_unlock(&iommu->lock);
2427}
2428
2429static void *vfio_iommu_type1_open(unsigned long arg)
2430{
2431 struct vfio_iommu *iommu;
2432
2433 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2434 if (!iommu)
2435 return ERR_PTR(-ENOMEM);
2436
2437 switch (arg) {
2438 case VFIO_TYPE1_IOMMU:
2439 break;
2440 case VFIO_TYPE1_NESTING_IOMMU:
2441 iommu->nesting = true;
2442 fallthrough;
2443 case VFIO_TYPE1v2_IOMMU:
2444 iommu->v2 = true;
2445 break;
2446 default:
2447 kfree(iommu);
2448 return ERR_PTR(-EINVAL);
2449 }
2450
2451 INIT_LIST_HEAD(&iommu->domain_list);
2452 INIT_LIST_HEAD(&iommu->iova_list);
2453 iommu->dma_list = RB_ROOT;
2454 iommu->dma_avail = dma_entry_limit;
2455 mutex_init(&iommu->lock);
2456 BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
2457
2458 return iommu;
2459}
2460
2461static void vfio_release_domain(struct vfio_domain *domain, bool external)
2462{
2463 struct vfio_group *group, *group_tmp;
2464
2465 list_for_each_entry_safe(group, group_tmp,
2466 &domain->group_list, next) {
2467 if (!external)
2468 vfio_iommu_detach_group(domain, group);
2469 list_del(&group->next);
2470 kfree(group);
2471 }
2472
2473 if (!external)
2474 iommu_domain_free(domain->domain);
2475}
2476
2477static void vfio_iommu_type1_release(void *iommu_data)
2478{
2479 struct vfio_iommu *iommu = iommu_data;
2480 struct vfio_domain *domain, *domain_tmp;
2481
2482 if (iommu->external_domain) {
2483 vfio_release_domain(iommu->external_domain, true);
2484 vfio_sanity_check_pfn_list(iommu);
2485 kfree(iommu->external_domain);
2486 }
2487
2488 vfio_iommu_unmap_unpin_all(iommu);
2489
2490 list_for_each_entry_safe(domain, domain_tmp,
2491 &iommu->domain_list, next) {
2492 vfio_release_domain(domain, false);
2493 list_del(&domain->next);
2494 kfree(domain);
2495 }
2496
2497 vfio_iommu_iova_free(&iommu->iova_list);
2498
2499 kfree(iommu);
2500}
2501
2502static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
2503{
2504 struct vfio_domain *domain;
2505 int ret = 1;
2506
2507 mutex_lock(&iommu->lock);
2508 list_for_each_entry(domain, &iommu->domain_list, next) {
2509 if (!(domain->prot & IOMMU_CACHE)) {
2510 ret = 0;
2511 break;
2512 }
2513 }
2514 mutex_unlock(&iommu->lock);
2515
2516 return ret;
2517}
2518
2519static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2520 unsigned long arg)
2521{
2522 switch (arg) {
2523 case VFIO_TYPE1_IOMMU:
2524 case VFIO_TYPE1v2_IOMMU:
2525 case VFIO_TYPE1_NESTING_IOMMU:
2526 return 1;
2527 case VFIO_DMA_CC_IOMMU:
2528 if (!iommu)
2529 return 0;
2530 return vfio_domains_have_iommu_cache(iommu);
2531 default:
2532 return 0;
2533 }
2534}
2535
2536static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2537 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2538 size_t size)
2539{
2540 struct vfio_info_cap_header *header;
2541 struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2542
2543 header = vfio_info_cap_add(caps, size,
2544 VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2545 if (IS_ERR(header))
2546 return PTR_ERR(header);
2547
2548 iova_cap = container_of(header,
2549 struct vfio_iommu_type1_info_cap_iova_range,
2550 header);
2551 iova_cap->nr_iovas = cap_iovas->nr_iovas;
2552 memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2553 cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2554 return 0;
2555}
2556
2557static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2558 struct vfio_info_cap *caps)
2559{
2560 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2561 struct vfio_iova *iova;
2562 size_t size;
2563 int iovas = 0, i = 0, ret;
2564
2565 list_for_each_entry(iova, &iommu->iova_list, list)
2566 iovas++;
2567
2568 if (!iovas) {
2569
2570
2571
2572
2573 return 0;
2574 }
2575
2576 size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
2577
2578 cap_iovas = kzalloc(size, GFP_KERNEL);
2579 if (!cap_iovas)
2580 return -ENOMEM;
2581
2582 cap_iovas->nr_iovas = iovas;
2583
2584 list_for_each_entry(iova, &iommu->iova_list, list) {
2585 cap_iovas->iova_ranges[i].start = iova->start;
2586 cap_iovas->iova_ranges[i].end = iova->end;
2587 i++;
2588 }
2589
2590 ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2591
2592 kfree(cap_iovas);
2593 return ret;
2594}
2595
2596static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2597 struct vfio_info_cap *caps)
2598{
2599 struct vfio_iommu_type1_info_cap_migration cap_mig;
2600
2601 cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2602 cap_mig.header.version = 1;
2603
2604 cap_mig.flags = 0;
2605
2606 cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2607 cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2608
2609 return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2610}
2611
2612static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2613 unsigned long arg)
2614{
2615 struct vfio_iommu_type1_info info;
2616 unsigned long minsz;
2617 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2618 unsigned long capsz;
2619 int ret;
2620
2621 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2622
2623
2624 capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
2625
2626 if (copy_from_user(&info, (void __user *)arg, minsz))
2627 return -EFAULT;
2628
2629 if (info.argsz < minsz)
2630 return -EINVAL;
2631
2632 if (info.argsz >= capsz) {
2633 minsz = capsz;
2634 info.cap_offset = 0;
2635 }
2636
2637 mutex_lock(&iommu->lock);
2638 info.flags = VFIO_IOMMU_INFO_PGSIZES;
2639
2640 info.iova_pgsizes = iommu->pgsize_bitmap;
2641
2642 ret = vfio_iommu_migration_build_caps(iommu, &caps);
2643
2644 if (!ret)
2645 ret = vfio_iommu_iova_build_caps(iommu, &caps);
2646
2647 mutex_unlock(&iommu->lock);
2648
2649 if (ret)
2650 return ret;
2651
2652 if (caps.size) {
2653 info.flags |= VFIO_IOMMU_INFO_CAPS;
2654
2655 if (info.argsz < sizeof(info) + caps.size) {
2656 info.argsz = sizeof(info) + caps.size;
2657 } else {
2658 vfio_info_cap_shift(&caps, sizeof(info));
2659 if (copy_to_user((void __user *)arg +
2660 sizeof(info), caps.buf,
2661 caps.size)) {
2662 kfree(caps.buf);
2663 return -EFAULT;
2664 }
2665 info.cap_offset = sizeof(info);
2666 }
2667
2668 kfree(caps.buf);
2669 }
2670
2671 return copy_to_user((void __user *)arg, &info, minsz) ?
2672 -EFAULT : 0;
2673}
2674
2675static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2676 unsigned long arg)
2677{
2678 struct vfio_iommu_type1_dma_map map;
2679 unsigned long minsz;
2680 uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
2681
2682 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2683
2684 if (copy_from_user(&map, (void __user *)arg, minsz))
2685 return -EFAULT;
2686
2687 if (map.argsz < minsz || map.flags & ~mask)
2688 return -EINVAL;
2689
2690 return vfio_dma_do_map(iommu, &map);
2691}
2692
2693static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2694 unsigned long arg)
2695{
2696 struct vfio_iommu_type1_dma_unmap unmap;
2697 struct vfio_bitmap bitmap = { 0 };
2698 unsigned long minsz;
2699 int ret;
2700
2701 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2702
2703 if (copy_from_user(&unmap, (void __user *)arg, minsz))
2704 return -EFAULT;
2705
2706 if (unmap.argsz < minsz ||
2707 unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
2708 return -EINVAL;
2709
2710 if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2711 unsigned long pgshift;
2712
2713 if (unmap.argsz < (minsz + sizeof(bitmap)))
2714 return -EINVAL;
2715
2716 if (copy_from_user(&bitmap,
2717 (void __user *)(arg + minsz),
2718 sizeof(bitmap)))
2719 return -EFAULT;
2720
2721 if (!access_ok((void __user *)bitmap.data, bitmap.size))
2722 return -EINVAL;
2723
2724 pgshift = __ffs(bitmap.pgsize);
2725 ret = verify_bitmap_size(unmap.size >> pgshift,
2726 bitmap.size);
2727 if (ret)
2728 return ret;
2729 }
2730
2731 ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2732 if (ret)
2733 return ret;
2734
2735 return copy_to_user((void __user *)arg, &unmap, minsz) ?
2736 -EFAULT : 0;
2737}
2738
2739static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2740 unsigned long arg)
2741{
2742 struct vfio_iommu_type1_dirty_bitmap dirty;
2743 uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
2744 VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
2745 VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2746 unsigned long minsz;
2747 int ret = 0;
2748
2749 if (!iommu->v2)
2750 return -EACCES;
2751
2752 minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2753
2754 if (copy_from_user(&dirty, (void __user *)arg, minsz))
2755 return -EFAULT;
2756
2757 if (dirty.argsz < minsz || dirty.flags & ~mask)
2758 return -EINVAL;
2759
2760
2761 if (__ffs(dirty.flags) != __fls(dirty.flags))
2762 return -EINVAL;
2763
2764 if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2765 size_t pgsize;
2766
2767 mutex_lock(&iommu->lock);
2768 pgsize = 1 << __ffs(iommu->pgsize_bitmap);
2769 if (!iommu->dirty_page_tracking) {
2770 ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2771 if (!ret)
2772 iommu->dirty_page_tracking = true;
2773 }
2774 mutex_unlock(&iommu->lock);
2775 return ret;
2776 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2777 mutex_lock(&iommu->lock);
2778 if (iommu->dirty_page_tracking) {
2779 iommu->dirty_page_tracking = false;
2780 vfio_dma_bitmap_free_all(iommu);
2781 }
2782 mutex_unlock(&iommu->lock);
2783 return 0;
2784 } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2785 struct vfio_iommu_type1_dirty_bitmap_get range;
2786 unsigned long pgshift;
2787 size_t data_size = dirty.argsz - minsz;
2788 size_t iommu_pgsize;
2789
2790 if (!data_size || data_size < sizeof(range))
2791 return -EINVAL;
2792
2793 if (copy_from_user(&range, (void __user *)(arg + minsz),
2794 sizeof(range)))
2795 return -EFAULT;
2796
2797 if (range.iova + range.size < range.iova)
2798 return -EINVAL;
2799 if (!access_ok((void __user *)range.bitmap.data,
2800 range.bitmap.size))
2801 return -EINVAL;
2802
2803 pgshift = __ffs(range.bitmap.pgsize);
2804 ret = verify_bitmap_size(range.size >> pgshift,
2805 range.bitmap.size);
2806 if (ret)
2807 return ret;
2808
2809 mutex_lock(&iommu->lock);
2810
2811 iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2812
2813
2814 if (range.bitmap.pgsize != iommu_pgsize) {
2815 ret = -EINVAL;
2816 goto out_unlock;
2817 }
2818 if (range.iova & (iommu_pgsize - 1)) {
2819 ret = -EINVAL;
2820 goto out_unlock;
2821 }
2822 if (!range.size || range.size & (iommu_pgsize - 1)) {
2823 ret = -EINVAL;
2824 goto out_unlock;
2825 }
2826
2827 if (iommu->dirty_page_tracking)
2828 ret = vfio_iova_dirty_bitmap(range.bitmap.data,
2829 iommu, range.iova,
2830 range.size,
2831 range.bitmap.pgsize);
2832 else
2833 ret = -EINVAL;
2834out_unlock:
2835 mutex_unlock(&iommu->lock);
2836
2837 return ret;
2838 }
2839
2840 return -EINVAL;
2841}
2842
2843static long vfio_iommu_type1_ioctl(void *iommu_data,
2844 unsigned int cmd, unsigned long arg)
2845{
2846 struct vfio_iommu *iommu = iommu_data;
2847
2848 switch (cmd) {
2849 case VFIO_CHECK_EXTENSION:
2850 return vfio_iommu_type1_check_extension(iommu, arg);
2851 case VFIO_IOMMU_GET_INFO:
2852 return vfio_iommu_type1_get_info(iommu, arg);
2853 case VFIO_IOMMU_MAP_DMA:
2854 return vfio_iommu_type1_map_dma(iommu, arg);
2855 case VFIO_IOMMU_UNMAP_DMA:
2856 return vfio_iommu_type1_unmap_dma(iommu, arg);
2857 case VFIO_IOMMU_DIRTY_PAGES:
2858 return vfio_iommu_type1_dirty_pages(iommu, arg);
2859 default:
2860 return -ENOTTY;
2861 }
2862}
2863
2864static int vfio_iommu_type1_register_notifier(void *iommu_data,
2865 unsigned long *events,
2866 struct notifier_block *nb)
2867{
2868 struct vfio_iommu *iommu = iommu_data;
2869
2870
2871 *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
2872
2873
2874 if (*events)
2875 return -EINVAL;
2876
2877 return blocking_notifier_chain_register(&iommu->notifier, nb);
2878}
2879
2880static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
2881 struct notifier_block *nb)
2882{
2883 struct vfio_iommu *iommu = iommu_data;
2884
2885 return blocking_notifier_chain_unregister(&iommu->notifier, nb);
2886}
2887
2888static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
2889 dma_addr_t user_iova, void *data,
2890 size_t count, bool write,
2891 size_t *copied)
2892{
2893 struct mm_struct *mm;
2894 unsigned long vaddr;
2895 struct vfio_dma *dma;
2896 bool kthread = current->mm == NULL;
2897 size_t offset;
2898
2899 *copied = 0;
2900
2901 dma = vfio_find_dma(iommu, user_iova, 1);
2902 if (!dma)
2903 return -EINVAL;
2904
2905 if ((write && !(dma->prot & IOMMU_WRITE)) ||
2906 !(dma->prot & IOMMU_READ))
2907 return -EPERM;
2908
2909 mm = get_task_mm(dma->task);
2910
2911 if (!mm)
2912 return -EPERM;
2913
2914 if (kthread)
2915 kthread_use_mm(mm);
2916 else if (current->mm != mm)
2917 goto out;
2918
2919 offset = user_iova - dma->iova;
2920
2921 if (count > dma->size - offset)
2922 count = dma->size - offset;
2923
2924 vaddr = dma->vaddr + offset;
2925
2926 if (write) {
2927 *copied = copy_to_user((void __user *)vaddr, data,
2928 count) ? 0 : count;
2929 if (*copied && iommu->dirty_page_tracking) {
2930 unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
2931
2932
2933
2934
2935 bitmap_set(dma->bitmap, offset >> pgshift,
2936 *copied >> pgshift);
2937 }
2938 } else
2939 *copied = copy_from_user(data, (void __user *)vaddr,
2940 count) ? 0 : count;
2941 if (kthread)
2942 kthread_unuse_mm(mm);
2943out:
2944 mmput(mm);
2945 return *copied ? 0 : -EFAULT;
2946}
2947
2948static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
2949 void *data, size_t count, bool write)
2950{
2951 struct vfio_iommu *iommu = iommu_data;
2952 int ret = 0;
2953 size_t done;
2954
2955 mutex_lock(&iommu->lock);
2956 while (count > 0) {
2957 ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
2958 count, write, &done);
2959 if (ret)
2960 break;
2961
2962 count -= done;
2963 data += done;
2964 user_iova += done;
2965 }
2966
2967 mutex_unlock(&iommu->lock);
2968 return ret;
2969}
2970
2971static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
2972 .name = "vfio-iommu-type1",
2973 .owner = THIS_MODULE,
2974 .open = vfio_iommu_type1_open,
2975 .release = vfio_iommu_type1_release,
2976 .ioctl = vfio_iommu_type1_ioctl,
2977 .attach_group = vfio_iommu_type1_attach_group,
2978 .detach_group = vfio_iommu_type1_detach_group,
2979 .pin_pages = vfio_iommu_type1_pin_pages,
2980 .unpin_pages = vfio_iommu_type1_unpin_pages,
2981 .register_notifier = vfio_iommu_type1_register_notifier,
2982 .unregister_notifier = vfio_iommu_type1_unregister_notifier,
2983 .dma_rw = vfio_iommu_type1_dma_rw,
2984};
2985
2986static int __init vfio_iommu_type1_init(void)
2987{
2988 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
2989}
2990
2991static void __exit vfio_iommu_type1_cleanup(void)
2992{
2993 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
2994}
2995
2996module_init(vfio_iommu_type1_init);
2997module_exit(vfio_iommu_type1_cleanup);
2998
2999MODULE_VERSION(DRIVER_VERSION);
3000MODULE_LICENSE("GPL v2");
3001MODULE_AUTHOR(DRIVER_AUTHOR);
3002MODULE_DESCRIPTION(DRIVER_DESC);
3003