1
2
3
4
5
6
7
8
9
10
11#include <linux/mm.h>
12#include <linux/hmm.h>
13#include <linux/init.h>
14#include <linux/rmap.h>
15#include <linux/swap.h>
16#include <linux/slab.h>
17#include <linux/sched.h>
18#include <linux/mmzone.h>
19#include <linux/pagemap.h>
20#include <linux/swapops.h>
21#include <linux/hugetlb.h>
22#include <linux/memremap.h>
23#include <linux/sched/mm.h>
24#include <linux/jump_label.h>
25#include <linux/dma-mapping.h>
26#include <linux/mmu_notifier.h>
27#include <linux/memory_hotplug.h>
28
29static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
30
31
32
33
34
35
36
37
38
39
40
41
42
43static struct hmm *hmm_get_or_create(struct mm_struct *mm)
44{
45 struct hmm *hmm;
46
47 lockdep_assert_held_write(&mm->mmap_sem);
48
49
50 spin_lock(&mm->page_table_lock);
51 hmm = mm->hmm;
52 if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref))
53 goto out_unlock;
54 spin_unlock(&mm->page_table_lock);
55
56 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
57 if (!hmm)
58 return NULL;
59 init_waitqueue_head(&hmm->wq);
60 INIT_LIST_HEAD(&hmm->mirrors);
61 init_rwsem(&hmm->mirrors_sem);
62 hmm->mmu_notifier.ops = NULL;
63 INIT_LIST_HEAD(&hmm->ranges);
64 spin_lock_init(&hmm->ranges_lock);
65 kref_init(&hmm->kref);
66 hmm->notifiers = 0;
67 hmm->mm = mm;
68
69 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
70 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
71 kfree(hmm);
72 return NULL;
73 }
74
75 mmgrab(hmm->mm);
76
77
78
79
80
81 spin_lock(&mm->page_table_lock);
82 mm->hmm = hmm;
83
84out_unlock:
85 spin_unlock(&mm->page_table_lock);
86 return hmm;
87}
88
89static void hmm_free_rcu(struct rcu_head *rcu)
90{
91 struct hmm *hmm = container_of(rcu, struct hmm, rcu);
92
93 mmdrop(hmm->mm);
94 kfree(hmm);
95}
96
97static void hmm_free(struct kref *kref)
98{
99 struct hmm *hmm = container_of(kref, struct hmm, kref);
100
101 spin_lock(&hmm->mm->page_table_lock);
102 if (hmm->mm->hmm == hmm)
103 hmm->mm->hmm = NULL;
104 spin_unlock(&hmm->mm->page_table_lock);
105
106 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
107 mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
108}
109
110static inline void hmm_put(struct hmm *hmm)
111{
112 kref_put(&hmm->kref, hmm_free);
113}
114
115static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
116{
117 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
118 struct hmm_mirror *mirror;
119
120
121 if (!kref_get_unless_zero(&hmm->kref))
122 return;
123
124
125
126
127
128 WARN_ON(!list_empty_careful(&hmm->ranges));
129
130 down_read(&hmm->mirrors_sem);
131 list_for_each_entry(mirror, &hmm->mirrors, list) {
132
133
134
135
136 if (mirror->ops->release)
137 mirror->ops->release(mirror);
138 }
139 up_read(&hmm->mirrors_sem);
140
141 hmm_put(hmm);
142}
143
144static void notifiers_decrement(struct hmm *hmm)
145{
146 unsigned long flags;
147
148 spin_lock_irqsave(&hmm->ranges_lock, flags);
149 hmm->notifiers--;
150 if (!hmm->notifiers) {
151 struct hmm_range *range;
152
153 list_for_each_entry(range, &hmm->ranges, list) {
154 if (range->valid)
155 continue;
156 range->valid = true;
157 }
158 wake_up_all(&hmm->wq);
159 }
160 spin_unlock_irqrestore(&hmm->ranges_lock, flags);
161}
162
163static int hmm_invalidate_range_start(struct mmu_notifier *mn,
164 const struct mmu_notifier_range *nrange)
165{
166 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
167 struct hmm_mirror *mirror;
168 struct hmm_update update;
169 struct hmm_range *range;
170 unsigned long flags;
171 int ret = 0;
172
173 if (!kref_get_unless_zero(&hmm->kref))
174 return 0;
175
176 update.start = nrange->start;
177 update.end = nrange->end;
178 update.event = HMM_UPDATE_INVALIDATE;
179 update.blockable = mmu_notifier_range_blockable(nrange);
180
181 spin_lock_irqsave(&hmm->ranges_lock, flags);
182 hmm->notifiers++;
183 list_for_each_entry(range, &hmm->ranges, list) {
184 if (update.end < range->start || update.start >= range->end)
185 continue;
186
187 range->valid = false;
188 }
189 spin_unlock_irqrestore(&hmm->ranges_lock, flags);
190
191 if (mmu_notifier_range_blockable(nrange))
192 down_read(&hmm->mirrors_sem);
193 else if (!down_read_trylock(&hmm->mirrors_sem)) {
194 ret = -EAGAIN;
195 goto out;
196 }
197
198 list_for_each_entry(mirror, &hmm->mirrors, list) {
199 int rc;
200
201 rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
202 if (rc) {
203 if (WARN_ON(update.blockable || rc != -EAGAIN))
204 continue;
205 ret = -EAGAIN;
206 break;
207 }
208 }
209 up_read(&hmm->mirrors_sem);
210
211out:
212 if (ret)
213 notifiers_decrement(hmm);
214 hmm_put(hmm);
215 return ret;
216}
217
218static void hmm_invalidate_range_end(struct mmu_notifier *mn,
219 const struct mmu_notifier_range *nrange)
220{
221 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
222
223 if (!kref_get_unless_zero(&hmm->kref))
224 return;
225
226 notifiers_decrement(hmm);
227 hmm_put(hmm);
228}
229
230static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
231 .release = hmm_release,
232 .invalidate_range_start = hmm_invalidate_range_start,
233 .invalidate_range_end = hmm_invalidate_range_end,
234};
235
236
237
238
239
240
241
242
243
244
245
246int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
247{
248 lockdep_assert_held_write(&mm->mmap_sem);
249
250
251 if (!mm || !mirror || !mirror->ops)
252 return -EINVAL;
253
254 mirror->hmm = hmm_get_or_create(mm);
255 if (!mirror->hmm)
256 return -ENOMEM;
257
258 down_write(&mirror->hmm->mirrors_sem);
259 list_add(&mirror->list, &mirror->hmm->mirrors);
260 up_write(&mirror->hmm->mirrors_sem);
261
262 return 0;
263}
264EXPORT_SYMBOL(hmm_mirror_register);
265
266
267
268
269
270
271
272
273void hmm_mirror_unregister(struct hmm_mirror *mirror)
274{
275 struct hmm *hmm = mirror->hmm;
276
277 down_write(&hmm->mirrors_sem);
278 list_del(&mirror->list);
279 up_write(&hmm->mirrors_sem);
280 hmm_put(hmm);
281}
282EXPORT_SYMBOL(hmm_mirror_unregister);
283
284struct hmm_vma_walk {
285 struct hmm_range *range;
286 struct dev_pagemap *pgmap;
287 unsigned long last;
288 bool fault;
289 bool block;
290};
291
292static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
293 bool write_fault, uint64_t *pfn)
294{
295 unsigned int flags = FAULT_FLAG_REMOTE;
296 struct hmm_vma_walk *hmm_vma_walk = walk->private;
297 struct hmm_range *range = hmm_vma_walk->range;
298 struct vm_area_struct *vma = walk->vma;
299 vm_fault_t ret;
300
301 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
302 flags |= write_fault ? FAULT_FLAG_WRITE : 0;
303 ret = handle_mm_fault(vma, addr, flags);
304 if (ret & VM_FAULT_RETRY)
305 return -EAGAIN;
306 if (ret & VM_FAULT_ERROR) {
307 *pfn = range->values[HMM_PFN_ERROR];
308 return -EFAULT;
309 }
310
311 return -EBUSY;
312}
313
314static int hmm_pfns_bad(unsigned long addr,
315 unsigned long end,
316 struct mm_walk *walk)
317{
318 struct hmm_vma_walk *hmm_vma_walk = walk->private;
319 struct hmm_range *range = hmm_vma_walk->range;
320 uint64_t *pfns = range->pfns;
321 unsigned long i;
322
323 i = (addr - range->start) >> PAGE_SHIFT;
324 for (; addr < end; addr += PAGE_SIZE, i++)
325 pfns[i] = range->values[HMM_PFN_ERROR];
326
327 return 0;
328}
329
330
331
332
333
334
335
336
337
338
339
340
341
342static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
343 bool fault, bool write_fault,
344 struct mm_walk *walk)
345{
346 struct hmm_vma_walk *hmm_vma_walk = walk->private;
347 struct hmm_range *range = hmm_vma_walk->range;
348 uint64_t *pfns = range->pfns;
349 unsigned long i, page_size;
350
351 hmm_vma_walk->last = addr;
352 page_size = hmm_range_page_size(range);
353 i = (addr - range->start) >> range->page_shift;
354
355 for (; addr < end; addr += page_size, i++) {
356 pfns[i] = range->values[HMM_PFN_NONE];
357 if (fault || write_fault) {
358 int ret;
359
360 ret = hmm_vma_do_fault(walk, addr, write_fault,
361 &pfns[i]);
362 if (ret != -EBUSY)
363 return ret;
364 }
365 }
366
367 return (fault || write_fault) ? -EBUSY : 0;
368}
369
370static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
371 uint64_t pfns, uint64_t cpu_flags,
372 bool *fault, bool *write_fault)
373{
374 struct hmm_range *range = hmm_vma_walk->range;
375
376 if (!hmm_vma_walk->fault)
377 return;
378
379
380
381
382
383
384
385
386
387
388
389 pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
390
391
392 if (!(pfns & range->flags[HMM_PFN_VALID]))
393 return;
394
395 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
396
397 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
398 *write_fault = pfns & range->flags[HMM_PFN_WRITE];
399 *fault = true;
400 }
401 return;
402 }
403
404
405 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
406
407 if ((pfns & range->flags[HMM_PFN_WRITE]) &&
408 !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
409 *write_fault = true;
410 *fault = true;
411 }
412}
413
414static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
415 const uint64_t *pfns, unsigned long npages,
416 uint64_t cpu_flags, bool *fault,
417 bool *write_fault)
418{
419 unsigned long i;
420
421 if (!hmm_vma_walk->fault) {
422 *fault = *write_fault = false;
423 return;
424 }
425
426 *fault = *write_fault = false;
427 for (i = 0; i < npages; ++i) {
428 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
429 fault, write_fault);
430 if ((*write_fault))
431 return;
432 }
433}
434
435static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
436 struct mm_walk *walk)
437{
438 struct hmm_vma_walk *hmm_vma_walk = walk->private;
439 struct hmm_range *range = hmm_vma_walk->range;
440 bool fault, write_fault;
441 unsigned long i, npages;
442 uint64_t *pfns;
443
444 i = (addr - range->start) >> PAGE_SHIFT;
445 npages = (end - addr) >> PAGE_SHIFT;
446 pfns = &range->pfns[i];
447 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
448 0, &fault, &write_fault);
449 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
450}
451
452static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
453{
454 if (pmd_protnone(pmd))
455 return 0;
456 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
457 range->flags[HMM_PFN_WRITE] :
458 range->flags[HMM_PFN_VALID];
459}
460
461static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
462{
463 if (!pud_present(pud))
464 return 0;
465 return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
466 range->flags[HMM_PFN_WRITE] :
467 range->flags[HMM_PFN_VALID];
468}
469
470static int hmm_vma_handle_pmd(struct mm_walk *walk,
471 unsigned long addr,
472 unsigned long end,
473 uint64_t *pfns,
474 pmd_t pmd)
475{
476#ifdef CONFIG_TRANSPARENT_HUGEPAGE
477 struct hmm_vma_walk *hmm_vma_walk = walk->private;
478 struct hmm_range *range = hmm_vma_walk->range;
479 unsigned long pfn, npages, i;
480 bool fault, write_fault;
481 uint64_t cpu_flags;
482
483 npages = (end - addr) >> PAGE_SHIFT;
484 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
485 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
486 &fault, &write_fault);
487
488 if (pmd_protnone(pmd) || fault || write_fault)
489 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
490
491 pfn = pmd_pfn(pmd) + pte_index(addr);
492 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
493 if (pmd_devmap(pmd)) {
494 hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
495 hmm_vma_walk->pgmap);
496 if (unlikely(!hmm_vma_walk->pgmap))
497 return -EBUSY;
498 }
499 pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
500 }
501 if (hmm_vma_walk->pgmap) {
502 put_dev_pagemap(hmm_vma_walk->pgmap);
503 hmm_vma_walk->pgmap = NULL;
504 }
505 hmm_vma_walk->last = end;
506 return 0;
507#else
508
509 return -EINVAL;
510#endif
511}
512
513static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
514{
515 if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
516 return 0;
517 return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
518 range->flags[HMM_PFN_WRITE] :
519 range->flags[HMM_PFN_VALID];
520}
521
522static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
523 unsigned long end, pmd_t *pmdp, pte_t *ptep,
524 uint64_t *pfn)
525{
526 struct hmm_vma_walk *hmm_vma_walk = walk->private;
527 struct hmm_range *range = hmm_vma_walk->range;
528 struct vm_area_struct *vma = walk->vma;
529 bool fault, write_fault;
530 uint64_t cpu_flags;
531 pte_t pte = *ptep;
532 uint64_t orig_pfn = *pfn;
533
534 *pfn = range->values[HMM_PFN_NONE];
535 fault = write_fault = false;
536
537 if (pte_none(pte)) {
538 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
539 &fault, &write_fault);
540 if (fault || write_fault)
541 goto fault;
542 return 0;
543 }
544
545 if (!pte_present(pte)) {
546 swp_entry_t entry = pte_to_swp_entry(pte);
547
548 if (!non_swap_entry(entry)) {
549 if (fault || write_fault)
550 goto fault;
551 return 0;
552 }
553
554
555
556
557
558 if (is_device_private_entry(entry)) {
559 cpu_flags = range->flags[HMM_PFN_VALID] |
560 range->flags[HMM_PFN_DEVICE_PRIVATE];
561 cpu_flags |= is_write_device_private_entry(entry) ?
562 range->flags[HMM_PFN_WRITE] : 0;
563 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
564 &fault, &write_fault);
565 if (fault || write_fault)
566 goto fault;
567 *pfn = hmm_device_entry_from_pfn(range,
568 swp_offset(entry));
569 *pfn |= cpu_flags;
570 return 0;
571 }
572
573 if (is_migration_entry(entry)) {
574 if (fault || write_fault) {
575 pte_unmap(ptep);
576 hmm_vma_walk->last = addr;
577 migration_entry_wait(vma->vm_mm,
578 pmdp, addr);
579 return -EBUSY;
580 }
581 return 0;
582 }
583
584
585 *pfn = range->values[HMM_PFN_ERROR];
586 return -EFAULT;
587 } else {
588 cpu_flags = pte_to_hmm_pfn_flags(range, pte);
589 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
590 &fault, &write_fault);
591 }
592
593 if (fault || write_fault)
594 goto fault;
595
596 if (pte_devmap(pte)) {
597 hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
598 hmm_vma_walk->pgmap);
599 if (unlikely(!hmm_vma_walk->pgmap))
600 return -EBUSY;
601 } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
602 *pfn = range->values[HMM_PFN_SPECIAL];
603 return -EFAULT;
604 }
605
606 *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
607 return 0;
608
609fault:
610 if (hmm_vma_walk->pgmap) {
611 put_dev_pagemap(hmm_vma_walk->pgmap);
612 hmm_vma_walk->pgmap = NULL;
613 }
614 pte_unmap(ptep);
615
616 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
617}
618
619static int hmm_vma_walk_pmd(pmd_t *pmdp,
620 unsigned long start,
621 unsigned long end,
622 struct mm_walk *walk)
623{
624 struct hmm_vma_walk *hmm_vma_walk = walk->private;
625 struct hmm_range *range = hmm_vma_walk->range;
626 struct vm_area_struct *vma = walk->vma;
627 uint64_t *pfns = range->pfns;
628 unsigned long addr = start, i;
629 pte_t *ptep;
630 pmd_t pmd;
631
632
633again:
634 pmd = READ_ONCE(*pmdp);
635 if (pmd_none(pmd))
636 return hmm_vma_walk_hole(start, end, walk);
637
638 if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
639 return hmm_pfns_bad(start, end, walk);
640
641 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
642 bool fault, write_fault;
643 unsigned long npages;
644 uint64_t *pfns;
645
646 i = (addr - range->start) >> PAGE_SHIFT;
647 npages = (end - addr) >> PAGE_SHIFT;
648 pfns = &range->pfns[i];
649
650 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
651 0, &fault, &write_fault);
652 if (fault || write_fault) {
653 hmm_vma_walk->last = addr;
654 pmd_migration_entry_wait(vma->vm_mm, pmdp);
655 return -EBUSY;
656 }
657 return 0;
658 } else if (!pmd_present(pmd))
659 return hmm_pfns_bad(start, end, walk);
660
661 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
662
663
664
665
666
667
668
669
670
671 pmd = pmd_read_atomic(pmdp);
672 barrier();
673 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
674 goto again;
675
676 i = (addr - range->start) >> PAGE_SHIFT;
677 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
678 }
679
680
681
682
683
684
685
686 if (pmd_bad(pmd))
687 return hmm_pfns_bad(start, end, walk);
688
689 ptep = pte_offset_map(pmdp, addr);
690 i = (addr - range->start) >> PAGE_SHIFT;
691 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
692 int r;
693
694 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
695 if (r) {
696
697 hmm_vma_walk->last = addr;
698 return r;
699 }
700 }
701 if (hmm_vma_walk->pgmap) {
702
703
704
705
706
707
708 put_dev_pagemap(hmm_vma_walk->pgmap);
709 hmm_vma_walk->pgmap = NULL;
710 }
711 pte_unmap(ptep - 1);
712
713 hmm_vma_walk->last = addr;
714 return 0;
715}
716
717static int hmm_vma_walk_pud(pud_t *pudp,
718 unsigned long start,
719 unsigned long end,
720 struct mm_walk *walk)
721{
722 struct hmm_vma_walk *hmm_vma_walk = walk->private;
723 struct hmm_range *range = hmm_vma_walk->range;
724 unsigned long addr = start, next;
725 pmd_t *pmdp;
726 pud_t pud;
727 int ret;
728
729again:
730 pud = READ_ONCE(*pudp);
731 if (pud_none(pud))
732 return hmm_vma_walk_hole(start, end, walk);
733
734 if (pud_huge(pud) && pud_devmap(pud)) {
735 unsigned long i, npages, pfn;
736 uint64_t *pfns, cpu_flags;
737 bool fault, write_fault;
738
739 if (!pud_present(pud))
740 return hmm_vma_walk_hole(start, end, walk);
741
742 i = (addr - range->start) >> PAGE_SHIFT;
743 npages = (end - addr) >> PAGE_SHIFT;
744 pfns = &range->pfns[i];
745
746 cpu_flags = pud_to_hmm_pfn_flags(range, pud);
747 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
748 cpu_flags, &fault, &write_fault);
749 if (fault || write_fault)
750 return hmm_vma_walk_hole_(addr, end, fault,
751 write_fault, walk);
752
753 pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
754 for (i = 0; i < npages; ++i, ++pfn) {
755 hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
756 hmm_vma_walk->pgmap);
757 if (unlikely(!hmm_vma_walk->pgmap))
758 return -EBUSY;
759 pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
760 cpu_flags;
761 }
762 if (hmm_vma_walk->pgmap) {
763 put_dev_pagemap(hmm_vma_walk->pgmap);
764 hmm_vma_walk->pgmap = NULL;
765 }
766 hmm_vma_walk->last = end;
767 return 0;
768 }
769
770 split_huge_pud(walk->vma, pudp, addr);
771 if (pud_none(*pudp))
772 goto again;
773
774 pmdp = pmd_offset(pudp, addr);
775 do {
776 next = pmd_addr_end(addr, end);
777 ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
778 if (ret)
779 return ret;
780 } while (pmdp++, addr = next, addr != end);
781
782 return 0;
783}
784
785static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
786 unsigned long start, unsigned long end,
787 struct mm_walk *walk)
788{
789#ifdef CONFIG_HUGETLB_PAGE
790 unsigned long addr = start, i, pfn, mask, size, pfn_inc;
791 struct hmm_vma_walk *hmm_vma_walk = walk->private;
792 struct hmm_range *range = hmm_vma_walk->range;
793 struct vm_area_struct *vma = walk->vma;
794 struct hstate *h = hstate_vma(vma);
795 uint64_t orig_pfn, cpu_flags;
796 bool fault, write_fault;
797 spinlock_t *ptl;
798 pte_t entry;
799 int ret = 0;
800
801 size = 1UL << huge_page_shift(h);
802 mask = size - 1;
803 if (range->page_shift != PAGE_SHIFT) {
804
805 if (start & mask)
806 return -EINVAL;
807 if (end < (start + size))
808 return -EINVAL;
809 pfn_inc = size >> PAGE_SHIFT;
810 } else {
811 pfn_inc = 1;
812 size = PAGE_SIZE;
813 }
814
815
816 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
817 entry = huge_ptep_get(pte);
818
819 i = (start - range->start) >> range->page_shift;
820 orig_pfn = range->pfns[i];
821 range->pfns[i] = range->values[HMM_PFN_NONE];
822 cpu_flags = pte_to_hmm_pfn_flags(range, entry);
823 fault = write_fault = false;
824 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
825 &fault, &write_fault);
826 if (fault || write_fault) {
827 ret = -ENOENT;
828 goto unlock;
829 }
830
831 pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
832 for (; addr < end; addr += size, i++, pfn += pfn_inc)
833 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
834 cpu_flags;
835 hmm_vma_walk->last = end;
836
837unlock:
838 spin_unlock(ptl);
839
840 if (ret == -ENOENT)
841 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
842
843 return ret;
844#else
845 return -EINVAL;
846#endif
847}
848
849static void hmm_pfns_clear(struct hmm_range *range,
850 uint64_t *pfns,
851 unsigned long addr,
852 unsigned long end)
853{
854 for (; addr < end; addr += PAGE_SIZE, pfns++)
855 *pfns = range->values[HMM_PFN_NONE];
856}
857
858
859
860
861
862
863
864
865
866
867
868
869int hmm_range_register(struct hmm_range *range,
870 struct hmm_mirror *mirror,
871 unsigned long start,
872 unsigned long end,
873 unsigned page_shift)
874{
875 unsigned long mask = ((1UL << page_shift) - 1UL);
876 struct hmm *hmm = mirror->hmm;
877 unsigned long flags;
878
879 range->valid = false;
880 range->hmm = NULL;
881
882 if ((start & mask) || (end & mask))
883 return -EINVAL;
884 if (start >= end)
885 return -EINVAL;
886
887 range->page_shift = page_shift;
888 range->start = start;
889 range->end = end;
890
891
892 if (!mmget_not_zero(hmm->mm))
893 return -EFAULT;
894
895
896 spin_lock_irqsave(&hmm->ranges_lock, flags);
897
898 range->hmm = hmm;
899 kref_get(&hmm->kref);
900 list_add(&range->list, &hmm->ranges);
901
902
903
904
905
906 if (!hmm->notifiers)
907 range->valid = true;
908 spin_unlock_irqrestore(&hmm->ranges_lock, flags);
909
910 return 0;
911}
912EXPORT_SYMBOL(hmm_range_register);
913
914
915
916
917
918
919
920
921void hmm_range_unregister(struct hmm_range *range)
922{
923 struct hmm *hmm = range->hmm;
924 unsigned long flags;
925
926 spin_lock_irqsave(&hmm->ranges_lock, flags);
927 list_del_init(&range->list);
928 spin_unlock_irqrestore(&hmm->ranges_lock, flags);
929
930
931 mmput(hmm->mm);
932 hmm_put(hmm);
933
934
935
936
937
938
939 range->valid = false;
940 memset(&range->hmm, POISON_INUSE, sizeof(range->hmm));
941}
942EXPORT_SYMBOL(hmm_range_unregister);
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957long hmm_range_snapshot(struct hmm_range *range)
958{
959 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
960 unsigned long start = range->start, end;
961 struct hmm_vma_walk hmm_vma_walk;
962 struct hmm *hmm = range->hmm;
963 struct vm_area_struct *vma;
964 struct mm_walk mm_walk;
965
966 lockdep_assert_held(&hmm->mm->mmap_sem);
967 do {
968
969 if (!range->valid)
970 return -EBUSY;
971
972 vma = find_vma(hmm->mm, start);
973 if (vma == NULL || (vma->vm_flags & device_vma))
974 return -EFAULT;
975
976 if (is_vm_hugetlb_page(vma)) {
977 if (huge_page_shift(hstate_vma(vma)) !=
978 range->page_shift &&
979 range->page_shift != PAGE_SHIFT)
980 return -EINVAL;
981 } else {
982 if (range->page_shift != PAGE_SHIFT)
983 return -EINVAL;
984 }
985
986 if (!(vma->vm_flags & VM_READ)) {
987
988
989
990
991
992 hmm_pfns_clear(range, range->pfns,
993 range->start, range->end);
994 return -EPERM;
995 }
996
997 range->vma = vma;
998 hmm_vma_walk.pgmap = NULL;
999 hmm_vma_walk.last = start;
1000 hmm_vma_walk.fault = false;
1001 hmm_vma_walk.range = range;
1002 mm_walk.private = &hmm_vma_walk;
1003 end = min(range->end, vma->vm_end);
1004
1005 mm_walk.vma = vma;
1006 mm_walk.mm = vma->vm_mm;
1007 mm_walk.pte_entry = NULL;
1008 mm_walk.test_walk = NULL;
1009 mm_walk.hugetlb_entry = NULL;
1010 mm_walk.pud_entry = hmm_vma_walk_pud;
1011 mm_walk.pmd_entry = hmm_vma_walk_pmd;
1012 mm_walk.pte_hole = hmm_vma_walk_hole;
1013 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
1014
1015 walk_page_range(start, end, &mm_walk);
1016 start = end;
1017 } while (start < range->end);
1018
1019 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
1020}
1021EXPORT_SYMBOL(hmm_range_snapshot);
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051long hmm_range_fault(struct hmm_range *range, bool block)
1052{
1053 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
1054 unsigned long start = range->start, end;
1055 struct hmm_vma_walk hmm_vma_walk;
1056 struct hmm *hmm = range->hmm;
1057 struct vm_area_struct *vma;
1058 struct mm_walk mm_walk;
1059 int ret;
1060
1061 lockdep_assert_held(&hmm->mm->mmap_sem);
1062
1063 do {
1064
1065 if (!range->valid)
1066 return -EBUSY;
1067
1068 vma = find_vma(hmm->mm, start);
1069 if (vma == NULL || (vma->vm_flags & device_vma))
1070 return -EFAULT;
1071
1072 if (is_vm_hugetlb_page(vma)) {
1073 if (huge_page_shift(hstate_vma(vma)) !=
1074 range->page_shift &&
1075 range->page_shift != PAGE_SHIFT)
1076 return -EINVAL;
1077 } else {
1078 if (range->page_shift != PAGE_SHIFT)
1079 return -EINVAL;
1080 }
1081
1082 if (!(vma->vm_flags & VM_READ)) {
1083
1084
1085
1086
1087
1088 hmm_pfns_clear(range, range->pfns,
1089 range->start, range->end);
1090 return -EPERM;
1091 }
1092
1093 range->vma = vma;
1094 hmm_vma_walk.pgmap = NULL;
1095 hmm_vma_walk.last = start;
1096 hmm_vma_walk.fault = true;
1097 hmm_vma_walk.block = block;
1098 hmm_vma_walk.range = range;
1099 mm_walk.private = &hmm_vma_walk;
1100 end = min(range->end, vma->vm_end);
1101
1102 mm_walk.vma = vma;
1103 mm_walk.mm = vma->vm_mm;
1104 mm_walk.pte_entry = NULL;
1105 mm_walk.test_walk = NULL;
1106 mm_walk.hugetlb_entry = NULL;
1107 mm_walk.pud_entry = hmm_vma_walk_pud;
1108 mm_walk.pmd_entry = hmm_vma_walk_pmd;
1109 mm_walk.pte_hole = hmm_vma_walk_hole;
1110 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
1111
1112 do {
1113 ret = walk_page_range(start, end, &mm_walk);
1114 start = hmm_vma_walk.last;
1115
1116
1117 } while (ret == -EBUSY && range->valid);
1118
1119 if (ret) {
1120 unsigned long i;
1121
1122 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
1123 hmm_pfns_clear(range, &range->pfns[i],
1124 hmm_vma_walk.last, range->end);
1125 return ret;
1126 }
1127 start = end;
1128
1129 } while (start < range->end);
1130
1131 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
1132}
1133EXPORT_SYMBOL(hmm_range_fault);
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146long hmm_range_dma_map(struct hmm_range *range,
1147 struct device *device,
1148 dma_addr_t *daddrs,
1149 bool block)
1150{
1151 unsigned long i, npages, mapped;
1152 long ret;
1153
1154 ret = hmm_range_fault(range, block);
1155 if (ret <= 0)
1156 return ret ? ret : -EBUSY;
1157
1158 npages = (range->end - range->start) >> PAGE_SHIFT;
1159 for (i = 0, mapped = 0; i < npages; ++i) {
1160 enum dma_data_direction dir = DMA_TO_DEVICE;
1161 struct page *page;
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171 daddrs[i] = 0;
1172
1173 page = hmm_device_entry_to_page(range, range->pfns[i]);
1174 if (page == NULL)
1175 continue;
1176
1177
1178 if (!range->valid) {
1179 ret = -EBUSY;
1180 goto unmap;
1181 }
1182
1183
1184 if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
1185 dir = DMA_BIDIRECTIONAL;
1186
1187 daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
1188 if (dma_mapping_error(device, daddrs[i])) {
1189 ret = -EFAULT;
1190 goto unmap;
1191 }
1192
1193 mapped++;
1194 }
1195
1196 return mapped;
1197
1198unmap:
1199 for (npages = i, i = 0; (i < npages) && mapped; ++i) {
1200 enum dma_data_direction dir = DMA_TO_DEVICE;
1201 struct page *page;
1202
1203 page = hmm_device_entry_to_page(range, range->pfns[i]);
1204 if (page == NULL)
1205 continue;
1206
1207 if (dma_mapping_error(device, daddrs[i]))
1208 continue;
1209
1210
1211 if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
1212 dir = DMA_BIDIRECTIONAL;
1213
1214 dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
1215 mapped--;
1216 }
1217
1218 return ret;
1219}
1220EXPORT_SYMBOL(hmm_range_dma_map);
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236long hmm_range_dma_unmap(struct hmm_range *range,
1237 struct vm_area_struct *vma,
1238 struct device *device,
1239 dma_addr_t *daddrs,
1240 bool dirty)
1241{
1242 unsigned long i, npages;
1243 long cpages = 0;
1244
1245
1246 if (range->end <= range->start)
1247 return -EINVAL;
1248 if (!daddrs)
1249 return -EINVAL;
1250 if (!range->pfns)
1251 return -EINVAL;
1252
1253 npages = (range->end - range->start) >> PAGE_SHIFT;
1254 for (i = 0; i < npages; ++i) {
1255 enum dma_data_direction dir = DMA_TO_DEVICE;
1256 struct page *page;
1257
1258 page = hmm_device_entry_to_page(range, range->pfns[i]);
1259 if (page == NULL)
1260 continue;
1261
1262
1263 if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
1264 dir = DMA_BIDIRECTIONAL;
1265
1266
1267
1268
1269
1270 if (dirty)
1271 set_page_dirty(page);
1272 }
1273
1274
1275 dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
1276 range->pfns[i] = range->values[HMM_PFN_NONE];
1277
1278 daddrs[i] = 0;
1279 cpages++;
1280 }
1281
1282 return cpages;
1283}
1284EXPORT_SYMBOL(hmm_range_dma_unmap);
1285