1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include <linux/mm.h>
21#include <linux/hmm.h>
22#include <linux/init.h>
23#include <linux/rmap.h>
24#include <linux/swap.h>
25#include <linux/slab.h>
26#include <linux/sched.h>
27#include <linux/mmzone.h>
28#include <linux/pagemap.h>
29#include <linux/swapops.h>
30#include <linux/hugetlb.h>
31#include <linux/memremap.h>
32#include <linux/jump_label.h>
33#include <linux/mmu_notifier.h>
34#include <linux/memory_hotplug.h>
35
36#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
37
38#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
39
40
41
42DEFINE_STATIC_KEY_FALSE(device_private_key);
43EXPORT_SYMBOL(device_private_key);
44#endif
45
46
47#if IS_ENABLED(CONFIG_HMM_MIRROR)
48static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
49
50
51
52
53
54
55
56
57
58
59
60
61struct hmm {
62 struct mm_struct *mm;
63 spinlock_t lock;
64 atomic_t sequence;
65 struct list_head ranges;
66 struct list_head mirrors;
67 struct mmu_notifier mmu_notifier;
68 struct rw_semaphore mirrors_sem;
69};
70
71
72
73
74
75
76
77
78
79static struct hmm *hmm_register(struct mm_struct *mm)
80{
81 struct hmm *hmm = READ_ONCE(mm->hmm);
82 bool cleanup = false;
83
84
85
86
87
88
89 if (hmm)
90 return hmm;
91
92 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
93 if (!hmm)
94 return NULL;
95 INIT_LIST_HEAD(&hmm->mirrors);
96 init_rwsem(&hmm->mirrors_sem);
97 atomic_set(&hmm->sequence, 0);
98 hmm->mmu_notifier.ops = NULL;
99 INIT_LIST_HEAD(&hmm->ranges);
100 spin_lock_init(&hmm->lock);
101 hmm->mm = mm;
102
103
104
105
106
107 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
108 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
109 kfree(hmm);
110 return NULL;
111 }
112
113 spin_lock(&mm->page_table_lock);
114 if (!mm->hmm)
115 mm->hmm = hmm;
116 else
117 cleanup = true;
118 spin_unlock(&mm->page_table_lock);
119
120 if (cleanup) {
121 mmu_notifier_unregister(&hmm->mmu_notifier, mm);
122 kfree(hmm);
123 }
124
125 return mm->hmm;
126}
127
128void hmm_mm_destroy(struct mm_struct *mm)
129{
130 kfree(mm->hmm);
131}
132
133static void hmm_invalidate_range(struct hmm *hmm,
134 enum hmm_update_type action,
135 unsigned long start,
136 unsigned long end)
137{
138 struct hmm_mirror *mirror;
139 struct hmm_range *range;
140
141 spin_lock(&hmm->lock);
142 list_for_each_entry(range, &hmm->ranges, list) {
143 unsigned long addr, idx, npages;
144
145 if (end < range->start || start >= range->end)
146 continue;
147
148 range->valid = false;
149 addr = max(start, range->start);
150 idx = (addr - range->start) >> PAGE_SHIFT;
151 npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
152 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
153 }
154 spin_unlock(&hmm->lock);
155
156 down_read(&hmm->mirrors_sem);
157 list_for_each_entry(mirror, &hmm->mirrors, list)
158 mirror->ops->sync_cpu_device_pagetables(mirror, action,
159 start, end);
160 up_read(&hmm->mirrors_sem);
161}
162
163static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
164{
165 struct hmm_mirror *mirror;
166 struct hmm *hmm = mm->hmm;
167
168 down_write(&hmm->mirrors_sem);
169 mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
170 list);
171 while (mirror) {
172 list_del_init(&mirror->list);
173 if (mirror->ops->release) {
174
175
176
177
178
179 up_write(&hmm->mirrors_sem);
180 mirror->ops->release(mirror);
181 down_write(&hmm->mirrors_sem);
182 }
183 mirror = list_first_entry_or_null(&hmm->mirrors,
184 struct hmm_mirror, list);
185 }
186 up_write(&hmm->mirrors_sem);
187}
188
189static void hmm_invalidate_range_start(struct mmu_notifier *mn,
190 struct mm_struct *mm,
191 unsigned long start,
192 unsigned long end)
193{
194 struct hmm *hmm = mm->hmm;
195
196 VM_BUG_ON(!hmm);
197
198 atomic_inc(&hmm->sequence);
199}
200
201static void hmm_invalidate_range_end(struct mmu_notifier *mn,
202 struct mm_struct *mm,
203 unsigned long start,
204 unsigned long end)
205{
206 struct hmm *hmm = mm->hmm;
207
208 VM_BUG_ON(!hmm);
209
210 hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
211}
212
213static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
214 .release = hmm_release,
215 .invalidate_range_start = hmm_invalidate_range_start,
216 .invalidate_range_end = hmm_invalidate_range_end,
217};
218
219
220
221
222
223
224
225
226
227
228
229
230int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
231{
232
233 if (!mm || !mirror || !mirror->ops)
234 return -EINVAL;
235
236again:
237 mirror->hmm = hmm_register(mm);
238 if (!mirror->hmm)
239 return -ENOMEM;
240
241 down_write(&mirror->hmm->mirrors_sem);
242 if (mirror->hmm->mm == NULL) {
243
244
245
246
247 up_write(&mirror->hmm->mirrors_sem);
248 mirror->hmm = NULL;
249 goto again;
250 } else {
251 list_add(&mirror->list, &mirror->hmm->mirrors);
252 up_write(&mirror->hmm->mirrors_sem);
253 }
254
255 return 0;
256}
257EXPORT_SYMBOL(hmm_mirror_register);
258
259
260
261
262
263
264
265
266void hmm_mirror_unregister(struct hmm_mirror *mirror)
267{
268 bool should_unregister = false;
269 struct mm_struct *mm;
270 struct hmm *hmm;
271
272 if (mirror->hmm == NULL)
273 return;
274
275 hmm = mirror->hmm;
276 down_write(&hmm->mirrors_sem);
277 list_del_init(&mirror->list);
278 should_unregister = list_empty(&hmm->mirrors);
279 mirror->hmm = NULL;
280 mm = hmm->mm;
281 hmm->mm = NULL;
282 up_write(&hmm->mirrors_sem);
283
284 if (!should_unregister || mm == NULL)
285 return;
286
287 spin_lock(&mm->page_table_lock);
288 if (mm->hmm == hmm)
289 mm->hmm = NULL;
290 spin_unlock(&mm->page_table_lock);
291
292 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
293 kfree(hmm);
294}
295EXPORT_SYMBOL(hmm_mirror_unregister);
296
297struct hmm_vma_walk {
298 struct hmm_range *range;
299 unsigned long last;
300 bool fault;
301 bool block;
302};
303
304static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
305 bool write_fault, uint64_t *pfn)
306{
307 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
308 struct hmm_vma_walk *hmm_vma_walk = walk->private;
309 struct hmm_range *range = hmm_vma_walk->range;
310 struct vm_area_struct *vma = walk->vma;
311 int r;
312
313 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
314 flags |= write_fault ? FAULT_FLAG_WRITE : 0;
315 r = handle_mm_fault(vma, addr, flags);
316 if (r & VM_FAULT_RETRY)
317 return -EBUSY;
318 if (r & VM_FAULT_ERROR) {
319 *pfn = range->values[HMM_PFN_ERROR];
320 return -EFAULT;
321 }
322
323 return -EAGAIN;
324}
325
326static int hmm_pfns_bad(unsigned long addr,
327 unsigned long end,
328 struct mm_walk *walk)
329{
330 struct hmm_vma_walk *hmm_vma_walk = walk->private;
331 struct hmm_range *range = hmm_vma_walk->range;
332 uint64_t *pfns = range->pfns;
333 unsigned long i;
334
335 i = (addr - range->start) >> PAGE_SHIFT;
336 for (; addr < end; addr += PAGE_SIZE, i++)
337 pfns[i] = range->values[HMM_PFN_ERROR];
338
339 return 0;
340}
341
342
343
344
345
346
347
348
349
350
351
352
353
354static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
355 bool fault, bool write_fault,
356 struct mm_walk *walk)
357{
358 struct hmm_vma_walk *hmm_vma_walk = walk->private;
359 struct hmm_range *range = hmm_vma_walk->range;
360 uint64_t *pfns = range->pfns;
361 unsigned long i;
362
363 hmm_vma_walk->last = addr;
364 i = (addr - range->start) >> PAGE_SHIFT;
365 for (; addr < end; addr += PAGE_SIZE, i++) {
366 pfns[i] = range->values[HMM_PFN_NONE];
367 if (fault || write_fault) {
368 int ret;
369
370 ret = hmm_vma_do_fault(walk, addr, write_fault,
371 &pfns[i]);
372 if (ret != -EAGAIN)
373 return ret;
374 }
375 }
376
377 return (fault || write_fault) ? -EAGAIN : 0;
378}
379
380static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
381 uint64_t pfns, uint64_t cpu_flags,
382 bool *fault, bool *write_fault)
383{
384 struct hmm_range *range = hmm_vma_walk->range;
385
386 *fault = *write_fault = false;
387 if (!hmm_vma_walk->fault)
388 return;
389
390
391 if (!(pfns & range->flags[HMM_PFN_VALID]))
392 return;
393
394 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
395
396 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
397 *write_fault = pfns & range->flags[HMM_PFN_WRITE];
398 *fault = true;
399 }
400 return;
401 }
402
403
404 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
405
406 if ((pfns & range->flags[HMM_PFN_WRITE]) &&
407 !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
408 *write_fault = true;
409 *fault = true;
410 }
411}
412
413static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
414 const uint64_t *pfns, unsigned long npages,
415 uint64_t cpu_flags, bool *fault,
416 bool *write_fault)
417{
418 unsigned long i;
419
420 if (!hmm_vma_walk->fault) {
421 *fault = *write_fault = false;
422 return;
423 }
424
425 for (i = 0; i < npages; ++i) {
426 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
427 fault, write_fault);
428 if ((*fault) || (*write_fault))
429 return;
430 }
431}
432
433static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
434 struct mm_walk *walk)
435{
436 struct hmm_vma_walk *hmm_vma_walk = walk->private;
437 struct hmm_range *range = hmm_vma_walk->range;
438 bool fault, write_fault;
439 unsigned long i, npages;
440 uint64_t *pfns;
441
442 i = (addr - range->start) >> PAGE_SHIFT;
443 npages = (end - addr) >> PAGE_SHIFT;
444 pfns = &range->pfns[i];
445 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
446 0, &fault, &write_fault);
447 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
448}
449
450static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
451{
452 if (pmd_protnone(pmd))
453 return 0;
454 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
455 range->flags[HMM_PFN_WRITE] :
456 range->flags[HMM_PFN_VALID];
457}
458
459static int hmm_vma_handle_pmd(struct mm_walk *walk,
460 unsigned long addr,
461 unsigned long end,
462 uint64_t *pfns,
463 pmd_t pmd)
464{
465 struct hmm_vma_walk *hmm_vma_walk = walk->private;
466 struct hmm_range *range = hmm_vma_walk->range;
467 unsigned long pfn, npages, i;
468 bool fault, write_fault;
469 uint64_t cpu_flags;
470
471 npages = (end - addr) >> PAGE_SHIFT;
472 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
473 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
474 &fault, &write_fault);
475
476 if (pmd_protnone(pmd) || fault || write_fault)
477 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
478
479 pfn = pmd_pfn(pmd) + pte_index(addr);
480 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
481 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
482 hmm_vma_walk->last = end;
483 return 0;
484}
485
486static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
487{
488 if (pte_none(pte) || !pte_present(pte))
489 return 0;
490 return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
491 range->flags[HMM_PFN_WRITE] :
492 range->flags[HMM_PFN_VALID];
493}
494
495static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
496 unsigned long end, pmd_t *pmdp, pte_t *ptep,
497 uint64_t *pfn)
498{
499 struct hmm_vma_walk *hmm_vma_walk = walk->private;
500 struct hmm_range *range = hmm_vma_walk->range;
501 struct vm_area_struct *vma = walk->vma;
502 bool fault, write_fault;
503 uint64_t cpu_flags;
504 pte_t pte = *ptep;
505 uint64_t orig_pfn = *pfn;
506
507 *pfn = range->values[HMM_PFN_NONE];
508 cpu_flags = pte_to_hmm_pfn_flags(range, pte);
509 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
510 &fault, &write_fault);
511
512 if (pte_none(pte)) {
513 if (fault || write_fault)
514 goto fault;
515 return 0;
516 }
517
518 if (!pte_present(pte)) {
519 swp_entry_t entry = pte_to_swp_entry(pte);
520
521 if (!non_swap_entry(entry)) {
522 if (fault || write_fault)
523 goto fault;
524 return 0;
525 }
526
527
528
529
530
531 if (is_device_private_entry(entry)) {
532 cpu_flags = range->flags[HMM_PFN_VALID] |
533 range->flags[HMM_PFN_DEVICE_PRIVATE];
534 cpu_flags |= is_write_device_private_entry(entry) ?
535 range->flags[HMM_PFN_WRITE] : 0;
536 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
537 &fault, &write_fault);
538 if (fault || write_fault)
539 goto fault;
540 *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
541 *pfn |= cpu_flags;
542 return 0;
543 }
544
545 if (is_migration_entry(entry)) {
546 if (fault || write_fault) {
547 pte_unmap(ptep);
548 hmm_vma_walk->last = addr;
549 migration_entry_wait(vma->vm_mm,
550 pmdp, addr);
551 return -EAGAIN;
552 }
553 return 0;
554 }
555
556
557 *pfn = range->values[HMM_PFN_ERROR];
558 return -EFAULT;
559 }
560
561 if (fault || write_fault)
562 goto fault;
563
564 *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
565 return 0;
566
567fault:
568 pte_unmap(ptep);
569
570 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
571}
572
573static int hmm_vma_walk_pmd(pmd_t *pmdp,
574 unsigned long start,
575 unsigned long end,
576 struct mm_walk *walk)
577{
578 struct hmm_vma_walk *hmm_vma_walk = walk->private;
579 struct hmm_range *range = hmm_vma_walk->range;
580 uint64_t *pfns = range->pfns;
581 unsigned long addr = start, i;
582 pte_t *ptep;
583
584 i = (addr - range->start) >> PAGE_SHIFT;
585
586again:
587 if (pmd_none(*pmdp))
588 return hmm_vma_walk_hole(start, end, walk);
589
590 if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
591 return hmm_pfns_bad(start, end, walk);
592
593 if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
594 pmd_t pmd;
595
596
597
598
599
600
601
602
603
604
605 pmd = pmd_read_atomic(pmdp);
606 barrier();
607 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
608 goto again;
609
610 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
611 }
612
613 if (pmd_bad(*pmdp))
614 return hmm_pfns_bad(start, end, walk);
615
616 ptep = pte_offset_map(pmdp, addr);
617 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
618 int r;
619
620 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
621 if (r) {
622
623 hmm_vma_walk->last = addr;
624 return r;
625 }
626 }
627 pte_unmap(ptep - 1);
628
629 hmm_vma_walk->last = addr;
630 return 0;
631}
632
633static void hmm_pfns_clear(struct hmm_range *range,
634 uint64_t *pfns,
635 unsigned long addr,
636 unsigned long end)
637{
638 for (; addr < end; addr += PAGE_SIZE, pfns++)
639 *pfns = range->values[HMM_PFN_NONE];
640}
641
642static void hmm_pfns_special(struct hmm_range *range)
643{
644 unsigned long addr = range->start, i = 0;
645
646 for (; addr < range->end; addr += PAGE_SIZE, i++)
647 range->pfns[i] = range->values[HMM_PFN_SPECIAL];
648}
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667int hmm_vma_get_pfns(struct hmm_range *range)
668{
669 struct vm_area_struct *vma = range->vma;
670 struct hmm_vma_walk hmm_vma_walk;
671 struct mm_walk mm_walk;
672 struct hmm *hmm;
673
674
675 if (range->start < vma->vm_start || range->start >= vma->vm_end)
676 return -EINVAL;
677 if (range->end < vma->vm_start || range->end > vma->vm_end)
678 return -EINVAL;
679
680 hmm = hmm_register(vma->vm_mm);
681 if (!hmm)
682 return -ENOMEM;
683
684 if (!hmm->mmu_notifier.ops)
685 return -EINVAL;
686
687
688 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
689 hmm_pfns_special(range);
690 return -EINVAL;
691 }
692
693 if (!(vma->vm_flags & VM_READ)) {
694
695
696
697
698
699
700 hmm_pfns_clear(range, range->pfns, range->start, range->end);
701 return -EPERM;
702 }
703
704
705 spin_lock(&hmm->lock);
706 range->valid = true;
707 list_add_rcu(&range->list, &hmm->ranges);
708 spin_unlock(&hmm->lock);
709
710 hmm_vma_walk.fault = false;
711 hmm_vma_walk.range = range;
712 mm_walk.private = &hmm_vma_walk;
713
714 mm_walk.vma = vma;
715 mm_walk.mm = vma->vm_mm;
716 mm_walk.pte_entry = NULL;
717 mm_walk.test_walk = NULL;
718 mm_walk.hugetlb_entry = NULL;
719 mm_walk.pmd_entry = hmm_vma_walk_pmd;
720 mm_walk.pte_hole = hmm_vma_walk_hole;
721
722 walk_page_range(range->start, range->end, &mm_walk);
723 return 0;
724}
725EXPORT_SYMBOL(hmm_vma_get_pfns);
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765bool hmm_vma_range_done(struct hmm_range *range)
766{
767 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
768 struct hmm *hmm;
769
770 if (range->end <= range->start) {
771 BUG();
772 return false;
773 }
774
775 hmm = hmm_register(range->vma->vm_mm);
776 if (!hmm) {
777 memset(range->pfns, 0, sizeof(*range->pfns) * npages);
778 return false;
779 }
780
781 spin_lock(&hmm->lock);
782 list_del_rcu(&range->list);
783 spin_unlock(&hmm->lock);
784
785 return range->valid;
786}
787EXPORT_SYMBOL(hmm_vma_range_done);
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836int hmm_vma_fault(struct hmm_range *range, bool block)
837{
838 struct vm_area_struct *vma = range->vma;
839 unsigned long start = range->start;
840 struct hmm_vma_walk hmm_vma_walk;
841 struct mm_walk mm_walk;
842 struct hmm *hmm;
843 int ret;
844
845
846 if (range->start < vma->vm_start || range->start >= vma->vm_end)
847 return -EINVAL;
848 if (range->end < vma->vm_start || range->end > vma->vm_end)
849 return -EINVAL;
850
851 hmm = hmm_register(vma->vm_mm);
852 if (!hmm) {
853 hmm_pfns_clear(range, range->pfns, range->start, range->end);
854 return -ENOMEM;
855 }
856
857 if (!hmm->mmu_notifier.ops)
858 return -EINVAL;
859
860
861 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
862 hmm_pfns_special(range);
863 return -EINVAL;
864 }
865
866 if (!(vma->vm_flags & VM_READ)) {
867
868
869
870
871
872
873 hmm_pfns_clear(range, range->pfns, range->start, range->end);
874 return -EPERM;
875 }
876
877
878 spin_lock(&hmm->lock);
879 range->valid = true;
880 list_add_rcu(&range->list, &hmm->ranges);
881 spin_unlock(&hmm->lock);
882
883 hmm_vma_walk.fault = true;
884 hmm_vma_walk.block = block;
885 hmm_vma_walk.range = range;
886 mm_walk.private = &hmm_vma_walk;
887 hmm_vma_walk.last = range->start;
888
889 mm_walk.vma = vma;
890 mm_walk.mm = vma->vm_mm;
891 mm_walk.pte_entry = NULL;
892 mm_walk.test_walk = NULL;
893 mm_walk.hugetlb_entry = NULL;
894 mm_walk.pmd_entry = hmm_vma_walk_pmd;
895 mm_walk.pte_hole = hmm_vma_walk_hole;
896
897 do {
898 ret = walk_page_range(start, range->end, &mm_walk);
899 start = hmm_vma_walk.last;
900 } while (ret == -EAGAIN);
901
902 if (ret) {
903 unsigned long i;
904
905 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
906 hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
907 range->end);
908 hmm_vma_range_done(range);
909 }
910 return ret;
911}
912EXPORT_SYMBOL(hmm_vma_fault);
913#endif
914
915
916#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
917struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
918 unsigned long addr)
919{
920 struct page *page;
921
922 page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
923 if (!page)
924 return NULL;
925 lock_page(page);
926 return page;
927}
928EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
929
930
931static void hmm_devmem_ref_release(struct percpu_ref *ref)
932{
933 struct hmm_devmem *devmem;
934
935 devmem = container_of(ref, struct hmm_devmem, ref);
936 complete(&devmem->completion);
937}
938
939static void hmm_devmem_ref_exit(void *data)
940{
941 struct percpu_ref *ref = data;
942 struct hmm_devmem *devmem;
943
944 devmem = container_of(ref, struct hmm_devmem, ref);
945 percpu_ref_exit(ref);
946 devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
947}
948
949static void hmm_devmem_ref_kill(void *data)
950{
951 struct percpu_ref *ref = data;
952 struct hmm_devmem *devmem;
953
954 devmem = container_of(ref, struct hmm_devmem, ref);
955 percpu_ref_kill(ref);
956 wait_for_completion(&devmem->completion);
957 devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
958}
959
960static int hmm_devmem_fault(struct vm_area_struct *vma,
961 unsigned long addr,
962 const struct page *page,
963 unsigned int flags,
964 pmd_t *pmdp)
965{
966 struct hmm_devmem *devmem = page->pgmap->data;
967
968 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
969}
970
971static void hmm_devmem_free(struct page *page, void *data)
972{
973 struct hmm_devmem *devmem = data;
974
975 devmem->ops->free(devmem, page);
976}
977
978static DEFINE_MUTEX(hmm_devmem_lock);
979static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
980
981static void hmm_devmem_radix_release(struct resource *resource)
982{
983 resource_size_t key, align_start, align_size;
984
985 align_start = resource->start & ~(PA_SECTION_SIZE - 1);
986 align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
987
988 mutex_lock(&hmm_devmem_lock);
989 for (key = resource->start;
990 key <= resource->end;
991 key += PA_SECTION_SIZE)
992 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
993 mutex_unlock(&hmm_devmem_lock);
994}
995
996static void hmm_devmem_release(struct device *dev, void *data)
997{
998 struct hmm_devmem *devmem = data;
999 struct resource *resource = devmem->resource;
1000 unsigned long start_pfn, npages;
1001 struct zone *zone;
1002 struct page *page;
1003
1004 if (percpu_ref_tryget_live(&devmem->ref)) {
1005 dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
1006 percpu_ref_put(&devmem->ref);
1007 }
1008
1009
1010 start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
1011 npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
1012
1013 page = pfn_to_page(start_pfn);
1014 zone = page_zone(page);
1015
1016 mem_hotplug_begin();
1017 if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
1018 __remove_pages(zone, start_pfn, npages, NULL);
1019 else
1020 arch_remove_memory(start_pfn << PAGE_SHIFT,
1021 npages << PAGE_SHIFT, NULL);
1022 mem_hotplug_done();
1023
1024 hmm_devmem_radix_release(resource);
1025}
1026
1027static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
1028{
1029 resource_size_t key, align_start, align_size, align_end;
1030 struct device *device = devmem->device;
1031 int ret, nid, is_ram;
1032 unsigned long pfn;
1033
1034 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
1035 align_size = ALIGN(devmem->resource->start +
1036 resource_size(devmem->resource),
1037 PA_SECTION_SIZE) - align_start;
1038
1039 is_ram = region_intersects(align_start, align_size,
1040 IORESOURCE_SYSTEM_RAM,
1041 IORES_DESC_NONE);
1042 if (is_ram == REGION_MIXED) {
1043 WARN_ONCE(1, "%s attempted on mixed region %pr\n",
1044 __func__, devmem->resource);
1045 return -ENXIO;
1046 }
1047 if (is_ram == REGION_INTERSECTS)
1048 return -ENXIO;
1049
1050 if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
1051 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
1052 else
1053 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
1054
1055 devmem->pagemap.res = *devmem->resource;
1056 devmem->pagemap.page_fault = hmm_devmem_fault;
1057 devmem->pagemap.page_free = hmm_devmem_free;
1058 devmem->pagemap.dev = devmem->device;
1059 devmem->pagemap.ref = &devmem->ref;
1060 devmem->pagemap.data = devmem;
1061
1062 mutex_lock(&hmm_devmem_lock);
1063 align_end = align_start + align_size - 1;
1064 for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
1065 struct hmm_devmem *dup;
1066
1067 dup = radix_tree_lookup(&hmm_devmem_radix,
1068 key >> PA_SECTION_SHIFT);
1069 if (dup) {
1070 dev_err(device, "%s: collides with mapping for %s\n",
1071 __func__, dev_name(dup->device));
1072 mutex_unlock(&hmm_devmem_lock);
1073 ret = -EBUSY;
1074 goto error;
1075 }
1076 ret = radix_tree_insert(&hmm_devmem_radix,
1077 key >> PA_SECTION_SHIFT,
1078 devmem);
1079 if (ret) {
1080 dev_err(device, "%s: failed: %d\n", __func__, ret);
1081 mutex_unlock(&hmm_devmem_lock);
1082 goto error_radix;
1083 }
1084 }
1085 mutex_unlock(&hmm_devmem_lock);
1086
1087 nid = dev_to_node(device);
1088 if (nid < 0)
1089 nid = numa_mem_id();
1090
1091 mem_hotplug_begin();
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102 if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
1103 ret = arch_add_memory(nid, align_start, align_size, NULL,
1104 false);
1105 else
1106 ret = add_pages(nid, align_start >> PAGE_SHIFT,
1107 align_size >> PAGE_SHIFT, NULL, false);
1108 if (ret) {
1109 mem_hotplug_done();
1110 goto error_add_memory;
1111 }
1112 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
1113 align_start >> PAGE_SHIFT,
1114 align_size >> PAGE_SHIFT, NULL);
1115 mem_hotplug_done();
1116
1117 for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
1118 struct page *page = pfn_to_page(pfn);
1119
1120 page->pgmap = &devmem->pagemap;
1121 }
1122 return 0;
1123
1124error_add_memory:
1125 untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
1126error_radix:
1127 hmm_devmem_radix_release(devmem->resource);
1128error:
1129 return ret;
1130}
1131
1132static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
1133{
1134 struct hmm_devmem *devmem = data;
1135
1136 return devmem->resource == match_data;
1137}
1138
1139static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
1140{
1141 devres_release(devmem->device, &hmm_devmem_release,
1142 &hmm_devmem_match, devmem->resource);
1143}
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
1163 struct device *device,
1164 unsigned long size)
1165{
1166 struct hmm_devmem *devmem;
1167 resource_size_t addr;
1168 int ret;
1169
1170 static_branch_enable(&device_private_key);
1171
1172 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
1173 GFP_KERNEL, dev_to_node(device));
1174 if (!devmem)
1175 return ERR_PTR(-ENOMEM);
1176
1177 init_completion(&devmem->completion);
1178 devmem->pfn_first = -1UL;
1179 devmem->pfn_last = -1UL;
1180 devmem->resource = NULL;
1181 devmem->device = device;
1182 devmem->ops = ops;
1183
1184 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1185 0, GFP_KERNEL);
1186 if (ret)
1187 goto error_percpu_ref;
1188
1189 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
1190 if (ret)
1191 goto error_devm_add_action;
1192
1193 size = ALIGN(size, PA_SECTION_SIZE);
1194 addr = min((unsigned long)iomem_resource.end,
1195 (1UL << MAX_PHYSMEM_BITS) - 1);
1196 addr = addr - size + 1UL;
1197
1198
1199
1200
1201
1202
1203
1204 for (; addr > size && addr >= iomem_resource.start; addr -= size) {
1205 ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
1206 if (ret != REGION_DISJOINT)
1207 continue;
1208
1209 devmem->resource = devm_request_mem_region(device, addr, size,
1210 dev_name(device));
1211 if (!devmem->resource) {
1212 ret = -ENOMEM;
1213 goto error_no_resource;
1214 }
1215 break;
1216 }
1217 if (!devmem->resource) {
1218 ret = -ERANGE;
1219 goto error_no_resource;
1220 }
1221
1222 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
1223 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1224 devmem->pfn_last = devmem->pfn_first +
1225 (resource_size(devmem->resource) >> PAGE_SHIFT);
1226
1227 ret = hmm_devmem_pages_create(devmem);
1228 if (ret)
1229 goto error_pages;
1230
1231 devres_add(device, devmem);
1232
1233 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
1234 if (ret) {
1235 hmm_devmem_remove(devmem);
1236 return ERR_PTR(ret);
1237 }
1238
1239 return devmem;
1240
1241error_pages:
1242 devm_release_mem_region(device, devmem->resource->start,
1243 resource_size(devmem->resource));
1244error_no_resource:
1245error_devm_add_action:
1246 hmm_devmem_ref_kill(&devmem->ref);
1247 hmm_devmem_ref_exit(&devmem->ref);
1248error_percpu_ref:
1249 devres_free(devmem);
1250 return ERR_PTR(ret);
1251}
1252EXPORT_SYMBOL(hmm_devmem_add);
1253
1254struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
1255 struct device *device,
1256 struct resource *res)
1257{
1258 struct hmm_devmem *devmem;
1259 int ret;
1260
1261 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
1262 return ERR_PTR(-EINVAL);
1263
1264 static_branch_enable(&device_private_key);
1265
1266 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
1267 GFP_KERNEL, dev_to_node(device));
1268 if (!devmem)
1269 return ERR_PTR(-ENOMEM);
1270
1271 init_completion(&devmem->completion);
1272 devmem->pfn_first = -1UL;
1273 devmem->pfn_last = -1UL;
1274 devmem->resource = res;
1275 devmem->device = device;
1276 devmem->ops = ops;
1277
1278 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1279 0, GFP_KERNEL);
1280 if (ret)
1281 goto error_percpu_ref;
1282
1283 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
1284 if (ret)
1285 goto error_devm_add_action;
1286
1287
1288 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1289 devmem->pfn_last = devmem->pfn_first +
1290 (resource_size(devmem->resource) >> PAGE_SHIFT);
1291
1292 ret = hmm_devmem_pages_create(devmem);
1293 if (ret)
1294 goto error_devm_add_action;
1295
1296 devres_add(device, devmem);
1297
1298 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
1299 if (ret) {
1300 hmm_devmem_remove(devmem);
1301 return ERR_PTR(ret);
1302 }
1303
1304 return devmem;
1305
1306error_devm_add_action:
1307 hmm_devmem_ref_kill(&devmem->ref);
1308 hmm_devmem_ref_exit(&devmem->ref);
1309error_percpu_ref:
1310 devres_free(devmem);
1311 return ERR_PTR(ret);
1312}
1313EXPORT_SYMBOL(hmm_devmem_add_resource);
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324void hmm_devmem_remove(struct hmm_devmem *devmem)
1325{
1326 resource_size_t start, size;
1327 struct device *device;
1328 bool cdm = false;
1329
1330 if (!devmem)
1331 return;
1332
1333 device = devmem->device;
1334 start = devmem->resource->start;
1335 size = resource_size(devmem->resource);
1336
1337 cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
1338 hmm_devmem_ref_kill(&devmem->ref);
1339 hmm_devmem_ref_exit(&devmem->ref);
1340 hmm_devmem_pages_remove(devmem);
1341
1342 if (!cdm)
1343 devm_release_mem_region(device, start, size);
1344}
1345EXPORT_SYMBOL(hmm_devmem_remove);
1346
1347
1348
1349
1350
1351
1352#define HMM_DEVICE_MAX 256
1353
1354static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
1355static DEFINE_SPINLOCK(hmm_device_lock);
1356static struct class *hmm_device_class;
1357static dev_t hmm_device_devt;
1358
1359static void hmm_device_release(struct device *device)
1360{
1361 struct hmm_device *hmm_device;
1362
1363 hmm_device = container_of(device, struct hmm_device, device);
1364 spin_lock(&hmm_device_lock);
1365 clear_bit(hmm_device->minor, hmm_device_mask);
1366 spin_unlock(&hmm_device_lock);
1367
1368 kfree(hmm_device);
1369}
1370
1371struct hmm_device *hmm_device_new(void *drvdata)
1372{
1373 struct hmm_device *hmm_device;
1374
1375 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
1376 if (!hmm_device)
1377 return ERR_PTR(-ENOMEM);
1378
1379 spin_lock(&hmm_device_lock);
1380 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
1381 if (hmm_device->minor >= HMM_DEVICE_MAX) {
1382 spin_unlock(&hmm_device_lock);
1383 kfree(hmm_device);
1384 return ERR_PTR(-EBUSY);
1385 }
1386 set_bit(hmm_device->minor, hmm_device_mask);
1387 spin_unlock(&hmm_device_lock);
1388
1389 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
1390 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
1391 hmm_device->minor);
1392 hmm_device->device.release = hmm_device_release;
1393 dev_set_drvdata(&hmm_device->device, drvdata);
1394 hmm_device->device.class = hmm_device_class;
1395 device_initialize(&hmm_device->device);
1396
1397 return hmm_device;
1398}
1399EXPORT_SYMBOL(hmm_device_new);
1400
1401void hmm_device_put(struct hmm_device *hmm_device)
1402{
1403 put_device(&hmm_device->device);
1404}
1405EXPORT_SYMBOL(hmm_device_put);
1406
1407static int __init hmm_init(void)
1408{
1409 int ret;
1410
1411 ret = alloc_chrdev_region(&hmm_device_devt, 0,
1412 HMM_DEVICE_MAX,
1413 "hmm_device");
1414 if (ret)
1415 return ret;
1416
1417 hmm_device_class = class_create(THIS_MODULE, "hmm_device");
1418 if (IS_ERR(hmm_device_class)) {
1419 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
1420 return PTR_ERR(hmm_device_class);
1421 }
1422 return 0;
1423}
1424
1425device_initcall(hmm_init);
1426#endif
1427