1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include <linux/mm.h>
21#include <linux/hmm.h>
22#include <linux/init.h>
23#include <linux/rmap.h>
24#include <linux/swap.h>
25#include <linux/slab.h>
26#include <linux/sched.h>
27#include <linux/mmzone.h>
28#include <linux/pagemap.h>
29#include <linux/swapops.h>
30#include <linux/hugetlb.h>
31#include <linux/memremap.h>
32#include <linux/jump_label.h>
33#include <linux/mmu_notifier.h>
34#include <linux/memory_hotplug.h>
35
36#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
37
38#if IS_ENABLED(CONFIG_HMM_MIRROR)
39static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
40
41
42
43
44
45
46
47
48
49
50
51
52struct hmm {
53 struct mm_struct *mm;
54 spinlock_t lock;
55 atomic_t sequence;
56 struct list_head ranges;
57 struct list_head mirrors;
58 struct mmu_notifier mmu_notifier;
59 struct rw_semaphore mirrors_sem;
60};
61
62
63
64
65
66
67
68
69
70static struct hmm *hmm_register(struct mm_struct *mm)
71{
72 struct hmm *hmm = READ_ONCE(mm->hmm);
73 bool cleanup = false;
74
75
76
77
78
79
80 if (hmm)
81 return hmm;
82
83 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
84 if (!hmm)
85 return NULL;
86 INIT_LIST_HEAD(&hmm->mirrors);
87 init_rwsem(&hmm->mirrors_sem);
88 atomic_set(&hmm->sequence, 0);
89 hmm->mmu_notifier.ops = NULL;
90 INIT_LIST_HEAD(&hmm->ranges);
91 spin_lock_init(&hmm->lock);
92 hmm->mm = mm;
93
94
95
96
97
98 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
99 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
100 kfree(hmm);
101 return NULL;
102 }
103
104 spin_lock(&mm->page_table_lock);
105 if (!mm->hmm)
106 mm->hmm = hmm;
107 else
108 cleanup = true;
109 spin_unlock(&mm->page_table_lock);
110
111 if (cleanup) {
112 mmu_notifier_unregister(&hmm->mmu_notifier, mm);
113 kfree(hmm);
114 }
115
116 return mm->hmm;
117}
118
119void hmm_mm_destroy(struct mm_struct *mm)
120{
121 kfree(mm->hmm);
122}
123
124static void hmm_invalidate_range(struct hmm *hmm,
125 enum hmm_update_type action,
126 unsigned long start,
127 unsigned long end)
128{
129 struct hmm_mirror *mirror;
130 struct hmm_range *range;
131
132 spin_lock(&hmm->lock);
133 list_for_each_entry(range, &hmm->ranges, list) {
134 unsigned long addr, idx, npages;
135
136 if (end < range->start || start >= range->end)
137 continue;
138
139 range->valid = false;
140 addr = max(start, range->start);
141 idx = (addr - range->start) >> PAGE_SHIFT;
142 npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
143 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
144 }
145 spin_unlock(&hmm->lock);
146
147 down_read(&hmm->mirrors_sem);
148 list_for_each_entry(mirror, &hmm->mirrors, list)
149 mirror->ops->sync_cpu_device_pagetables(mirror, action,
150 start, end);
151 up_read(&hmm->mirrors_sem);
152}
153
154static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
155{
156 struct hmm_mirror *mirror;
157 struct hmm *hmm = mm->hmm;
158
159 down_write(&hmm->mirrors_sem);
160 mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
161 list);
162 while (mirror) {
163 list_del_init(&mirror->list);
164 if (mirror->ops->release) {
165
166
167
168
169
170 up_write(&hmm->mirrors_sem);
171 mirror->ops->release(mirror);
172 down_write(&hmm->mirrors_sem);
173 }
174 mirror = list_first_entry_or_null(&hmm->mirrors,
175 struct hmm_mirror, list);
176 }
177 up_write(&hmm->mirrors_sem);
178}
179
180static int hmm_invalidate_range_start(struct mmu_notifier *mn,
181 struct mm_struct *mm,
182 unsigned long start,
183 unsigned long end,
184 bool blockable)
185{
186 struct hmm *hmm = mm->hmm;
187
188 VM_BUG_ON(!hmm);
189
190 atomic_inc(&hmm->sequence);
191
192 return 0;
193}
194
195static void hmm_invalidate_range_end(struct mmu_notifier *mn,
196 struct mm_struct *mm,
197 unsigned long start,
198 unsigned long end)
199{
200 struct hmm *hmm = mm->hmm;
201
202 VM_BUG_ON(!hmm);
203
204 hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
205}
206
207static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
208 .release = hmm_release,
209 .invalidate_range_start = hmm_invalidate_range_start,
210 .invalidate_range_end = hmm_invalidate_range_end,
211};
212
213
214
215
216
217
218
219
220
221
222
223
224int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
225{
226
227 if (!mm || !mirror || !mirror->ops)
228 return -EINVAL;
229
230again:
231 mirror->hmm = hmm_register(mm);
232 if (!mirror->hmm)
233 return -ENOMEM;
234
235 down_write(&mirror->hmm->mirrors_sem);
236 if (mirror->hmm->mm == NULL) {
237
238
239
240
241 up_write(&mirror->hmm->mirrors_sem);
242 mirror->hmm = NULL;
243 goto again;
244 } else {
245 list_add(&mirror->list, &mirror->hmm->mirrors);
246 up_write(&mirror->hmm->mirrors_sem);
247 }
248
249 return 0;
250}
251EXPORT_SYMBOL(hmm_mirror_register);
252
253
254
255
256
257
258
259
260void hmm_mirror_unregister(struct hmm_mirror *mirror)
261{
262 bool should_unregister = false;
263 struct mm_struct *mm;
264 struct hmm *hmm;
265
266 if (mirror->hmm == NULL)
267 return;
268
269 hmm = mirror->hmm;
270 down_write(&hmm->mirrors_sem);
271 list_del_init(&mirror->list);
272 should_unregister = list_empty(&hmm->mirrors);
273 mirror->hmm = NULL;
274 mm = hmm->mm;
275 hmm->mm = NULL;
276 up_write(&hmm->mirrors_sem);
277
278 if (!should_unregister || mm == NULL)
279 return;
280
281 spin_lock(&mm->page_table_lock);
282 if (mm->hmm == hmm)
283 mm->hmm = NULL;
284 spin_unlock(&mm->page_table_lock);
285
286 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
287 kfree(hmm);
288}
289EXPORT_SYMBOL(hmm_mirror_unregister);
290
291struct hmm_vma_walk {
292 struct hmm_range *range;
293 unsigned long last;
294 bool fault;
295 bool block;
296};
297
298static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
299 bool write_fault, uint64_t *pfn)
300{
301 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
302 struct hmm_vma_walk *hmm_vma_walk = walk->private;
303 struct hmm_range *range = hmm_vma_walk->range;
304 struct vm_area_struct *vma = walk->vma;
305 vm_fault_t ret;
306
307 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
308 flags |= write_fault ? FAULT_FLAG_WRITE : 0;
309 ret = handle_mm_fault(vma, addr, flags);
310 if (ret & VM_FAULT_RETRY)
311 return -EBUSY;
312 if (ret & VM_FAULT_ERROR) {
313 *pfn = range->values[HMM_PFN_ERROR];
314 return -EFAULT;
315 }
316
317 return -EAGAIN;
318}
319
320static int hmm_pfns_bad(unsigned long addr,
321 unsigned long end,
322 struct mm_walk *walk)
323{
324 struct hmm_vma_walk *hmm_vma_walk = walk->private;
325 struct hmm_range *range = hmm_vma_walk->range;
326 uint64_t *pfns = range->pfns;
327 unsigned long i;
328
329 i = (addr - range->start) >> PAGE_SHIFT;
330 for (; addr < end; addr += PAGE_SIZE, i++)
331 pfns[i] = range->values[HMM_PFN_ERROR];
332
333 return 0;
334}
335
336
337
338
339
340
341
342
343
344
345
346
347
348static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
349 bool fault, bool write_fault,
350 struct mm_walk *walk)
351{
352 struct hmm_vma_walk *hmm_vma_walk = walk->private;
353 struct hmm_range *range = hmm_vma_walk->range;
354 uint64_t *pfns = range->pfns;
355 unsigned long i;
356
357 hmm_vma_walk->last = addr;
358 i = (addr - range->start) >> PAGE_SHIFT;
359 for (; addr < end; addr += PAGE_SIZE, i++) {
360 pfns[i] = range->values[HMM_PFN_NONE];
361 if (fault || write_fault) {
362 int ret;
363
364 ret = hmm_vma_do_fault(walk, addr, write_fault,
365 &pfns[i]);
366 if (ret != -EAGAIN)
367 return ret;
368 }
369 }
370
371 return (fault || write_fault) ? -EAGAIN : 0;
372}
373
374static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
375 uint64_t pfns, uint64_t cpu_flags,
376 bool *fault, bool *write_fault)
377{
378 struct hmm_range *range = hmm_vma_walk->range;
379
380 *fault = *write_fault = false;
381 if (!hmm_vma_walk->fault)
382 return;
383
384
385 if (!(pfns & range->flags[HMM_PFN_VALID]))
386 return;
387
388 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
389
390 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
391 *write_fault = pfns & range->flags[HMM_PFN_WRITE];
392 *fault = true;
393 }
394 return;
395 }
396
397
398 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
399
400 if ((pfns & range->flags[HMM_PFN_WRITE]) &&
401 !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
402 *write_fault = true;
403 *fault = true;
404 }
405}
406
407static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
408 const uint64_t *pfns, unsigned long npages,
409 uint64_t cpu_flags, bool *fault,
410 bool *write_fault)
411{
412 unsigned long i;
413
414 if (!hmm_vma_walk->fault) {
415 *fault = *write_fault = false;
416 return;
417 }
418
419 for (i = 0; i < npages; ++i) {
420 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
421 fault, write_fault);
422 if ((*fault) || (*write_fault))
423 return;
424 }
425}
426
427static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
428 struct mm_walk *walk)
429{
430 struct hmm_vma_walk *hmm_vma_walk = walk->private;
431 struct hmm_range *range = hmm_vma_walk->range;
432 bool fault, write_fault;
433 unsigned long i, npages;
434 uint64_t *pfns;
435
436 i = (addr - range->start) >> PAGE_SHIFT;
437 npages = (end - addr) >> PAGE_SHIFT;
438 pfns = &range->pfns[i];
439 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
440 0, &fault, &write_fault);
441 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
442}
443
444static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
445{
446 if (pmd_protnone(pmd))
447 return 0;
448 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
449 range->flags[HMM_PFN_WRITE] :
450 range->flags[HMM_PFN_VALID];
451}
452
453static int hmm_vma_handle_pmd(struct mm_walk *walk,
454 unsigned long addr,
455 unsigned long end,
456 uint64_t *pfns,
457 pmd_t pmd)
458{
459 struct hmm_vma_walk *hmm_vma_walk = walk->private;
460 struct hmm_range *range = hmm_vma_walk->range;
461 unsigned long pfn, npages, i;
462 bool fault, write_fault;
463 uint64_t cpu_flags;
464
465 npages = (end - addr) >> PAGE_SHIFT;
466 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
467 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
468 &fault, &write_fault);
469
470 if (pmd_protnone(pmd) || fault || write_fault)
471 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
472
473 pfn = pmd_pfn(pmd) + pte_index(addr);
474 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
475 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
476 hmm_vma_walk->last = end;
477 return 0;
478}
479
480static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
481{
482 if (pte_none(pte) || !pte_present(pte))
483 return 0;
484 return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
485 range->flags[HMM_PFN_WRITE] :
486 range->flags[HMM_PFN_VALID];
487}
488
489static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
490 unsigned long end, pmd_t *pmdp, pte_t *ptep,
491 uint64_t *pfn)
492{
493 struct hmm_vma_walk *hmm_vma_walk = walk->private;
494 struct hmm_range *range = hmm_vma_walk->range;
495 struct vm_area_struct *vma = walk->vma;
496 bool fault, write_fault;
497 uint64_t cpu_flags;
498 pte_t pte = *ptep;
499 uint64_t orig_pfn = *pfn;
500
501 *pfn = range->values[HMM_PFN_NONE];
502 cpu_flags = pte_to_hmm_pfn_flags(range, pte);
503 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
504 &fault, &write_fault);
505
506 if (pte_none(pte)) {
507 if (fault || write_fault)
508 goto fault;
509 return 0;
510 }
511
512 if (!pte_present(pte)) {
513 swp_entry_t entry = pte_to_swp_entry(pte);
514
515 if (!non_swap_entry(entry)) {
516 if (fault || write_fault)
517 goto fault;
518 return 0;
519 }
520
521
522
523
524
525 if (is_device_private_entry(entry)) {
526 cpu_flags = range->flags[HMM_PFN_VALID] |
527 range->flags[HMM_PFN_DEVICE_PRIVATE];
528 cpu_flags |= is_write_device_private_entry(entry) ?
529 range->flags[HMM_PFN_WRITE] : 0;
530 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
531 &fault, &write_fault);
532 if (fault || write_fault)
533 goto fault;
534 *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
535 *pfn |= cpu_flags;
536 return 0;
537 }
538
539 if (is_migration_entry(entry)) {
540 if (fault || write_fault) {
541 pte_unmap(ptep);
542 hmm_vma_walk->last = addr;
543 migration_entry_wait(vma->vm_mm,
544 pmdp, addr);
545 return -EAGAIN;
546 }
547 return 0;
548 }
549
550
551 *pfn = range->values[HMM_PFN_ERROR];
552 return -EFAULT;
553 }
554
555 if (fault || write_fault)
556 goto fault;
557
558 *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
559 return 0;
560
561fault:
562 pte_unmap(ptep);
563
564 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
565}
566
567static int hmm_vma_walk_pmd(pmd_t *pmdp,
568 unsigned long start,
569 unsigned long end,
570 struct mm_walk *walk)
571{
572 struct hmm_vma_walk *hmm_vma_walk = walk->private;
573 struct hmm_range *range = hmm_vma_walk->range;
574 uint64_t *pfns = range->pfns;
575 unsigned long addr = start, i;
576 pte_t *ptep;
577
578 i = (addr - range->start) >> PAGE_SHIFT;
579
580again:
581 if (pmd_none(*pmdp))
582 return hmm_vma_walk_hole(start, end, walk);
583
584 if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
585 return hmm_pfns_bad(start, end, walk);
586
587 if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
588 pmd_t pmd;
589
590
591
592
593
594
595
596
597
598
599 pmd = pmd_read_atomic(pmdp);
600 barrier();
601 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
602 goto again;
603
604 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
605 }
606
607 if (pmd_bad(*pmdp))
608 return hmm_pfns_bad(start, end, walk);
609
610 ptep = pte_offset_map(pmdp, addr);
611 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
612 int r;
613
614 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
615 if (r) {
616
617 hmm_vma_walk->last = addr;
618 return r;
619 }
620 }
621 pte_unmap(ptep - 1);
622
623 hmm_vma_walk->last = addr;
624 return 0;
625}
626
627static void hmm_pfns_clear(struct hmm_range *range,
628 uint64_t *pfns,
629 unsigned long addr,
630 unsigned long end)
631{
632 for (; addr < end; addr += PAGE_SIZE, pfns++)
633 *pfns = range->values[HMM_PFN_NONE];
634}
635
636static void hmm_pfns_special(struct hmm_range *range)
637{
638 unsigned long addr = range->start, i = 0;
639
640 for (; addr < range->end; addr += PAGE_SIZE, i++)
641 range->pfns[i] = range->values[HMM_PFN_SPECIAL];
642}
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661int hmm_vma_get_pfns(struct hmm_range *range)
662{
663 struct vm_area_struct *vma = range->vma;
664 struct hmm_vma_walk hmm_vma_walk;
665 struct mm_walk mm_walk;
666 struct hmm *hmm;
667
668
669 if (range->start < vma->vm_start || range->start >= vma->vm_end)
670 return -EINVAL;
671 if (range->end < vma->vm_start || range->end > vma->vm_end)
672 return -EINVAL;
673
674 hmm = hmm_register(vma->vm_mm);
675 if (!hmm)
676 return -ENOMEM;
677
678 if (!hmm->mmu_notifier.ops)
679 return -EINVAL;
680
681
682 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
683 vma_is_dax(vma)) {
684 hmm_pfns_special(range);
685 return -EINVAL;
686 }
687
688 if (!(vma->vm_flags & VM_READ)) {
689
690
691
692
693
694
695 hmm_pfns_clear(range, range->pfns, range->start, range->end);
696 return -EPERM;
697 }
698
699
700 spin_lock(&hmm->lock);
701 range->valid = true;
702 list_add_rcu(&range->list, &hmm->ranges);
703 spin_unlock(&hmm->lock);
704
705 hmm_vma_walk.fault = false;
706 hmm_vma_walk.range = range;
707 mm_walk.private = &hmm_vma_walk;
708
709 mm_walk.vma = vma;
710 mm_walk.mm = vma->vm_mm;
711 mm_walk.pte_entry = NULL;
712 mm_walk.test_walk = NULL;
713 mm_walk.hugetlb_entry = NULL;
714 mm_walk.pmd_entry = hmm_vma_walk_pmd;
715 mm_walk.pte_hole = hmm_vma_walk_hole;
716
717 walk_page_range(range->start, range->end, &mm_walk);
718 return 0;
719}
720EXPORT_SYMBOL(hmm_vma_get_pfns);
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760bool hmm_vma_range_done(struct hmm_range *range)
761{
762 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
763 struct hmm *hmm;
764
765 if (range->end <= range->start) {
766 BUG();
767 return false;
768 }
769
770 hmm = hmm_register(range->vma->vm_mm);
771 if (!hmm) {
772 memset(range->pfns, 0, sizeof(*range->pfns) * npages);
773 return false;
774 }
775
776 spin_lock(&hmm->lock);
777 list_del_rcu(&range->list);
778 spin_unlock(&hmm->lock);
779
780 return range->valid;
781}
782EXPORT_SYMBOL(hmm_vma_range_done);
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831int hmm_vma_fault(struct hmm_range *range, bool block)
832{
833 struct vm_area_struct *vma = range->vma;
834 unsigned long start = range->start;
835 struct hmm_vma_walk hmm_vma_walk;
836 struct mm_walk mm_walk;
837 struct hmm *hmm;
838 int ret;
839
840
841 if (range->start < vma->vm_start || range->start >= vma->vm_end)
842 return -EINVAL;
843 if (range->end < vma->vm_start || range->end > vma->vm_end)
844 return -EINVAL;
845
846 hmm = hmm_register(vma->vm_mm);
847 if (!hmm) {
848 hmm_pfns_clear(range, range->pfns, range->start, range->end);
849 return -ENOMEM;
850 }
851
852 if (!hmm->mmu_notifier.ops)
853 return -EINVAL;
854
855
856 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
857 vma_is_dax(vma)) {
858 hmm_pfns_special(range);
859 return -EINVAL;
860 }
861
862 if (!(vma->vm_flags & VM_READ)) {
863
864
865
866
867
868
869 hmm_pfns_clear(range, range->pfns, range->start, range->end);
870 return -EPERM;
871 }
872
873
874 spin_lock(&hmm->lock);
875 range->valid = true;
876 list_add_rcu(&range->list, &hmm->ranges);
877 spin_unlock(&hmm->lock);
878
879 hmm_vma_walk.fault = true;
880 hmm_vma_walk.block = block;
881 hmm_vma_walk.range = range;
882 mm_walk.private = &hmm_vma_walk;
883 hmm_vma_walk.last = range->start;
884
885 mm_walk.vma = vma;
886 mm_walk.mm = vma->vm_mm;
887 mm_walk.pte_entry = NULL;
888 mm_walk.test_walk = NULL;
889 mm_walk.hugetlb_entry = NULL;
890 mm_walk.pmd_entry = hmm_vma_walk_pmd;
891 mm_walk.pte_hole = hmm_vma_walk_hole;
892
893 do {
894 ret = walk_page_range(start, range->end, &mm_walk);
895 start = hmm_vma_walk.last;
896 } while (ret == -EAGAIN);
897
898 if (ret) {
899 unsigned long i;
900
901 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
902 hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
903 range->end);
904 hmm_vma_range_done(range);
905 }
906 return ret;
907}
908EXPORT_SYMBOL(hmm_vma_fault);
909#endif
910
911
912#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
913struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
914 unsigned long addr)
915{
916 struct page *page;
917
918 page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
919 if (!page)
920 return NULL;
921 lock_page(page);
922 return page;
923}
924EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
925
926
927static void hmm_devmem_ref_release(struct percpu_ref *ref)
928{
929 struct hmm_devmem *devmem;
930
931 devmem = container_of(ref, struct hmm_devmem, ref);
932 complete(&devmem->completion);
933}
934
935static void hmm_devmem_ref_exit(void *data)
936{
937 struct percpu_ref *ref = data;
938 struct hmm_devmem *devmem;
939
940 devmem = container_of(ref, struct hmm_devmem, ref);
941 percpu_ref_exit(ref);
942 devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
943}
944
945static void hmm_devmem_ref_kill(void *data)
946{
947 struct percpu_ref *ref = data;
948 struct hmm_devmem *devmem;
949
950 devmem = container_of(ref, struct hmm_devmem, ref);
951 percpu_ref_kill(ref);
952 wait_for_completion(&devmem->completion);
953 devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
954}
955
956static int hmm_devmem_fault(struct vm_area_struct *vma,
957 unsigned long addr,
958 const struct page *page,
959 unsigned int flags,
960 pmd_t *pmdp)
961{
962 struct hmm_devmem *devmem = page->pgmap->data;
963
964 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
965}
966
967static void hmm_devmem_free(struct page *page, void *data)
968{
969 struct hmm_devmem *devmem = data;
970
971 page->mapping = NULL;
972
973 devmem->ops->free(devmem, page);
974}
975
976static DEFINE_MUTEX(hmm_devmem_lock);
977static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
978
979static void hmm_devmem_radix_release(struct resource *resource)
980{
981 resource_size_t key;
982
983 mutex_lock(&hmm_devmem_lock);
984 for (key = resource->start;
985 key <= resource->end;
986 key += PA_SECTION_SIZE)
987 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
988 mutex_unlock(&hmm_devmem_lock);
989}
990
991static void hmm_devmem_release(struct device *dev, void *data)
992{
993 struct hmm_devmem *devmem = data;
994 struct resource *resource = devmem->resource;
995 unsigned long start_pfn, npages;
996 struct zone *zone;
997 struct page *page;
998
999 if (percpu_ref_tryget_live(&devmem->ref)) {
1000 dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
1001 percpu_ref_put(&devmem->ref);
1002 }
1003
1004
1005 start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
1006 npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
1007
1008 page = pfn_to_page(start_pfn);
1009 zone = page_zone(page);
1010
1011 mem_hotplug_begin();
1012 if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
1013 __remove_pages(zone, start_pfn, npages, NULL);
1014 else
1015 arch_remove_memory(start_pfn << PAGE_SHIFT,
1016 npages << PAGE_SHIFT, NULL);
1017 mem_hotplug_done();
1018
1019 hmm_devmem_radix_release(resource);
1020}
1021
1022static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
1023{
1024 resource_size_t key, align_start, align_size, align_end;
1025 struct device *device = devmem->device;
1026 int ret, nid, is_ram;
1027 unsigned long pfn;
1028
1029 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
1030 align_size = ALIGN(devmem->resource->start +
1031 resource_size(devmem->resource),
1032 PA_SECTION_SIZE) - align_start;
1033
1034 is_ram = region_intersects(align_start, align_size,
1035 IORESOURCE_SYSTEM_RAM,
1036 IORES_DESC_NONE);
1037 if (is_ram == REGION_MIXED) {
1038 WARN_ONCE(1, "%s attempted on mixed region %pr\n",
1039 __func__, devmem->resource);
1040 return -ENXIO;
1041 }
1042 if (is_ram == REGION_INTERSECTS)
1043 return -ENXIO;
1044
1045 if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
1046 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
1047 else
1048 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
1049
1050 devmem->pagemap.res = *devmem->resource;
1051 devmem->pagemap.page_fault = hmm_devmem_fault;
1052 devmem->pagemap.page_free = hmm_devmem_free;
1053 devmem->pagemap.dev = devmem->device;
1054 devmem->pagemap.ref = &devmem->ref;
1055 devmem->pagemap.data = devmem;
1056
1057 mutex_lock(&hmm_devmem_lock);
1058 align_end = align_start + align_size - 1;
1059 for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
1060 struct hmm_devmem *dup;
1061
1062 dup = radix_tree_lookup(&hmm_devmem_radix,
1063 key >> PA_SECTION_SHIFT);
1064 if (dup) {
1065 dev_err(device, "%s: collides with mapping for %s\n",
1066 __func__, dev_name(dup->device));
1067 mutex_unlock(&hmm_devmem_lock);
1068 ret = -EBUSY;
1069 goto error;
1070 }
1071 ret = radix_tree_insert(&hmm_devmem_radix,
1072 key >> PA_SECTION_SHIFT,
1073 devmem);
1074 if (ret) {
1075 dev_err(device, "%s: failed: %d\n", __func__, ret);
1076 mutex_unlock(&hmm_devmem_lock);
1077 goto error_radix;
1078 }
1079 }
1080 mutex_unlock(&hmm_devmem_lock);
1081
1082 nid = dev_to_node(device);
1083 if (nid < 0)
1084 nid = numa_mem_id();
1085
1086 mem_hotplug_begin();
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097 if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
1098 ret = arch_add_memory(nid, align_start, align_size, NULL,
1099 false);
1100 else
1101 ret = add_pages(nid, align_start >> PAGE_SHIFT,
1102 align_size >> PAGE_SHIFT, NULL, false);
1103 if (ret) {
1104 mem_hotplug_done();
1105 goto error_add_memory;
1106 }
1107 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
1108 align_start >> PAGE_SHIFT,
1109 align_size >> PAGE_SHIFT, NULL);
1110 mem_hotplug_done();
1111
1112 for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
1113 struct page *page = pfn_to_page(pfn);
1114
1115 page->pgmap = &devmem->pagemap;
1116 }
1117 return 0;
1118
1119error_add_memory:
1120 untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
1121error_radix:
1122 hmm_devmem_radix_release(devmem->resource);
1123error:
1124 return ret;
1125}
1126
1127static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
1128{
1129 struct hmm_devmem *devmem = data;
1130
1131 return devmem->resource == match_data;
1132}
1133
1134static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
1135{
1136 devres_release(devmem->device, &hmm_devmem_release,
1137 &hmm_devmem_match, devmem->resource);
1138}
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
1158 struct device *device,
1159 unsigned long size)
1160{
1161 struct hmm_devmem *devmem;
1162 resource_size_t addr;
1163 int ret;
1164
1165 dev_pagemap_get_ops();
1166
1167 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
1168 GFP_KERNEL, dev_to_node(device));
1169 if (!devmem)
1170 return ERR_PTR(-ENOMEM);
1171
1172 init_completion(&devmem->completion);
1173 devmem->pfn_first = -1UL;
1174 devmem->pfn_last = -1UL;
1175 devmem->resource = NULL;
1176 devmem->device = device;
1177 devmem->ops = ops;
1178
1179 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1180 0, GFP_KERNEL);
1181 if (ret)
1182 goto error_percpu_ref;
1183
1184 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
1185 if (ret)
1186 goto error_devm_add_action;
1187
1188 size = ALIGN(size, PA_SECTION_SIZE);
1189 addr = min((unsigned long)iomem_resource.end,
1190 (1UL << MAX_PHYSMEM_BITS) - 1);
1191 addr = addr - size + 1UL;
1192
1193
1194
1195
1196
1197
1198
1199 for (; addr > size && addr >= iomem_resource.start; addr -= size) {
1200 ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
1201 if (ret != REGION_DISJOINT)
1202 continue;
1203
1204 devmem->resource = devm_request_mem_region(device, addr, size,
1205 dev_name(device));
1206 if (!devmem->resource) {
1207 ret = -ENOMEM;
1208 goto error_no_resource;
1209 }
1210 break;
1211 }
1212 if (!devmem->resource) {
1213 ret = -ERANGE;
1214 goto error_no_resource;
1215 }
1216
1217 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
1218 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1219 devmem->pfn_last = devmem->pfn_first +
1220 (resource_size(devmem->resource) >> PAGE_SHIFT);
1221
1222 ret = hmm_devmem_pages_create(devmem);
1223 if (ret)
1224 goto error_pages;
1225
1226 devres_add(device, devmem);
1227
1228 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
1229 if (ret) {
1230 hmm_devmem_remove(devmem);
1231 return ERR_PTR(ret);
1232 }
1233
1234 return devmem;
1235
1236error_pages:
1237 devm_release_mem_region(device, devmem->resource->start,
1238 resource_size(devmem->resource));
1239error_no_resource:
1240error_devm_add_action:
1241 hmm_devmem_ref_kill(&devmem->ref);
1242 hmm_devmem_ref_exit(&devmem->ref);
1243error_percpu_ref:
1244 devres_free(devmem);
1245 return ERR_PTR(ret);
1246}
1247EXPORT_SYMBOL(hmm_devmem_add);
1248
1249struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
1250 struct device *device,
1251 struct resource *res)
1252{
1253 struct hmm_devmem *devmem;
1254 int ret;
1255
1256 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
1257 return ERR_PTR(-EINVAL);
1258
1259 dev_pagemap_get_ops();
1260
1261 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
1262 GFP_KERNEL, dev_to_node(device));
1263 if (!devmem)
1264 return ERR_PTR(-ENOMEM);
1265
1266 init_completion(&devmem->completion);
1267 devmem->pfn_first = -1UL;
1268 devmem->pfn_last = -1UL;
1269 devmem->resource = res;
1270 devmem->device = device;
1271 devmem->ops = ops;
1272
1273 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1274 0, GFP_KERNEL);
1275 if (ret)
1276 goto error_percpu_ref;
1277
1278 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
1279 if (ret)
1280 goto error_devm_add_action;
1281
1282
1283 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1284 devmem->pfn_last = devmem->pfn_first +
1285 (resource_size(devmem->resource) >> PAGE_SHIFT);
1286
1287 ret = hmm_devmem_pages_create(devmem);
1288 if (ret)
1289 goto error_devm_add_action;
1290
1291 devres_add(device, devmem);
1292
1293 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
1294 if (ret) {
1295 hmm_devmem_remove(devmem);
1296 return ERR_PTR(ret);
1297 }
1298
1299 return devmem;
1300
1301error_devm_add_action:
1302 hmm_devmem_ref_kill(&devmem->ref);
1303 hmm_devmem_ref_exit(&devmem->ref);
1304error_percpu_ref:
1305 devres_free(devmem);
1306 return ERR_PTR(ret);
1307}
1308EXPORT_SYMBOL(hmm_devmem_add_resource);
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319void hmm_devmem_remove(struct hmm_devmem *devmem)
1320{
1321 resource_size_t start, size;
1322 struct device *device;
1323 bool cdm = false;
1324
1325 if (!devmem)
1326 return;
1327
1328 device = devmem->device;
1329 start = devmem->resource->start;
1330 size = resource_size(devmem->resource);
1331
1332 cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
1333 hmm_devmem_ref_kill(&devmem->ref);
1334 hmm_devmem_ref_exit(&devmem->ref);
1335 hmm_devmem_pages_remove(devmem);
1336
1337 if (!cdm)
1338 devm_release_mem_region(device, start, size);
1339}
1340EXPORT_SYMBOL(hmm_devmem_remove);
1341
1342
1343
1344
1345
1346
1347#define HMM_DEVICE_MAX 256
1348
1349static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
1350static DEFINE_SPINLOCK(hmm_device_lock);
1351static struct class *hmm_device_class;
1352static dev_t hmm_device_devt;
1353
1354static void hmm_device_release(struct device *device)
1355{
1356 struct hmm_device *hmm_device;
1357
1358 hmm_device = container_of(device, struct hmm_device, device);
1359 spin_lock(&hmm_device_lock);
1360 clear_bit(hmm_device->minor, hmm_device_mask);
1361 spin_unlock(&hmm_device_lock);
1362
1363 kfree(hmm_device);
1364}
1365
1366struct hmm_device *hmm_device_new(void *drvdata)
1367{
1368 struct hmm_device *hmm_device;
1369
1370 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
1371 if (!hmm_device)
1372 return ERR_PTR(-ENOMEM);
1373
1374 spin_lock(&hmm_device_lock);
1375 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
1376 if (hmm_device->minor >= HMM_DEVICE_MAX) {
1377 spin_unlock(&hmm_device_lock);
1378 kfree(hmm_device);
1379 return ERR_PTR(-EBUSY);
1380 }
1381 set_bit(hmm_device->minor, hmm_device_mask);
1382 spin_unlock(&hmm_device_lock);
1383
1384 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
1385 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
1386 hmm_device->minor);
1387 hmm_device->device.release = hmm_device_release;
1388 dev_set_drvdata(&hmm_device->device, drvdata);
1389 hmm_device->device.class = hmm_device_class;
1390 device_initialize(&hmm_device->device);
1391
1392 return hmm_device;
1393}
1394EXPORT_SYMBOL(hmm_device_new);
1395
1396void hmm_device_put(struct hmm_device *hmm_device)
1397{
1398 put_device(&hmm_device->device);
1399}
1400EXPORT_SYMBOL(hmm_device_put);
1401
1402static int __init hmm_init(void)
1403{
1404 int ret;
1405
1406 ret = alloc_chrdev_region(&hmm_device_devt, 0,
1407 HMM_DEVICE_MAX,
1408 "hmm_device");
1409 if (ret)
1410 return ret;
1411
1412 hmm_device_class = class_create(THIS_MODULE, "hmm_device");
1413 if (IS_ERR(hmm_device_class)) {
1414 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
1415 return PTR_ERR(hmm_device_class);
1416 }
1417 return 0;
1418}
1419
1420device_initcall(hmm_init);
1421#endif
1422