1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20#include <linux/mm.h>
21#include <linux/hmm.h>
22#include <linux/init.h>
23#include <linux/rmap.h>
24#include <linux/swap.h>
25#include <linux/slab.h>
26#include <linux/sched.h>
27#include <linux/mmzone.h>
28#include <linux/pagemap.h>
29#include <linux/swapops.h>
30#include <linux/hugetlb.h>
31#include <linux/memremap.h>
32#include <linux/jump_label.h>
33#include <linux/mmu_notifier.h>
34#include <linux/memory_hotplug.h>
35
36#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
37
38#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
39
40
41
42DEFINE_STATIC_KEY_FALSE(device_private_key);
43EXPORT_SYMBOL(device_private_key);
44#endif
45
46
47#if IS_ENABLED(CONFIG_HMM_MIRROR)
48static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
49
50
51
52
53
54
55
56
57
58
59
60
61struct hmm {
62 struct mm_struct *mm;
63 spinlock_t lock;
64 atomic_t sequence;
65 struct list_head ranges;
66 struct list_head mirrors;
67 struct mmu_notifier mmu_notifier;
68 struct rw_semaphore mirrors_sem;
69};
70
71
72
73
74
75
76
77
78
79static struct hmm *hmm_register(struct mm_struct *mm)
80{
81 struct hmm *hmm = READ_ONCE(mm->hmm);
82 bool cleanup = false;
83
84
85
86
87
88
89 if (hmm)
90 return hmm;
91
92 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
93 if (!hmm)
94 return NULL;
95 INIT_LIST_HEAD(&hmm->mirrors);
96 init_rwsem(&hmm->mirrors_sem);
97 atomic_set(&hmm->sequence, 0);
98 hmm->mmu_notifier.ops = NULL;
99 INIT_LIST_HEAD(&hmm->ranges);
100 spin_lock_init(&hmm->lock);
101 hmm->mm = mm;
102
103
104
105
106
107 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
108 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
109 kfree(hmm);
110 return NULL;
111 }
112
113 spin_lock(&mm->page_table_lock);
114 if (!mm->hmm)
115 mm->hmm = hmm;
116 else
117 cleanup = true;
118 spin_unlock(&mm->page_table_lock);
119
120 if (cleanup) {
121 mmu_notifier_unregister(&hmm->mmu_notifier, mm);
122 kfree(hmm);
123 }
124
125 return mm->hmm;
126}
127
128void hmm_mm_destroy(struct mm_struct *mm)
129{
130 kfree(mm->hmm);
131}
132
133static void hmm_invalidate_range(struct hmm *hmm,
134 enum hmm_update_type action,
135 unsigned long start,
136 unsigned long end)
137{
138 struct hmm_mirror *mirror;
139 struct hmm_range *range;
140
141 spin_lock(&hmm->lock);
142 list_for_each_entry(range, &hmm->ranges, list) {
143 unsigned long addr, idx, npages;
144
145 if (end < range->start || start >= range->end)
146 continue;
147
148 range->valid = false;
149 addr = max(start, range->start);
150 idx = (addr - range->start) >> PAGE_SHIFT;
151 npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
152 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
153 }
154 spin_unlock(&hmm->lock);
155
156 down_read(&hmm->mirrors_sem);
157 list_for_each_entry(mirror, &hmm->mirrors, list)
158 mirror->ops->sync_cpu_device_pagetables(mirror, action,
159 start, end);
160 up_read(&hmm->mirrors_sem);
161}
162
163static void hmm_invalidate_range_start(struct mmu_notifier *mn,
164 struct mm_struct *mm,
165 unsigned long start,
166 unsigned long end)
167{
168 struct hmm *hmm = mm->hmm;
169
170 VM_BUG_ON(!hmm);
171
172 atomic_inc(&hmm->sequence);
173}
174
175static void hmm_invalidate_range_end(struct mmu_notifier *mn,
176 struct mm_struct *mm,
177 unsigned long start,
178 unsigned long end)
179{
180 struct hmm *hmm = mm->hmm;
181
182 VM_BUG_ON(!hmm);
183
184 hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
185}
186
187static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
188 .invalidate_range_start = hmm_invalidate_range_start,
189 .invalidate_range_end = hmm_invalidate_range_end,
190};
191
192
193
194
195
196
197
198
199
200
201
202
203int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
204{
205
206 if (!mm || !mirror || !mirror->ops)
207 return -EINVAL;
208
209 mirror->hmm = hmm_register(mm);
210 if (!mirror->hmm)
211 return -ENOMEM;
212
213 down_write(&mirror->hmm->mirrors_sem);
214 list_add(&mirror->list, &mirror->hmm->mirrors);
215 up_write(&mirror->hmm->mirrors_sem);
216
217 return 0;
218}
219EXPORT_SYMBOL(hmm_mirror_register);
220
221
222
223
224
225
226
227
228void hmm_mirror_unregister(struct hmm_mirror *mirror)
229{
230 struct hmm *hmm = mirror->hmm;
231
232 down_write(&hmm->mirrors_sem);
233 list_del(&mirror->list);
234 up_write(&hmm->mirrors_sem);
235}
236EXPORT_SYMBOL(hmm_mirror_unregister);
237
238struct hmm_vma_walk {
239 struct hmm_range *range;
240 unsigned long last;
241 bool fault;
242 bool block;
243 bool write;
244};
245
246static int hmm_vma_do_fault(struct mm_walk *walk,
247 unsigned long addr,
248 hmm_pfn_t *pfn)
249{
250 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
251 struct hmm_vma_walk *hmm_vma_walk = walk->private;
252 struct vm_area_struct *vma = walk->vma;
253 int r;
254
255 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
256 flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
257 r = handle_mm_fault(vma, addr, flags);
258 if (r & VM_FAULT_RETRY)
259 return -EBUSY;
260 if (r & VM_FAULT_ERROR) {
261 *pfn = HMM_PFN_ERROR;
262 return -EFAULT;
263 }
264
265 return -EAGAIN;
266}
267
268static void hmm_pfns_special(hmm_pfn_t *pfns,
269 unsigned long addr,
270 unsigned long end)
271{
272 for (; addr < end; addr += PAGE_SIZE, pfns++)
273 *pfns = HMM_PFN_SPECIAL;
274}
275
276static int hmm_pfns_bad(unsigned long addr,
277 unsigned long end,
278 struct mm_walk *walk)
279{
280 struct hmm_range *range = walk->private;
281 hmm_pfn_t *pfns = range->pfns;
282 unsigned long i;
283
284 i = (addr - range->start) >> PAGE_SHIFT;
285 for (; addr < end; addr += PAGE_SIZE, i++)
286 pfns[i] = HMM_PFN_ERROR;
287
288 return 0;
289}
290
291static void hmm_pfns_clear(hmm_pfn_t *pfns,
292 unsigned long addr,
293 unsigned long end)
294{
295 for (; addr < end; addr += PAGE_SIZE, pfns++)
296 *pfns = 0;
297}
298
299static int hmm_vma_walk_hole(unsigned long addr,
300 unsigned long end,
301 struct mm_walk *walk)
302{
303 struct hmm_vma_walk *hmm_vma_walk = walk->private;
304 struct hmm_range *range = hmm_vma_walk->range;
305 hmm_pfn_t *pfns = range->pfns;
306 unsigned long i;
307
308 hmm_vma_walk->last = addr;
309 i = (addr - range->start) >> PAGE_SHIFT;
310 for (; addr < end; addr += PAGE_SIZE, i++) {
311 pfns[i] = HMM_PFN_EMPTY;
312 if (hmm_vma_walk->fault) {
313 int ret;
314
315 ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
316 if (ret != -EAGAIN)
317 return ret;
318 }
319 }
320
321 return hmm_vma_walk->fault ? -EAGAIN : 0;
322}
323
324static int hmm_vma_walk_clear(unsigned long addr,
325 unsigned long end,
326 struct mm_walk *walk)
327{
328 struct hmm_vma_walk *hmm_vma_walk = walk->private;
329 struct hmm_range *range = hmm_vma_walk->range;
330 hmm_pfn_t *pfns = range->pfns;
331 unsigned long i;
332
333 hmm_vma_walk->last = addr;
334 i = (addr - range->start) >> PAGE_SHIFT;
335 for (; addr < end; addr += PAGE_SIZE, i++) {
336 pfns[i] = 0;
337 if (hmm_vma_walk->fault) {
338 int ret;
339
340 ret = hmm_vma_do_fault(walk, addr, &pfns[i]);
341 if (ret != -EAGAIN)
342 return ret;
343 }
344 }
345
346 return hmm_vma_walk->fault ? -EAGAIN : 0;
347}
348
349static int hmm_vma_walk_pmd(pmd_t *pmdp,
350 unsigned long start,
351 unsigned long end,
352 struct mm_walk *walk)
353{
354 struct hmm_vma_walk *hmm_vma_walk = walk->private;
355 struct hmm_range *range = hmm_vma_walk->range;
356 struct vm_area_struct *vma = walk->vma;
357 hmm_pfn_t *pfns = range->pfns;
358 unsigned long addr = start, i;
359 bool write_fault;
360 hmm_pfn_t flag;
361 pte_t *ptep;
362
363 i = (addr - range->start) >> PAGE_SHIFT;
364 flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0;
365 write_fault = hmm_vma_walk->fault & hmm_vma_walk->write;
366
367again:
368 if (pmd_none(*pmdp))
369 return hmm_vma_walk_hole(start, end, walk);
370
371 if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB)
372 return hmm_pfns_bad(start, end, walk);
373
374 if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
375 unsigned long pfn;
376 pmd_t pmd;
377
378
379
380
381
382
383
384
385
386
387 pmd = pmd_read_atomic(pmdp);
388 barrier();
389 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
390 goto again;
391 if (pmd_protnone(pmd))
392 return hmm_vma_walk_clear(start, end, walk);
393
394 if (write_fault && !pmd_write(pmd))
395 return hmm_vma_walk_clear(start, end, walk);
396
397 pfn = pmd_pfn(pmd) + pte_index(addr);
398 flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
399 for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
400 pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag;
401 return 0;
402 }
403
404 if (pmd_bad(*pmdp))
405 return hmm_pfns_bad(start, end, walk);
406
407 ptep = pte_offset_map(pmdp, addr);
408 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
409 pte_t pte = *ptep;
410
411 pfns[i] = 0;
412
413 if (pte_none(pte)) {
414 pfns[i] = HMM_PFN_EMPTY;
415 if (hmm_vma_walk->fault)
416 goto fault;
417 continue;
418 }
419
420 if (!pte_present(pte)) {
421 swp_entry_t entry = pte_to_swp_entry(pte);
422
423 if (!non_swap_entry(entry)) {
424 if (hmm_vma_walk->fault)
425 goto fault;
426 continue;
427 }
428
429
430
431
432
433 if (is_device_private_entry(entry)) {
434 pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry));
435 if (is_write_device_private_entry(entry)) {
436 pfns[i] |= HMM_PFN_WRITE;
437 } else if (write_fault)
438 goto fault;
439 pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE;
440 pfns[i] |= flag;
441 } else if (is_migration_entry(entry)) {
442 if (hmm_vma_walk->fault) {
443 pte_unmap(ptep);
444 hmm_vma_walk->last = addr;
445 migration_entry_wait(vma->vm_mm,
446 pmdp, addr);
447 return -EAGAIN;
448 }
449 continue;
450 } else {
451
452 pfns[i] = HMM_PFN_ERROR;
453 }
454 continue;
455 }
456
457 if (write_fault && !pte_write(pte))
458 goto fault;
459
460 pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag;
461 pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0;
462 continue;
463
464fault:
465 pte_unmap(ptep);
466
467 return hmm_vma_walk_clear(start, end, walk);
468 }
469 pte_unmap(ptep - 1);
470
471 return 0;
472}
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494int hmm_vma_get_pfns(struct vm_area_struct *vma,
495 struct hmm_range *range,
496 unsigned long start,
497 unsigned long end,
498 hmm_pfn_t *pfns)
499{
500 struct hmm_vma_walk hmm_vma_walk;
501 struct mm_walk mm_walk;
502 struct hmm *hmm;
503
504
505 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
506 hmm_pfns_special(pfns, start, end);
507 return -EINVAL;
508 }
509
510
511 if (start < vma->vm_start || start >= vma->vm_end)
512 return -EINVAL;
513 if (end < vma->vm_start || end > vma->vm_end)
514 return -EINVAL;
515
516 hmm = hmm_register(vma->vm_mm);
517 if (!hmm)
518 return -ENOMEM;
519
520 if (!hmm->mmu_notifier.ops)
521 return -EINVAL;
522
523
524 range->start = start;
525 range->pfns = pfns;
526 range->end = end;
527 spin_lock(&hmm->lock);
528 range->valid = true;
529 list_add_rcu(&range->list, &hmm->ranges);
530 spin_unlock(&hmm->lock);
531
532 hmm_vma_walk.fault = false;
533 hmm_vma_walk.range = range;
534 mm_walk.private = &hmm_vma_walk;
535
536 mm_walk.vma = vma;
537 mm_walk.mm = vma->vm_mm;
538 mm_walk.pte_entry = NULL;
539 mm_walk.test_walk = NULL;
540 mm_walk.hugetlb_entry = NULL;
541 mm_walk.pmd_entry = hmm_vma_walk_pmd;
542 mm_walk.pte_hole = hmm_vma_walk_hole;
543
544 walk_page_range(start, end, &mm_walk);
545 return 0;
546}
547EXPORT_SYMBOL(hmm_vma_get_pfns);
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range)
589{
590 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
591 struct hmm *hmm;
592
593 if (range->end <= range->start) {
594 BUG();
595 return false;
596 }
597
598 hmm = hmm_register(vma->vm_mm);
599 if (!hmm) {
600 memset(range->pfns, 0, sizeof(*range->pfns) * npages);
601 return false;
602 }
603
604 spin_lock(&hmm->lock);
605 list_del_rcu(&range->list);
606 spin_unlock(&hmm->lock);
607
608 return range->valid;
609}
610EXPORT_SYMBOL(hmm_vma_range_done);
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661int hmm_vma_fault(struct vm_area_struct *vma,
662 struct hmm_range *range,
663 unsigned long start,
664 unsigned long end,
665 hmm_pfn_t *pfns,
666 bool write,
667 bool block)
668{
669 struct hmm_vma_walk hmm_vma_walk;
670 struct mm_walk mm_walk;
671 struct hmm *hmm;
672 int ret;
673
674
675 if (start < vma->vm_start || start >= vma->vm_end)
676 return -EINVAL;
677 if (end < vma->vm_start || end > vma->vm_end)
678 return -EINVAL;
679
680 hmm = hmm_register(vma->vm_mm);
681 if (!hmm) {
682 hmm_pfns_clear(pfns, start, end);
683 return -ENOMEM;
684 }
685
686 if (!hmm->mmu_notifier.ops)
687 return -EINVAL;
688
689
690 range->start = start;
691 range->pfns = pfns;
692 range->end = end;
693 spin_lock(&hmm->lock);
694 range->valid = true;
695 list_add_rcu(&range->list, &hmm->ranges);
696 spin_unlock(&hmm->lock);
697
698
699 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
700 hmm_pfns_special(pfns, start, end);
701 return 0;
702 }
703
704 hmm_vma_walk.fault = true;
705 hmm_vma_walk.write = write;
706 hmm_vma_walk.block = block;
707 hmm_vma_walk.range = range;
708 mm_walk.private = &hmm_vma_walk;
709 hmm_vma_walk.last = range->start;
710
711 mm_walk.vma = vma;
712 mm_walk.mm = vma->vm_mm;
713 mm_walk.pte_entry = NULL;
714 mm_walk.test_walk = NULL;
715 mm_walk.hugetlb_entry = NULL;
716 mm_walk.pmd_entry = hmm_vma_walk_pmd;
717 mm_walk.pte_hole = hmm_vma_walk_hole;
718
719 do {
720 ret = walk_page_range(start, end, &mm_walk);
721 start = hmm_vma_walk.last;
722 } while (ret == -EAGAIN);
723
724 if (ret) {
725 unsigned long i;
726
727 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
728 hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end);
729 hmm_vma_range_done(vma, range);
730 }
731 return ret;
732}
733EXPORT_SYMBOL(hmm_vma_fault);
734#endif
735
736
737#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
738struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
739 unsigned long addr)
740{
741 struct page *page;
742
743 page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
744 if (!page)
745 return NULL;
746 lock_page(page);
747 return page;
748}
749EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
750
751
752static void hmm_devmem_ref_release(struct percpu_ref *ref)
753{
754 struct hmm_devmem *devmem;
755
756 devmem = container_of(ref, struct hmm_devmem, ref);
757 complete(&devmem->completion);
758}
759
760static void hmm_devmem_ref_exit(void *data)
761{
762 struct percpu_ref *ref = data;
763 struct hmm_devmem *devmem;
764
765 devmem = container_of(ref, struct hmm_devmem, ref);
766 percpu_ref_exit(ref);
767 devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
768}
769
770static void hmm_devmem_ref_kill(void *data)
771{
772 struct percpu_ref *ref = data;
773 struct hmm_devmem *devmem;
774
775 devmem = container_of(ref, struct hmm_devmem, ref);
776 percpu_ref_kill(ref);
777 wait_for_completion(&devmem->completion);
778 devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
779}
780
781static int hmm_devmem_fault(struct vm_area_struct *vma,
782 unsigned long addr,
783 const struct page *page,
784 unsigned int flags,
785 pmd_t *pmdp)
786{
787 struct hmm_devmem *devmem = page->pgmap->data;
788
789 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
790}
791
792static void hmm_devmem_free(struct page *page, void *data)
793{
794 struct hmm_devmem *devmem = data;
795
796 devmem->ops->free(devmem, page);
797}
798
799static DEFINE_MUTEX(hmm_devmem_lock);
800static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
801
802static void hmm_devmem_radix_release(struct resource *resource)
803{
804 resource_size_t key, align_start, align_size;
805
806 align_start = resource->start & ~(PA_SECTION_SIZE - 1);
807 align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
808
809 mutex_lock(&hmm_devmem_lock);
810 for (key = resource->start;
811 key <= resource->end;
812 key += PA_SECTION_SIZE)
813 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
814 mutex_unlock(&hmm_devmem_lock);
815}
816
817static void hmm_devmem_release(struct device *dev, void *data)
818{
819 struct hmm_devmem *devmem = data;
820 struct resource *resource = devmem->resource;
821 unsigned long start_pfn, npages;
822 struct zone *zone;
823 struct page *page;
824
825 if (percpu_ref_tryget_live(&devmem->ref)) {
826 dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
827 percpu_ref_put(&devmem->ref);
828 }
829
830
831 start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
832 npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
833
834 page = pfn_to_page(start_pfn);
835 zone = page_zone(page);
836
837 mem_hotplug_begin();
838 if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
839 __remove_pages(zone, start_pfn, npages, NULL);
840 else
841 arch_remove_memory(start_pfn << PAGE_SHIFT,
842 npages << PAGE_SHIFT, NULL);
843 mem_hotplug_done();
844
845 hmm_devmem_radix_release(resource);
846}
847
848static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
849{
850 WARN_ON_ONCE(!rcu_read_lock_held());
851
852 return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT);
853}
854
855static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
856{
857 resource_size_t key, align_start, align_size, align_end;
858 struct device *device = devmem->device;
859 int ret, nid, is_ram;
860 unsigned long pfn;
861
862 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
863 align_size = ALIGN(devmem->resource->start +
864 resource_size(devmem->resource),
865 PA_SECTION_SIZE) - align_start;
866
867 is_ram = region_intersects(align_start, align_size,
868 IORESOURCE_SYSTEM_RAM,
869 IORES_DESC_NONE);
870 if (is_ram == REGION_MIXED) {
871 WARN_ONCE(1, "%s attempted on mixed region %pr\n",
872 __func__, devmem->resource);
873 return -ENXIO;
874 }
875 if (is_ram == REGION_INTERSECTS)
876 return -ENXIO;
877
878 if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
879 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
880 else
881 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
882
883 devmem->pagemap.res = *devmem->resource;
884 devmem->pagemap.page_fault = hmm_devmem_fault;
885 devmem->pagemap.page_free = hmm_devmem_free;
886 devmem->pagemap.dev = devmem->device;
887 devmem->pagemap.ref = &devmem->ref;
888 devmem->pagemap.data = devmem;
889
890 mutex_lock(&hmm_devmem_lock);
891 align_end = align_start + align_size - 1;
892 for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
893 struct hmm_devmem *dup;
894
895 rcu_read_lock();
896 dup = hmm_devmem_find(key);
897 rcu_read_unlock();
898 if (dup) {
899 dev_err(device, "%s: collides with mapping for %s\n",
900 __func__, dev_name(dup->device));
901 mutex_unlock(&hmm_devmem_lock);
902 ret = -EBUSY;
903 goto error;
904 }
905 ret = radix_tree_insert(&hmm_devmem_radix,
906 key >> PA_SECTION_SHIFT,
907 devmem);
908 if (ret) {
909 dev_err(device, "%s: failed: %d\n", __func__, ret);
910 mutex_unlock(&hmm_devmem_lock);
911 goto error_radix;
912 }
913 }
914 mutex_unlock(&hmm_devmem_lock);
915
916 nid = dev_to_node(device);
917 if (nid < 0)
918 nid = numa_mem_id();
919
920 mem_hotplug_begin();
921
922
923
924
925
926
927
928
929
930
931 if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
932 ret = arch_add_memory(nid, align_start, align_size, NULL,
933 false);
934 else
935 ret = add_pages(nid, align_start >> PAGE_SHIFT,
936 align_size >> PAGE_SHIFT, NULL, false);
937 if (ret) {
938 mem_hotplug_done();
939 goto error_add_memory;
940 }
941 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
942 align_start >> PAGE_SHIFT,
943 align_size >> PAGE_SHIFT, NULL);
944 mem_hotplug_done();
945
946 for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
947 struct page *page = pfn_to_page(pfn);
948
949 page->pgmap = &devmem->pagemap;
950 }
951 return 0;
952
953error_add_memory:
954 untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
955error_radix:
956 hmm_devmem_radix_release(devmem->resource);
957error:
958 return ret;
959}
960
961static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
962{
963 struct hmm_devmem *devmem = data;
964
965 return devmem->resource == match_data;
966}
967
968static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
969{
970 devres_release(devmem->device, &hmm_devmem_release,
971 &hmm_devmem_match, devmem->resource);
972}
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
992 struct device *device,
993 unsigned long size)
994{
995 struct hmm_devmem *devmem;
996 resource_size_t addr;
997 int ret;
998
999 static_branch_enable(&device_private_key);
1000
1001 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
1002 GFP_KERNEL, dev_to_node(device));
1003 if (!devmem)
1004 return ERR_PTR(-ENOMEM);
1005
1006 init_completion(&devmem->completion);
1007 devmem->pfn_first = -1UL;
1008 devmem->pfn_last = -1UL;
1009 devmem->resource = NULL;
1010 devmem->device = device;
1011 devmem->ops = ops;
1012
1013 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1014 0, GFP_KERNEL);
1015 if (ret)
1016 goto error_percpu_ref;
1017
1018 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
1019 if (ret)
1020 goto error_devm_add_action;
1021
1022 size = ALIGN(size, PA_SECTION_SIZE);
1023 addr = min((unsigned long)iomem_resource.end,
1024 (1UL << MAX_PHYSMEM_BITS) - 1);
1025 addr = addr - size + 1UL;
1026
1027
1028
1029
1030
1031
1032
1033 for (; addr > size && addr >= iomem_resource.start; addr -= size) {
1034 ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
1035 if (ret != REGION_DISJOINT)
1036 continue;
1037
1038 devmem->resource = devm_request_mem_region(device, addr, size,
1039 dev_name(device));
1040 if (!devmem->resource) {
1041 ret = -ENOMEM;
1042 goto error_no_resource;
1043 }
1044 break;
1045 }
1046 if (!devmem->resource) {
1047 ret = -ERANGE;
1048 goto error_no_resource;
1049 }
1050
1051 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
1052 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1053 devmem->pfn_last = devmem->pfn_first +
1054 (resource_size(devmem->resource) >> PAGE_SHIFT);
1055
1056 ret = hmm_devmem_pages_create(devmem);
1057 if (ret)
1058 goto error_pages;
1059
1060 devres_add(device, devmem);
1061
1062 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
1063 if (ret) {
1064 hmm_devmem_remove(devmem);
1065 return ERR_PTR(ret);
1066 }
1067
1068 return devmem;
1069
1070error_pages:
1071 devm_release_mem_region(device, devmem->resource->start,
1072 resource_size(devmem->resource));
1073error_no_resource:
1074error_devm_add_action:
1075 hmm_devmem_ref_kill(&devmem->ref);
1076 hmm_devmem_ref_exit(&devmem->ref);
1077error_percpu_ref:
1078 devres_free(devmem);
1079 return ERR_PTR(ret);
1080}
1081EXPORT_SYMBOL(hmm_devmem_add);
1082
1083struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
1084 struct device *device,
1085 struct resource *res)
1086{
1087 struct hmm_devmem *devmem;
1088 int ret;
1089
1090 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
1091 return ERR_PTR(-EINVAL);
1092
1093 static_branch_enable(&device_private_key);
1094
1095 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
1096 GFP_KERNEL, dev_to_node(device));
1097 if (!devmem)
1098 return ERR_PTR(-ENOMEM);
1099
1100 init_completion(&devmem->completion);
1101 devmem->pfn_first = -1UL;
1102 devmem->pfn_last = -1UL;
1103 devmem->resource = res;
1104 devmem->device = device;
1105 devmem->ops = ops;
1106
1107 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
1108 0, GFP_KERNEL);
1109 if (ret)
1110 goto error_percpu_ref;
1111
1112 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
1113 if (ret)
1114 goto error_devm_add_action;
1115
1116
1117 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
1118 devmem->pfn_last = devmem->pfn_first +
1119 (resource_size(devmem->resource) >> PAGE_SHIFT);
1120
1121 ret = hmm_devmem_pages_create(devmem);
1122 if (ret)
1123 goto error_devm_add_action;
1124
1125 devres_add(device, devmem);
1126
1127 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
1128 if (ret) {
1129 hmm_devmem_remove(devmem);
1130 return ERR_PTR(ret);
1131 }
1132
1133 return devmem;
1134
1135error_devm_add_action:
1136 hmm_devmem_ref_kill(&devmem->ref);
1137 hmm_devmem_ref_exit(&devmem->ref);
1138error_percpu_ref:
1139 devres_free(devmem);
1140 return ERR_PTR(ret);
1141}
1142EXPORT_SYMBOL(hmm_devmem_add_resource);
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153void hmm_devmem_remove(struct hmm_devmem *devmem)
1154{
1155 resource_size_t start, size;
1156 struct device *device;
1157 bool cdm = false;
1158
1159 if (!devmem)
1160 return;
1161
1162 device = devmem->device;
1163 start = devmem->resource->start;
1164 size = resource_size(devmem->resource);
1165
1166 cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
1167 hmm_devmem_ref_kill(&devmem->ref);
1168 hmm_devmem_ref_exit(&devmem->ref);
1169 hmm_devmem_pages_remove(devmem);
1170
1171 if (!cdm)
1172 devm_release_mem_region(device, start, size);
1173}
1174EXPORT_SYMBOL(hmm_devmem_remove);
1175
1176
1177
1178
1179
1180
1181#define HMM_DEVICE_MAX 256
1182
1183static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
1184static DEFINE_SPINLOCK(hmm_device_lock);
1185static struct class *hmm_device_class;
1186static dev_t hmm_device_devt;
1187
1188static void hmm_device_release(struct device *device)
1189{
1190 struct hmm_device *hmm_device;
1191
1192 hmm_device = container_of(device, struct hmm_device, device);
1193 spin_lock(&hmm_device_lock);
1194 clear_bit(hmm_device->minor, hmm_device_mask);
1195 spin_unlock(&hmm_device_lock);
1196
1197 kfree(hmm_device);
1198}
1199
1200struct hmm_device *hmm_device_new(void *drvdata)
1201{
1202 struct hmm_device *hmm_device;
1203
1204 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
1205 if (!hmm_device)
1206 return ERR_PTR(-ENOMEM);
1207
1208 spin_lock(&hmm_device_lock);
1209 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
1210 if (hmm_device->minor >= HMM_DEVICE_MAX) {
1211 spin_unlock(&hmm_device_lock);
1212 kfree(hmm_device);
1213 return ERR_PTR(-EBUSY);
1214 }
1215 set_bit(hmm_device->minor, hmm_device_mask);
1216 spin_unlock(&hmm_device_lock);
1217
1218 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
1219 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
1220 hmm_device->minor);
1221 hmm_device->device.release = hmm_device_release;
1222 dev_set_drvdata(&hmm_device->device, drvdata);
1223 hmm_device->device.class = hmm_device_class;
1224 device_initialize(&hmm_device->device);
1225
1226 return hmm_device;
1227}
1228EXPORT_SYMBOL(hmm_device_new);
1229
1230void hmm_device_put(struct hmm_device *hmm_device)
1231{
1232 put_device(&hmm_device->device);
1233}
1234EXPORT_SYMBOL(hmm_device_put);
1235
1236static int __init hmm_init(void)
1237{
1238 int ret;
1239
1240 ret = alloc_chrdev_region(&hmm_device_devt, 0,
1241 HMM_DEVICE_MAX,
1242 "hmm_device");
1243 if (ret)
1244 return ret;
1245
1246 hmm_device_class = class_create(THIS_MODULE, "hmm_device");
1247 if (IS_ERR(hmm_device_class)) {
1248 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
1249 return PTR_ERR(hmm_device_class);
1250 }
1251 return 0;
1252}
1253
1254device_initcall(hmm_init);
1255#endif
1256