1
2
3
4
5
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/mm.h>
9#include <linux/seq_file.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/mmu_notifier.h>
13#include <linux/nodemask.h>
14#include <linux/pagemap.h>
15#include <linux/mempolicy.h>
16#include <linux/compiler.h>
17#include <linux/cpuset.h>
18#include <linux/mutex.h>
19#include <linux/memblock.h>
20#include <linux/sysfs.h>
21#include <linux/slab.h>
22#include <linux/sched/mm.h>
23#include <linux/mmdebug.h>
24#include <linux/sched/signal.h>
25#include <linux/rmap.h>
26#include <linux/string_helpers.h>
27#include <linux/swap.h>
28#include <linux/swapops.h>
29#include <linux/jhash.h>
30#include <linux/numa.h>
31#include <linux/llist.h>
32#include <linux/cma.h>
33#include <linux/migrate.h>
34#include <linux/nospec.h>
35#include <linux/delayacct.h>
36
37#include <asm/page.h>
38#include <asm/pgalloc.h>
39#include <asm/tlb.h>
40
41#include <linux/io.h>
42#include <linux/hugetlb.h>
43#include <linux/hugetlb_cgroup.h>
44#include <linux/node.h>
45#include <linux/page_owner.h>
46#include "internal.h"
47#include "hugetlb_vmemmap.h"
48
49int hugetlb_max_hstate __read_mostly;
50unsigned int default_hstate_idx;
51struct hstate hstates[HUGE_MAX_HSTATE];
52
53#ifdef CONFIG_CMA
54static struct cma *hugetlb_cma[MAX_NUMNODES];
55static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
56static bool hugetlb_cma_page(struct page *page, unsigned int order)
57{
58 return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
59 1 << order);
60}
61#else
62static bool hugetlb_cma_page(struct page *page, unsigned int order)
63{
64 return false;
65}
66#endif
67static unsigned long hugetlb_cma_size __initdata;
68
69
70
71
72
73static unsigned int minimum_order __read_mostly = UINT_MAX;
74
75__initdata LIST_HEAD(huge_boot_pages);
76
77
78static struct hstate * __initdata parsed_hstate;
79static unsigned long __initdata default_hstate_max_huge_pages;
80static bool __initdata parsed_valid_hugepagesz = true;
81static bool __initdata parsed_default_hugepagesz;
82static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
83
84
85
86
87
88DEFINE_SPINLOCK(hugetlb_lock);
89
90
91
92
93
94static int num_fault_mutexes;
95struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
96
97
98static int hugetlb_acct_memory(struct hstate *h, long delta);
99
100static inline bool subpool_is_free(struct hugepage_subpool *spool)
101{
102 if (spool->count)
103 return false;
104 if (spool->max_hpages != -1)
105 return spool->used_hpages == 0;
106 if (spool->min_hpages != -1)
107 return spool->rsv_hpages == spool->min_hpages;
108
109 return true;
110}
111
112static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
113 unsigned long irq_flags)
114{
115 spin_unlock_irqrestore(&spool->lock, irq_flags);
116
117
118
119
120 if (subpool_is_free(spool)) {
121 if (spool->min_hpages != -1)
122 hugetlb_acct_memory(spool->hstate,
123 -spool->min_hpages);
124 kfree(spool);
125 }
126}
127
128struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
129 long min_hpages)
130{
131 struct hugepage_subpool *spool;
132
133 spool = kzalloc(sizeof(*spool), GFP_KERNEL);
134 if (!spool)
135 return NULL;
136
137 spin_lock_init(&spool->lock);
138 spool->count = 1;
139 spool->max_hpages = max_hpages;
140 spool->hstate = h;
141 spool->min_hpages = min_hpages;
142
143 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
144 kfree(spool);
145 return NULL;
146 }
147 spool->rsv_hpages = min_hpages;
148
149 return spool;
150}
151
152void hugepage_put_subpool(struct hugepage_subpool *spool)
153{
154 unsigned long flags;
155
156 spin_lock_irqsave(&spool->lock, flags);
157 BUG_ON(!spool->count);
158 spool->count--;
159 unlock_or_release_subpool(spool, flags);
160}
161
162
163
164
165
166
167
168
169
170static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
171 long delta)
172{
173 long ret = delta;
174
175 if (!spool)
176 return ret;
177
178 spin_lock_irq(&spool->lock);
179
180 if (spool->max_hpages != -1) {
181 if ((spool->used_hpages + delta) <= spool->max_hpages)
182 spool->used_hpages += delta;
183 else {
184 ret = -ENOMEM;
185 goto unlock_ret;
186 }
187 }
188
189
190 if (spool->min_hpages != -1 && spool->rsv_hpages) {
191 if (delta > spool->rsv_hpages) {
192
193
194
195
196 ret = delta - spool->rsv_hpages;
197 spool->rsv_hpages = 0;
198 } else {
199 ret = 0;
200 spool->rsv_hpages -= delta;
201 }
202 }
203
204unlock_ret:
205 spin_unlock_irq(&spool->lock);
206 return ret;
207}
208
209
210
211
212
213
214
215static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
216 long delta)
217{
218 long ret = delta;
219 unsigned long flags;
220
221 if (!spool)
222 return delta;
223
224 spin_lock_irqsave(&spool->lock, flags);
225
226 if (spool->max_hpages != -1)
227 spool->used_hpages -= delta;
228
229
230 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
231 if (spool->rsv_hpages + delta <= spool->min_hpages)
232 ret = 0;
233 else
234 ret = spool->rsv_hpages + delta - spool->min_hpages;
235
236 spool->rsv_hpages += delta;
237 if (spool->rsv_hpages > spool->min_hpages)
238 spool->rsv_hpages = spool->min_hpages;
239 }
240
241
242
243
244
245 unlock_or_release_subpool(spool, flags);
246
247 return ret;
248}
249
250static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
251{
252 return HUGETLBFS_SB(inode->i_sb)->spool;
253}
254
255static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
256{
257 return subpool_inode(file_inode(vma->vm_file));
258}
259
260
261
262
263static struct file_region *
264get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
265{
266 struct file_region *nrg = NULL;
267
268 VM_BUG_ON(resv->region_cache_count <= 0);
269
270 resv->region_cache_count--;
271 nrg = list_first_entry(&resv->region_cache, struct file_region, link);
272 list_del(&nrg->link);
273
274 nrg->from = from;
275 nrg->to = to;
276
277 return nrg;
278}
279
280static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
281 struct file_region *rg)
282{
283#ifdef CONFIG_CGROUP_HUGETLB
284 nrg->reservation_counter = rg->reservation_counter;
285 nrg->css = rg->css;
286 if (rg->css)
287 css_get(rg->css);
288#endif
289}
290
291
292static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
293 struct hstate *h,
294 struct resv_map *resv,
295 struct file_region *nrg)
296{
297#ifdef CONFIG_CGROUP_HUGETLB
298 if (h_cg) {
299 nrg->reservation_counter =
300 &h_cg->rsvd_hugepage[hstate_index(h)];
301 nrg->css = &h_cg->css;
302
303
304
305
306
307
308
309
310
311
312 css_get(&h_cg->css);
313 if (!resv->pages_per_hpage)
314 resv->pages_per_hpage = pages_per_huge_page(h);
315
316
317
318 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
319 } else {
320 nrg->reservation_counter = NULL;
321 nrg->css = NULL;
322 }
323#endif
324}
325
326static void put_uncharge_info(struct file_region *rg)
327{
328#ifdef CONFIG_CGROUP_HUGETLB
329 if (rg->css)
330 css_put(rg->css);
331#endif
332}
333
334static bool has_same_uncharge_info(struct file_region *rg,
335 struct file_region *org)
336{
337#ifdef CONFIG_CGROUP_HUGETLB
338 return rg->reservation_counter == org->reservation_counter &&
339 rg->css == org->css;
340
341#else
342 return true;
343#endif
344}
345
346static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
347{
348 struct file_region *nrg = NULL, *prg = NULL;
349
350 prg = list_prev_entry(rg, link);
351 if (&prg->link != &resv->regions && prg->to == rg->from &&
352 has_same_uncharge_info(prg, rg)) {
353 prg->to = rg->to;
354
355 list_del(&rg->link);
356 put_uncharge_info(rg);
357 kfree(rg);
358
359 rg = prg;
360 }
361
362 nrg = list_next_entry(rg, link);
363 if (&nrg->link != &resv->regions && nrg->from == rg->to &&
364 has_same_uncharge_info(nrg, rg)) {
365 nrg->from = rg->from;
366
367 list_del(&rg->link);
368 put_uncharge_info(rg);
369 kfree(rg);
370 }
371}
372
373static inline long
374hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
375 long to, struct hstate *h, struct hugetlb_cgroup *cg,
376 long *regions_needed)
377{
378 struct file_region *nrg;
379
380 if (!regions_needed) {
381 nrg = get_file_region_entry_from_cache(map, from, to);
382 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
383 list_add(&nrg->link, rg);
384 coalesce_file_region(map, nrg);
385 } else
386 *regions_needed += 1;
387
388 return to - from;
389}
390
391
392
393
394
395
396
397
398
399static long add_reservation_in_range(struct resv_map *resv, long f, long t,
400 struct hugetlb_cgroup *h_cg,
401 struct hstate *h, long *regions_needed)
402{
403 long add = 0;
404 struct list_head *head = &resv->regions;
405 long last_accounted_offset = f;
406 struct file_region *iter, *trg = NULL;
407 struct list_head *rg = NULL;
408
409 if (regions_needed)
410 *regions_needed = 0;
411
412
413
414
415
416 list_for_each_entry_safe(iter, trg, head, link) {
417
418 if (iter->from < f) {
419
420
421
422 if (iter->to > last_accounted_offset)
423 last_accounted_offset = iter->to;
424 continue;
425 }
426
427
428
429
430 if (iter->from >= t) {
431 rg = iter->link.prev;
432 break;
433 }
434
435
436
437
438 if (iter->from > last_accounted_offset)
439 add += hugetlb_resv_map_add(resv, iter->link.prev,
440 last_accounted_offset,
441 iter->from, h, h_cg,
442 regions_needed);
443
444 last_accounted_offset = iter->to;
445 }
446
447
448
449
450 if (!rg)
451 rg = head->prev;
452 if (last_accounted_offset < t)
453 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
454 t, h, h_cg, regions_needed);
455
456 return add;
457}
458
459
460
461static int allocate_file_region_entries(struct resv_map *resv,
462 int regions_needed)
463 __must_hold(&resv->lock)
464{
465 struct list_head allocated_regions;
466 int to_allocate = 0, i = 0;
467 struct file_region *trg = NULL, *rg = NULL;
468
469 VM_BUG_ON(regions_needed < 0);
470
471 INIT_LIST_HEAD(&allocated_regions);
472
473
474
475
476
477
478
479
480
481
482 while (resv->region_cache_count <
483 (resv->adds_in_progress + regions_needed)) {
484 to_allocate = resv->adds_in_progress + regions_needed -
485 resv->region_cache_count;
486
487
488
489
490
491 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
492
493 spin_unlock(&resv->lock);
494 for (i = 0; i < to_allocate; i++) {
495 trg = kmalloc(sizeof(*trg), GFP_KERNEL);
496 if (!trg)
497 goto out_of_memory;
498 list_add(&trg->link, &allocated_regions);
499 }
500
501 spin_lock(&resv->lock);
502
503 list_splice(&allocated_regions, &resv->region_cache);
504 resv->region_cache_count += to_allocate;
505 }
506
507 return 0;
508
509out_of_memory:
510 list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
511 list_del(&rg->link);
512 kfree(rg);
513 }
514 return -ENOMEM;
515}
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534static long region_add(struct resv_map *resv, long f, long t,
535 long in_regions_needed, struct hstate *h,
536 struct hugetlb_cgroup *h_cg)
537{
538 long add = 0, actual_regions_needed = 0;
539
540 spin_lock(&resv->lock);
541retry:
542
543
544 add_reservation_in_range(resv, f, t, NULL, NULL,
545 &actual_regions_needed);
546
547
548
549
550
551
552
553
554
555
556 if (actual_regions_needed > in_regions_needed &&
557 resv->region_cache_count <
558 resv->adds_in_progress +
559 (actual_regions_needed - in_regions_needed)) {
560
561
562
563 VM_BUG_ON(t - f <= 1);
564
565 if (allocate_file_region_entries(
566 resv, actual_regions_needed - in_regions_needed)) {
567 return -ENOMEM;
568 }
569
570 goto retry;
571 }
572
573 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
574
575 resv->adds_in_progress -= in_regions_needed;
576
577 spin_unlock(&resv->lock);
578 return add;
579}
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601static long region_chg(struct resv_map *resv, long f, long t,
602 long *out_regions_needed)
603{
604 long chg = 0;
605
606 spin_lock(&resv->lock);
607
608
609 chg = add_reservation_in_range(resv, f, t, NULL, NULL,
610 out_regions_needed);
611
612 if (*out_regions_needed == 0)
613 *out_regions_needed = 1;
614
615 if (allocate_file_region_entries(resv, *out_regions_needed))
616 return -ENOMEM;
617
618 resv->adds_in_progress += *out_regions_needed;
619
620 spin_unlock(&resv->lock);
621 return chg;
622}
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637static void region_abort(struct resv_map *resv, long f, long t,
638 long regions_needed)
639{
640 spin_lock(&resv->lock);
641 VM_BUG_ON(!resv->region_cache_count);
642 resv->adds_in_progress -= regions_needed;
643 spin_unlock(&resv->lock);
644}
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660static long region_del(struct resv_map *resv, long f, long t)
661{
662 struct list_head *head = &resv->regions;
663 struct file_region *rg, *trg;
664 struct file_region *nrg = NULL;
665 long del = 0;
666
667retry:
668 spin_lock(&resv->lock);
669 list_for_each_entry_safe(rg, trg, head, link) {
670
671
672
673
674
675
676
677 if (rg->to <= f && (rg->to != rg->from || rg->to != f))
678 continue;
679
680 if (rg->from >= t)
681 break;
682
683 if (f > rg->from && t < rg->to) {
684
685
686
687
688 if (!nrg &&
689 resv->region_cache_count > resv->adds_in_progress) {
690 nrg = list_first_entry(&resv->region_cache,
691 struct file_region,
692 link);
693 list_del(&nrg->link);
694 resv->region_cache_count--;
695 }
696
697 if (!nrg) {
698 spin_unlock(&resv->lock);
699 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
700 if (!nrg)
701 return -ENOMEM;
702 goto retry;
703 }
704
705 del += t - f;
706 hugetlb_cgroup_uncharge_file_region(
707 resv, rg, t - f, false);
708
709
710 nrg->from = t;
711 nrg->to = rg->to;
712
713 copy_hugetlb_cgroup_uncharge_info(nrg, rg);
714
715 INIT_LIST_HEAD(&nrg->link);
716
717
718 rg->to = f;
719
720 list_add(&nrg->link, &rg->link);
721 nrg = NULL;
722 break;
723 }
724
725 if (f <= rg->from && t >= rg->to) {
726 del += rg->to - rg->from;
727 hugetlb_cgroup_uncharge_file_region(resv, rg,
728 rg->to - rg->from, true);
729 list_del(&rg->link);
730 kfree(rg);
731 continue;
732 }
733
734 if (f <= rg->from) {
735 hugetlb_cgroup_uncharge_file_region(resv, rg,
736 t - rg->from, false);
737
738 del += t - rg->from;
739 rg->from = t;
740 } else {
741 hugetlb_cgroup_uncharge_file_region(resv, rg,
742 rg->to - f, false);
743
744 del += rg->to - f;
745 rg->to = f;
746 }
747 }
748
749 spin_unlock(&resv->lock);
750 kfree(nrg);
751 return del;
752}
753
754
755
756
757
758
759
760
761
762
763void hugetlb_fix_reserve_counts(struct inode *inode)
764{
765 struct hugepage_subpool *spool = subpool_inode(inode);
766 long rsv_adjust;
767 bool reserved = false;
768
769 rsv_adjust = hugepage_subpool_get_pages(spool, 1);
770 if (rsv_adjust > 0) {
771 struct hstate *h = hstate_inode(inode);
772
773 if (!hugetlb_acct_memory(h, 1))
774 reserved = true;
775 } else if (!rsv_adjust) {
776 reserved = true;
777 }
778
779 if (!reserved)
780 pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
781}
782
783
784
785
786
787static long region_count(struct resv_map *resv, long f, long t)
788{
789 struct list_head *head = &resv->regions;
790 struct file_region *rg;
791 long chg = 0;
792
793 spin_lock(&resv->lock);
794
795 list_for_each_entry(rg, head, link) {
796 long seg_from;
797 long seg_to;
798
799 if (rg->to <= f)
800 continue;
801 if (rg->from >= t)
802 break;
803
804 seg_from = max(rg->from, f);
805 seg_to = min(rg->to, t);
806
807 chg += seg_to - seg_from;
808 }
809 spin_unlock(&resv->lock);
810
811 return chg;
812}
813
814
815
816
817
818static pgoff_t vma_hugecache_offset(struct hstate *h,
819 struct vm_area_struct *vma, unsigned long address)
820{
821 return ((address - vma->vm_start) >> huge_page_shift(h)) +
822 (vma->vm_pgoff >> huge_page_order(h));
823}
824
825pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
826 unsigned long address)
827{
828 return vma_hugecache_offset(hstate_vma(vma), vma, address);
829}
830EXPORT_SYMBOL_GPL(linear_hugepage_index);
831
832
833
834
835
836unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
837{
838 if (vma->vm_ops && vma->vm_ops->pagesize)
839 return vma->vm_ops->pagesize(vma);
840 return PAGE_SIZE;
841}
842EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
843
844
845
846
847
848
849
850__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
851{
852 return vma_kernel_pagesize(vma);
853}
854
855
856
857
858
859
860#define HPAGE_RESV_OWNER (1UL << 0)
861#define HPAGE_RESV_UNMAPPED (1UL << 1)
862#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883static unsigned long get_vma_private_data(struct vm_area_struct *vma)
884{
885 return (unsigned long)vma->vm_private_data;
886}
887
888static void set_vma_private_data(struct vm_area_struct *vma,
889 unsigned long value)
890{
891 vma->vm_private_data = (void *)value;
892}
893
894static void
895resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
896 struct hugetlb_cgroup *h_cg,
897 struct hstate *h)
898{
899#ifdef CONFIG_CGROUP_HUGETLB
900 if (!h_cg || !h) {
901 resv_map->reservation_counter = NULL;
902 resv_map->pages_per_hpage = 0;
903 resv_map->css = NULL;
904 } else {
905 resv_map->reservation_counter =
906 &h_cg->rsvd_hugepage[hstate_index(h)];
907 resv_map->pages_per_hpage = pages_per_huge_page(h);
908 resv_map->css = &h_cg->css;
909 }
910#endif
911}
912
913struct resv_map *resv_map_alloc(void)
914{
915 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
916 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
917
918 if (!resv_map || !rg) {
919 kfree(resv_map);
920 kfree(rg);
921 return NULL;
922 }
923
924 kref_init(&resv_map->refs);
925 spin_lock_init(&resv_map->lock);
926 INIT_LIST_HEAD(&resv_map->regions);
927
928 resv_map->adds_in_progress = 0;
929
930
931
932
933
934
935 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
936
937 INIT_LIST_HEAD(&resv_map->region_cache);
938 list_add(&rg->link, &resv_map->region_cache);
939 resv_map->region_cache_count = 1;
940
941 return resv_map;
942}
943
944void resv_map_release(struct kref *ref)
945{
946 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
947 struct list_head *head = &resv_map->region_cache;
948 struct file_region *rg, *trg;
949
950
951 region_del(resv_map, 0, LONG_MAX);
952
953
954 list_for_each_entry_safe(rg, trg, head, link) {
955 list_del(&rg->link);
956 kfree(rg);
957 }
958
959 VM_BUG_ON(resv_map->adds_in_progress);
960
961 kfree(resv_map);
962}
963
964static inline struct resv_map *inode_resv_map(struct inode *inode)
965{
966
967
968
969
970
971
972
973
974 return (struct resv_map *)(&inode->i_data)->private_data;
975}
976
977static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
978{
979 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
980 if (vma->vm_flags & VM_MAYSHARE) {
981 struct address_space *mapping = vma->vm_file->f_mapping;
982 struct inode *inode = mapping->host;
983
984 return inode_resv_map(inode);
985
986 } else {
987 return (struct resv_map *)(get_vma_private_data(vma) &
988 ~HPAGE_RESV_MASK);
989 }
990}
991
992static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
993{
994 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
995 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
996
997 set_vma_private_data(vma, (get_vma_private_data(vma) &
998 HPAGE_RESV_MASK) | (unsigned long)map);
999}
1000
1001static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
1002{
1003 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1004 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1005
1006 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
1007}
1008
1009static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
1010{
1011 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1012
1013 return (get_vma_private_data(vma) & flag) != 0;
1014}
1015
1016
1017void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
1018{
1019 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1020 if (!(vma->vm_flags & VM_MAYSHARE))
1021 vma->vm_private_data = (void *)0;
1022}
1023
1024
1025
1026
1027
1028
1029
1030
1031void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1032{
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045 struct resv_map *reservations = vma_resv_map(vma);
1046
1047 if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1048 resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
1049 kref_put(&reservations->refs, resv_map_release);
1050 }
1051
1052 reset_vma_resv_huge_pages(vma);
1053}
1054
1055
1056static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
1057{
1058 if (vma->vm_flags & VM_NORESERVE) {
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
1069 return true;
1070 else
1071 return false;
1072 }
1073
1074
1075 if (vma->vm_flags & VM_MAYSHARE) {
1076
1077
1078
1079
1080
1081
1082
1083 if (chg)
1084 return false;
1085 else
1086 return true;
1087 }
1088
1089
1090
1091
1092
1093 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109 if (chg)
1110 return false;
1111 else
1112 return true;
1113 }
1114
1115 return false;
1116}
1117
1118static void enqueue_huge_page(struct hstate *h, struct page *page)
1119{
1120 int nid = page_to_nid(page);
1121
1122 lockdep_assert_held(&hugetlb_lock);
1123 VM_BUG_ON_PAGE(page_count(page), page);
1124
1125 list_move(&page->lru, &h->hugepage_freelists[nid]);
1126 h->free_huge_pages++;
1127 h->free_huge_pages_node[nid]++;
1128 SetHPageFreed(page);
1129}
1130
1131static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
1132{
1133 struct page *page;
1134 bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1135
1136 lockdep_assert_held(&hugetlb_lock);
1137 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
1138 if (pin && !is_pinnable_page(page))
1139 continue;
1140
1141 if (PageHWPoison(page))
1142 continue;
1143
1144 list_move(&page->lru, &h->hugepage_activelist);
1145 set_page_refcounted(page);
1146 ClearHPageFreed(page);
1147 h->free_huge_pages--;
1148 h->free_huge_pages_node[nid]--;
1149 return page;
1150 }
1151
1152 return NULL;
1153}
1154
1155static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
1156 nodemask_t *nmask)
1157{
1158 unsigned int cpuset_mems_cookie;
1159 struct zonelist *zonelist;
1160 struct zone *zone;
1161 struct zoneref *z;
1162 int node = NUMA_NO_NODE;
1163
1164 zonelist = node_zonelist(nid, gfp_mask);
1165
1166retry_cpuset:
1167 cpuset_mems_cookie = read_mems_allowed_begin();
1168 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1169 struct page *page;
1170
1171 if (!cpuset_zone_allowed(zone, gfp_mask))
1172 continue;
1173
1174
1175
1176
1177 if (zone_to_nid(zone) == node)
1178 continue;
1179 node = zone_to_nid(zone);
1180
1181 page = dequeue_huge_page_node_exact(h, node);
1182 if (page)
1183 return page;
1184 }
1185 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1186 goto retry_cpuset;
1187
1188 return NULL;
1189}
1190
1191static struct page *dequeue_huge_page_vma(struct hstate *h,
1192 struct vm_area_struct *vma,
1193 unsigned long address, int avoid_reserve,
1194 long chg)
1195{
1196 struct page *page = NULL;
1197 struct mempolicy *mpol;
1198 gfp_t gfp_mask;
1199 nodemask_t *nodemask;
1200 int nid;
1201
1202
1203
1204
1205
1206
1207 if (!vma_has_reserves(vma, chg) &&
1208 h->free_huge_pages - h->resv_huge_pages == 0)
1209 goto err;
1210
1211
1212 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
1213 goto err;
1214
1215 gfp_mask = htlb_alloc_mask(h);
1216 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1217
1218 if (mpol_is_preferred_many(mpol)) {
1219 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1220
1221
1222 nodemask = NULL;
1223 }
1224
1225 if (!page)
1226 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1227
1228 if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
1229 SetHPageRestoreReserve(page);
1230 h->resv_huge_pages--;
1231 }
1232
1233 mpol_cond_put(mpol);
1234 return page;
1235
1236err:
1237 return NULL;
1238}
1239
1240
1241
1242
1243
1244
1245
1246
1247static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1248{
1249 nid = next_node_in(nid, *nodes_allowed);
1250 VM_BUG_ON(nid >= MAX_NUMNODES);
1251
1252 return nid;
1253}
1254
1255static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1256{
1257 if (!node_isset(nid, *nodes_allowed))
1258 nid = next_node_allowed(nid, nodes_allowed);
1259 return nid;
1260}
1261
1262
1263
1264
1265
1266
1267
1268static int hstate_next_node_to_alloc(struct hstate *h,
1269 nodemask_t *nodes_allowed)
1270{
1271 int nid;
1272
1273 VM_BUG_ON(!nodes_allowed);
1274
1275 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
1276 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
1277
1278 return nid;
1279}
1280
1281
1282
1283
1284
1285
1286
1287static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1288{
1289 int nid;
1290
1291 VM_BUG_ON(!nodes_allowed);
1292
1293 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1294 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1295
1296 return nid;
1297}
1298
1299#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
1300 for (nr_nodes = nodes_weight(*mask); \
1301 nr_nodes > 0 && \
1302 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
1303 nr_nodes--)
1304
1305#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
1306 for (nr_nodes = nodes_weight(*mask); \
1307 nr_nodes > 0 && \
1308 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
1309 nr_nodes--)
1310
1311
1312static void __destroy_compound_gigantic_page(struct page *page,
1313 unsigned int order, bool demote)
1314{
1315 int i;
1316 int nr_pages = 1 << order;
1317 struct page *p = page + 1;
1318
1319 atomic_set(compound_mapcount_ptr(page), 0);
1320 atomic_set(compound_pincount_ptr(page), 0);
1321
1322 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1323 p->mapping = NULL;
1324 clear_compound_head(p);
1325 if (!demote)
1326 set_page_refcounted(p);
1327 }
1328
1329 set_compound_order(page, 0);
1330#ifdef CONFIG_64BIT
1331 page[1].compound_nr = 0;
1332#endif
1333 __ClearPageHead(page);
1334}
1335
1336static void destroy_compound_hugetlb_page_for_demote(struct page *page,
1337 unsigned int order)
1338{
1339 __destroy_compound_gigantic_page(page, order, true);
1340}
1341
1342#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1343static void destroy_compound_gigantic_page(struct page *page,
1344 unsigned int order)
1345{
1346 __destroy_compound_gigantic_page(page, order, false);
1347}
1348
1349static void free_gigantic_page(struct page *page, unsigned int order)
1350{
1351
1352
1353
1354
1355#ifdef CONFIG_CMA
1356 if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
1357 return;
1358#endif
1359
1360 free_contig_range(page_to_pfn(page), 1 << order);
1361}
1362
1363#ifdef CONFIG_CONTIG_ALLOC
1364static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1365 int nid, nodemask_t *nodemask)
1366{
1367 unsigned long nr_pages = pages_per_huge_page(h);
1368 if (nid == NUMA_NO_NODE)
1369 nid = numa_mem_id();
1370
1371#ifdef CONFIG_CMA
1372 {
1373 struct page *page;
1374 int node;
1375
1376 if (hugetlb_cma[nid]) {
1377 page = cma_alloc(hugetlb_cma[nid], nr_pages,
1378 huge_page_order(h), true);
1379 if (page)
1380 return page;
1381 }
1382
1383 if (!(gfp_mask & __GFP_THISNODE)) {
1384 for_each_node_mask(node, *nodemask) {
1385 if (node == nid || !hugetlb_cma[node])
1386 continue;
1387
1388 page = cma_alloc(hugetlb_cma[node], nr_pages,
1389 huge_page_order(h), true);
1390 if (page)
1391 return page;
1392 }
1393 }
1394 }
1395#endif
1396
1397 return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1398}
1399
1400#else
1401static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1402 int nid, nodemask_t *nodemask)
1403{
1404 return NULL;
1405}
1406#endif
1407
1408#else
1409static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1410 int nid, nodemask_t *nodemask)
1411{
1412 return NULL;
1413}
1414static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1415static inline void destroy_compound_gigantic_page(struct page *page,
1416 unsigned int order) { }
1417#endif
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427static void __remove_hugetlb_page(struct hstate *h, struct page *page,
1428 bool adjust_surplus,
1429 bool demote)
1430{
1431 int nid = page_to_nid(page);
1432
1433 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1434 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
1435
1436 lockdep_assert_held(&hugetlb_lock);
1437 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1438 return;
1439
1440 list_del(&page->lru);
1441
1442 if (HPageFreed(page)) {
1443 h->free_huge_pages--;
1444 h->free_huge_pages_node[nid]--;
1445 }
1446 if (adjust_surplus) {
1447 h->surplus_huge_pages--;
1448 h->surplus_huge_pages_node[nid]--;
1449 }
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471 if (!demote)
1472 set_page_refcounted(page);
1473 if (hstate_is_gigantic(h))
1474 set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1475 else
1476 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
1477
1478 h->nr_huge_pages--;
1479 h->nr_huge_pages_node[nid]--;
1480}
1481
1482static void remove_hugetlb_page(struct hstate *h, struct page *page,
1483 bool adjust_surplus)
1484{
1485 __remove_hugetlb_page(h, page, adjust_surplus, false);
1486}
1487
1488static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
1489 bool adjust_surplus)
1490{
1491 __remove_hugetlb_page(h, page, adjust_surplus, true);
1492}
1493
1494static void add_hugetlb_page(struct hstate *h, struct page *page,
1495 bool adjust_surplus)
1496{
1497 int zeroed;
1498 int nid = page_to_nid(page);
1499
1500 VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
1501
1502 lockdep_assert_held(&hugetlb_lock);
1503
1504 INIT_LIST_HEAD(&page->lru);
1505 h->nr_huge_pages++;
1506 h->nr_huge_pages_node[nid]++;
1507
1508 if (adjust_surplus) {
1509 h->surplus_huge_pages++;
1510 h->surplus_huge_pages_node[nid]++;
1511 }
1512
1513 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1514 set_page_private(page, 0);
1515 SetHPageVmemmapOptimized(page);
1516
1517
1518
1519
1520
1521
1522 zeroed = put_page_testzero(page);
1523 if (!zeroed)
1524
1525
1526
1527
1528
1529
1530 return;
1531
1532 arch_clear_hugepage_flags(page);
1533 enqueue_huge_page(h, page);
1534}
1535
1536static void __update_and_free_page(struct hstate *h, struct page *page)
1537{
1538 int i;
1539 struct page *subpage = page;
1540
1541 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1542 return;
1543
1544 if (hugetlb_vmemmap_alloc(h, page)) {
1545 spin_lock_irq(&hugetlb_lock);
1546
1547
1548
1549
1550
1551 add_hugetlb_page(h, page, true);
1552 spin_unlock_irq(&hugetlb_lock);
1553 return;
1554 }
1555
1556 for (i = 0; i < pages_per_huge_page(h);
1557 i++, subpage = mem_map_next(subpage, page, i)) {
1558 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1559 1 << PG_referenced | 1 << PG_dirty |
1560 1 << PG_active | 1 << PG_private |
1561 1 << PG_writeback);
1562 }
1563
1564
1565
1566
1567
1568 if (hstate_is_gigantic(h) ||
1569 hugetlb_cma_page(page, huge_page_order(h))) {
1570 destroy_compound_gigantic_page(page, huge_page_order(h));
1571 free_gigantic_page(page, huge_page_order(h));
1572 } else {
1573 __free_pages(page, huge_page_order(h));
1574 }
1575}
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588static LLIST_HEAD(hpage_freelist);
1589
1590static void free_hpage_workfn(struct work_struct *work)
1591{
1592 struct llist_node *node;
1593
1594 node = llist_del_all(&hpage_freelist);
1595
1596 while (node) {
1597 struct page *page;
1598 struct hstate *h;
1599
1600 page = container_of((struct address_space **)node,
1601 struct page, mapping);
1602 node = node->next;
1603 page->mapping = NULL;
1604
1605
1606
1607
1608
1609
1610 h = size_to_hstate(page_size(page));
1611
1612 __update_and_free_page(h, page);
1613
1614 cond_resched();
1615 }
1616}
1617static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1618
1619static inline void flush_free_hpage_work(struct hstate *h)
1620{
1621 if (hugetlb_optimize_vmemmap_pages(h))
1622 flush_work(&free_hpage_work);
1623}
1624
1625static void update_and_free_page(struct hstate *h, struct page *page,
1626 bool atomic)
1627{
1628 if (!HPageVmemmapOptimized(page) || !atomic) {
1629 __update_and_free_page(h, page);
1630 return;
1631 }
1632
1633
1634
1635
1636
1637
1638
1639
1640 if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
1641 schedule_work(&free_hpage_work);
1642}
1643
1644static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
1645{
1646 struct page *page, *t_page;
1647
1648 list_for_each_entry_safe(page, t_page, list, lru) {
1649 update_and_free_page(h, page, false);
1650 cond_resched();
1651 }
1652}
1653
1654struct hstate *size_to_hstate(unsigned long size)
1655{
1656 struct hstate *h;
1657
1658 for_each_hstate(h) {
1659 if (huge_page_size(h) == size)
1660 return h;
1661 }
1662 return NULL;
1663}
1664
1665void free_huge_page(struct page *page)
1666{
1667
1668
1669
1670
1671 struct hstate *h = page_hstate(page);
1672 int nid = page_to_nid(page);
1673 struct hugepage_subpool *spool = hugetlb_page_subpool(page);
1674 bool restore_reserve;
1675 unsigned long flags;
1676
1677 VM_BUG_ON_PAGE(page_count(page), page);
1678 VM_BUG_ON_PAGE(page_mapcount(page), page);
1679
1680 hugetlb_set_page_subpool(page, NULL);
1681 if (PageAnon(page))
1682 __ClearPageAnonExclusive(page);
1683 page->mapping = NULL;
1684 restore_reserve = HPageRestoreReserve(page);
1685 ClearHPageRestoreReserve(page);
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695 if (!restore_reserve) {
1696
1697
1698
1699
1700
1701
1702 if (hugepage_subpool_put_pages(spool, 1) == 0)
1703 restore_reserve = true;
1704 }
1705
1706 spin_lock_irqsave(&hugetlb_lock, flags);
1707 ClearHPageMigratable(page);
1708 hugetlb_cgroup_uncharge_page(hstate_index(h),
1709 pages_per_huge_page(h), page);
1710 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
1711 pages_per_huge_page(h), page);
1712 if (restore_reserve)
1713 h->resv_huge_pages++;
1714
1715 if (HPageTemporary(page)) {
1716 remove_hugetlb_page(h, page, false);
1717 spin_unlock_irqrestore(&hugetlb_lock, flags);
1718 update_and_free_page(h, page, true);
1719 } else if (h->surplus_huge_pages_node[nid]) {
1720
1721 remove_hugetlb_page(h, page, true);
1722 spin_unlock_irqrestore(&hugetlb_lock, flags);
1723 update_and_free_page(h, page, true);
1724 } else {
1725 arch_clear_hugepage_flags(page);
1726 enqueue_huge_page(h, page);
1727 spin_unlock_irqrestore(&hugetlb_lock, flags);
1728 }
1729}
1730
1731
1732
1733
1734static void __prep_account_new_huge_page(struct hstate *h, int nid)
1735{
1736 lockdep_assert_held(&hugetlb_lock);
1737 h->nr_huge_pages++;
1738 h->nr_huge_pages_node[nid]++;
1739}
1740
1741static void __prep_new_huge_page(struct hstate *h, struct page *page)
1742{
1743 hugetlb_vmemmap_free(h, page);
1744 INIT_LIST_HEAD(&page->lru);
1745 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1746 hugetlb_set_page_subpool(page, NULL);
1747 set_hugetlb_cgroup(page, NULL);
1748 set_hugetlb_cgroup_rsvd(page, NULL);
1749}
1750
1751static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1752{
1753 __prep_new_huge_page(h, page);
1754 spin_lock_irq(&hugetlb_lock);
1755 __prep_account_new_huge_page(h, nid);
1756 spin_unlock_irq(&hugetlb_lock);
1757}
1758
1759static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
1760 bool demote)
1761{
1762 int i, j;
1763 int nr_pages = 1 << order;
1764 struct page *p = page + 1;
1765
1766
1767 set_compound_order(page, order);
1768 __ClearPageReserved(page);
1769 __SetPageHead(page);
1770 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783 __ClearPageReserved(p);
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801 if (!demote) {
1802 if (!page_ref_freeze(p, 1)) {
1803 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
1804 goto out_error;
1805 }
1806 } else {
1807 VM_BUG_ON_PAGE(page_count(p), p);
1808 }
1809 set_compound_head(p, page);
1810 }
1811 atomic_set(compound_mapcount_ptr(page), -1);
1812 atomic_set(compound_pincount_ptr(page), 0);
1813 return true;
1814
1815out_error:
1816
1817 p = page + 1;
1818 for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
1819 clear_compound_head(p);
1820 set_page_refcounted(p);
1821 }
1822
1823 for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
1824 __ClearPageReserved(p);
1825 set_compound_order(page, 0);
1826#ifdef CONFIG_64BIT
1827 page[1].compound_nr = 0;
1828#endif
1829 __ClearPageHead(page);
1830 return false;
1831}
1832
1833static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
1834{
1835 return __prep_compound_gigantic_page(page, order, false);
1836}
1837
1838static bool prep_compound_gigantic_page_for_demote(struct page *page,
1839 unsigned int order)
1840{
1841 return __prep_compound_gigantic_page(page, order, true);
1842}
1843
1844
1845
1846
1847
1848
1849int PageHuge(struct page *page)
1850{
1851 if (!PageCompound(page))
1852 return 0;
1853
1854 page = compound_head(page);
1855 return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
1856}
1857EXPORT_SYMBOL_GPL(PageHuge);
1858
1859
1860
1861
1862
1863int PageHeadHuge(struct page *page_head)
1864{
1865 if (!PageHead(page_head))
1866 return 0;
1867
1868 return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
1869}
1870EXPORT_SYMBOL_GPL(PageHeadHuge);
1871
1872
1873
1874
1875
1876
1877
1878
1879struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
1880{
1881 struct address_space *mapping = page_mapping(hpage);
1882
1883 if (!mapping)
1884 return mapping;
1885
1886 if (i_mmap_trylock_write(mapping))
1887 return mapping;
1888
1889 return NULL;
1890}
1891
1892pgoff_t hugetlb_basepage_index(struct page *page)
1893{
1894 struct page *page_head = compound_head(page);
1895 pgoff_t index = page_index(page_head);
1896 unsigned long compound_idx;
1897
1898 if (compound_order(page_head) >= MAX_ORDER)
1899 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
1900 else
1901 compound_idx = page - page_head;
1902
1903 return (index << compound_order(page_head)) + compound_idx;
1904}
1905
1906static struct page *alloc_buddy_huge_page(struct hstate *h,
1907 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1908 nodemask_t *node_alloc_noretry)
1909{
1910 int order = huge_page_order(h);
1911 struct page *page;
1912 bool alloc_try_hard = true;
1913
1914
1915
1916
1917
1918
1919
1920
1921 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1922 alloc_try_hard = false;
1923 gfp_mask |= __GFP_COMP|__GFP_NOWARN;
1924 if (alloc_try_hard)
1925 gfp_mask |= __GFP_RETRY_MAYFAIL;
1926 if (nid == NUMA_NO_NODE)
1927 nid = numa_mem_id();
1928 page = __alloc_pages(gfp_mask, order, nid, nmask);
1929 if (page)
1930 __count_vm_event(HTLB_BUDDY_PGALLOC);
1931 else
1932 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1933
1934
1935
1936
1937
1938
1939 if (node_alloc_noretry && page && !alloc_try_hard)
1940 node_clear(nid, *node_alloc_noretry);
1941
1942
1943
1944
1945
1946
1947 if (node_alloc_noretry && !page && alloc_try_hard)
1948 node_set(nid, *node_alloc_noretry);
1949
1950 return page;
1951}
1952
1953
1954
1955
1956
1957static struct page *alloc_fresh_huge_page(struct hstate *h,
1958 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1959 nodemask_t *node_alloc_noretry)
1960{
1961 struct page *page;
1962 bool retry = false;
1963
1964retry:
1965 if (hstate_is_gigantic(h))
1966 page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
1967 else
1968 page = alloc_buddy_huge_page(h, gfp_mask,
1969 nid, nmask, node_alloc_noretry);
1970 if (!page)
1971 return NULL;
1972
1973 if (hstate_is_gigantic(h)) {
1974 if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
1975
1976
1977
1978
1979 free_gigantic_page(page, huge_page_order(h));
1980 if (!retry) {
1981 retry = true;
1982 goto retry;
1983 }
1984 return NULL;
1985 }
1986 }
1987 prep_new_huge_page(h, page, page_to_nid(page));
1988
1989 return page;
1990}
1991
1992
1993
1994
1995
1996static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1997 nodemask_t *node_alloc_noretry)
1998{
1999 struct page *page;
2000 int nr_nodes, node;
2001 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2002
2003 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
2004 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
2005 node_alloc_noretry);
2006 if (page)
2007 break;
2008 }
2009
2010 if (!page)
2011 return 0;
2012
2013 put_page(page);
2014
2015 return 1;
2016}
2017
2018
2019
2020
2021
2022
2023
2024
2025static struct page *remove_pool_huge_page(struct hstate *h,
2026 nodemask_t *nodes_allowed,
2027 bool acct_surplus)
2028{
2029 int nr_nodes, node;
2030 struct page *page = NULL;
2031
2032 lockdep_assert_held(&hugetlb_lock);
2033 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2034
2035
2036
2037
2038 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2039 !list_empty(&h->hugepage_freelists[node])) {
2040 page = list_entry(h->hugepage_freelists[node].next,
2041 struct page, lru);
2042 remove_hugetlb_page(h, page, acct_surplus);
2043 break;
2044 }
2045 }
2046
2047 return page;
2048}
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064int dissolve_free_huge_page(struct page *page)
2065{
2066 int rc = -EBUSY;
2067
2068retry:
2069
2070 if (!PageHuge(page))
2071 return 0;
2072
2073 spin_lock_irq(&hugetlb_lock);
2074 if (!PageHuge(page)) {
2075 rc = 0;
2076 goto out;
2077 }
2078
2079 if (!page_count(page)) {
2080 struct page *head = compound_head(page);
2081 struct hstate *h = page_hstate(head);
2082 if (h->free_huge_pages - h->resv_huge_pages == 0)
2083 goto out;
2084
2085
2086
2087
2088
2089 if (unlikely(!HPageFreed(head))) {
2090 spin_unlock_irq(&hugetlb_lock);
2091 cond_resched();
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101 goto retry;
2102 }
2103
2104 remove_hugetlb_page(h, head, false);
2105 h->max_huge_pages--;
2106 spin_unlock_irq(&hugetlb_lock);
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116 rc = hugetlb_vmemmap_alloc(h, head);
2117 if (!rc) {
2118
2119
2120
2121
2122
2123 if (PageHWPoison(head) && page != head) {
2124 SetPageHWPoison(page);
2125 ClearPageHWPoison(head);
2126 }
2127 update_and_free_page(h, head, false);
2128 } else {
2129 spin_lock_irq(&hugetlb_lock);
2130 add_hugetlb_page(h, head, false);
2131 h->max_huge_pages++;
2132 spin_unlock_irq(&hugetlb_lock);
2133 }
2134
2135 return rc;
2136 }
2137out:
2138 spin_unlock_irq(&hugetlb_lock);
2139 return rc;
2140}
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
2151{
2152 unsigned long pfn;
2153 struct page *page;
2154 int rc = 0;
2155
2156 if (!hugepages_supported())
2157 return rc;
2158
2159 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
2160 page = pfn_to_page(pfn);
2161 rc = dissolve_free_huge_page(page);
2162 if (rc)
2163 break;
2164 }
2165
2166 return rc;
2167}
2168
2169
2170
2171
2172static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
2173 int nid, nodemask_t *nmask, bool zero_ref)
2174{
2175 struct page *page = NULL;
2176 bool retry = false;
2177
2178 if (hstate_is_gigantic(h))
2179 return NULL;
2180
2181 spin_lock_irq(&hugetlb_lock);
2182 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2183 goto out_unlock;
2184 spin_unlock_irq(&hugetlb_lock);
2185
2186retry:
2187 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2188 if (!page)
2189 return NULL;
2190
2191 spin_lock_irq(&hugetlb_lock);
2192
2193
2194
2195
2196
2197
2198
2199 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2200 SetHPageTemporary(page);
2201 spin_unlock_irq(&hugetlb_lock);
2202 put_page(page);
2203 return NULL;
2204 }
2205
2206 if (zero_ref) {
2207
2208
2209
2210
2211
2212
2213 SetHPageTemporary(page);
2214 if (!put_page_testzero(page)) {
2215
2216
2217
2218
2219 pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
2220 spin_unlock_irq(&hugetlb_lock);
2221 if (retry)
2222 return NULL;
2223
2224 retry = true;
2225 goto retry;
2226 }
2227 ClearHPageTemporary(page);
2228 }
2229
2230 h->surplus_huge_pages++;
2231 h->surplus_huge_pages_node[page_to_nid(page)]++;
2232
2233out_unlock:
2234 spin_unlock_irq(&hugetlb_lock);
2235
2236 return page;
2237}
2238
2239static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
2240 int nid, nodemask_t *nmask)
2241{
2242 struct page *page;
2243
2244 if (hstate_is_gigantic(h))
2245 return NULL;
2246
2247 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2248 if (!page)
2249 return NULL;
2250
2251
2252
2253
2254
2255 SetHPageTemporary(page);
2256
2257 return page;
2258}
2259
2260
2261
2262
2263static
2264struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
2265 struct vm_area_struct *vma, unsigned long addr)
2266{
2267 struct page *page = NULL;
2268 struct mempolicy *mpol;
2269 gfp_t gfp_mask = htlb_alloc_mask(h);
2270 int nid;
2271 nodemask_t *nodemask;
2272
2273 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
2274 if (mpol_is_preferred_many(mpol)) {
2275 gfp_t gfp = gfp_mask | __GFP_NOWARN;
2276
2277 gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2278 page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
2279
2280
2281 nodemask = NULL;
2282 }
2283
2284 if (!page)
2285 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
2286 mpol_cond_put(mpol);
2287 return page;
2288}
2289
2290
2291struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
2292 nodemask_t *nmask, gfp_t gfp_mask)
2293{
2294 spin_lock_irq(&hugetlb_lock);
2295 if (h->free_huge_pages - h->resv_huge_pages > 0) {
2296 struct page *page;
2297
2298 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
2299 if (page) {
2300 spin_unlock_irq(&hugetlb_lock);
2301 return page;
2302 }
2303 }
2304 spin_unlock_irq(&hugetlb_lock);
2305
2306 return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
2307}
2308
2309
2310struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
2311 unsigned long address)
2312{
2313 struct mempolicy *mpol;
2314 nodemask_t *nodemask;
2315 struct page *page;
2316 gfp_t gfp_mask;
2317 int node;
2318
2319 gfp_mask = htlb_alloc_mask(h);
2320 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
2321 page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
2322 mpol_cond_put(mpol);
2323
2324 return page;
2325}
2326
2327
2328
2329
2330
2331static int gather_surplus_pages(struct hstate *h, long delta)
2332 __must_hold(&hugetlb_lock)
2333{
2334 struct list_head surplus_list;
2335 struct page *page, *tmp;
2336 int ret;
2337 long i;
2338 long needed, allocated;
2339 bool alloc_ok = true;
2340
2341 lockdep_assert_held(&hugetlb_lock);
2342 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2343 if (needed <= 0) {
2344 h->resv_huge_pages += delta;
2345 return 0;
2346 }
2347
2348 allocated = 0;
2349 INIT_LIST_HEAD(&surplus_list);
2350
2351 ret = -ENOMEM;
2352retry:
2353 spin_unlock_irq(&hugetlb_lock);
2354 for (i = 0; i < needed; i++) {
2355 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
2356 NUMA_NO_NODE, NULL, true);
2357 if (!page) {
2358 alloc_ok = false;
2359 break;
2360 }
2361 list_add(&page->lru, &surplus_list);
2362 cond_resched();
2363 }
2364 allocated += i;
2365
2366
2367
2368
2369
2370 spin_lock_irq(&hugetlb_lock);
2371 needed = (h->resv_huge_pages + delta) -
2372 (h->free_huge_pages + allocated);
2373 if (needed > 0) {
2374 if (alloc_ok)
2375 goto retry;
2376
2377
2378
2379
2380
2381 goto free;
2382 }
2383
2384
2385
2386
2387
2388
2389
2390
2391 needed += allocated;
2392 h->resv_huge_pages += delta;
2393 ret = 0;
2394
2395
2396 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
2397 if ((--needed) < 0)
2398 break;
2399
2400 enqueue_huge_page(h, page);
2401 }
2402free:
2403 spin_unlock_irq(&hugetlb_lock);
2404
2405
2406
2407
2408
2409 list_for_each_entry_safe(page, tmp, &surplus_list, lru)
2410 free_huge_page(page);
2411 spin_lock_irq(&hugetlb_lock);
2412
2413 return ret;
2414}
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424static void return_unused_surplus_pages(struct hstate *h,
2425 unsigned long unused_resv_pages)
2426{
2427 unsigned long nr_pages;
2428 struct page *page;
2429 LIST_HEAD(page_list);
2430
2431 lockdep_assert_held(&hugetlb_lock);
2432
2433 h->resv_huge_pages -= unused_resv_pages;
2434
2435
2436 if (hstate_is_gigantic(h))
2437 goto out;
2438
2439
2440
2441
2442
2443 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453 while (nr_pages--) {
2454 page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
2455 if (!page)
2456 goto out;
2457
2458 list_add(&page->lru, &page_list);
2459 }
2460
2461out:
2462 spin_unlock_irq(&hugetlb_lock);
2463 update_and_free_pages_bulk(h, &page_list);
2464 spin_lock_irq(&hugetlb_lock);
2465}
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497enum vma_resv_mode {
2498 VMA_NEEDS_RESV,
2499 VMA_COMMIT_RESV,
2500 VMA_END_RESV,
2501 VMA_ADD_RESV,
2502 VMA_DEL_RESV,
2503};
2504static long __vma_reservation_common(struct hstate *h,
2505 struct vm_area_struct *vma, unsigned long addr,
2506 enum vma_resv_mode mode)
2507{
2508 struct resv_map *resv;
2509 pgoff_t idx;
2510 long ret;
2511 long dummy_out_regions_needed;
2512
2513 resv = vma_resv_map(vma);
2514 if (!resv)
2515 return 1;
2516
2517 idx = vma_hugecache_offset(h, vma, addr);
2518 switch (mode) {
2519 case VMA_NEEDS_RESV:
2520 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2521
2522
2523
2524
2525 VM_BUG_ON(dummy_out_regions_needed != 1);
2526 break;
2527 case VMA_COMMIT_RESV:
2528 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2529
2530 VM_BUG_ON(ret < 0);
2531 break;
2532 case VMA_END_RESV:
2533 region_abort(resv, idx, idx + 1, 1);
2534 ret = 0;
2535 break;
2536 case VMA_ADD_RESV:
2537 if (vma->vm_flags & VM_MAYSHARE) {
2538 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2539
2540 VM_BUG_ON(ret < 0);
2541 } else {
2542 region_abort(resv, idx, idx + 1, 1);
2543 ret = region_del(resv, idx, idx + 1);
2544 }
2545 break;
2546 case VMA_DEL_RESV:
2547 if (vma->vm_flags & VM_MAYSHARE) {
2548 region_abort(resv, idx, idx + 1, 1);
2549 ret = region_del(resv, idx, idx + 1);
2550 } else {
2551 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2552
2553 VM_BUG_ON(ret < 0);
2554 }
2555 break;
2556 default:
2557 BUG();
2558 }
2559
2560 if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2561 return ret;
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577 if (ret > 0)
2578 return 0;
2579 if (ret == 0)
2580 return 1;
2581 return ret;
2582}
2583
2584static long vma_needs_reservation(struct hstate *h,
2585 struct vm_area_struct *vma, unsigned long addr)
2586{
2587 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
2588}
2589
2590static long vma_commit_reservation(struct hstate *h,
2591 struct vm_area_struct *vma, unsigned long addr)
2592{
2593 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2594}
2595
2596static void vma_end_reservation(struct hstate *h,
2597 struct vm_area_struct *vma, unsigned long addr)
2598{
2599 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
2600}
2601
2602static long vma_add_reservation(struct hstate *h,
2603 struct vm_area_struct *vma, unsigned long addr)
2604{
2605 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2606}
2607
2608static long vma_del_reservation(struct hstate *h,
2609 struct vm_area_struct *vma, unsigned long addr)
2610{
2611 return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2612}
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
2635 unsigned long address, struct page *page)
2636{
2637 long rc = vma_needs_reservation(h, vma, address);
2638
2639 if (HPageRestoreReserve(page)) {
2640 if (unlikely(rc < 0))
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652 ClearHPageRestoreReserve(page);
2653 else if (rc)
2654 (void)vma_add_reservation(h, vma, address);
2655 else
2656 vma_end_reservation(h, vma, address);
2657 } else {
2658 if (!rc) {
2659
2660
2661
2662
2663
2664
2665
2666
2667 rc = vma_del_reservation(h, vma, address);
2668 if (rc < 0)
2669
2670
2671
2672
2673
2674
2675
2676
2677 SetHPageRestoreReserve(page);
2678 } else if (rc < 0) {
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689 if (!(vma->vm_flags & VM_MAYSHARE))
2690
2691
2692
2693
2694
2695
2696
2697
2698 SetHPageRestoreReserve(page);
2699 } else
2700
2701
2702
2703 vma_end_reservation(h, vma, address);
2704 }
2705}
2706
2707
2708
2709
2710
2711
2712
2713
2714static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
2715 struct list_head *list)
2716{
2717 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2718 int nid = page_to_nid(old_page);
2719 bool alloc_retry = false;
2720 struct page *new_page;
2721 int ret = 0;
2722
2723
2724
2725
2726
2727
2728
2729
2730alloc_retry:
2731 new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
2732 if (!new_page)
2733 return -ENOMEM;
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744 SetHPageTemporary(new_page);
2745 if (!put_page_testzero(new_page)) {
2746 if (alloc_retry)
2747 return -EBUSY;
2748
2749 alloc_retry = true;
2750 goto alloc_retry;
2751 }
2752 ClearHPageTemporary(new_page);
2753
2754 __prep_new_huge_page(h, new_page);
2755
2756retry:
2757 spin_lock_irq(&hugetlb_lock);
2758 if (!PageHuge(old_page)) {
2759
2760
2761
2762 goto free_new;
2763 } else if (page_count(old_page)) {
2764
2765
2766
2767
2768 spin_unlock_irq(&hugetlb_lock);
2769 if (!isolate_huge_page(old_page, list))
2770 ret = -EBUSY;
2771 spin_lock_irq(&hugetlb_lock);
2772 goto free_new;
2773 } else if (!HPageFreed(old_page)) {
2774
2775
2776
2777
2778
2779 spin_unlock_irq(&hugetlb_lock);
2780 cond_resched();
2781 goto retry;
2782 } else {
2783
2784
2785
2786
2787
2788
2789
2790 remove_hugetlb_page(h, old_page, false);
2791
2792
2793
2794
2795
2796 __prep_account_new_huge_page(h, nid);
2797 enqueue_huge_page(h, new_page);
2798
2799
2800
2801
2802 spin_unlock_irq(&hugetlb_lock);
2803 update_and_free_page(h, old_page, false);
2804 }
2805
2806 return ret;
2807
2808free_new:
2809 spin_unlock_irq(&hugetlb_lock);
2810
2811 set_page_refcounted(new_page);
2812 update_and_free_page(h, new_page, false);
2813
2814 return ret;
2815}
2816
2817int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
2818{
2819 struct hstate *h;
2820 struct page *head;
2821 int ret = -EBUSY;
2822
2823
2824
2825
2826
2827
2828 spin_lock_irq(&hugetlb_lock);
2829 if (PageHuge(page)) {
2830 head = compound_head(page);
2831 h = page_hstate(head);
2832 } else {
2833 spin_unlock_irq(&hugetlb_lock);
2834 return 0;
2835 }
2836 spin_unlock_irq(&hugetlb_lock);
2837
2838
2839
2840
2841
2842
2843 if (hstate_is_gigantic(h))
2844 return -ENOMEM;
2845
2846 if (page_count(head) && isolate_huge_page(head, list))
2847 ret = 0;
2848 else if (!page_count(head))
2849 ret = alloc_and_dissolve_huge_page(h, head, list);
2850
2851 return ret;
2852}
2853
2854struct page *alloc_huge_page(struct vm_area_struct *vma,
2855 unsigned long addr, int avoid_reserve)
2856{
2857 struct hugepage_subpool *spool = subpool_vma(vma);
2858 struct hstate *h = hstate_vma(vma);
2859 struct page *page;
2860 long map_chg, map_commit;
2861 long gbl_chg;
2862 int ret, idx;
2863 struct hugetlb_cgroup *h_cg;
2864 bool deferred_reserve;
2865
2866 idx = hstate_index(h);
2867
2868
2869
2870
2871
2872 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
2873 if (map_chg < 0)
2874 return ERR_PTR(-ENOMEM);
2875
2876
2877
2878
2879
2880
2881
2882
2883 if (map_chg || avoid_reserve) {
2884 gbl_chg = hugepage_subpool_get_pages(spool, 1);
2885 if (gbl_chg < 0) {
2886 vma_end_reservation(h, vma, addr);
2887 return ERR_PTR(-ENOSPC);
2888 }
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898 if (avoid_reserve)
2899 gbl_chg = 1;
2900 }
2901
2902
2903
2904 deferred_reserve = map_chg || avoid_reserve;
2905 if (deferred_reserve) {
2906 ret = hugetlb_cgroup_charge_cgroup_rsvd(
2907 idx, pages_per_huge_page(h), &h_cg);
2908 if (ret)
2909 goto out_subpool_put;
2910 }
2911
2912 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
2913 if (ret)
2914 goto out_uncharge_cgroup_reservation;
2915
2916 spin_lock_irq(&hugetlb_lock);
2917
2918
2919
2920
2921
2922 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
2923 if (!page) {
2924 spin_unlock_irq(&hugetlb_lock);
2925 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
2926 if (!page)
2927 goto out_uncharge_cgroup;
2928 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
2929 SetHPageRestoreReserve(page);
2930 h->resv_huge_pages--;
2931 }
2932 spin_lock_irq(&hugetlb_lock);
2933 list_add(&page->lru, &h->hugepage_activelist);
2934
2935 }
2936 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
2937
2938
2939
2940 if (deferred_reserve) {
2941 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
2942 h_cg, page);
2943 }
2944
2945 spin_unlock_irq(&hugetlb_lock);
2946
2947 hugetlb_set_page_subpool(page, spool);
2948
2949 map_commit = vma_commit_reservation(h, vma, addr);
2950 if (unlikely(map_chg > map_commit)) {
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960 long rsv_adjust;
2961
2962 rsv_adjust = hugepage_subpool_put_pages(spool, 1);
2963 hugetlb_acct_memory(h, -rsv_adjust);
2964 if (deferred_reserve)
2965 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
2966 pages_per_huge_page(h), page);
2967 }
2968 return page;
2969
2970out_uncharge_cgroup:
2971 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
2972out_uncharge_cgroup_reservation:
2973 if (deferred_reserve)
2974 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
2975 h_cg);
2976out_subpool_put:
2977 if (map_chg || avoid_reserve)
2978 hugepage_subpool_put_pages(spool, 1);
2979 vma_end_reservation(h, vma, addr);
2980 return ERR_PTR(-ENOSPC);
2981}
2982
2983int alloc_bootmem_huge_page(struct hstate *h, int nid)
2984 __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
2985int __alloc_bootmem_huge_page(struct hstate *h, int nid)
2986{
2987 struct huge_bootmem_page *m = NULL;
2988 int nr_nodes, node;
2989
2990
2991 if (nid != NUMA_NO_NODE) {
2992 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
2993 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
2994 if (!m)
2995 return 0;
2996 goto found;
2997 }
2998
2999 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
3000 m = memblock_alloc_try_nid_raw(
3001 huge_page_size(h), huge_page_size(h),
3002 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
3003
3004
3005
3006
3007
3008 if (!m)
3009 return 0;
3010 goto found;
3011 }
3012
3013found:
3014
3015 INIT_LIST_HEAD(&m->list);
3016 list_add(&m->list, &huge_boot_pages);
3017 m->hstate = h;
3018 return 1;
3019}
3020
3021
3022
3023
3024
3025static void __init gather_bootmem_prealloc(void)
3026{
3027 struct huge_bootmem_page *m;
3028
3029 list_for_each_entry(m, &huge_boot_pages, list) {
3030 struct page *page = virt_to_page(m);
3031 struct hstate *h = m->hstate;
3032
3033 VM_BUG_ON(!hstate_is_gigantic(h));
3034 WARN_ON(page_count(page) != 1);
3035 if (prep_compound_gigantic_page(page, huge_page_order(h))) {
3036 WARN_ON(PageReserved(page));
3037 prep_new_huge_page(h, page, page_to_nid(page));
3038 put_page(page);
3039 } else {
3040
3041 free_gigantic_page(page, huge_page_order(h));
3042 }
3043
3044
3045
3046
3047
3048
3049 adjust_managed_page_count(page, pages_per_huge_page(h));
3050 cond_resched();
3051 }
3052}
3053static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3054{
3055 unsigned long i;
3056 char buf[32];
3057
3058 for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3059 if (hstate_is_gigantic(h)) {
3060 if (!alloc_bootmem_huge_page(h, nid))
3061 break;
3062 } else {
3063 struct page *page;
3064 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3065
3066 page = alloc_fresh_huge_page(h, gfp_mask, nid,
3067 &node_states[N_MEMORY], NULL);
3068 if (!page)
3069 break;
3070 put_page(page);
3071 }
3072 cond_resched();
3073 }
3074 if (i == h->max_huge_pages_node[nid])
3075 return;
3076
3077 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3078 pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
3079 h->max_huge_pages_node[nid], buf, nid, i);
3080 h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3081 h->max_huge_pages_node[nid] = i;
3082}
3083
3084static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
3085{
3086 unsigned long i;
3087 nodemask_t *node_alloc_noretry;
3088 bool node_specific_alloc = false;
3089
3090
3091 if (hstate_is_gigantic(h) && hugetlb_cma_size) {
3092 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3093 return;
3094 }
3095
3096
3097 for_each_online_node(i) {
3098 if (h->max_huge_pages_node[i] > 0) {
3099 hugetlb_hstate_alloc_pages_onenode(h, i);
3100 node_specific_alloc = true;
3101 }
3102 }
3103
3104 if (node_specific_alloc)
3105 return;
3106
3107
3108 if (!hstate_is_gigantic(h)) {
3109
3110
3111
3112
3113
3114
3115 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
3116 GFP_KERNEL);
3117 } else {
3118
3119 node_alloc_noretry = NULL;
3120 }
3121
3122
3123 if (node_alloc_noretry)
3124 nodes_clear(*node_alloc_noretry);
3125
3126 for (i = 0; i < h->max_huge_pages; ++i) {
3127 if (hstate_is_gigantic(h)) {
3128 if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
3129 break;
3130 } else if (!alloc_pool_huge_page(h,
3131 &node_states[N_MEMORY],
3132 node_alloc_noretry))
3133 break;
3134 cond_resched();
3135 }
3136 if (i < h->max_huge_pages) {
3137 char buf[32];
3138
3139 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3140 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
3141 h->max_huge_pages, buf, i);
3142 h->max_huge_pages = i;
3143 }
3144 kfree(node_alloc_noretry);
3145}
3146
3147static void __init hugetlb_init_hstates(void)
3148{
3149 struct hstate *h, *h2;
3150
3151 for_each_hstate(h) {
3152 if (minimum_order > huge_page_order(h))
3153 minimum_order = huge_page_order(h);
3154
3155
3156 if (!hstate_is_gigantic(h))
3157 hugetlb_hstate_alloc_pages(h);
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3168 continue;
3169 if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
3170 continue;
3171 for_each_hstate(h2) {
3172 if (h2 == h)
3173 continue;
3174 if (h2->order < h->order &&
3175 h2->order > h->demote_order)
3176 h->demote_order = h2->order;
3177 }
3178 }
3179 VM_BUG_ON(minimum_order == UINT_MAX);
3180}
3181
3182static void __init report_hugepages(void)
3183{
3184 struct hstate *h;
3185
3186 for_each_hstate(h) {
3187 char buf[32];
3188
3189 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3190 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
3191 buf, h->free_huge_pages);
3192 }
3193}
3194
3195#ifdef CONFIG_HIGHMEM
3196static void try_to_free_low(struct hstate *h, unsigned long count,
3197 nodemask_t *nodes_allowed)
3198{
3199 int i;
3200 LIST_HEAD(page_list);
3201
3202 lockdep_assert_held(&hugetlb_lock);
3203 if (hstate_is_gigantic(h))
3204 return;
3205
3206
3207
3208
3209 for_each_node_mask(i, *nodes_allowed) {
3210 struct page *page, *next;
3211 struct list_head *freel = &h->hugepage_freelists[i];
3212 list_for_each_entry_safe(page, next, freel, lru) {
3213 if (count >= h->nr_huge_pages)
3214 goto out;
3215 if (PageHighMem(page))
3216 continue;
3217 remove_hugetlb_page(h, page, false);
3218 list_add(&page->lru, &page_list);
3219 }
3220 }
3221
3222out:
3223 spin_unlock_irq(&hugetlb_lock);
3224 update_and_free_pages_bulk(h, &page_list);
3225 spin_lock_irq(&hugetlb_lock);
3226}
3227#else
3228static inline void try_to_free_low(struct hstate *h, unsigned long count,
3229 nodemask_t *nodes_allowed)
3230{
3231}
3232#endif
3233
3234
3235
3236
3237
3238
3239static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
3240 int delta)
3241{
3242 int nr_nodes, node;
3243
3244 lockdep_assert_held(&hugetlb_lock);
3245 VM_BUG_ON(delta != -1 && delta != 1);
3246
3247 if (delta < 0) {
3248 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
3249 if (h->surplus_huge_pages_node[node])
3250 goto found;
3251 }
3252 } else {
3253 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3254 if (h->surplus_huge_pages_node[node] <
3255 h->nr_huge_pages_node[node])
3256 goto found;
3257 }
3258 }
3259 return 0;
3260
3261found:
3262 h->surplus_huge_pages += delta;
3263 h->surplus_huge_pages_node[node] += delta;
3264 return 1;
3265}
3266
3267#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3268static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
3269 nodemask_t *nodes_allowed)
3270{
3271 unsigned long min_count, ret;
3272 struct page *page;
3273 LIST_HEAD(page_list);
3274 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3275
3276
3277
3278
3279
3280
3281 if (node_alloc_noretry)
3282 nodes_clear(*node_alloc_noretry);
3283 else
3284 return -ENOMEM;
3285
3286
3287
3288
3289
3290 mutex_lock(&h->resize_lock);
3291 flush_free_hpage_work(h);
3292 spin_lock_irq(&hugetlb_lock);
3293
3294
3295
3296
3297
3298
3299
3300 if (nid != NUMA_NO_NODE) {
3301 unsigned long old_count = count;
3302
3303 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
3304
3305
3306
3307
3308
3309
3310 if (count < old_count)
3311 count = ULONG_MAX;
3312 }
3313
3314
3315
3316
3317
3318
3319
3320
3321 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3322 if (count > persistent_huge_pages(h)) {
3323 spin_unlock_irq(&hugetlb_lock);
3324 mutex_unlock(&h->resize_lock);
3325 NODEMASK_FREE(node_alloc_noretry);
3326 return -EINVAL;
3327 }
3328
3329 }
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3343 if (!adjust_pool_surplus(h, nodes_allowed, -1))
3344 break;
3345 }
3346
3347 while (count > persistent_huge_pages(h)) {
3348
3349
3350
3351
3352
3353 spin_unlock_irq(&hugetlb_lock);
3354
3355
3356 cond_resched();
3357
3358 ret = alloc_pool_huge_page(h, nodes_allowed,
3359 node_alloc_noretry);
3360 spin_lock_irq(&hugetlb_lock);
3361 if (!ret)
3362 goto out;
3363
3364
3365 if (signal_pending(current))
3366 goto out;
3367 }
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
3385 min_count = max(count, min_count);
3386 try_to_free_low(h, min_count, nodes_allowed);
3387
3388
3389
3390
3391 while (min_count < persistent_huge_pages(h)) {
3392 page = remove_pool_huge_page(h, nodes_allowed, 0);
3393 if (!page)
3394 break;
3395
3396 list_add(&page->lru, &page_list);
3397 }
3398
3399 spin_unlock_irq(&hugetlb_lock);
3400 update_and_free_pages_bulk(h, &page_list);
3401 flush_free_hpage_work(h);
3402 spin_lock_irq(&hugetlb_lock);
3403
3404 while (count < persistent_huge_pages(h)) {
3405 if (!adjust_pool_surplus(h, nodes_allowed, 1))
3406 break;
3407 }
3408out:
3409 h->max_huge_pages = persistent_huge_pages(h);
3410 spin_unlock_irq(&hugetlb_lock);
3411 mutex_unlock(&h->resize_lock);
3412
3413 NODEMASK_FREE(node_alloc_noretry);
3414
3415 return 0;
3416}
3417
3418static int demote_free_huge_page(struct hstate *h, struct page *page)
3419{
3420 int i, nid = page_to_nid(page);
3421 struct hstate *target_hstate;
3422 int rc = 0;
3423
3424 target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
3425
3426 remove_hugetlb_page_for_demote(h, page, false);
3427 spin_unlock_irq(&hugetlb_lock);
3428
3429 rc = hugetlb_vmemmap_alloc(h, page);
3430 if (rc) {
3431
3432 spin_lock_irq(&hugetlb_lock);
3433 set_page_refcounted(page);
3434 add_hugetlb_page(h, page, false);
3435 return rc;
3436 }
3437
3438
3439
3440
3441
3442 destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452 mutex_lock(&target_hstate->resize_lock);
3453 for (i = 0; i < pages_per_huge_page(h);
3454 i += pages_per_huge_page(target_hstate)) {
3455 if (hstate_is_gigantic(target_hstate))
3456 prep_compound_gigantic_page_for_demote(page + i,
3457 target_hstate->order);
3458 else
3459 prep_compound_page(page + i, target_hstate->order);
3460 set_page_private(page + i, 0);
3461 set_page_refcounted(page + i);
3462 prep_new_huge_page(target_hstate, page + i, nid);
3463 put_page(page + i);
3464 }
3465 mutex_unlock(&target_hstate->resize_lock);
3466
3467 spin_lock_irq(&hugetlb_lock);
3468
3469
3470
3471
3472
3473 h->max_huge_pages--;
3474 target_hstate->max_huge_pages += pages_per_huge_page(h);
3475
3476 return rc;
3477}
3478
3479static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
3480 __must_hold(&hugetlb_lock)
3481{
3482 int nr_nodes, node;
3483 struct page *page;
3484
3485 lockdep_assert_held(&hugetlb_lock);
3486
3487
3488 if (!h->demote_order) {
3489 pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
3490 return -EINVAL;
3491 }
3492
3493 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3494 list_for_each_entry(page, &h->hugepage_freelists[node], lru) {
3495 if (PageHWPoison(page))
3496 continue;
3497
3498 return demote_free_huge_page(h, page);
3499 }
3500 }
3501
3502
3503
3504
3505
3506 return -EBUSY;
3507}
3508
3509#define HSTATE_ATTR_RO(_name) \
3510 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
3511
3512#define HSTATE_ATTR_WO(_name) \
3513 static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
3514
3515#define HSTATE_ATTR(_name) \
3516 static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
3517
3518static struct kobject *hugepages_kobj;
3519static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3520
3521static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
3522
3523static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
3524{
3525 int i;
3526
3527 for (i = 0; i < HUGE_MAX_HSTATE; i++)
3528 if (hstate_kobjs[i] == kobj) {
3529 if (nidp)
3530 *nidp = NUMA_NO_NODE;
3531 return &hstates[i];
3532 }
3533
3534 return kobj_to_node_hstate(kobj, nidp);
3535}
3536
3537static ssize_t nr_hugepages_show_common(struct kobject *kobj,
3538 struct kobj_attribute *attr, char *buf)
3539{
3540 struct hstate *h;
3541 unsigned long nr_huge_pages;
3542 int nid;
3543
3544 h = kobj_to_hstate(kobj, &nid);
3545 if (nid == NUMA_NO_NODE)
3546 nr_huge_pages = h->nr_huge_pages;
3547 else
3548 nr_huge_pages = h->nr_huge_pages_node[nid];
3549
3550 return sysfs_emit(buf, "%lu\n", nr_huge_pages);
3551}
3552
3553static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
3554 struct hstate *h, int nid,
3555 unsigned long count, size_t len)
3556{
3557 int err;
3558 nodemask_t nodes_allowed, *n_mask;
3559
3560 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3561 return -EINVAL;
3562
3563 if (nid == NUMA_NO_NODE) {
3564
3565
3566
3567 if (!(obey_mempolicy &&
3568 init_nodemask_of_mempolicy(&nodes_allowed)))
3569 n_mask = &node_states[N_MEMORY];
3570 else
3571 n_mask = &nodes_allowed;
3572 } else {
3573
3574
3575
3576
3577 init_nodemask_of_node(&nodes_allowed, nid);
3578 n_mask = &nodes_allowed;
3579 }
3580
3581 err = set_max_huge_pages(h, count, nid, n_mask);
3582
3583 return err ? err : len;
3584}
3585
3586static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
3587 struct kobject *kobj, const char *buf,
3588 size_t len)
3589{
3590 struct hstate *h;
3591 unsigned long count;
3592 int nid;
3593 int err;
3594
3595 err = kstrtoul(buf, 10, &count);
3596 if (err)
3597 return err;
3598
3599 h = kobj_to_hstate(kobj, &nid);
3600 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
3601}
3602
3603static ssize_t nr_hugepages_show(struct kobject *kobj,
3604 struct kobj_attribute *attr, char *buf)
3605{
3606 return nr_hugepages_show_common(kobj, attr, buf);
3607}
3608
3609static ssize_t nr_hugepages_store(struct kobject *kobj,
3610 struct kobj_attribute *attr, const char *buf, size_t len)
3611{
3612 return nr_hugepages_store_common(false, kobj, buf, len);
3613}
3614HSTATE_ATTR(nr_hugepages);
3615
3616#ifdef CONFIG_NUMA
3617
3618
3619
3620
3621
3622static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
3623 struct kobj_attribute *attr,
3624 char *buf)
3625{
3626 return nr_hugepages_show_common(kobj, attr, buf);
3627}
3628
3629static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
3630 struct kobj_attribute *attr, const char *buf, size_t len)
3631{
3632 return nr_hugepages_store_common(true, kobj, buf, len);
3633}
3634HSTATE_ATTR(nr_hugepages_mempolicy);
3635#endif
3636
3637
3638static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
3639 struct kobj_attribute *attr, char *buf)
3640{
3641 struct hstate *h = kobj_to_hstate(kobj, NULL);
3642 return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
3643}
3644
3645static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
3646 struct kobj_attribute *attr, const char *buf, size_t count)
3647{
3648 int err;
3649 unsigned long input;
3650 struct hstate *h = kobj_to_hstate(kobj, NULL);
3651
3652 if (hstate_is_gigantic(h))
3653 return -EINVAL;
3654
3655 err = kstrtoul(buf, 10, &input);
3656 if (err)
3657 return err;
3658
3659 spin_lock_irq(&hugetlb_lock);
3660 h->nr_overcommit_huge_pages = input;
3661 spin_unlock_irq(&hugetlb_lock);
3662
3663 return count;
3664}
3665HSTATE_ATTR(nr_overcommit_hugepages);
3666
3667static ssize_t free_hugepages_show(struct kobject *kobj,
3668 struct kobj_attribute *attr, char *buf)
3669{
3670 struct hstate *h;
3671 unsigned long free_huge_pages;
3672 int nid;
3673
3674 h = kobj_to_hstate(kobj, &nid);
3675 if (nid == NUMA_NO_NODE)
3676 free_huge_pages = h->free_huge_pages;
3677 else
3678 free_huge_pages = h->free_huge_pages_node[nid];
3679
3680 return sysfs_emit(buf, "%lu\n", free_huge_pages);
3681}
3682HSTATE_ATTR_RO(free_hugepages);
3683
3684static ssize_t resv_hugepages_show(struct kobject *kobj,
3685 struct kobj_attribute *attr, char *buf)
3686{
3687 struct hstate *h = kobj_to_hstate(kobj, NULL);
3688 return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
3689}
3690HSTATE_ATTR_RO(resv_hugepages);
3691
3692static ssize_t surplus_hugepages_show(struct kobject *kobj,
3693 struct kobj_attribute *attr, char *buf)
3694{
3695 struct hstate *h;
3696 unsigned long surplus_huge_pages;
3697 int nid;
3698
3699 h = kobj_to_hstate(kobj, &nid);
3700 if (nid == NUMA_NO_NODE)
3701 surplus_huge_pages = h->surplus_huge_pages;
3702 else
3703 surplus_huge_pages = h->surplus_huge_pages_node[nid];
3704
3705 return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
3706}
3707HSTATE_ATTR_RO(surplus_hugepages);
3708
3709static ssize_t demote_store(struct kobject *kobj,
3710 struct kobj_attribute *attr, const char *buf, size_t len)
3711{
3712 unsigned long nr_demote;
3713 unsigned long nr_available;
3714 nodemask_t nodes_allowed, *n_mask;
3715 struct hstate *h;
3716 int err = 0;
3717 int nid;
3718
3719 err = kstrtoul(buf, 10, &nr_demote);
3720 if (err)
3721 return err;
3722 h = kobj_to_hstate(kobj, &nid);
3723
3724 if (nid != NUMA_NO_NODE) {
3725 init_nodemask_of_node(&nodes_allowed, nid);
3726 n_mask = &nodes_allowed;
3727 } else {
3728 n_mask = &node_states[N_MEMORY];
3729 }
3730
3731
3732 mutex_lock(&h->resize_lock);
3733 spin_lock_irq(&hugetlb_lock);
3734
3735 while (nr_demote) {
3736
3737
3738
3739
3740 if (nid != NUMA_NO_NODE)
3741 nr_available = h->free_huge_pages_node[nid];
3742 else
3743 nr_available = h->free_huge_pages;
3744 nr_available -= h->resv_huge_pages;
3745 if (!nr_available)
3746 break;
3747
3748 err = demote_pool_huge_page(h, n_mask);
3749 if (err)
3750 break;
3751
3752 nr_demote--;
3753 }
3754
3755 spin_unlock_irq(&hugetlb_lock);
3756 mutex_unlock(&h->resize_lock);
3757
3758 if (err)
3759 return err;
3760 return len;
3761}
3762HSTATE_ATTR_WO(demote);
3763
3764static ssize_t demote_size_show(struct kobject *kobj,
3765 struct kobj_attribute *attr, char *buf)
3766{
3767 int nid;
3768 struct hstate *h = kobj_to_hstate(kobj, &nid);
3769 unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
3770
3771 return sysfs_emit(buf, "%lukB\n", demote_size);
3772}
3773
3774static ssize_t demote_size_store(struct kobject *kobj,
3775 struct kobj_attribute *attr,
3776 const char *buf, size_t count)
3777{
3778 struct hstate *h, *demote_hstate;
3779 unsigned long demote_size;
3780 unsigned int demote_order;
3781 int nid;
3782
3783 demote_size = (unsigned long)memparse(buf, NULL);
3784
3785 demote_hstate = size_to_hstate(demote_size);
3786 if (!demote_hstate)
3787 return -EINVAL;
3788 demote_order = demote_hstate->order;
3789 if (demote_order < HUGETLB_PAGE_ORDER)
3790 return -EINVAL;
3791
3792
3793 h = kobj_to_hstate(kobj, &nid);
3794 if (demote_order >= h->order)
3795 return -EINVAL;
3796
3797
3798 mutex_lock(&h->resize_lock);
3799 h->demote_order = demote_order;
3800 mutex_unlock(&h->resize_lock);
3801
3802 return count;
3803}
3804HSTATE_ATTR(demote_size);
3805
3806static struct attribute *hstate_attrs[] = {
3807 &nr_hugepages_attr.attr,
3808 &nr_overcommit_hugepages_attr.attr,
3809 &free_hugepages_attr.attr,
3810 &resv_hugepages_attr.attr,
3811 &surplus_hugepages_attr.attr,
3812#ifdef CONFIG_NUMA
3813 &nr_hugepages_mempolicy_attr.attr,
3814#endif
3815 NULL,
3816};
3817
3818static const struct attribute_group hstate_attr_group = {
3819 .attrs = hstate_attrs,
3820};
3821
3822static struct attribute *hstate_demote_attrs[] = {
3823 &demote_size_attr.attr,
3824 &demote_attr.attr,
3825 NULL,
3826};
3827
3828static const struct attribute_group hstate_demote_attr_group = {
3829 .attrs = hstate_demote_attrs,
3830};
3831
3832static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
3833 struct kobject **hstate_kobjs,
3834 const struct attribute_group *hstate_attr_group)
3835{
3836 int retval;
3837 int hi = hstate_index(h);
3838
3839 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
3840 if (!hstate_kobjs[hi])
3841 return -ENOMEM;
3842
3843 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
3844 if (retval) {
3845 kobject_put(hstate_kobjs[hi]);
3846 hstate_kobjs[hi] = NULL;
3847 }
3848
3849 if (h->demote_order) {
3850 if (sysfs_create_group(hstate_kobjs[hi],
3851 &hstate_demote_attr_group))
3852 pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
3853 }
3854
3855 return retval;
3856}
3857
3858static void __init hugetlb_sysfs_init(void)
3859{
3860 struct hstate *h;
3861 int err;
3862
3863 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
3864 if (!hugepages_kobj)
3865 return;
3866
3867 for_each_hstate(h) {
3868 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
3869 hstate_kobjs, &hstate_attr_group);
3870 if (err)
3871 pr_err("HugeTLB: Unable to add hstate %s", h->name);
3872 }
3873}
3874
3875#ifdef CONFIG_NUMA
3876
3877
3878
3879
3880
3881
3882
3883
3884struct node_hstate {
3885 struct kobject *hugepages_kobj;
3886 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3887};
3888static struct node_hstate node_hstates[MAX_NUMNODES];
3889
3890
3891
3892
3893static struct attribute *per_node_hstate_attrs[] = {
3894 &nr_hugepages_attr.attr,
3895 &free_hugepages_attr.attr,
3896 &surplus_hugepages_attr.attr,
3897 NULL,
3898};
3899
3900static const struct attribute_group per_node_hstate_attr_group = {
3901 .attrs = per_node_hstate_attrs,
3902};
3903
3904
3905
3906
3907
3908static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
3909{
3910 int nid;
3911
3912 for (nid = 0; nid < nr_node_ids; nid++) {
3913 struct node_hstate *nhs = &node_hstates[nid];
3914 int i;
3915 for (i = 0; i < HUGE_MAX_HSTATE; i++)
3916 if (nhs->hstate_kobjs[i] == kobj) {
3917 if (nidp)
3918 *nidp = nid;
3919 return &hstates[i];
3920 }
3921 }
3922
3923 BUG();
3924 return NULL;
3925}
3926
3927
3928
3929
3930
3931static void hugetlb_unregister_node(struct node *node)
3932{
3933 struct hstate *h;
3934 struct node_hstate *nhs = &node_hstates[node->dev.id];
3935
3936 if (!nhs->hugepages_kobj)
3937 return;
3938
3939 for_each_hstate(h) {
3940 int idx = hstate_index(h);
3941 if (nhs->hstate_kobjs[idx]) {
3942 kobject_put(nhs->hstate_kobjs[idx]);
3943 nhs->hstate_kobjs[idx] = NULL;
3944 }
3945 }
3946
3947 kobject_put(nhs->hugepages_kobj);
3948 nhs->hugepages_kobj = NULL;
3949}
3950
3951
3952
3953
3954
3955
3956static void hugetlb_register_node(struct node *node)
3957{
3958 struct hstate *h;
3959 struct node_hstate *nhs = &node_hstates[node->dev.id];
3960 int err;
3961
3962 if (nhs->hugepages_kobj)
3963 return;
3964
3965 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
3966 &node->dev.kobj);
3967 if (!nhs->hugepages_kobj)
3968 return;
3969
3970 for_each_hstate(h) {
3971 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
3972 nhs->hstate_kobjs,
3973 &per_node_hstate_attr_group);
3974 if (err) {
3975 pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
3976 h->name, node->dev.id);
3977 hugetlb_unregister_node(node);
3978 break;
3979 }
3980 }
3981}
3982
3983
3984
3985
3986
3987
3988static void __init hugetlb_register_all_nodes(void)
3989{
3990 int nid;
3991
3992 for_each_node_state(nid, N_MEMORY) {
3993 struct node *node = node_devices[nid];
3994 if (node->dev.id == nid)
3995 hugetlb_register_node(node);
3996 }
3997
3998
3999
4000
4001
4002 register_hugetlbfs_with_node(hugetlb_register_node,
4003 hugetlb_unregister_node);
4004}
4005#else
4006
4007static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
4008{
4009 BUG();
4010 if (nidp)
4011 *nidp = -1;
4012 return NULL;
4013}
4014
4015static void hugetlb_register_all_nodes(void) { }
4016
4017#endif
4018
4019static int __init hugetlb_init(void)
4020{
4021 int i;
4022
4023 BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4024 __NR_HPAGEFLAGS);
4025
4026 if (!hugepages_supported()) {
4027 if (hugetlb_max_hstate || default_hstate_max_huge_pages)
4028 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4029 return 0;
4030 }
4031
4032
4033
4034
4035
4036 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4037 if (!parsed_default_hugepagesz) {
4038
4039
4040
4041
4042
4043
4044
4045
4046 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
4047 if (default_hstate_max_huge_pages) {
4048 if (default_hstate.max_huge_pages) {
4049 char buf[32];
4050
4051 string_get_size(huge_page_size(&default_hstate),
4052 1, STRING_UNITS_2, buf, 32);
4053 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4054 default_hstate.max_huge_pages, buf);
4055 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4056 default_hstate_max_huge_pages);
4057 }
4058 default_hstate.max_huge_pages =
4059 default_hstate_max_huge_pages;
4060
4061 for_each_online_node(i)
4062 default_hstate.max_huge_pages_node[i] =
4063 default_hugepages_in_node[i];
4064 }
4065 }
4066
4067 hugetlb_cma_check();
4068 hugetlb_init_hstates();
4069 gather_bootmem_prealloc();
4070 report_hugepages();
4071
4072 hugetlb_sysfs_init();
4073 hugetlb_register_all_nodes();
4074 hugetlb_cgroup_file_init();
4075
4076#ifdef CONFIG_SMP
4077 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
4078#else
4079 num_fault_mutexes = 1;
4080#endif
4081 hugetlb_fault_mutex_table =
4082 kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
4083 GFP_KERNEL);
4084 BUG_ON(!hugetlb_fault_mutex_table);
4085
4086 for (i = 0; i < num_fault_mutexes; i++)
4087 mutex_init(&hugetlb_fault_mutex_table[i]);
4088 return 0;
4089}
4090subsys_initcall(hugetlb_init);
4091
4092
4093bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
4094{
4095 return size == HPAGE_SIZE;
4096}
4097
4098void __init hugetlb_add_hstate(unsigned int order)
4099{
4100 struct hstate *h;
4101 unsigned long i;
4102
4103 if (size_to_hstate(PAGE_SIZE << order)) {
4104 return;
4105 }
4106 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
4107 BUG_ON(order == 0);
4108 h = &hstates[hugetlb_max_hstate++];
4109 mutex_init(&h->resize_lock);
4110 h->order = order;
4111 h->mask = ~(huge_page_size(h) - 1);
4112 for (i = 0; i < MAX_NUMNODES; ++i)
4113 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
4114 INIT_LIST_HEAD(&h->hugepage_activelist);
4115 h->next_nid_to_alloc = first_memory_node;
4116 h->next_nid_to_free = first_memory_node;
4117 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
4118 huge_page_size(h)/1024);
4119 hugetlb_vmemmap_init(h);
4120
4121 parsed_hstate = h;
4122}
4123
4124bool __init __weak hugetlb_node_alloc_supported(void)
4125{
4126 return true;
4127}
4128
4129static void __init hugepages_clear_pages_in_node(void)
4130{
4131 if (!hugetlb_max_hstate) {
4132 default_hstate_max_huge_pages = 0;
4133 memset(default_hugepages_in_node, 0,
4134 MAX_NUMNODES * sizeof(unsigned int));
4135 } else {
4136 parsed_hstate->max_huge_pages = 0;
4137 memset(parsed_hstate->max_huge_pages_node, 0,
4138 MAX_NUMNODES * sizeof(unsigned int));
4139 }
4140}
4141
4142
4143
4144
4145
4146
4147
4148
4149static int __init hugepages_setup(char *s)
4150{
4151 unsigned long *mhp;
4152 static unsigned long *last_mhp;
4153 int node = NUMA_NO_NODE;
4154 int count;
4155 unsigned long tmp;
4156 char *p = s;
4157
4158 if (!parsed_valid_hugepagesz) {
4159 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
4160 parsed_valid_hugepagesz = true;
4161 return 1;
4162 }
4163
4164
4165
4166
4167
4168
4169
4170 else if (!hugetlb_max_hstate)
4171 mhp = &default_hstate_max_huge_pages;
4172 else
4173 mhp = &parsed_hstate->max_huge_pages;
4174
4175 if (mhp == last_mhp) {
4176 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
4177 return 1;
4178 }
4179
4180 while (*p) {
4181 count = 0;
4182 if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4183 goto invalid;
4184
4185 if (p[count] == ':') {
4186 if (!hugetlb_node_alloc_supported()) {
4187 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4188 return 1;
4189 }
4190 if (tmp >= MAX_NUMNODES || !node_online(tmp))
4191 goto invalid;
4192 node = array_index_nospec(tmp, MAX_NUMNODES);
4193 p += count + 1;
4194
4195 if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4196 goto invalid;
4197 if (!hugetlb_max_hstate)
4198 default_hugepages_in_node[node] = tmp;
4199 else
4200 parsed_hstate->max_huge_pages_node[node] = tmp;
4201 *mhp += tmp;
4202
4203 if (p[count] == ',')
4204 p += count + 1;
4205 else
4206 break;
4207 } else {
4208 if (p != s)
4209 goto invalid;
4210 *mhp = tmp;
4211 break;
4212 }
4213 }
4214
4215
4216
4217
4218
4219
4220 if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
4221 hugetlb_hstate_alloc_pages(parsed_hstate);
4222
4223 last_mhp = mhp;
4224
4225 return 1;
4226
4227invalid:
4228 pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4229 hugepages_clear_pages_in_node();
4230 return 1;
4231}
4232__setup("hugepages=", hugepages_setup);
4233
4234
4235
4236
4237
4238
4239
4240
4241static int __init hugepagesz_setup(char *s)
4242{
4243 unsigned long size;
4244 struct hstate *h;
4245
4246 parsed_valid_hugepagesz = false;
4247 size = (unsigned long)memparse(s, NULL);
4248
4249 if (!arch_hugetlb_valid_size(size)) {
4250 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
4251 return 1;
4252 }
4253
4254 h = size_to_hstate(size);
4255 if (h) {
4256
4257
4258
4259
4260
4261
4262
4263 if (!parsed_default_hugepagesz || h != &default_hstate ||
4264 default_hstate.max_huge_pages) {
4265 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
4266 return 1;
4267 }
4268
4269
4270
4271
4272
4273
4274 parsed_hstate = h;
4275 parsed_valid_hugepagesz = true;
4276 return 1;
4277 }
4278
4279 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4280 parsed_valid_hugepagesz = true;
4281 return 1;
4282}
4283__setup("hugepagesz=", hugepagesz_setup);
4284
4285
4286
4287
4288
4289static int __init default_hugepagesz_setup(char *s)
4290{
4291 unsigned long size;
4292 int i;
4293
4294 parsed_valid_hugepagesz = false;
4295 if (parsed_default_hugepagesz) {
4296 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
4297 return 1;
4298 }
4299
4300 size = (unsigned long)memparse(s, NULL);
4301
4302 if (!arch_hugetlb_valid_size(size)) {
4303 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
4304 return 1;
4305 }
4306
4307 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4308 parsed_valid_hugepagesz = true;
4309 parsed_default_hugepagesz = true;
4310 default_hstate_idx = hstate_index(size_to_hstate(size));
4311
4312
4313
4314
4315
4316
4317
4318
4319 if (default_hstate_max_huge_pages) {
4320 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
4321 for_each_online_node(i)
4322 default_hstate.max_huge_pages_node[i] =
4323 default_hugepages_in_node[i];
4324 if (hstate_is_gigantic(&default_hstate))
4325 hugetlb_hstate_alloc_pages(&default_hstate);
4326 default_hstate_max_huge_pages = 0;
4327 }
4328
4329 return 1;
4330}
4331__setup("default_hugepagesz=", default_hugepagesz_setup);
4332
4333static unsigned int allowed_mems_nr(struct hstate *h)
4334{
4335 int node;
4336 unsigned int nr = 0;
4337 nodemask_t *mpol_allowed;
4338 unsigned int *array = h->free_huge_pages_node;
4339 gfp_t gfp_mask = htlb_alloc_mask(h);
4340
4341 mpol_allowed = policy_nodemask_current(gfp_mask);
4342
4343 for_each_node_mask(node, cpuset_current_mems_allowed) {
4344 if (!mpol_allowed || node_isset(node, *mpol_allowed))
4345 nr += array[node];
4346 }
4347
4348 return nr;
4349}
4350
4351#ifdef CONFIG_SYSCTL
4352static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
4353 void *buffer, size_t *length,
4354 loff_t *ppos, unsigned long *out)
4355{
4356 struct ctl_table dup_table;
4357
4358
4359
4360
4361
4362 dup_table = *table;
4363 dup_table.data = out;
4364
4365 return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
4366}
4367
4368static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
4369 struct ctl_table *table, int write,
4370 void *buffer, size_t *length, loff_t *ppos)
4371{
4372 struct hstate *h = &default_hstate;
4373 unsigned long tmp = h->max_huge_pages;
4374 int ret;
4375
4376 if (!hugepages_supported())
4377 return -EOPNOTSUPP;
4378
4379 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4380 &tmp);
4381 if (ret)
4382 goto out;
4383
4384 if (write)
4385 ret = __nr_hugepages_store_common(obey_mempolicy, h,
4386 NUMA_NO_NODE, tmp, *length);
4387out:
4388 return ret;
4389}
4390
4391int hugetlb_sysctl_handler(struct ctl_table *table, int write,
4392 void *buffer, size_t *length, loff_t *ppos)
4393{
4394
4395 return hugetlb_sysctl_handler_common(false, table, write,
4396 buffer, length, ppos);
4397}
4398
4399#ifdef CONFIG_NUMA
4400int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
4401 void *buffer, size_t *length, loff_t *ppos)
4402{
4403 return hugetlb_sysctl_handler_common(true, table, write,
4404 buffer, length, ppos);
4405}
4406#endif
4407
4408int hugetlb_overcommit_handler(struct ctl_table *table, int write,
4409 void *buffer, size_t *length, loff_t *ppos)
4410{
4411 struct hstate *h = &default_hstate;
4412 unsigned long tmp;
4413 int ret;
4414
4415 if (!hugepages_supported())
4416 return -EOPNOTSUPP;
4417
4418 tmp = h->nr_overcommit_huge_pages;
4419
4420 if (write && hstate_is_gigantic(h))
4421 return -EINVAL;
4422
4423 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4424 &tmp);
4425 if (ret)
4426 goto out;
4427
4428 if (write) {
4429 spin_lock_irq(&hugetlb_lock);
4430 h->nr_overcommit_huge_pages = tmp;
4431 spin_unlock_irq(&hugetlb_lock);
4432 }
4433out:
4434 return ret;
4435}
4436
4437#endif
4438
4439void hugetlb_report_meminfo(struct seq_file *m)
4440{
4441 struct hstate *h;
4442 unsigned long total = 0;
4443
4444 if (!hugepages_supported())
4445 return;
4446
4447 for_each_hstate(h) {
4448 unsigned long count = h->nr_huge_pages;
4449
4450 total += huge_page_size(h) * count;
4451
4452 if (h == &default_hstate)
4453 seq_printf(m,
4454 "HugePages_Total: %5lu\n"
4455 "HugePages_Free: %5lu\n"
4456 "HugePages_Rsvd: %5lu\n"
4457 "HugePages_Surp: %5lu\n"
4458 "Hugepagesize: %8lu kB\n",
4459 count,
4460 h->free_huge_pages,
4461 h->resv_huge_pages,
4462 h->surplus_huge_pages,
4463 huge_page_size(h) / SZ_1K);
4464 }
4465
4466 seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
4467}
4468
4469int hugetlb_report_node_meminfo(char *buf, int len, int nid)
4470{
4471 struct hstate *h = &default_hstate;
4472
4473 if (!hugepages_supported())
4474 return 0;
4475
4476 return sysfs_emit_at(buf, len,
4477 "Node %d HugePages_Total: %5u\n"
4478 "Node %d HugePages_Free: %5u\n"
4479 "Node %d HugePages_Surp: %5u\n",
4480 nid, h->nr_huge_pages_node[nid],
4481 nid, h->free_huge_pages_node[nid],
4482 nid, h->surplus_huge_pages_node[nid]);
4483}
4484
4485void hugetlb_show_meminfo(void)
4486{
4487 struct hstate *h;
4488 int nid;
4489
4490 if (!hugepages_supported())
4491 return;
4492
4493 for_each_node_state(nid, N_MEMORY)
4494 for_each_hstate(h)
4495 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
4496 nid,
4497 h->nr_huge_pages_node[nid],
4498 h->free_huge_pages_node[nid],
4499 h->surplus_huge_pages_node[nid],
4500 huge_page_size(h) / SZ_1K);
4501}
4502
4503void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
4504{
4505 seq_printf(m, "HugetlbPages:\t%8lu kB\n",
4506 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
4507}
4508
4509
4510unsigned long hugetlb_total_pages(void)
4511{
4512 struct hstate *h;
4513 unsigned long nr_total_pages = 0;
4514
4515 for_each_hstate(h)
4516 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
4517 return nr_total_pages;
4518}
4519
4520static int hugetlb_acct_memory(struct hstate *h, long delta)
4521{
4522 int ret = -ENOMEM;
4523
4524 if (!delta)
4525 return 0;
4526
4527 spin_lock_irq(&hugetlb_lock);
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551 if (delta > 0) {
4552 if (gather_surplus_pages(h, delta) < 0)
4553 goto out;
4554
4555 if (delta > allowed_mems_nr(h)) {
4556 return_unused_surplus_pages(h, delta);
4557 goto out;
4558 }
4559 }
4560
4561 ret = 0;
4562 if (delta < 0)
4563 return_unused_surplus_pages(h, (unsigned long) -delta);
4564
4565out:
4566 spin_unlock_irq(&hugetlb_lock);
4567 return ret;
4568}
4569
4570static void hugetlb_vm_op_open(struct vm_area_struct *vma)
4571{
4572 struct resv_map *resv = vma_resv_map(vma);
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
4583 resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
4584 kref_get(&resv->refs);
4585 }
4586}
4587
4588static void hugetlb_vm_op_close(struct vm_area_struct *vma)
4589{
4590 struct hstate *h = hstate_vma(vma);
4591 struct resv_map *resv = vma_resv_map(vma);
4592 struct hugepage_subpool *spool = subpool_vma(vma);
4593 unsigned long reserve, start, end;
4594 long gbl_reserve;
4595
4596 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4597 return;
4598
4599 start = vma_hugecache_offset(h, vma, vma->vm_start);
4600 end = vma_hugecache_offset(h, vma, vma->vm_end);
4601
4602 reserve = (end - start) - region_count(resv, start, end);
4603 hugetlb_cgroup_uncharge_counter(resv, start, end);
4604 if (reserve) {
4605
4606
4607
4608
4609 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
4610 hugetlb_acct_memory(h, -gbl_reserve);
4611 }
4612
4613 kref_put(&resv->refs, resv_map_release);
4614}
4615
4616static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
4617{
4618 if (addr & ~(huge_page_mask(hstate_vma(vma))))
4619 return -EINVAL;
4620 return 0;
4621}
4622
4623static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
4624{
4625 return huge_page_size(hstate_vma(vma));
4626}
4627
4628
4629
4630
4631
4632
4633
4634static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
4635{
4636 BUG();
4637 return 0;
4638}
4639
4640
4641
4642
4643
4644
4645
4646
4647const struct vm_operations_struct hugetlb_vm_ops = {
4648 .fault = hugetlb_vm_op_fault,
4649 .open = hugetlb_vm_op_open,
4650 .close = hugetlb_vm_op_close,
4651 .may_split = hugetlb_vm_op_split,
4652 .pagesize = hugetlb_vm_op_pagesize,
4653};
4654
4655static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
4656 int writable)
4657{
4658 pte_t entry;
4659 unsigned int shift = huge_page_shift(hstate_vma(vma));
4660
4661 if (writable) {
4662 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
4663 vma->vm_page_prot)));
4664 } else {
4665 entry = huge_pte_wrprotect(mk_huge_pte(page,
4666 vma->vm_page_prot));
4667 }
4668 entry = pte_mkyoung(entry);
4669 entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
4670
4671 return entry;
4672}
4673
4674static void set_huge_ptep_writable(struct vm_area_struct *vma,
4675 unsigned long address, pte_t *ptep)
4676{
4677 pte_t entry;
4678
4679 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
4680 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
4681 update_mmu_cache(vma, address, ptep);
4682}
4683
4684bool is_hugetlb_entry_migration(pte_t pte)
4685{
4686 swp_entry_t swp;
4687
4688 if (huge_pte_none(pte) || pte_present(pte))
4689 return false;
4690 swp = pte_to_swp_entry(pte);
4691 if (is_migration_entry(swp))
4692 return true;
4693 else
4694 return false;
4695}
4696
4697static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
4698{
4699 swp_entry_t swp;
4700
4701 if (huge_pte_none(pte) || pte_present(pte))
4702 return false;
4703 swp = pte_to_swp_entry(pte);
4704 if (is_hwpoison_entry(swp))
4705 return true;
4706 else
4707 return false;
4708}
4709
4710static void
4711hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
4712 struct page *new_page)
4713{
4714 __SetPageUptodate(new_page);
4715 hugepage_add_new_anon_rmap(new_page, vma, addr);
4716 set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
4717 hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
4718 ClearHPageRestoreReserve(new_page);
4719 SetHPageMigratable(new_page);
4720}
4721
4722int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
4723 struct vm_area_struct *dst_vma,
4724 struct vm_area_struct *src_vma)
4725{
4726 pte_t *src_pte, *dst_pte, entry, dst_entry;
4727 struct page *ptepage;
4728 unsigned long addr;
4729 bool cow = is_cow_mapping(src_vma->vm_flags);
4730 struct hstate *h = hstate_vma(src_vma);
4731 unsigned long sz = huge_page_size(h);
4732 unsigned long npages = pages_per_huge_page(h);
4733 struct address_space *mapping = src_vma->vm_file->f_mapping;
4734 struct mmu_notifier_range range;
4735 int ret = 0;
4736
4737 if (cow) {
4738 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
4739 src_vma->vm_start,
4740 src_vma->vm_end);
4741 mmu_notifier_invalidate_range_start(&range);
4742 mmap_assert_write_locked(src);
4743 raw_write_seqcount_begin(&src->write_protect_seq);
4744 } else {
4745
4746
4747
4748
4749
4750
4751 i_mmap_lock_read(mapping);
4752 }
4753
4754 for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
4755 spinlock_t *src_ptl, *dst_ptl;
4756 src_pte = huge_pte_offset(src, addr, sz);
4757 if (!src_pte)
4758 continue;
4759 dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
4760 if (!dst_pte) {
4761 ret = -ENOMEM;
4762 break;
4763 }
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774 dst_entry = huge_ptep_get(dst_pte);
4775 if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
4776 continue;
4777
4778 dst_ptl = huge_pte_lock(h, dst, dst_pte);
4779 src_ptl = huge_pte_lockptr(h, src, src_pte);
4780 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4781 entry = huge_ptep_get(src_pte);
4782 dst_entry = huge_ptep_get(dst_pte);
4783again:
4784 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
4785
4786
4787
4788
4789
4790 ;
4791 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
4792 bool uffd_wp = huge_pte_uffd_wp(entry);
4793
4794 if (!userfaultfd_wp(dst_vma) && uffd_wp)
4795 entry = huge_pte_clear_uffd_wp(entry);
4796 set_huge_pte_at(dst, addr, dst_pte, entry);
4797 } else if (unlikely(is_hugetlb_entry_migration(entry))) {
4798 swp_entry_t swp_entry = pte_to_swp_entry(entry);
4799 bool uffd_wp = huge_pte_uffd_wp(entry);
4800
4801 if (!is_readable_migration_entry(swp_entry) && cow) {
4802
4803
4804
4805
4806 swp_entry = make_readable_migration_entry(
4807 swp_offset(swp_entry));
4808 entry = swp_entry_to_pte(swp_entry);
4809 if (userfaultfd_wp(src_vma) && uffd_wp)
4810 entry = huge_pte_mkuffd_wp(entry);
4811 set_huge_swap_pte_at(src, addr, src_pte,
4812 entry, sz);
4813 }
4814 if (!userfaultfd_wp(dst_vma) && uffd_wp)
4815 entry = huge_pte_clear_uffd_wp(entry);
4816 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
4817 } else if (unlikely(is_pte_marker(entry))) {
4818
4819
4820
4821
4822 if (userfaultfd_wp(dst_vma))
4823 set_huge_pte_at(dst, addr, dst_pte, entry);
4824 } else {
4825 entry = huge_ptep_get(src_pte);
4826 ptepage = pte_page(entry);
4827 get_page(ptepage);
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839 if (!PageAnon(ptepage)) {
4840 page_dup_file_rmap(ptepage, true);
4841 } else if (page_try_dup_anon_rmap(ptepage, true,
4842 src_vma)) {
4843 pte_t src_pte_old = entry;
4844 struct page *new;
4845
4846 spin_unlock(src_ptl);
4847 spin_unlock(dst_ptl);
4848
4849 new = alloc_huge_page(dst_vma, addr, 1);
4850 if (IS_ERR(new)) {
4851 put_page(ptepage);
4852 ret = PTR_ERR(new);
4853 break;
4854 }
4855 copy_user_huge_page(new, ptepage, addr, dst_vma,
4856 npages);
4857 put_page(ptepage);
4858
4859
4860 dst_ptl = huge_pte_lock(h, dst, dst_pte);
4861 src_ptl = huge_pte_lockptr(h, src, src_pte);
4862 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4863 entry = huge_ptep_get(src_pte);
4864 if (!pte_same(src_pte_old, entry)) {
4865 restore_reserve_on_error(h, dst_vma, addr,
4866 new);
4867 put_page(new);
4868
4869 goto again;
4870 }
4871 hugetlb_install_page(dst_vma, dst_pte, addr, new);
4872 spin_unlock(src_ptl);
4873 spin_unlock(dst_ptl);
4874 continue;
4875 }
4876
4877 if (cow) {
4878
4879
4880
4881
4882
4883
4884
4885 huge_ptep_set_wrprotect(src, addr, src_pte);
4886 entry = huge_pte_wrprotect(entry);
4887 }
4888
4889 set_huge_pte_at(dst, addr, dst_pte, entry);
4890 hugetlb_count_add(npages, dst);
4891 }
4892 spin_unlock(src_ptl);
4893 spin_unlock(dst_ptl);
4894 }
4895
4896 if (cow) {
4897 raw_write_seqcount_end(&src->write_protect_seq);
4898 mmu_notifier_invalidate_range_end(&range);
4899 } else {
4900 i_mmap_unlock_read(mapping);
4901 }
4902
4903 return ret;
4904}
4905
4906static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
4907 unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
4908{
4909 struct hstate *h = hstate_vma(vma);
4910 struct mm_struct *mm = vma->vm_mm;
4911 spinlock_t *src_ptl, *dst_ptl;
4912 pte_t pte;
4913
4914 dst_ptl = huge_pte_lock(h, mm, dst_pte);
4915 src_ptl = huge_pte_lockptr(h, mm, src_pte);
4916
4917
4918
4919
4920
4921 if (src_ptl != dst_ptl)
4922 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4923
4924 pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
4925 set_huge_pte_at(mm, new_addr, dst_pte, pte);
4926
4927 if (src_ptl != dst_ptl)
4928 spin_unlock(src_ptl);
4929 spin_unlock(dst_ptl);
4930}
4931
4932int move_hugetlb_page_tables(struct vm_area_struct *vma,
4933 struct vm_area_struct *new_vma,
4934 unsigned long old_addr, unsigned long new_addr,
4935 unsigned long len)
4936{
4937 struct hstate *h = hstate_vma(vma);
4938 struct address_space *mapping = vma->vm_file->f_mapping;
4939 unsigned long sz = huge_page_size(h);
4940 struct mm_struct *mm = vma->vm_mm;
4941 unsigned long old_end = old_addr + len;
4942 unsigned long old_addr_copy;
4943 pte_t *src_pte, *dst_pte;
4944 struct mmu_notifier_range range;
4945 bool shared_pmd = false;
4946
4947 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
4948 old_end);
4949 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4950
4951
4952
4953
4954 flush_cache_range(vma, range.start, range.end);
4955
4956 mmu_notifier_invalidate_range_start(&range);
4957
4958 i_mmap_lock_write(mapping);
4959 for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
4960 src_pte = huge_pte_offset(mm, old_addr, sz);
4961 if (!src_pte)
4962 continue;
4963 if (huge_pte_none(huge_ptep_get(src_pte)))
4964 continue;
4965
4966
4967
4968
4969
4970 old_addr_copy = old_addr;
4971
4972 if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
4973 shared_pmd = true;
4974 continue;
4975 }
4976
4977 dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
4978 if (!dst_pte)
4979 break;
4980
4981 move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
4982 }
4983
4984 if (shared_pmd)
4985 flush_tlb_range(vma, range.start, range.end);
4986 else
4987 flush_tlb_range(vma, old_end - len, old_end);
4988 mmu_notifier_invalidate_range_end(&range);
4989 i_mmap_unlock_write(mapping);
4990
4991 return len + old_addr - old_end;
4992}
4993
4994static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
4995 unsigned long start, unsigned long end,
4996 struct page *ref_page, zap_flags_t zap_flags)
4997{
4998 struct mm_struct *mm = vma->vm_mm;
4999 unsigned long address;
5000 pte_t *ptep;
5001 pte_t pte;
5002 spinlock_t *ptl;
5003 struct page *page;
5004 struct hstate *h = hstate_vma(vma);
5005 unsigned long sz = huge_page_size(h);
5006 struct mmu_notifier_range range;
5007 bool force_flush = false;
5008
5009 WARN_ON(!is_vm_hugetlb_page(vma));
5010 BUG_ON(start & ~huge_page_mask(h));
5011 BUG_ON(end & ~huge_page_mask(h));
5012
5013
5014
5015
5016
5017 tlb_change_page_size(tlb, sz);
5018 tlb_start_vma(tlb, vma);
5019
5020
5021
5022
5023 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
5024 end);
5025 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
5026 mmu_notifier_invalidate_range_start(&range);
5027 address = start;
5028 for (; address < end; address += sz) {
5029 ptep = huge_pte_offset(mm, address, sz);
5030 if (!ptep)
5031 continue;
5032
5033 ptl = huge_pte_lock(h, mm, ptep);
5034 if (huge_pmd_unshare(mm, vma, &address, ptep)) {
5035 spin_unlock(ptl);
5036 tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
5037 force_flush = true;
5038 continue;
5039 }
5040
5041 pte = huge_ptep_get(ptep);
5042 if (huge_pte_none(pte)) {
5043 spin_unlock(ptl);
5044 continue;
5045 }
5046
5047
5048
5049
5050
5051 if (unlikely(!pte_present(pte))) {
5052
5053
5054
5055
5056
5057
5058 if (pte_swp_uffd_wp_any(pte) &&
5059 !(zap_flags & ZAP_FLAG_DROP_MARKER))
5060 set_huge_pte_at(mm, address, ptep,
5061 make_pte_marker(PTE_MARKER_UFFD_WP));
5062 else
5063 huge_pte_clear(mm, address, ptep, sz);
5064 spin_unlock(ptl);
5065 continue;
5066 }
5067
5068 page = pte_page(pte);
5069
5070
5071
5072
5073
5074 if (ref_page) {
5075 if (page != ref_page) {
5076 spin_unlock(ptl);
5077 continue;
5078 }
5079
5080
5081
5082
5083
5084 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5085 }
5086
5087 pte = huge_ptep_get_and_clear(mm, address, ptep);
5088 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
5089 if (huge_pte_dirty(pte))
5090 set_page_dirty(page);
5091
5092 if (huge_pte_uffd_wp(pte) &&
5093 !(zap_flags & ZAP_FLAG_DROP_MARKER))
5094 set_huge_pte_at(mm, address, ptep,
5095 make_pte_marker(PTE_MARKER_UFFD_WP));
5096 hugetlb_count_sub(pages_per_huge_page(h), mm);
5097 page_remove_rmap(page, vma, true);
5098
5099 spin_unlock(ptl);
5100 tlb_remove_page_size(tlb, page, huge_page_size(h));
5101
5102
5103
5104 if (ref_page)
5105 break;
5106 }
5107 mmu_notifier_invalidate_range_end(&range);
5108 tlb_end_vma(tlb, vma);
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123 if (force_flush)
5124 tlb_flush_mmu_tlbonly(tlb);
5125}
5126
5127void __unmap_hugepage_range_final(struct mmu_gather *tlb,
5128 struct vm_area_struct *vma, unsigned long start,
5129 unsigned long end, struct page *ref_page,
5130 zap_flags_t zap_flags)
5131{
5132 __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144 vma->vm_flags &= ~VM_MAYSHARE;
5145}
5146
5147void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
5148 unsigned long end, struct page *ref_page,
5149 zap_flags_t zap_flags)
5150{
5151 struct mmu_gather tlb;
5152
5153 tlb_gather_mmu(&tlb, vma->vm_mm);
5154 __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
5155 tlb_finish_mmu(&tlb);
5156}
5157
5158
5159
5160
5161
5162
5163
5164static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
5165 struct page *page, unsigned long address)
5166{
5167 struct hstate *h = hstate_vma(vma);
5168 struct vm_area_struct *iter_vma;
5169 struct address_space *mapping;
5170 pgoff_t pgoff;
5171
5172
5173
5174
5175
5176 address = address & huge_page_mask(h);
5177 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
5178 vma->vm_pgoff;
5179 mapping = vma->vm_file->f_mapping;
5180
5181
5182
5183
5184
5185
5186 i_mmap_lock_write(mapping);
5187 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
5188
5189 if (iter_vma == vma)
5190 continue;
5191
5192
5193
5194
5195
5196
5197 if (iter_vma->vm_flags & VM_MAYSHARE)
5198 continue;
5199
5200
5201
5202
5203
5204
5205
5206
5207 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
5208 unmap_hugepage_range(iter_vma, address,
5209 address + huge_page_size(h), page, 0);
5210 }
5211 i_mmap_unlock_write(mapping);
5212}
5213
5214
5215
5216
5217
5218
5219
5220static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
5221 unsigned long address, pte_t *ptep, unsigned int flags,
5222 struct page *pagecache_page, spinlock_t *ptl)
5223{
5224 const bool unshare = flags & FAULT_FLAG_UNSHARE;
5225 pte_t pte;
5226 struct hstate *h = hstate_vma(vma);
5227 struct page *old_page, *new_page;
5228 int outside_reserve = 0;
5229 vm_fault_t ret = 0;
5230 unsigned long haddr = address & huge_page_mask(h);
5231 struct mmu_notifier_range range;
5232
5233 VM_BUG_ON(unshare && (flags & FOLL_WRITE));
5234 VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
5235
5236 pte = huge_ptep_get(ptep);
5237 old_page = pte_page(pte);
5238
5239 delayacct_wpcopy_start();
5240
5241retry_avoidcopy:
5242
5243
5244
5245
5246 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
5247 if (!PageAnonExclusive(old_page))
5248 page_move_anon_rmap(old_page, vma);
5249 if (likely(!unshare))
5250 set_huge_ptep_writable(vma, haddr, ptep);
5251
5252 delayacct_wpcopy_end();
5253 return 0;
5254 }
5255 VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
5256 old_page);
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
5268 old_page != pagecache_page)
5269 outside_reserve = 1;
5270
5271 get_page(old_page);
5272
5273
5274
5275
5276
5277 spin_unlock(ptl);
5278 new_page = alloc_huge_page(vma, haddr, outside_reserve);
5279
5280 if (IS_ERR(new_page)) {
5281
5282
5283
5284
5285
5286
5287
5288 if (outside_reserve) {
5289 struct address_space *mapping = vma->vm_file->f_mapping;
5290 pgoff_t idx;
5291 u32 hash;
5292
5293 put_page(old_page);
5294 BUG_ON(huge_pte_none(pte));
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304 idx = vma_hugecache_offset(h, vma, haddr);
5305 hash = hugetlb_fault_mutex_hash(mapping, idx);
5306 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5307 i_mmap_unlock_read(mapping);
5308
5309 unmap_ref_private(mm, vma, old_page, haddr);
5310
5311 i_mmap_lock_read(mapping);
5312 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5313 spin_lock(ptl);
5314 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5315 if (likely(ptep &&
5316 pte_same(huge_ptep_get(ptep), pte)))
5317 goto retry_avoidcopy;
5318
5319
5320
5321
5322 delayacct_wpcopy_end();
5323 return 0;
5324 }
5325
5326 ret = vmf_error(PTR_ERR(new_page));
5327 goto out_release_old;
5328 }
5329
5330
5331
5332
5333
5334 if (unlikely(anon_vma_prepare(vma))) {
5335 ret = VM_FAULT_OOM;
5336 goto out_release_all;
5337 }
5338
5339 copy_user_huge_page(new_page, old_page, address, vma,
5340 pages_per_huge_page(h));
5341 __SetPageUptodate(new_page);
5342
5343 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
5344 haddr + huge_page_size(h));
5345 mmu_notifier_invalidate_range_start(&range);
5346
5347
5348
5349
5350
5351 spin_lock(ptl);
5352 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5353 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
5354 ClearHPageRestoreReserve(new_page);
5355
5356
5357 huge_ptep_clear_flush(vma, haddr, ptep);
5358 mmu_notifier_invalidate_range(mm, range.start, range.end);
5359 page_remove_rmap(old_page, vma, true);
5360 hugepage_add_new_anon_rmap(new_page, vma, haddr);
5361 set_huge_pte_at(mm, haddr, ptep,
5362 make_huge_pte(vma, new_page, !unshare));
5363 SetHPageMigratable(new_page);
5364
5365 new_page = old_page;
5366 }
5367 spin_unlock(ptl);
5368 mmu_notifier_invalidate_range_end(&range);
5369out_release_all:
5370
5371
5372
5373
5374 if (new_page != old_page)
5375 restore_reserve_on_error(h, vma, haddr, new_page);
5376 put_page(new_page);
5377out_release_old:
5378 put_page(old_page);
5379
5380 spin_lock(ptl);
5381
5382 delayacct_wpcopy_end();
5383 return ret;
5384}
5385
5386
5387static struct page *hugetlbfs_pagecache_page(struct hstate *h,
5388 struct vm_area_struct *vma, unsigned long address)
5389{
5390 struct address_space *mapping;
5391 pgoff_t idx;
5392
5393 mapping = vma->vm_file->f_mapping;
5394 idx = vma_hugecache_offset(h, vma, address);
5395
5396 return find_lock_page(mapping, idx);
5397}
5398
5399
5400
5401
5402
5403static bool hugetlbfs_pagecache_present(struct hstate *h,
5404 struct vm_area_struct *vma, unsigned long address)
5405{
5406 struct address_space *mapping;
5407 pgoff_t idx;
5408 struct page *page;
5409
5410 mapping = vma->vm_file->f_mapping;
5411 idx = vma_hugecache_offset(h, vma, address);
5412
5413 page = find_get_page(mapping, idx);
5414 if (page)
5415 put_page(page);
5416 return page != NULL;
5417}
5418
5419int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
5420 pgoff_t idx)
5421{
5422 struct inode *inode = mapping->host;
5423 struct hstate *h = hstate_inode(inode);
5424 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
5425
5426 if (err)
5427 return err;
5428 ClearHPageRestoreReserve(page);
5429
5430
5431
5432
5433
5434 set_page_dirty(page);
5435
5436 spin_lock(&inode->i_lock);
5437 inode->i_blocks += blocks_per_huge_page(h);
5438 spin_unlock(&inode->i_lock);
5439 return 0;
5440}
5441
5442static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
5443 struct address_space *mapping,
5444 pgoff_t idx,
5445 unsigned int flags,
5446 unsigned long haddr,
5447 unsigned long addr,
5448 unsigned long reason)
5449{
5450 vm_fault_t ret;
5451 u32 hash;
5452 struct vm_fault vmf = {
5453 .vma = vma,
5454 .address = haddr,
5455 .real_address = addr,
5456 .flags = flags,
5457
5458
5459
5460
5461
5462
5463
5464
5465 };
5466
5467
5468
5469
5470
5471
5472 hash = hugetlb_fault_mutex_hash(mapping, idx);
5473 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5474 i_mmap_unlock_read(mapping);
5475 ret = handle_userfault(&vmf, reason);
5476 i_mmap_lock_read(mapping);
5477 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5478
5479 return ret;
5480}
5481
5482static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
5483 struct vm_area_struct *vma,
5484 struct address_space *mapping, pgoff_t idx,
5485 unsigned long address, pte_t *ptep,
5486 pte_t old_pte, unsigned int flags)
5487{
5488 struct hstate *h = hstate_vma(vma);
5489 vm_fault_t ret = VM_FAULT_SIGBUS;
5490 int anon_rmap = 0;
5491 unsigned long size;
5492 struct page *page;
5493 pte_t new_pte;
5494 spinlock_t *ptl;
5495 unsigned long haddr = address & huge_page_mask(h);
5496 bool new_page, new_pagecache_page = false;
5497
5498
5499
5500
5501
5502
5503
5504 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
5505 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
5506 current->pid);
5507 return ret;
5508 }
5509
5510
5511
5512
5513
5514
5515 size = i_size_read(mapping->host) >> huge_page_shift(h);
5516 if (idx >= size)
5517 goto out;
5518
5519retry:
5520 new_page = false;
5521 page = find_lock_page(mapping, idx);
5522 if (!page) {
5523
5524 if (userfaultfd_missing(vma)) {
5525 ret = hugetlb_handle_userfault(vma, mapping, idx,
5526 flags, haddr, address,
5527 VM_UFFD_MISSING);
5528 goto out;
5529 }
5530
5531 page = alloc_huge_page(vma, haddr, 0);
5532 if (IS_ERR(page)) {
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545 ptl = huge_pte_lock(h, mm, ptep);
5546 ret = 0;
5547 if (huge_pte_none(huge_ptep_get(ptep)))
5548 ret = vmf_error(PTR_ERR(page));
5549 spin_unlock(ptl);
5550 goto out;
5551 }
5552 clear_huge_page(page, address, pages_per_huge_page(h));
5553 __SetPageUptodate(page);
5554 new_page = true;
5555
5556 if (vma->vm_flags & VM_MAYSHARE) {
5557 int err = huge_add_to_page_cache(page, mapping, idx);
5558 if (err) {
5559 put_page(page);
5560 if (err == -EEXIST)
5561 goto retry;
5562 goto out;
5563 }
5564 new_pagecache_page = true;
5565 } else {
5566 lock_page(page);
5567 if (unlikely(anon_vma_prepare(vma))) {
5568 ret = VM_FAULT_OOM;
5569 goto backout_unlocked;
5570 }
5571 anon_rmap = 1;
5572 }
5573 } else {
5574
5575
5576
5577
5578
5579 if (unlikely(PageHWPoison(page))) {
5580 ret = VM_FAULT_HWPOISON_LARGE |
5581 VM_FAULT_SET_HINDEX(hstate_index(h));
5582 goto backout_unlocked;
5583 }
5584
5585
5586 if (userfaultfd_minor(vma)) {
5587 unlock_page(page);
5588 put_page(page);
5589 ret = hugetlb_handle_userfault(vma, mapping, idx,
5590 flags, haddr, address,
5591 VM_UFFD_MINOR);
5592 goto out;
5593 }
5594 }
5595
5596
5597
5598
5599
5600
5601
5602 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5603 if (vma_needs_reservation(h, vma, haddr) < 0) {
5604 ret = VM_FAULT_OOM;
5605 goto backout_unlocked;
5606 }
5607
5608 vma_end_reservation(h, vma, haddr);
5609 }
5610
5611 ptl = huge_pte_lock(h, mm, ptep);
5612 ret = 0;
5613
5614 if (!pte_same(huge_ptep_get(ptep), old_pte))
5615 goto backout;
5616
5617 if (anon_rmap) {
5618 ClearHPageRestoreReserve(page);
5619 hugepage_add_new_anon_rmap(page, vma, haddr);
5620 } else
5621 page_dup_file_rmap(page, true);
5622 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
5623 && (vma->vm_flags & VM_SHARED)));
5624
5625
5626
5627
5628 if (unlikely(pte_marker_uffd_wp(old_pte)))
5629 new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
5630 set_huge_pte_at(mm, haddr, ptep, new_pte);
5631
5632 hugetlb_count_add(pages_per_huge_page(h), mm);
5633 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5634
5635 ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
5636 }
5637
5638 spin_unlock(ptl);
5639
5640
5641
5642
5643
5644
5645 if (new_page)
5646 SetHPageMigratable(page);
5647
5648 unlock_page(page);
5649out:
5650 return ret;
5651
5652backout:
5653 spin_unlock(ptl);
5654backout_unlocked:
5655 unlock_page(page);
5656
5657 if (new_page && !new_pagecache_page)
5658 restore_reserve_on_error(h, vma, haddr, page);
5659 put_page(page);
5660 goto out;
5661}
5662
5663#ifdef CONFIG_SMP
5664u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5665{
5666 unsigned long key[2];
5667 u32 hash;
5668
5669 key[0] = (unsigned long) mapping;
5670 key[1] = idx;
5671
5672 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
5673
5674 return hash & (num_fault_mutexes - 1);
5675}
5676#else
5677
5678
5679
5680
5681u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5682{
5683 return 0;
5684}
5685#endif
5686
5687vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
5688 unsigned long address, unsigned int flags)
5689{
5690 pte_t *ptep, entry;
5691 spinlock_t *ptl;
5692 vm_fault_t ret;
5693 u32 hash;
5694 pgoff_t idx;
5695 struct page *page = NULL;
5696 struct page *pagecache_page = NULL;
5697 struct hstate *h = hstate_vma(vma);
5698 struct address_space *mapping;
5699 int need_wait_lock = 0;
5700 unsigned long haddr = address & huge_page_mask(h);
5701
5702 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5703 if (ptep) {
5704
5705
5706
5707
5708
5709 entry = huge_ptep_get(ptep);
5710 if (unlikely(is_hugetlb_entry_migration(entry))) {
5711 migration_entry_wait_huge(vma, mm, ptep);
5712 return 0;
5713 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
5714 return VM_FAULT_HWPOISON_LARGE |
5715 VM_FAULT_SET_HINDEX(hstate_index(h));
5716 }
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729 mapping = vma->vm_file->f_mapping;
5730 i_mmap_lock_read(mapping);
5731 ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
5732 if (!ptep) {
5733 i_mmap_unlock_read(mapping);
5734 return VM_FAULT_OOM;
5735 }
5736
5737
5738
5739
5740
5741
5742 idx = vma_hugecache_offset(h, vma, haddr);
5743 hash = hugetlb_fault_mutex_hash(mapping, idx);
5744 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5745
5746 entry = huge_ptep_get(ptep);
5747
5748 if (huge_pte_none_mostly(entry)) {
5749 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
5750 entry, flags);
5751 goto out_mutex;
5752 }
5753
5754 ret = 0;
5755
5756
5757
5758
5759
5760
5761
5762
5763 if (!pte_present(entry))
5764 goto out_mutex;
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774 if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
5775 !huge_pte_write(entry)) {
5776 if (vma_needs_reservation(h, vma, haddr) < 0) {
5777 ret = VM_FAULT_OOM;
5778 goto out_mutex;
5779 }
5780
5781 vma_end_reservation(h, vma, haddr);
5782
5783 if (!(vma->vm_flags & VM_MAYSHARE))
5784 pagecache_page = hugetlbfs_pagecache_page(h,
5785 vma, haddr);
5786 }
5787
5788 ptl = huge_pte_lock(h, mm, ptep);
5789
5790
5791 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
5792 goto out_ptl;
5793
5794
5795 if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
5796 (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
5797 struct vm_fault vmf = {
5798 .vma = vma,
5799 .address = haddr,
5800 .real_address = address,
5801 .flags = flags,
5802 };
5803
5804 spin_unlock(ptl);
5805 if (pagecache_page) {
5806 unlock_page(pagecache_page);
5807 put_page(pagecache_page);
5808 }
5809 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5810 i_mmap_unlock_read(mapping);
5811 return handle_userfault(&vmf, VM_UFFD_WP);
5812 }
5813
5814
5815
5816
5817
5818
5819 page = pte_page(entry);
5820 if (page != pagecache_page)
5821 if (!trylock_page(page)) {
5822 need_wait_lock = 1;
5823 goto out_ptl;
5824 }
5825
5826 get_page(page);
5827
5828 if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
5829 if (!huge_pte_write(entry)) {
5830 ret = hugetlb_wp(mm, vma, address, ptep, flags,
5831 pagecache_page, ptl);
5832 goto out_put_page;
5833 } else if (likely(flags & FAULT_FLAG_WRITE)) {
5834 entry = huge_pte_mkdirty(entry);
5835 }
5836 }
5837 entry = pte_mkyoung(entry);
5838 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
5839 flags & FAULT_FLAG_WRITE))
5840 update_mmu_cache(vma, haddr, ptep);
5841out_put_page:
5842 if (page != pagecache_page)
5843 unlock_page(page);
5844 put_page(page);
5845out_ptl:
5846 spin_unlock(ptl);
5847
5848 if (pagecache_page) {
5849 unlock_page(pagecache_page);
5850 put_page(pagecache_page);
5851 }
5852out_mutex:
5853 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5854 i_mmap_unlock_read(mapping);
5855
5856
5857
5858
5859
5860
5861
5862 if (need_wait_lock)
5863 wait_on_page_locked(page);
5864 return ret;
5865}
5866
5867#ifdef CONFIG_USERFAULTFD
5868
5869
5870
5871
5872int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
5873 pte_t *dst_pte,
5874 struct vm_area_struct *dst_vma,
5875 unsigned long dst_addr,
5876 unsigned long src_addr,
5877 enum mcopy_atomic_mode mode,
5878 struct page **pagep,
5879 bool wp_copy)
5880{
5881 bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
5882 struct hstate *h = hstate_vma(dst_vma);
5883 struct address_space *mapping = dst_vma->vm_file->f_mapping;
5884 pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
5885 unsigned long size;
5886 int vm_shared = dst_vma->vm_flags & VM_SHARED;
5887 pte_t _dst_pte;
5888 spinlock_t *ptl;
5889 int ret = -ENOMEM;
5890 struct page *page;
5891 int writable;
5892 bool page_in_pagecache = false;
5893
5894 if (is_continue) {
5895 ret = -EFAULT;
5896 page = find_lock_page(mapping, idx);
5897 if (!page)
5898 goto out;
5899 page_in_pagecache = true;
5900 } else if (!*pagep) {
5901
5902
5903
5904 if (vm_shared &&
5905 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5906 ret = -EEXIST;
5907 goto out;
5908 }
5909
5910 page = alloc_huge_page(dst_vma, dst_addr, 0);
5911 if (IS_ERR(page)) {
5912 ret = -ENOMEM;
5913 goto out;
5914 }
5915
5916 ret = copy_huge_page_from_user(page,
5917 (const void __user *) src_addr,
5918 pages_per_huge_page(h), false);
5919
5920
5921 if (unlikely(ret)) {
5922 ret = -ENOENT;
5923
5924
5925
5926 restore_reserve_on_error(h, dst_vma, dst_addr, page);
5927 put_page(page);
5928
5929
5930
5931
5932 page = alloc_huge_page_vma(h, dst_vma, dst_addr);
5933 if (!page) {
5934 ret = -ENOMEM;
5935 goto out;
5936 }
5937 *pagep = page;
5938
5939
5940
5941
5942 goto out;
5943 }
5944 } else {
5945 if (vm_shared &&
5946 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5947 put_page(*pagep);
5948 ret = -EEXIST;
5949 *pagep = NULL;
5950 goto out;
5951 }
5952
5953 page = alloc_huge_page(dst_vma, dst_addr, 0);
5954 if (IS_ERR(page)) {
5955 put_page(*pagep);
5956 ret = -ENOMEM;
5957 *pagep = NULL;
5958 goto out;
5959 }
5960 copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
5961 pages_per_huge_page(h));
5962 put_page(*pagep);
5963 *pagep = NULL;
5964 }
5965
5966
5967
5968
5969
5970
5971 __SetPageUptodate(page);
5972
5973
5974 if (vm_shared && !is_continue) {
5975 size = i_size_read(mapping->host) >> huge_page_shift(h);
5976 ret = -EFAULT;
5977 if (idx >= size)
5978 goto out_release_nounlock;
5979
5980
5981
5982
5983
5984
5985
5986 ret = huge_add_to_page_cache(page, mapping, idx);
5987 if (ret)
5988 goto out_release_nounlock;
5989 page_in_pagecache = true;
5990 }
5991
5992 ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
5993 spin_lock(ptl);
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004 size = i_size_read(mapping->host) >> huge_page_shift(h);
6005 ret = -EFAULT;
6006 if (idx >= size)
6007 goto out_release_unlock;
6008
6009 ret = -EEXIST;
6010
6011
6012
6013
6014
6015 if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
6016 goto out_release_unlock;
6017
6018 if (vm_shared) {
6019 page_dup_file_rmap(page, true);
6020 } else {
6021 ClearHPageRestoreReserve(page);
6022 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
6023 }
6024
6025
6026
6027
6028
6029 if (wp_copy || (is_continue && !vm_shared))
6030 writable = 0;
6031 else
6032 writable = dst_vma->vm_flags & VM_WRITE;
6033
6034 _dst_pte = make_huge_pte(dst_vma, page, writable);
6035
6036
6037
6038
6039
6040
6041 _dst_pte = huge_pte_mkdirty(_dst_pte);
6042 _dst_pte = pte_mkyoung(_dst_pte);
6043
6044 if (wp_copy)
6045 _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
6046
6047 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
6048
6049 (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
6050 dst_vma->vm_flags & VM_WRITE);
6051 hugetlb_count_add(pages_per_huge_page(h), dst_mm);
6052
6053
6054 update_mmu_cache(dst_vma, dst_addr, dst_pte);
6055
6056 spin_unlock(ptl);
6057 if (!is_continue)
6058 SetHPageMigratable(page);
6059 if (vm_shared || is_continue)
6060 unlock_page(page);
6061 ret = 0;
6062out:
6063 return ret;
6064out_release_unlock:
6065 spin_unlock(ptl);
6066 if (vm_shared || is_continue)
6067 unlock_page(page);
6068out_release_nounlock:
6069 if (!page_in_pagecache)
6070 restore_reserve_on_error(h, dst_vma, dst_addr, page);
6071 put_page(page);
6072 goto out;
6073}
6074#endif
6075
6076static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
6077 int refs, struct page **pages,
6078 struct vm_area_struct **vmas)
6079{
6080 int nr;
6081
6082 for (nr = 0; nr < refs; nr++) {
6083 if (likely(pages))
6084 pages[nr] = mem_map_offset(page, nr);
6085 if (vmas)
6086 vmas[nr] = vma;
6087 }
6088}
6089
6090static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
6091 bool *unshare)
6092{
6093 pte_t pteval = huge_ptep_get(pte);
6094
6095 *unshare = false;
6096 if (is_swap_pte(pteval))
6097 return true;
6098 if (huge_pte_write(pteval))
6099 return false;
6100 if (flags & FOLL_WRITE)
6101 return true;
6102 if (gup_must_unshare(flags, pte_page(pteval))) {
6103 *unshare = true;
6104 return true;
6105 }
6106 return false;
6107}
6108
6109long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
6110 struct page **pages, struct vm_area_struct **vmas,
6111 unsigned long *position, unsigned long *nr_pages,
6112 long i, unsigned int flags, int *locked)
6113{
6114 unsigned long pfn_offset;
6115 unsigned long vaddr = *position;
6116 unsigned long remainder = *nr_pages;
6117 struct hstate *h = hstate_vma(vma);
6118 int err = -EFAULT, refs;
6119
6120 while (vaddr < vma->vm_end && remainder) {
6121 pte_t *pte;
6122 spinlock_t *ptl = NULL;
6123 bool unshare = false;
6124 int absent;
6125 struct page *page;
6126
6127
6128
6129
6130
6131 if (fatal_signal_pending(current)) {
6132 remainder = 0;
6133 break;
6134 }
6135
6136
6137
6138
6139
6140
6141
6142
6143 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
6144 huge_page_size(h));
6145 if (pte)
6146 ptl = huge_pte_lock(h, mm, pte);
6147 absent = !pte || huge_pte_none(huge_ptep_get(pte));
6148
6149
6150
6151
6152
6153
6154
6155
6156 if (absent && (flags & FOLL_DUMP) &&
6157 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
6158 if (pte)
6159 spin_unlock(ptl);
6160 remainder = 0;
6161 break;
6162 }
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174 if (absent ||
6175 __follow_hugetlb_must_fault(flags, pte, &unshare)) {
6176 vm_fault_t ret;
6177 unsigned int fault_flags = 0;
6178
6179 if (pte)
6180 spin_unlock(ptl);
6181 if (flags & FOLL_WRITE)
6182 fault_flags |= FAULT_FLAG_WRITE;
6183 else if (unshare)
6184 fault_flags |= FAULT_FLAG_UNSHARE;
6185 if (locked)
6186 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6187 FAULT_FLAG_KILLABLE;
6188 if (flags & FOLL_NOWAIT)
6189 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6190 FAULT_FLAG_RETRY_NOWAIT;
6191 if (flags & FOLL_TRIED) {
6192
6193
6194
6195
6196 fault_flags |= FAULT_FLAG_TRIED;
6197 }
6198 ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
6199 if (ret & VM_FAULT_ERROR) {
6200 err = vm_fault_to_errno(ret, flags);
6201 remainder = 0;
6202 break;
6203 }
6204 if (ret & VM_FAULT_RETRY) {
6205 if (locked &&
6206 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
6207 *locked = 0;
6208 *nr_pages = 0;
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218 return i;
6219 }
6220 continue;
6221 }
6222
6223 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
6224 page = pte_page(huge_ptep_get(pte));
6225
6226 VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
6227 !PageAnonExclusive(page), page);
6228
6229
6230
6231
6232
6233 if (!pages && !vmas && !pfn_offset &&
6234 (vaddr + huge_page_size(h) < vma->vm_end) &&
6235 (remainder >= pages_per_huge_page(h))) {
6236 vaddr += huge_page_size(h);
6237 remainder -= pages_per_huge_page(h);
6238 i += pages_per_huge_page(h);
6239 spin_unlock(ptl);
6240 continue;
6241 }
6242
6243
6244 refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
6245 (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
6246
6247 if (pages || vmas)
6248 record_subpages_vmas(mem_map_offset(page, pfn_offset),
6249 vma, refs,
6250 likely(pages) ? pages + i : NULL,
6251 vmas ? vmas + i : NULL);
6252
6253 if (pages) {
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264 if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
6265 flags))) {
6266 spin_unlock(ptl);
6267 remainder = 0;
6268 err = -ENOMEM;
6269 break;
6270 }
6271 }
6272
6273 vaddr += (refs << PAGE_SHIFT);
6274 remainder -= refs;
6275 i += refs;
6276
6277 spin_unlock(ptl);
6278 }
6279 *nr_pages = remainder;
6280
6281
6282
6283
6284
6285 *position = vaddr;
6286
6287 return i ? i : err;
6288}
6289
6290unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
6291 unsigned long address, unsigned long end,
6292 pgprot_t newprot, unsigned long cp_flags)
6293{
6294 struct mm_struct *mm = vma->vm_mm;
6295 unsigned long start = address;
6296 pte_t *ptep;
6297 pte_t pte;
6298 struct hstate *h = hstate_vma(vma);
6299 unsigned long pages = 0, psize = huge_page_size(h);
6300 bool shared_pmd = false;
6301 struct mmu_notifier_range range;
6302 bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
6303 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
6304
6305
6306
6307
6308
6309
6310 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
6311 0, vma, mm, start, end);
6312 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
6313
6314 BUG_ON(address >= end);
6315 flush_cache_range(vma, range.start, range.end);
6316
6317 mmu_notifier_invalidate_range_start(&range);
6318 i_mmap_lock_write(vma->vm_file->f_mapping);
6319 for (; address < end; address += psize) {
6320 spinlock_t *ptl;
6321 ptep = huge_pte_offset(mm, address, psize);
6322 if (!ptep)
6323 continue;
6324 ptl = huge_pte_lock(h, mm, ptep);
6325 if (huge_pmd_unshare(mm, vma, &address, ptep)) {
6326
6327
6328
6329
6330
6331 WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
6332 pages++;
6333 spin_unlock(ptl);
6334 shared_pmd = true;
6335 continue;
6336 }
6337 pte = huge_ptep_get(ptep);
6338 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
6339 spin_unlock(ptl);
6340 continue;
6341 }
6342 if (unlikely(is_hugetlb_entry_migration(pte))) {
6343 swp_entry_t entry = pte_to_swp_entry(pte);
6344 struct page *page = pfn_swap_entry_to_page(entry);
6345
6346 if (!is_readable_migration_entry(entry)) {
6347 pte_t newpte;
6348
6349 if (PageAnon(page))
6350 entry = make_readable_exclusive_migration_entry(
6351 swp_offset(entry));
6352 else
6353 entry = make_readable_migration_entry(
6354 swp_offset(entry));
6355 newpte = swp_entry_to_pte(entry);
6356 if (uffd_wp)
6357 newpte = pte_swp_mkuffd_wp(newpte);
6358 else if (uffd_wp_resolve)
6359 newpte = pte_swp_clear_uffd_wp(newpte);
6360 set_huge_swap_pte_at(mm, address, ptep,
6361 newpte, psize);
6362 pages++;
6363 }
6364 spin_unlock(ptl);
6365 continue;
6366 }
6367 if (unlikely(pte_marker_uffd_wp(pte))) {
6368
6369
6370
6371
6372 if (uffd_wp_resolve)
6373 huge_pte_clear(mm, address, ptep, psize);
6374 }
6375 if (!huge_pte_none(pte)) {
6376 pte_t old_pte;
6377 unsigned int shift = huge_page_shift(hstate_vma(vma));
6378
6379 old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
6380 pte = huge_pte_modify(old_pte, newprot);
6381 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
6382 if (uffd_wp)
6383 pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
6384 else if (uffd_wp_resolve)
6385 pte = huge_pte_clear_uffd_wp(pte);
6386 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
6387 pages++;
6388 } else {
6389
6390 if (unlikely(uffd_wp))
6391
6392 set_huge_pte_at(mm, address, ptep,
6393 make_pte_marker(PTE_MARKER_UFFD_WP));
6394 }
6395 spin_unlock(ptl);
6396 }
6397
6398
6399
6400
6401
6402
6403
6404 if (shared_pmd)
6405 flush_hugetlb_tlb_range(vma, range.start, range.end);
6406 else
6407 flush_hugetlb_tlb_range(vma, start, end);
6408
6409
6410
6411
6412
6413
6414 i_mmap_unlock_write(vma->vm_file->f_mapping);
6415 mmu_notifier_invalidate_range_end(&range);
6416
6417 return pages << h->order;
6418}
6419
6420
6421bool hugetlb_reserve_pages(struct inode *inode,
6422 long from, long to,
6423 struct vm_area_struct *vma,
6424 vm_flags_t vm_flags)
6425{
6426 long chg, add = -1;
6427 struct hstate *h = hstate_inode(inode);
6428 struct hugepage_subpool *spool = subpool_inode(inode);
6429 struct resv_map *resv_map;
6430 struct hugetlb_cgroup *h_cg = NULL;
6431 long gbl_reserve, regions_needed = 0;
6432
6433
6434 if (from > to) {
6435 VM_WARN(1, "%s called with a negative range\n", __func__);
6436 return false;
6437 }
6438
6439
6440
6441
6442
6443
6444 if (vm_flags & VM_NORESERVE)
6445 return true;
6446
6447
6448
6449
6450
6451
6452
6453 if (!vma || vma->vm_flags & VM_MAYSHARE) {
6454
6455
6456
6457
6458
6459 resv_map = inode_resv_map(inode);
6460
6461 chg = region_chg(resv_map, from, to, ®ions_needed);
6462
6463 } else {
6464
6465 resv_map = resv_map_alloc();
6466 if (!resv_map)
6467 return false;
6468
6469 chg = to - from;
6470
6471 set_vma_resv_map(vma, resv_map);
6472 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
6473 }
6474
6475 if (chg < 0)
6476 goto out_err;
6477
6478 if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
6479 chg * pages_per_huge_page(h), &h_cg) < 0)
6480 goto out_err;
6481
6482 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
6483
6484
6485
6486 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
6487 }
6488
6489
6490
6491
6492
6493
6494 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
6495 if (gbl_reserve < 0)
6496 goto out_uncharge_cgroup;
6497
6498
6499
6500
6501
6502 if (hugetlb_acct_memory(h, gbl_reserve) < 0)
6503 goto out_put_pages;
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516 if (!vma || vma->vm_flags & VM_MAYSHARE) {
6517 add = region_add(resv_map, from, to, regions_needed, h, h_cg);
6518
6519 if (unlikely(add < 0)) {
6520 hugetlb_acct_memory(h, -gbl_reserve);
6521 goto out_put_pages;
6522 } else if (unlikely(chg > add)) {
6523
6524
6525
6526
6527
6528
6529
6530 long rsv_adjust;
6531
6532
6533
6534
6535
6536 hugetlb_cgroup_uncharge_cgroup_rsvd(
6537 hstate_index(h),
6538 (chg - add) * pages_per_huge_page(h), h_cg);
6539
6540 rsv_adjust = hugepage_subpool_put_pages(spool,
6541 chg - add);
6542 hugetlb_acct_memory(h, -rsv_adjust);
6543 } else if (h_cg) {
6544
6545
6546
6547
6548
6549
6550 hugetlb_cgroup_put_rsvd_cgroup(h_cg);
6551 }
6552 }
6553 return true;
6554
6555out_put_pages:
6556
6557 (void)hugepage_subpool_put_pages(spool, chg);
6558out_uncharge_cgroup:
6559 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
6560 chg * pages_per_huge_page(h), h_cg);
6561out_err:
6562 if (!vma || vma->vm_flags & VM_MAYSHARE)
6563
6564
6565
6566 if (chg >= 0 && add < 0)
6567 region_abort(resv_map, from, to, regions_needed);
6568 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
6569 kref_put(&resv_map->refs, resv_map_release);
6570 return false;
6571}
6572
6573long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
6574 long freed)
6575{
6576 struct hstate *h = hstate_inode(inode);
6577 struct resv_map *resv_map = inode_resv_map(inode);
6578 long chg = 0;
6579 struct hugepage_subpool *spool = subpool_inode(inode);
6580 long gbl_reserve;
6581
6582
6583
6584
6585
6586 if (resv_map) {
6587 chg = region_del(resv_map, start, end);
6588
6589
6590
6591
6592
6593 if (chg < 0)
6594 return chg;
6595 }
6596
6597 spin_lock(&inode->i_lock);
6598 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
6599 spin_unlock(&inode->i_lock);
6600
6601
6602
6603
6604
6605
6606
6607
6608 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
6609 hugetlb_acct_memory(h, -gbl_reserve);
6610
6611 return 0;
6612}
6613
6614#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
6615static unsigned long page_table_shareable(struct vm_area_struct *svma,
6616 struct vm_area_struct *vma,
6617 unsigned long addr, pgoff_t idx)
6618{
6619 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
6620 svma->vm_start;
6621 unsigned long sbase = saddr & PUD_MASK;
6622 unsigned long s_end = sbase + PUD_SIZE;
6623
6624
6625 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
6626 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
6627
6628
6629
6630
6631
6632 if (pmd_index(addr) != pmd_index(saddr) ||
6633 vm_flags != svm_flags ||
6634 !range_in_vma(svma, sbase, s_end))
6635 return 0;
6636
6637 return saddr;
6638}
6639
6640static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
6641{
6642 unsigned long base = addr & PUD_MASK;
6643 unsigned long end = base + PUD_SIZE;
6644
6645
6646
6647
6648 if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
6649 return true;
6650 return false;
6651}
6652
6653bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6654{
6655#ifdef CONFIG_USERFAULTFD
6656 if (uffd_disable_huge_pmd_share(vma))
6657 return false;
6658#endif
6659 return vma_shareable(vma, addr);
6660}
6661
6662
6663
6664
6665
6666
6667void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6668 unsigned long *start, unsigned long *end)
6669{
6670 unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
6671 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
6672
6673
6674
6675
6676
6677 if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
6678 (*end <= v_start) || (*start >= v_end))
6679 return;
6680
6681
6682 if (*start > v_start)
6683 *start = ALIGN_DOWN(*start, PUD_SIZE);
6684
6685 if (*end < v_end)
6686 *end = ALIGN(*end, PUD_SIZE);
6687}
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6701 unsigned long addr, pud_t *pud)
6702{
6703 struct address_space *mapping = vma->vm_file->f_mapping;
6704 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
6705 vma->vm_pgoff;
6706 struct vm_area_struct *svma;
6707 unsigned long saddr;
6708 pte_t *spte = NULL;
6709 pte_t *pte;
6710 spinlock_t *ptl;
6711
6712 i_mmap_assert_locked(mapping);
6713 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
6714 if (svma == vma)
6715 continue;
6716
6717 saddr = page_table_shareable(svma, vma, addr, idx);
6718 if (saddr) {
6719 spte = huge_pte_offset(svma->vm_mm, saddr,
6720 vma_mmu_pagesize(svma));
6721 if (spte) {
6722 get_page(virt_to_page(spte));
6723 break;
6724 }
6725 }
6726 }
6727
6728 if (!spte)
6729 goto out;
6730
6731 ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
6732 if (pud_none(*pud)) {
6733 pud_populate(mm, pud,
6734 (pmd_t *)((unsigned long)spte & PAGE_MASK));
6735 mm_inc_nr_pmds(mm);
6736 } else {
6737 put_page(virt_to_page(spte));
6738 }
6739 spin_unlock(ptl);
6740out:
6741 pte = (pte_t *)pmd_alloc(mm, pud, addr);
6742 return pte;
6743}
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6758 unsigned long *addr, pte_t *ptep)
6759{
6760 pgd_t *pgd = pgd_offset(mm, *addr);
6761 p4d_t *p4d = p4d_offset(pgd, *addr);
6762 pud_t *pud = pud_offset(p4d, *addr);
6763
6764 i_mmap_assert_write_locked(vma->vm_file->f_mapping);
6765 BUG_ON(page_count(virt_to_page(ptep)) == 0);
6766 if (page_count(virt_to_page(ptep)) == 1)
6767 return 0;
6768
6769 pud_clear(pud);
6770 put_page(virt_to_page(ptep));
6771 mm_dec_nr_pmds(mm);
6772
6773
6774
6775
6776
6777
6778
6779 *addr |= PUD_SIZE - PMD_SIZE;
6780 return 1;
6781}
6782
6783#else
6784pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6785 unsigned long addr, pud_t *pud)
6786{
6787 return NULL;
6788}
6789
6790int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6791 unsigned long *addr, pte_t *ptep)
6792{
6793 return 0;
6794}
6795
6796void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6797 unsigned long *start, unsigned long *end)
6798{
6799}
6800
6801bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6802{
6803 return false;
6804}
6805#endif
6806
6807#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
6808pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
6809 unsigned long addr, unsigned long sz)
6810{
6811 pgd_t *pgd;
6812 p4d_t *p4d;
6813 pud_t *pud;
6814 pte_t *pte = NULL;
6815
6816 pgd = pgd_offset(mm, addr);
6817 p4d = p4d_alloc(mm, pgd, addr);
6818 if (!p4d)
6819 return NULL;
6820 pud = pud_alloc(mm, p4d, addr);
6821 if (pud) {
6822 if (sz == PUD_SIZE) {
6823 pte = (pte_t *)pud;
6824 } else {
6825 BUG_ON(sz != PMD_SIZE);
6826 if (want_pmd_share(vma, addr) && pud_none(*pud))
6827 pte = huge_pmd_share(mm, vma, addr, pud);
6828 else
6829 pte = (pte_t *)pmd_alloc(mm, pud, addr);
6830 }
6831 }
6832 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
6833
6834 return pte;
6835}
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846pte_t *huge_pte_offset(struct mm_struct *mm,
6847 unsigned long addr, unsigned long sz)
6848{
6849 pgd_t *pgd;
6850 p4d_t *p4d;
6851 pud_t *pud;
6852 pmd_t *pmd;
6853
6854 pgd = pgd_offset(mm, addr);
6855 if (!pgd_present(*pgd))
6856 return NULL;
6857 p4d = p4d_offset(pgd, addr);
6858 if (!p4d_present(*p4d))
6859 return NULL;
6860
6861 pud = pud_offset(p4d, addr);
6862 if (sz == PUD_SIZE)
6863
6864 return (pte_t *)pud;
6865 if (!pud_present(*pud))
6866 return NULL;
6867
6868
6869 pmd = pmd_offset(pud, addr);
6870
6871 return (pte_t *)pmd;
6872}
6873
6874#endif
6875
6876
6877
6878
6879
6880struct page * __weak
6881follow_huge_addr(struct mm_struct *mm, unsigned long address,
6882 int write)
6883{
6884 return ERR_PTR(-EINVAL);
6885}
6886
6887struct page * __weak
6888follow_huge_pd(struct vm_area_struct *vma,
6889 unsigned long address, hugepd_t hpd, int flags, int pdshift)
6890{
6891 WARN(1, "hugepd follow called with no support for hugepage directory format\n");
6892 return NULL;
6893}
6894
6895struct page * __weak
6896follow_huge_pmd(struct mm_struct *mm, unsigned long address,
6897 pmd_t *pmd, int flags)
6898{
6899 struct page *page = NULL;
6900 spinlock_t *ptl;
6901 pte_t pte;
6902
6903
6904
6905
6906
6907 if (WARN_ON_ONCE(flags & FOLL_PIN))
6908 return NULL;
6909
6910retry:
6911 ptl = pmd_lockptr(mm, pmd);
6912 spin_lock(ptl);
6913
6914
6915
6916
6917 if (!pmd_huge(*pmd))
6918 goto out;
6919 pte = huge_ptep_get((pte_t *)pmd);
6920 if (pte_present(pte)) {
6921 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
6922
6923
6924
6925
6926
6927
6928
6929
6930 if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
6931 page = NULL;
6932 goto out;
6933 }
6934 } else {
6935 if (is_hugetlb_entry_migration(pte)) {
6936 spin_unlock(ptl);
6937 __migration_entry_wait(mm, (pte_t *)pmd, ptl);
6938 goto retry;
6939 }
6940
6941
6942
6943
6944 }
6945out:
6946 spin_unlock(ptl);
6947 return page;
6948}
6949
6950struct page * __weak
6951follow_huge_pud(struct mm_struct *mm, unsigned long address,
6952 pud_t *pud, int flags)
6953{
6954 if (flags & (FOLL_GET | FOLL_PIN))
6955 return NULL;
6956
6957 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
6958}
6959
6960struct page * __weak
6961follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
6962{
6963 if (flags & (FOLL_GET | FOLL_PIN))
6964 return NULL;
6965
6966 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
6967}
6968
6969bool isolate_huge_page(struct page *page, struct list_head *list)
6970{
6971 bool ret = true;
6972
6973 spin_lock_irq(&hugetlb_lock);
6974 if (!PageHeadHuge(page) ||
6975 !HPageMigratable(page) ||
6976 !get_page_unless_zero(page)) {
6977 ret = false;
6978 goto unlock;
6979 }
6980 ClearHPageMigratable(page);
6981 list_move_tail(&page->lru, list);
6982unlock:
6983 spin_unlock_irq(&hugetlb_lock);
6984 return ret;
6985}
6986
6987int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
6988{
6989 int ret = 0;
6990
6991 *hugetlb = false;
6992 spin_lock_irq(&hugetlb_lock);
6993 if (PageHeadHuge(page)) {
6994 *hugetlb = true;
6995 if (HPageFreed(page))
6996 ret = 0;
6997 else if (HPageMigratable(page))
6998 ret = get_page_unless_zero(page);
6999 else
7000 ret = -EBUSY;
7001 }
7002 spin_unlock_irq(&hugetlb_lock);
7003 return ret;
7004}
7005
7006int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
7007{
7008 int ret;
7009
7010 spin_lock_irq(&hugetlb_lock);
7011 ret = __get_huge_page_for_hwpoison(pfn, flags);
7012 spin_unlock_irq(&hugetlb_lock);
7013 return ret;
7014}
7015
7016void putback_active_hugepage(struct page *page)
7017{
7018 spin_lock_irq(&hugetlb_lock);
7019 SetHPageMigratable(page);
7020 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
7021 spin_unlock_irq(&hugetlb_lock);
7022 put_page(page);
7023}
7024
7025void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
7026{
7027 struct hstate *h = page_hstate(oldpage);
7028
7029 hugetlb_cgroup_migrate(oldpage, newpage);
7030 set_page_owner_migrate_reason(newpage, reason);
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042 if (HPageTemporary(newpage)) {
7043 int old_nid = page_to_nid(oldpage);
7044 int new_nid = page_to_nid(newpage);
7045
7046 SetHPageTemporary(oldpage);
7047 ClearHPageTemporary(newpage);
7048
7049
7050
7051
7052
7053 if (new_nid == old_nid)
7054 return;
7055 spin_lock_irq(&hugetlb_lock);
7056 if (h->surplus_huge_pages_node[old_nid]) {
7057 h->surplus_huge_pages_node[old_nid]--;
7058 h->surplus_huge_pages_node[new_nid]++;
7059 }
7060 spin_unlock_irq(&hugetlb_lock);
7061 }
7062}
7063
7064
7065
7066
7067
7068void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
7069{
7070 struct hstate *h = hstate_vma(vma);
7071 unsigned long sz = huge_page_size(h);
7072 struct mm_struct *mm = vma->vm_mm;
7073 struct mmu_notifier_range range;
7074 unsigned long address, start, end;
7075 spinlock_t *ptl;
7076 pte_t *ptep;
7077
7078 if (!(vma->vm_flags & VM_MAYSHARE))
7079 return;
7080
7081 start = ALIGN(vma->vm_start, PUD_SIZE);
7082 end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
7083
7084 if (start >= end)
7085 return;
7086
7087 flush_cache_range(vma, start, end);
7088
7089
7090
7091
7092 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
7093 start, end);
7094 mmu_notifier_invalidate_range_start(&range);
7095 i_mmap_lock_write(vma->vm_file->f_mapping);
7096 for (address = start; address < end; address += PUD_SIZE) {
7097 unsigned long tmp = address;
7098
7099 ptep = huge_pte_offset(mm, address, sz);
7100 if (!ptep)
7101 continue;
7102 ptl = huge_pte_lock(h, mm, ptep);
7103
7104 huge_pmd_unshare(mm, vma, &tmp, ptep);
7105 spin_unlock(ptl);
7106 }
7107 flush_hugetlb_tlb_range(vma, start, end);
7108 i_mmap_unlock_write(vma->vm_file->f_mapping);
7109
7110
7111
7112
7113 mmu_notifier_invalidate_range_end(&range);
7114}
7115
7116#ifdef CONFIG_CMA
7117static bool cma_reserve_called __initdata;
7118
7119static int __init cmdline_parse_hugetlb_cma(char *p)
7120{
7121 int nid, count = 0;
7122 unsigned long tmp;
7123 char *s = p;
7124
7125 while (*s) {
7126 if (sscanf(s, "%lu%n", &tmp, &count) != 1)
7127 break;
7128
7129 if (s[count] == ':') {
7130 if (tmp >= MAX_NUMNODES)
7131 break;
7132 nid = array_index_nospec(tmp, MAX_NUMNODES);
7133
7134 s += count + 1;
7135 tmp = memparse(s, &s);
7136 hugetlb_cma_size_in_node[nid] = tmp;
7137 hugetlb_cma_size += tmp;
7138
7139
7140
7141
7142
7143 if (*s == ',')
7144 s++;
7145 else
7146 break;
7147 } else {
7148 hugetlb_cma_size = memparse(p, &p);
7149 break;
7150 }
7151 }
7152
7153 return 0;
7154}
7155
7156early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
7157
7158void __init hugetlb_cma_reserve(int order)
7159{
7160 unsigned long size, reserved, per_node;
7161 bool node_specific_cma_alloc = false;
7162 int nid;
7163
7164 cma_reserve_called = true;
7165
7166 if (!hugetlb_cma_size)
7167 return;
7168
7169 for (nid = 0; nid < MAX_NUMNODES; nid++) {
7170 if (hugetlb_cma_size_in_node[nid] == 0)
7171 continue;
7172
7173 if (!node_online(nid)) {
7174 pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
7175 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7176 hugetlb_cma_size_in_node[nid] = 0;
7177 continue;
7178 }
7179
7180 if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
7181 pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
7182 nid, (PAGE_SIZE << order) / SZ_1M);
7183 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7184 hugetlb_cma_size_in_node[nid] = 0;
7185 } else {
7186 node_specific_cma_alloc = true;
7187 }
7188 }
7189
7190
7191 if (!hugetlb_cma_size)
7192 return;
7193
7194 if (hugetlb_cma_size < (PAGE_SIZE << order)) {
7195 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
7196 (PAGE_SIZE << order) / SZ_1M);
7197 hugetlb_cma_size = 0;
7198 return;
7199 }
7200
7201 if (!node_specific_cma_alloc) {
7202
7203
7204
7205
7206 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
7207 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
7208 hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
7209 }
7210
7211 reserved = 0;
7212 for_each_online_node(nid) {
7213 int res;
7214 char name[CMA_MAX_NAME];
7215
7216 if (node_specific_cma_alloc) {
7217 if (hugetlb_cma_size_in_node[nid] == 0)
7218 continue;
7219
7220 size = hugetlb_cma_size_in_node[nid];
7221 } else {
7222 size = min(per_node, hugetlb_cma_size - reserved);
7223 }
7224
7225 size = round_up(size, PAGE_SIZE << order);
7226
7227 snprintf(name, sizeof(name), "hugetlb%d", nid);
7228
7229
7230
7231
7232
7233 res = cma_declare_contiguous_nid(0, size, 0,
7234 PAGE_SIZE << HUGETLB_PAGE_ORDER,
7235 0, false, name,
7236 &hugetlb_cma[nid], nid);
7237 if (res) {
7238 pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
7239 res, nid);
7240 continue;
7241 }
7242
7243 reserved += size;
7244 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
7245 size / SZ_1M, nid);
7246
7247 if (reserved >= hugetlb_cma_size)
7248 break;
7249 }
7250
7251 if (!reserved)
7252
7253
7254
7255
7256 hugetlb_cma_size = 0;
7257}
7258
7259void __init hugetlb_cma_check(void)
7260{
7261 if (!hugetlb_cma_size || cma_reserve_called)
7262 return;
7263
7264 pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
7265}
7266
7267#endif
7268