1
2
3
4
5
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/mm.h>
9#include <linux/seq_file.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/mmu_notifier.h>
13#include <linux/nodemask.h>
14#include <linux/pagemap.h>
15#include <linux/mempolicy.h>
16#include <linux/compiler.h>
17#include <linux/cpuset.h>
18#include <linux/mutex.h>
19#include <linux/memblock.h>
20#include <linux/sysfs.h>
21#include <linux/slab.h>
22#include <linux/sched/mm.h>
23#include <linux/mmdebug.h>
24#include <linux/sched/signal.h>
25#include <linux/rmap.h>
26#include <linux/string_helpers.h>
27#include <linux/swap.h>
28#include <linux/swapops.h>
29#include <linux/jhash.h>
30#include <linux/numa.h>
31#include <linux/llist.h>
32#include <linux/cma.h>
33#include <linux/migrate.h>
34
35#include <asm/page.h>
36#include <asm/pgalloc.h>
37#include <asm/tlb.h>
38
39#include <linux/io.h>
40#include <linux/hugetlb.h>
41#include <linux/hugetlb_cgroup.h>
42#include <linux/node.h>
43#include <linux/page_owner.h>
44#include "internal.h"
45#include "hugetlb_vmemmap.h"
46
47int hugetlb_max_hstate __read_mostly;
48unsigned int default_hstate_idx;
49struct hstate hstates[HUGE_MAX_HSTATE];
50
51#ifdef CONFIG_CMA
52static struct cma *hugetlb_cma[MAX_NUMNODES];
53static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
54static bool hugetlb_cma_page(struct page *page, unsigned int order)
55{
56 return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
57 1 << order);
58}
59#else
60static bool hugetlb_cma_page(struct page *page, unsigned int order)
61{
62 return false;
63}
64#endif
65static unsigned long hugetlb_cma_size __initdata;
66
67
68
69
70
71static unsigned int minimum_order __read_mostly = UINT_MAX;
72
73__initdata LIST_HEAD(huge_boot_pages);
74
75
76static struct hstate * __initdata parsed_hstate;
77static unsigned long __initdata default_hstate_max_huge_pages;
78static bool __initdata parsed_valid_hugepagesz = true;
79static bool __initdata parsed_default_hugepagesz;
80static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
81
82
83
84
85
86DEFINE_SPINLOCK(hugetlb_lock);
87
88
89
90
91
92static int num_fault_mutexes;
93struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
94
95
96static int hugetlb_acct_memory(struct hstate *h, long delta);
97
98static inline bool subpool_is_free(struct hugepage_subpool *spool)
99{
100 if (spool->count)
101 return false;
102 if (spool->max_hpages != -1)
103 return spool->used_hpages == 0;
104 if (spool->min_hpages != -1)
105 return spool->rsv_hpages == spool->min_hpages;
106
107 return true;
108}
109
110static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
111 unsigned long irq_flags)
112{
113 spin_unlock_irqrestore(&spool->lock, irq_flags);
114
115
116
117
118 if (subpool_is_free(spool)) {
119 if (spool->min_hpages != -1)
120 hugetlb_acct_memory(spool->hstate,
121 -spool->min_hpages);
122 kfree(spool);
123 }
124}
125
126struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
127 long min_hpages)
128{
129 struct hugepage_subpool *spool;
130
131 spool = kzalloc(sizeof(*spool), GFP_KERNEL);
132 if (!spool)
133 return NULL;
134
135 spin_lock_init(&spool->lock);
136 spool->count = 1;
137 spool->max_hpages = max_hpages;
138 spool->hstate = h;
139 spool->min_hpages = min_hpages;
140
141 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
142 kfree(spool);
143 return NULL;
144 }
145 spool->rsv_hpages = min_hpages;
146
147 return spool;
148}
149
150void hugepage_put_subpool(struct hugepage_subpool *spool)
151{
152 unsigned long flags;
153
154 spin_lock_irqsave(&spool->lock, flags);
155 BUG_ON(!spool->count);
156 spool->count--;
157 unlock_or_release_subpool(spool, flags);
158}
159
160
161
162
163
164
165
166
167
168static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
169 long delta)
170{
171 long ret = delta;
172
173 if (!spool)
174 return ret;
175
176 spin_lock_irq(&spool->lock);
177
178 if (spool->max_hpages != -1) {
179 if ((spool->used_hpages + delta) <= spool->max_hpages)
180 spool->used_hpages += delta;
181 else {
182 ret = -ENOMEM;
183 goto unlock_ret;
184 }
185 }
186
187
188 if (spool->min_hpages != -1 && spool->rsv_hpages) {
189 if (delta > spool->rsv_hpages) {
190
191
192
193
194 ret = delta - spool->rsv_hpages;
195 spool->rsv_hpages = 0;
196 } else {
197 ret = 0;
198 spool->rsv_hpages -= delta;
199 }
200 }
201
202unlock_ret:
203 spin_unlock_irq(&spool->lock);
204 return ret;
205}
206
207
208
209
210
211
212
213static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
214 long delta)
215{
216 long ret = delta;
217 unsigned long flags;
218
219 if (!spool)
220 return delta;
221
222 spin_lock_irqsave(&spool->lock, flags);
223
224 if (spool->max_hpages != -1)
225 spool->used_hpages -= delta;
226
227
228 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
229 if (spool->rsv_hpages + delta <= spool->min_hpages)
230 ret = 0;
231 else
232 ret = spool->rsv_hpages + delta - spool->min_hpages;
233
234 spool->rsv_hpages += delta;
235 if (spool->rsv_hpages > spool->min_hpages)
236 spool->rsv_hpages = spool->min_hpages;
237 }
238
239
240
241
242
243 unlock_or_release_subpool(spool, flags);
244
245 return ret;
246}
247
248static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
249{
250 return HUGETLBFS_SB(inode->i_sb)->spool;
251}
252
253static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
254{
255 return subpool_inode(file_inode(vma->vm_file));
256}
257
258
259
260
261static struct file_region *
262get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
263{
264 struct file_region *nrg = NULL;
265
266 VM_BUG_ON(resv->region_cache_count <= 0);
267
268 resv->region_cache_count--;
269 nrg = list_first_entry(&resv->region_cache, struct file_region, link);
270 list_del(&nrg->link);
271
272 nrg->from = from;
273 nrg->to = to;
274
275 return nrg;
276}
277
278static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
279 struct file_region *rg)
280{
281#ifdef CONFIG_CGROUP_HUGETLB
282 nrg->reservation_counter = rg->reservation_counter;
283 nrg->css = rg->css;
284 if (rg->css)
285 css_get(rg->css);
286#endif
287}
288
289
290static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
291 struct hstate *h,
292 struct resv_map *resv,
293 struct file_region *nrg)
294{
295#ifdef CONFIG_CGROUP_HUGETLB
296 if (h_cg) {
297 nrg->reservation_counter =
298 &h_cg->rsvd_hugepage[hstate_index(h)];
299 nrg->css = &h_cg->css;
300
301
302
303
304
305
306
307
308
309
310 css_get(&h_cg->css);
311 if (!resv->pages_per_hpage)
312 resv->pages_per_hpage = pages_per_huge_page(h);
313
314
315
316 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
317 } else {
318 nrg->reservation_counter = NULL;
319 nrg->css = NULL;
320 }
321#endif
322}
323
324static void put_uncharge_info(struct file_region *rg)
325{
326#ifdef CONFIG_CGROUP_HUGETLB
327 if (rg->css)
328 css_put(rg->css);
329#endif
330}
331
332static bool has_same_uncharge_info(struct file_region *rg,
333 struct file_region *org)
334{
335#ifdef CONFIG_CGROUP_HUGETLB
336 return rg->reservation_counter == org->reservation_counter &&
337 rg->css == org->css;
338
339#else
340 return true;
341#endif
342}
343
344static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
345{
346 struct file_region *nrg = NULL, *prg = NULL;
347
348 prg = list_prev_entry(rg, link);
349 if (&prg->link != &resv->regions && prg->to == rg->from &&
350 has_same_uncharge_info(prg, rg)) {
351 prg->to = rg->to;
352
353 list_del(&rg->link);
354 put_uncharge_info(rg);
355 kfree(rg);
356
357 rg = prg;
358 }
359
360 nrg = list_next_entry(rg, link);
361 if (&nrg->link != &resv->regions && nrg->from == rg->to &&
362 has_same_uncharge_info(nrg, rg)) {
363 nrg->from = rg->from;
364
365 list_del(&rg->link);
366 put_uncharge_info(rg);
367 kfree(rg);
368 }
369}
370
371static inline long
372hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
373 long to, struct hstate *h, struct hugetlb_cgroup *cg,
374 long *regions_needed)
375{
376 struct file_region *nrg;
377
378 if (!regions_needed) {
379 nrg = get_file_region_entry_from_cache(map, from, to);
380 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
381 list_add(&nrg->link, rg->link.prev);
382 coalesce_file_region(map, nrg);
383 } else
384 *regions_needed += 1;
385
386 return to - from;
387}
388
389
390
391
392
393
394
395
396
397static long add_reservation_in_range(struct resv_map *resv, long f, long t,
398 struct hugetlb_cgroup *h_cg,
399 struct hstate *h, long *regions_needed)
400{
401 long add = 0;
402 struct list_head *head = &resv->regions;
403 long last_accounted_offset = f;
404 struct file_region *rg = NULL, *trg = NULL;
405
406 if (regions_needed)
407 *regions_needed = 0;
408
409
410
411
412
413 list_for_each_entry_safe(rg, trg, head, link) {
414
415 if (rg->from < f) {
416
417
418
419 if (rg->to > last_accounted_offset)
420 last_accounted_offset = rg->to;
421 continue;
422 }
423
424
425
426
427 if (rg->from >= t)
428 break;
429
430
431
432
433 if (rg->from > last_accounted_offset)
434 add += hugetlb_resv_map_add(resv, rg,
435 last_accounted_offset,
436 rg->from, h, h_cg,
437 regions_needed);
438
439 last_accounted_offset = rg->to;
440 }
441
442
443
444
445 if (last_accounted_offset < t)
446 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
447 t, h, h_cg, regions_needed);
448
449 return add;
450}
451
452
453
454static int allocate_file_region_entries(struct resv_map *resv,
455 int regions_needed)
456 __must_hold(&resv->lock)
457{
458 struct list_head allocated_regions;
459 int to_allocate = 0, i = 0;
460 struct file_region *trg = NULL, *rg = NULL;
461
462 VM_BUG_ON(regions_needed < 0);
463
464 INIT_LIST_HEAD(&allocated_regions);
465
466
467
468
469
470
471
472
473
474
475 while (resv->region_cache_count <
476 (resv->adds_in_progress + regions_needed)) {
477 to_allocate = resv->adds_in_progress + regions_needed -
478 resv->region_cache_count;
479
480
481
482
483
484 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
485
486 spin_unlock(&resv->lock);
487 for (i = 0; i < to_allocate; i++) {
488 trg = kmalloc(sizeof(*trg), GFP_KERNEL);
489 if (!trg)
490 goto out_of_memory;
491 list_add(&trg->link, &allocated_regions);
492 }
493
494 spin_lock(&resv->lock);
495
496 list_splice(&allocated_regions, &resv->region_cache);
497 resv->region_cache_count += to_allocate;
498 }
499
500 return 0;
501
502out_of_memory:
503 list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
504 list_del(&rg->link);
505 kfree(rg);
506 }
507 return -ENOMEM;
508}
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527static long region_add(struct resv_map *resv, long f, long t,
528 long in_regions_needed, struct hstate *h,
529 struct hugetlb_cgroup *h_cg)
530{
531 long add = 0, actual_regions_needed = 0;
532
533 spin_lock(&resv->lock);
534retry:
535
536
537 add_reservation_in_range(resv, f, t, NULL, NULL,
538 &actual_regions_needed);
539
540
541
542
543
544
545
546
547
548
549 if (actual_regions_needed > in_regions_needed &&
550 resv->region_cache_count <
551 resv->adds_in_progress +
552 (actual_regions_needed - in_regions_needed)) {
553
554
555
556 VM_BUG_ON(t - f <= 1);
557
558 if (allocate_file_region_entries(
559 resv, actual_regions_needed - in_regions_needed)) {
560 return -ENOMEM;
561 }
562
563 goto retry;
564 }
565
566 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
567
568 resv->adds_in_progress -= in_regions_needed;
569
570 spin_unlock(&resv->lock);
571 return add;
572}
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594static long region_chg(struct resv_map *resv, long f, long t,
595 long *out_regions_needed)
596{
597 long chg = 0;
598
599 spin_lock(&resv->lock);
600
601
602 chg = add_reservation_in_range(resv, f, t, NULL, NULL,
603 out_regions_needed);
604
605 if (*out_regions_needed == 0)
606 *out_regions_needed = 1;
607
608 if (allocate_file_region_entries(resv, *out_regions_needed))
609 return -ENOMEM;
610
611 resv->adds_in_progress += *out_regions_needed;
612
613 spin_unlock(&resv->lock);
614 return chg;
615}
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630static void region_abort(struct resv_map *resv, long f, long t,
631 long regions_needed)
632{
633 spin_lock(&resv->lock);
634 VM_BUG_ON(!resv->region_cache_count);
635 resv->adds_in_progress -= regions_needed;
636 spin_unlock(&resv->lock);
637}
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653static long region_del(struct resv_map *resv, long f, long t)
654{
655 struct list_head *head = &resv->regions;
656 struct file_region *rg, *trg;
657 struct file_region *nrg = NULL;
658 long del = 0;
659
660retry:
661 spin_lock(&resv->lock);
662 list_for_each_entry_safe(rg, trg, head, link) {
663
664
665
666
667
668
669
670 if (rg->to <= f && (rg->to != rg->from || rg->to != f))
671 continue;
672
673 if (rg->from >= t)
674 break;
675
676 if (f > rg->from && t < rg->to) {
677
678
679
680
681 if (!nrg &&
682 resv->region_cache_count > resv->adds_in_progress) {
683 nrg = list_first_entry(&resv->region_cache,
684 struct file_region,
685 link);
686 list_del(&nrg->link);
687 resv->region_cache_count--;
688 }
689
690 if (!nrg) {
691 spin_unlock(&resv->lock);
692 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
693 if (!nrg)
694 return -ENOMEM;
695 goto retry;
696 }
697
698 del += t - f;
699 hugetlb_cgroup_uncharge_file_region(
700 resv, rg, t - f, false);
701
702
703 nrg->from = t;
704 nrg->to = rg->to;
705
706 copy_hugetlb_cgroup_uncharge_info(nrg, rg);
707
708 INIT_LIST_HEAD(&nrg->link);
709
710
711 rg->to = f;
712
713 list_add(&nrg->link, &rg->link);
714 nrg = NULL;
715 break;
716 }
717
718 if (f <= rg->from && t >= rg->to) {
719 del += rg->to - rg->from;
720 hugetlb_cgroup_uncharge_file_region(resv, rg,
721 rg->to - rg->from, true);
722 list_del(&rg->link);
723 kfree(rg);
724 continue;
725 }
726
727 if (f <= rg->from) {
728 hugetlb_cgroup_uncharge_file_region(resv, rg,
729 t - rg->from, false);
730
731 del += t - rg->from;
732 rg->from = t;
733 } else {
734 hugetlb_cgroup_uncharge_file_region(resv, rg,
735 rg->to - f, false);
736
737 del += rg->to - f;
738 rg->to = f;
739 }
740 }
741
742 spin_unlock(&resv->lock);
743 kfree(nrg);
744 return del;
745}
746
747
748
749
750
751
752
753
754
755
756void hugetlb_fix_reserve_counts(struct inode *inode)
757{
758 struct hugepage_subpool *spool = subpool_inode(inode);
759 long rsv_adjust;
760 bool reserved = false;
761
762 rsv_adjust = hugepage_subpool_get_pages(spool, 1);
763 if (rsv_adjust > 0) {
764 struct hstate *h = hstate_inode(inode);
765
766 if (!hugetlb_acct_memory(h, 1))
767 reserved = true;
768 } else if (!rsv_adjust) {
769 reserved = true;
770 }
771
772 if (!reserved)
773 pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
774}
775
776
777
778
779
780static long region_count(struct resv_map *resv, long f, long t)
781{
782 struct list_head *head = &resv->regions;
783 struct file_region *rg;
784 long chg = 0;
785
786 spin_lock(&resv->lock);
787
788 list_for_each_entry(rg, head, link) {
789 long seg_from;
790 long seg_to;
791
792 if (rg->to <= f)
793 continue;
794 if (rg->from >= t)
795 break;
796
797 seg_from = max(rg->from, f);
798 seg_to = min(rg->to, t);
799
800 chg += seg_to - seg_from;
801 }
802 spin_unlock(&resv->lock);
803
804 return chg;
805}
806
807
808
809
810
811static pgoff_t vma_hugecache_offset(struct hstate *h,
812 struct vm_area_struct *vma, unsigned long address)
813{
814 return ((address - vma->vm_start) >> huge_page_shift(h)) +
815 (vma->vm_pgoff >> huge_page_order(h));
816}
817
818pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
819 unsigned long address)
820{
821 return vma_hugecache_offset(hstate_vma(vma), vma, address);
822}
823EXPORT_SYMBOL_GPL(linear_hugepage_index);
824
825
826
827
828
829unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
830{
831 if (vma->vm_ops && vma->vm_ops->pagesize)
832 return vma->vm_ops->pagesize(vma);
833 return PAGE_SIZE;
834}
835EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
836
837
838
839
840
841
842
843__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
844{
845 return vma_kernel_pagesize(vma);
846}
847
848
849
850
851
852
853#define HPAGE_RESV_OWNER (1UL << 0)
854#define HPAGE_RESV_UNMAPPED (1UL << 1)
855#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876static unsigned long get_vma_private_data(struct vm_area_struct *vma)
877{
878 return (unsigned long)vma->vm_private_data;
879}
880
881static void set_vma_private_data(struct vm_area_struct *vma,
882 unsigned long value)
883{
884 vma->vm_private_data = (void *)value;
885}
886
887static void
888resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
889 struct hugetlb_cgroup *h_cg,
890 struct hstate *h)
891{
892#ifdef CONFIG_CGROUP_HUGETLB
893 if (!h_cg || !h) {
894 resv_map->reservation_counter = NULL;
895 resv_map->pages_per_hpage = 0;
896 resv_map->css = NULL;
897 } else {
898 resv_map->reservation_counter =
899 &h_cg->rsvd_hugepage[hstate_index(h)];
900 resv_map->pages_per_hpage = pages_per_huge_page(h);
901 resv_map->css = &h_cg->css;
902 }
903#endif
904}
905
906struct resv_map *resv_map_alloc(void)
907{
908 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
909 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
910
911 if (!resv_map || !rg) {
912 kfree(resv_map);
913 kfree(rg);
914 return NULL;
915 }
916
917 kref_init(&resv_map->refs);
918 spin_lock_init(&resv_map->lock);
919 INIT_LIST_HEAD(&resv_map->regions);
920
921 resv_map->adds_in_progress = 0;
922
923
924
925
926
927
928 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
929
930 INIT_LIST_HEAD(&resv_map->region_cache);
931 list_add(&rg->link, &resv_map->region_cache);
932 resv_map->region_cache_count = 1;
933
934 return resv_map;
935}
936
937void resv_map_release(struct kref *ref)
938{
939 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
940 struct list_head *head = &resv_map->region_cache;
941 struct file_region *rg, *trg;
942
943
944 region_del(resv_map, 0, LONG_MAX);
945
946
947 list_for_each_entry_safe(rg, trg, head, link) {
948 list_del(&rg->link);
949 kfree(rg);
950 }
951
952 VM_BUG_ON(resv_map->adds_in_progress);
953
954 kfree(resv_map);
955}
956
957static inline struct resv_map *inode_resv_map(struct inode *inode)
958{
959
960
961
962
963
964
965
966
967 return (struct resv_map *)(&inode->i_data)->private_data;
968}
969
970static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
971{
972 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
973 if (vma->vm_flags & VM_MAYSHARE) {
974 struct address_space *mapping = vma->vm_file->f_mapping;
975 struct inode *inode = mapping->host;
976
977 return inode_resv_map(inode);
978
979 } else {
980 return (struct resv_map *)(get_vma_private_data(vma) &
981 ~HPAGE_RESV_MASK);
982 }
983}
984
985static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
986{
987 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
988 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
989
990 set_vma_private_data(vma, (get_vma_private_data(vma) &
991 HPAGE_RESV_MASK) | (unsigned long)map);
992}
993
994static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
995{
996 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
997 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
998
999 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
1000}
1001
1002static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
1003{
1004 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1005
1006 return (get_vma_private_data(vma) & flag) != 0;
1007}
1008
1009
1010void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
1011{
1012 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1013 if (!(vma->vm_flags & VM_MAYSHARE))
1014 vma->vm_private_data = (void *)0;
1015}
1016
1017
1018
1019
1020
1021
1022
1023
1024void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1025{
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038 struct resv_map *reservations = vma_resv_map(vma);
1039
1040 if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1041 resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
1042 kref_put(&reservations->refs, resv_map_release);
1043 }
1044
1045 reset_vma_resv_huge_pages(vma);
1046}
1047
1048
1049static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
1050{
1051 if (vma->vm_flags & VM_NORESERVE) {
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
1062 return true;
1063 else
1064 return false;
1065 }
1066
1067
1068 if (vma->vm_flags & VM_MAYSHARE) {
1069
1070
1071
1072
1073
1074
1075
1076 if (chg)
1077 return false;
1078 else
1079 return true;
1080 }
1081
1082
1083
1084
1085
1086 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102 if (chg)
1103 return false;
1104 else
1105 return true;
1106 }
1107
1108 return false;
1109}
1110
1111static void enqueue_huge_page(struct hstate *h, struct page *page)
1112{
1113 int nid = page_to_nid(page);
1114
1115 lockdep_assert_held(&hugetlb_lock);
1116 VM_BUG_ON_PAGE(page_count(page), page);
1117
1118 list_move(&page->lru, &h->hugepage_freelists[nid]);
1119 h->free_huge_pages++;
1120 h->free_huge_pages_node[nid]++;
1121 SetHPageFreed(page);
1122}
1123
1124static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
1125{
1126 struct page *page;
1127 bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1128
1129 lockdep_assert_held(&hugetlb_lock);
1130 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
1131 if (pin && !is_pinnable_page(page))
1132 continue;
1133
1134 if (PageHWPoison(page))
1135 continue;
1136
1137 list_move(&page->lru, &h->hugepage_activelist);
1138 set_page_refcounted(page);
1139 ClearHPageFreed(page);
1140 h->free_huge_pages--;
1141 h->free_huge_pages_node[nid]--;
1142 return page;
1143 }
1144
1145 return NULL;
1146}
1147
1148static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
1149 nodemask_t *nmask)
1150{
1151 unsigned int cpuset_mems_cookie;
1152 struct zonelist *zonelist;
1153 struct zone *zone;
1154 struct zoneref *z;
1155 int node = NUMA_NO_NODE;
1156
1157 zonelist = node_zonelist(nid, gfp_mask);
1158
1159retry_cpuset:
1160 cpuset_mems_cookie = read_mems_allowed_begin();
1161 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1162 struct page *page;
1163
1164 if (!cpuset_zone_allowed(zone, gfp_mask))
1165 continue;
1166
1167
1168
1169
1170 if (zone_to_nid(zone) == node)
1171 continue;
1172 node = zone_to_nid(zone);
1173
1174 page = dequeue_huge_page_node_exact(h, node);
1175 if (page)
1176 return page;
1177 }
1178 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1179 goto retry_cpuset;
1180
1181 return NULL;
1182}
1183
1184static struct page *dequeue_huge_page_vma(struct hstate *h,
1185 struct vm_area_struct *vma,
1186 unsigned long address, int avoid_reserve,
1187 long chg)
1188{
1189 struct page *page = NULL;
1190 struct mempolicy *mpol;
1191 gfp_t gfp_mask;
1192 nodemask_t *nodemask;
1193 int nid;
1194
1195
1196
1197
1198
1199
1200 if (!vma_has_reserves(vma, chg) &&
1201 h->free_huge_pages - h->resv_huge_pages == 0)
1202 goto err;
1203
1204
1205 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
1206 goto err;
1207
1208 gfp_mask = htlb_alloc_mask(h);
1209 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1210
1211 if (mpol_is_preferred_many(mpol)) {
1212 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1213
1214
1215 nodemask = NULL;
1216 }
1217
1218 if (!page)
1219 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1220
1221 if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
1222 SetHPageRestoreReserve(page);
1223 h->resv_huge_pages--;
1224 }
1225
1226 mpol_cond_put(mpol);
1227 return page;
1228
1229err:
1230 return NULL;
1231}
1232
1233
1234
1235
1236
1237
1238
1239
1240static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1241{
1242 nid = next_node_in(nid, *nodes_allowed);
1243 VM_BUG_ON(nid >= MAX_NUMNODES);
1244
1245 return nid;
1246}
1247
1248static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1249{
1250 if (!node_isset(nid, *nodes_allowed))
1251 nid = next_node_allowed(nid, nodes_allowed);
1252 return nid;
1253}
1254
1255
1256
1257
1258
1259
1260
1261static int hstate_next_node_to_alloc(struct hstate *h,
1262 nodemask_t *nodes_allowed)
1263{
1264 int nid;
1265
1266 VM_BUG_ON(!nodes_allowed);
1267
1268 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
1269 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
1270
1271 return nid;
1272}
1273
1274
1275
1276
1277
1278
1279
1280static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1281{
1282 int nid;
1283
1284 VM_BUG_ON(!nodes_allowed);
1285
1286 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1287 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1288
1289 return nid;
1290}
1291
1292#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
1293 for (nr_nodes = nodes_weight(*mask); \
1294 nr_nodes > 0 && \
1295 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
1296 nr_nodes--)
1297
1298#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
1299 for (nr_nodes = nodes_weight(*mask); \
1300 nr_nodes > 0 && \
1301 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
1302 nr_nodes--)
1303
1304
1305static void __destroy_compound_gigantic_page(struct page *page,
1306 unsigned int order, bool demote)
1307{
1308 int i;
1309 int nr_pages = 1 << order;
1310 struct page *p = page + 1;
1311
1312 atomic_set(compound_mapcount_ptr(page), 0);
1313 atomic_set(compound_pincount_ptr(page), 0);
1314
1315 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1316 p->mapping = NULL;
1317 clear_compound_head(p);
1318 if (!demote)
1319 set_page_refcounted(p);
1320 }
1321
1322 set_compound_order(page, 0);
1323 page[1].compound_nr = 0;
1324 __ClearPageHead(page);
1325}
1326
1327static void destroy_compound_hugetlb_page_for_demote(struct page *page,
1328 unsigned int order)
1329{
1330 __destroy_compound_gigantic_page(page, order, true);
1331}
1332
1333#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1334static void destroy_compound_gigantic_page(struct page *page,
1335 unsigned int order)
1336{
1337 __destroy_compound_gigantic_page(page, order, false);
1338}
1339
1340static void free_gigantic_page(struct page *page, unsigned int order)
1341{
1342
1343
1344
1345
1346#ifdef CONFIG_CMA
1347 if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
1348 return;
1349#endif
1350
1351 free_contig_range(page_to_pfn(page), 1 << order);
1352}
1353
1354#ifdef CONFIG_CONTIG_ALLOC
1355static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1356 int nid, nodemask_t *nodemask)
1357{
1358 unsigned long nr_pages = pages_per_huge_page(h);
1359 if (nid == NUMA_NO_NODE)
1360 nid = numa_mem_id();
1361
1362#ifdef CONFIG_CMA
1363 {
1364 struct page *page;
1365 int node;
1366
1367 if (hugetlb_cma[nid]) {
1368 page = cma_alloc(hugetlb_cma[nid], nr_pages,
1369 huge_page_order(h), true);
1370 if (page)
1371 return page;
1372 }
1373
1374 if (!(gfp_mask & __GFP_THISNODE)) {
1375 for_each_node_mask(node, *nodemask) {
1376 if (node == nid || !hugetlb_cma[node])
1377 continue;
1378
1379 page = cma_alloc(hugetlb_cma[node], nr_pages,
1380 huge_page_order(h), true);
1381 if (page)
1382 return page;
1383 }
1384 }
1385 }
1386#endif
1387
1388 return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1389}
1390
1391#else
1392static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1393 int nid, nodemask_t *nodemask)
1394{
1395 return NULL;
1396}
1397#endif
1398
1399#else
1400static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1401 int nid, nodemask_t *nodemask)
1402{
1403 return NULL;
1404}
1405static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1406static inline void destroy_compound_gigantic_page(struct page *page,
1407 unsigned int order) { }
1408#endif
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418static void __remove_hugetlb_page(struct hstate *h, struct page *page,
1419 bool adjust_surplus,
1420 bool demote)
1421{
1422 int nid = page_to_nid(page);
1423
1424 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1425 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
1426
1427 lockdep_assert_held(&hugetlb_lock);
1428 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1429 return;
1430
1431 list_del(&page->lru);
1432
1433 if (HPageFreed(page)) {
1434 h->free_huge_pages--;
1435 h->free_huge_pages_node[nid]--;
1436 }
1437 if (adjust_surplus) {
1438 h->surplus_huge_pages--;
1439 h->surplus_huge_pages_node[nid]--;
1440 }
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462 if (!demote)
1463 set_page_refcounted(page);
1464 if (hstate_is_gigantic(h))
1465 set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1466 else
1467 set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
1468
1469 h->nr_huge_pages--;
1470 h->nr_huge_pages_node[nid]--;
1471}
1472
1473static void remove_hugetlb_page(struct hstate *h, struct page *page,
1474 bool adjust_surplus)
1475{
1476 __remove_hugetlb_page(h, page, adjust_surplus, false);
1477}
1478
1479static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
1480 bool adjust_surplus)
1481{
1482 __remove_hugetlb_page(h, page, adjust_surplus, true);
1483}
1484
1485static void add_hugetlb_page(struct hstate *h, struct page *page,
1486 bool adjust_surplus)
1487{
1488 int zeroed;
1489 int nid = page_to_nid(page);
1490
1491 VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
1492
1493 lockdep_assert_held(&hugetlb_lock);
1494
1495 INIT_LIST_HEAD(&page->lru);
1496 h->nr_huge_pages++;
1497 h->nr_huge_pages_node[nid]++;
1498
1499 if (adjust_surplus) {
1500 h->surplus_huge_pages++;
1501 h->surplus_huge_pages_node[nid]++;
1502 }
1503
1504 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1505 set_page_private(page, 0);
1506 SetHPageVmemmapOptimized(page);
1507
1508
1509
1510
1511
1512
1513 zeroed = put_page_testzero(page);
1514 if (!zeroed)
1515
1516
1517
1518
1519
1520
1521 return;
1522
1523 arch_clear_hugepage_flags(page);
1524 enqueue_huge_page(h, page);
1525}
1526
1527static void __update_and_free_page(struct hstate *h, struct page *page)
1528{
1529 int i;
1530 struct page *subpage = page;
1531
1532 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1533 return;
1534
1535 if (alloc_huge_page_vmemmap(h, page)) {
1536 spin_lock_irq(&hugetlb_lock);
1537
1538
1539
1540
1541
1542 add_hugetlb_page(h, page, true);
1543 spin_unlock_irq(&hugetlb_lock);
1544 return;
1545 }
1546
1547 for (i = 0; i < pages_per_huge_page(h);
1548 i++, subpage = mem_map_next(subpage, page, i)) {
1549 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1550 1 << PG_referenced | 1 << PG_dirty |
1551 1 << PG_active | 1 << PG_private |
1552 1 << PG_writeback);
1553 }
1554
1555
1556
1557
1558
1559 if (hstate_is_gigantic(h) ||
1560 hugetlb_cma_page(page, huge_page_order(h))) {
1561 destroy_compound_gigantic_page(page, huge_page_order(h));
1562 free_gigantic_page(page, huge_page_order(h));
1563 } else {
1564 __free_pages(page, huge_page_order(h));
1565 }
1566}
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579static LLIST_HEAD(hpage_freelist);
1580
1581static void free_hpage_workfn(struct work_struct *work)
1582{
1583 struct llist_node *node;
1584
1585 node = llist_del_all(&hpage_freelist);
1586
1587 while (node) {
1588 struct page *page;
1589 struct hstate *h;
1590
1591 page = container_of((struct address_space **)node,
1592 struct page, mapping);
1593 node = node->next;
1594 page->mapping = NULL;
1595
1596
1597
1598
1599
1600
1601 h = size_to_hstate(page_size(page));
1602
1603 __update_and_free_page(h, page);
1604
1605 cond_resched();
1606 }
1607}
1608static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1609
1610static inline void flush_free_hpage_work(struct hstate *h)
1611{
1612 if (free_vmemmap_pages_per_hpage(h))
1613 flush_work(&free_hpage_work);
1614}
1615
1616static void update_and_free_page(struct hstate *h, struct page *page,
1617 bool atomic)
1618{
1619 if (!HPageVmemmapOptimized(page) || !atomic) {
1620 __update_and_free_page(h, page);
1621 return;
1622 }
1623
1624
1625
1626
1627
1628
1629
1630
1631 if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
1632 schedule_work(&free_hpage_work);
1633}
1634
1635static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
1636{
1637 struct page *page, *t_page;
1638
1639 list_for_each_entry_safe(page, t_page, list, lru) {
1640 update_and_free_page(h, page, false);
1641 cond_resched();
1642 }
1643}
1644
1645struct hstate *size_to_hstate(unsigned long size)
1646{
1647 struct hstate *h;
1648
1649 for_each_hstate(h) {
1650 if (huge_page_size(h) == size)
1651 return h;
1652 }
1653 return NULL;
1654}
1655
1656void free_huge_page(struct page *page)
1657{
1658
1659
1660
1661
1662 struct hstate *h = page_hstate(page);
1663 int nid = page_to_nid(page);
1664 struct hugepage_subpool *spool = hugetlb_page_subpool(page);
1665 bool restore_reserve;
1666 unsigned long flags;
1667
1668 VM_BUG_ON_PAGE(page_count(page), page);
1669 VM_BUG_ON_PAGE(page_mapcount(page), page);
1670
1671 hugetlb_set_page_subpool(page, NULL);
1672 page->mapping = NULL;
1673 restore_reserve = HPageRestoreReserve(page);
1674 ClearHPageRestoreReserve(page);
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684 if (!restore_reserve) {
1685
1686
1687
1688
1689
1690
1691 if (hugepage_subpool_put_pages(spool, 1) == 0)
1692 restore_reserve = true;
1693 }
1694
1695 spin_lock_irqsave(&hugetlb_lock, flags);
1696 ClearHPageMigratable(page);
1697 hugetlb_cgroup_uncharge_page(hstate_index(h),
1698 pages_per_huge_page(h), page);
1699 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
1700 pages_per_huge_page(h), page);
1701 if (restore_reserve)
1702 h->resv_huge_pages++;
1703
1704 if (HPageTemporary(page)) {
1705 remove_hugetlb_page(h, page, false);
1706 spin_unlock_irqrestore(&hugetlb_lock, flags);
1707 update_and_free_page(h, page, true);
1708 } else if (h->surplus_huge_pages_node[nid]) {
1709
1710 remove_hugetlb_page(h, page, true);
1711 spin_unlock_irqrestore(&hugetlb_lock, flags);
1712 update_and_free_page(h, page, true);
1713 } else {
1714 arch_clear_hugepage_flags(page);
1715 enqueue_huge_page(h, page);
1716 spin_unlock_irqrestore(&hugetlb_lock, flags);
1717 }
1718}
1719
1720
1721
1722
1723static void __prep_account_new_huge_page(struct hstate *h, int nid)
1724{
1725 lockdep_assert_held(&hugetlb_lock);
1726 h->nr_huge_pages++;
1727 h->nr_huge_pages_node[nid]++;
1728}
1729
1730static void __prep_new_huge_page(struct hstate *h, struct page *page)
1731{
1732 free_huge_page_vmemmap(h, page);
1733 INIT_LIST_HEAD(&page->lru);
1734 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1735 hugetlb_set_page_subpool(page, NULL);
1736 set_hugetlb_cgroup(page, NULL);
1737 set_hugetlb_cgroup_rsvd(page, NULL);
1738}
1739
1740static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1741{
1742 __prep_new_huge_page(h, page);
1743 spin_lock_irq(&hugetlb_lock);
1744 __prep_account_new_huge_page(h, nid);
1745 spin_unlock_irq(&hugetlb_lock);
1746}
1747
1748static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
1749 bool demote)
1750{
1751 int i, j;
1752 int nr_pages = 1 << order;
1753 struct page *p = page + 1;
1754
1755
1756 set_compound_order(page, order);
1757 __ClearPageReserved(page);
1758 __SetPageHead(page);
1759 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772 __ClearPageReserved(p);
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790 if (!demote) {
1791 if (!page_ref_freeze(p, 1)) {
1792 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
1793 goto out_error;
1794 }
1795 } else {
1796 VM_BUG_ON_PAGE(page_count(p), p);
1797 }
1798 set_compound_head(p, page);
1799 }
1800 atomic_set(compound_mapcount_ptr(page), -1);
1801 atomic_set(compound_pincount_ptr(page), 0);
1802 return true;
1803
1804out_error:
1805
1806 p = page + 1;
1807 for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
1808 clear_compound_head(p);
1809 set_page_refcounted(p);
1810 }
1811
1812 for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
1813 __ClearPageReserved(p);
1814 set_compound_order(page, 0);
1815 page[1].compound_nr = 0;
1816 __ClearPageHead(page);
1817 return false;
1818}
1819
1820static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
1821{
1822 return __prep_compound_gigantic_page(page, order, false);
1823}
1824
1825static bool prep_compound_gigantic_page_for_demote(struct page *page,
1826 unsigned int order)
1827{
1828 return __prep_compound_gigantic_page(page, order, true);
1829}
1830
1831
1832
1833
1834
1835
1836int PageHuge(struct page *page)
1837{
1838 if (!PageCompound(page))
1839 return 0;
1840
1841 page = compound_head(page);
1842 return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
1843}
1844EXPORT_SYMBOL_GPL(PageHuge);
1845
1846
1847
1848
1849
1850int PageHeadHuge(struct page *page_head)
1851{
1852 if (!PageHead(page_head))
1853 return 0;
1854
1855 return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
1856}
1857
1858
1859
1860
1861
1862
1863
1864
1865struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
1866{
1867 struct address_space *mapping = page_mapping(hpage);
1868
1869 if (!mapping)
1870 return mapping;
1871
1872 if (i_mmap_trylock_write(mapping))
1873 return mapping;
1874
1875 return NULL;
1876}
1877
1878pgoff_t hugetlb_basepage_index(struct page *page)
1879{
1880 struct page *page_head = compound_head(page);
1881 pgoff_t index = page_index(page_head);
1882 unsigned long compound_idx;
1883
1884 if (compound_order(page_head) >= MAX_ORDER)
1885 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
1886 else
1887 compound_idx = page - page_head;
1888
1889 return (index << compound_order(page_head)) + compound_idx;
1890}
1891
1892static struct page *alloc_buddy_huge_page(struct hstate *h,
1893 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1894 nodemask_t *node_alloc_noretry)
1895{
1896 int order = huge_page_order(h);
1897 struct page *page;
1898 bool alloc_try_hard = true;
1899
1900
1901
1902
1903
1904
1905
1906
1907 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1908 alloc_try_hard = false;
1909 gfp_mask |= __GFP_COMP|__GFP_NOWARN;
1910 if (alloc_try_hard)
1911 gfp_mask |= __GFP_RETRY_MAYFAIL;
1912 if (nid == NUMA_NO_NODE)
1913 nid = numa_mem_id();
1914 page = __alloc_pages(gfp_mask, order, nid, nmask);
1915 if (page)
1916 __count_vm_event(HTLB_BUDDY_PGALLOC);
1917 else
1918 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1919
1920
1921
1922
1923
1924
1925 if (node_alloc_noretry && page && !alloc_try_hard)
1926 node_clear(nid, *node_alloc_noretry);
1927
1928
1929
1930
1931
1932
1933 if (node_alloc_noretry && !page && alloc_try_hard)
1934 node_set(nid, *node_alloc_noretry);
1935
1936 return page;
1937}
1938
1939
1940
1941
1942
1943static struct page *alloc_fresh_huge_page(struct hstate *h,
1944 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1945 nodemask_t *node_alloc_noretry)
1946{
1947 struct page *page;
1948 bool retry = false;
1949
1950retry:
1951 if (hstate_is_gigantic(h))
1952 page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
1953 else
1954 page = alloc_buddy_huge_page(h, gfp_mask,
1955 nid, nmask, node_alloc_noretry);
1956 if (!page)
1957 return NULL;
1958
1959 if (hstate_is_gigantic(h)) {
1960 if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
1961
1962
1963
1964
1965 free_gigantic_page(page, huge_page_order(h));
1966 if (!retry) {
1967 retry = true;
1968 goto retry;
1969 }
1970 return NULL;
1971 }
1972 }
1973 prep_new_huge_page(h, page, page_to_nid(page));
1974
1975 return page;
1976}
1977
1978
1979
1980
1981
1982static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1983 nodemask_t *node_alloc_noretry)
1984{
1985 struct page *page;
1986 int nr_nodes, node;
1987 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
1988
1989 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1990 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
1991 node_alloc_noretry);
1992 if (page)
1993 break;
1994 }
1995
1996 if (!page)
1997 return 0;
1998
1999 put_page(page);
2000
2001 return 1;
2002}
2003
2004
2005
2006
2007
2008
2009
2010
2011static struct page *remove_pool_huge_page(struct hstate *h,
2012 nodemask_t *nodes_allowed,
2013 bool acct_surplus)
2014{
2015 int nr_nodes, node;
2016 struct page *page = NULL;
2017
2018 lockdep_assert_held(&hugetlb_lock);
2019 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2020
2021
2022
2023
2024 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2025 !list_empty(&h->hugepage_freelists[node])) {
2026 page = list_entry(h->hugepage_freelists[node].next,
2027 struct page, lru);
2028 remove_hugetlb_page(h, page, acct_surplus);
2029 break;
2030 }
2031 }
2032
2033 return page;
2034}
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050int dissolve_free_huge_page(struct page *page)
2051{
2052 int rc = -EBUSY;
2053
2054retry:
2055
2056 if (!PageHuge(page))
2057 return 0;
2058
2059 spin_lock_irq(&hugetlb_lock);
2060 if (!PageHuge(page)) {
2061 rc = 0;
2062 goto out;
2063 }
2064
2065 if (!page_count(page)) {
2066 struct page *head = compound_head(page);
2067 struct hstate *h = page_hstate(head);
2068 if (h->free_huge_pages - h->resv_huge_pages == 0)
2069 goto out;
2070
2071
2072
2073
2074
2075 if (unlikely(!HPageFreed(head))) {
2076 spin_unlock_irq(&hugetlb_lock);
2077 cond_resched();
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087 goto retry;
2088 }
2089
2090 remove_hugetlb_page(h, head, false);
2091 h->max_huge_pages--;
2092 spin_unlock_irq(&hugetlb_lock);
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102 rc = alloc_huge_page_vmemmap(h, head);
2103 if (!rc) {
2104
2105
2106
2107
2108
2109 if (PageHWPoison(head) && page != head) {
2110 SetPageHWPoison(page);
2111 ClearPageHWPoison(head);
2112 }
2113 update_and_free_page(h, head, false);
2114 } else {
2115 spin_lock_irq(&hugetlb_lock);
2116 add_hugetlb_page(h, head, false);
2117 h->max_huge_pages++;
2118 spin_unlock_irq(&hugetlb_lock);
2119 }
2120
2121 return rc;
2122 }
2123out:
2124 spin_unlock_irq(&hugetlb_lock);
2125 return rc;
2126}
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
2137{
2138 unsigned long pfn;
2139 struct page *page;
2140 int rc = 0;
2141
2142 if (!hugepages_supported())
2143 return rc;
2144
2145 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
2146 page = pfn_to_page(pfn);
2147 rc = dissolve_free_huge_page(page);
2148 if (rc)
2149 break;
2150 }
2151
2152 return rc;
2153}
2154
2155
2156
2157
2158static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
2159 int nid, nodemask_t *nmask, bool zero_ref)
2160{
2161 struct page *page = NULL;
2162 bool retry = false;
2163
2164 if (hstate_is_gigantic(h))
2165 return NULL;
2166
2167 spin_lock_irq(&hugetlb_lock);
2168 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2169 goto out_unlock;
2170 spin_unlock_irq(&hugetlb_lock);
2171
2172retry:
2173 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2174 if (!page)
2175 return NULL;
2176
2177 spin_lock_irq(&hugetlb_lock);
2178
2179
2180
2181
2182
2183
2184
2185 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2186 SetHPageTemporary(page);
2187 spin_unlock_irq(&hugetlb_lock);
2188 put_page(page);
2189 return NULL;
2190 }
2191
2192 if (zero_ref) {
2193
2194
2195
2196
2197
2198
2199 SetHPageTemporary(page);
2200 if (!put_page_testzero(page)) {
2201
2202
2203
2204
2205 pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
2206 spin_unlock_irq(&hugetlb_lock);
2207 if (retry)
2208 return NULL;
2209
2210 retry = true;
2211 goto retry;
2212 }
2213 ClearHPageTemporary(page);
2214 }
2215
2216 h->surplus_huge_pages++;
2217 h->surplus_huge_pages_node[page_to_nid(page)]++;
2218
2219out_unlock:
2220 spin_unlock_irq(&hugetlb_lock);
2221
2222 return page;
2223}
2224
2225static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
2226 int nid, nodemask_t *nmask)
2227{
2228 struct page *page;
2229
2230 if (hstate_is_gigantic(h))
2231 return NULL;
2232
2233 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2234 if (!page)
2235 return NULL;
2236
2237
2238
2239
2240
2241 SetHPageTemporary(page);
2242
2243 return page;
2244}
2245
2246
2247
2248
2249static
2250struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
2251 struct vm_area_struct *vma, unsigned long addr)
2252{
2253 struct page *page = NULL;
2254 struct mempolicy *mpol;
2255 gfp_t gfp_mask = htlb_alloc_mask(h);
2256 int nid;
2257 nodemask_t *nodemask;
2258
2259 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
2260 if (mpol_is_preferred_many(mpol)) {
2261 gfp_t gfp = gfp_mask | __GFP_NOWARN;
2262
2263 gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
2264 page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
2265
2266
2267 nodemask = NULL;
2268 }
2269
2270 if (!page)
2271 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
2272 mpol_cond_put(mpol);
2273 return page;
2274}
2275
2276
2277struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
2278 nodemask_t *nmask, gfp_t gfp_mask)
2279{
2280 spin_lock_irq(&hugetlb_lock);
2281 if (h->free_huge_pages - h->resv_huge_pages > 0) {
2282 struct page *page;
2283
2284 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
2285 if (page) {
2286 spin_unlock_irq(&hugetlb_lock);
2287 return page;
2288 }
2289 }
2290 spin_unlock_irq(&hugetlb_lock);
2291
2292 return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
2293}
2294
2295
2296struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
2297 unsigned long address)
2298{
2299 struct mempolicy *mpol;
2300 nodemask_t *nodemask;
2301 struct page *page;
2302 gfp_t gfp_mask;
2303 int node;
2304
2305 gfp_mask = htlb_alloc_mask(h);
2306 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
2307 page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
2308 mpol_cond_put(mpol);
2309
2310 return page;
2311}
2312
2313
2314
2315
2316
2317static int gather_surplus_pages(struct hstate *h, long delta)
2318 __must_hold(&hugetlb_lock)
2319{
2320 struct list_head surplus_list;
2321 struct page *page, *tmp;
2322 int ret;
2323 long i;
2324 long needed, allocated;
2325 bool alloc_ok = true;
2326
2327 lockdep_assert_held(&hugetlb_lock);
2328 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2329 if (needed <= 0) {
2330 h->resv_huge_pages += delta;
2331 return 0;
2332 }
2333
2334 allocated = 0;
2335 INIT_LIST_HEAD(&surplus_list);
2336
2337 ret = -ENOMEM;
2338retry:
2339 spin_unlock_irq(&hugetlb_lock);
2340 for (i = 0; i < needed; i++) {
2341 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
2342 NUMA_NO_NODE, NULL, true);
2343 if (!page) {
2344 alloc_ok = false;
2345 break;
2346 }
2347 list_add(&page->lru, &surplus_list);
2348 cond_resched();
2349 }
2350 allocated += i;
2351
2352
2353
2354
2355
2356 spin_lock_irq(&hugetlb_lock);
2357 needed = (h->resv_huge_pages + delta) -
2358 (h->free_huge_pages + allocated);
2359 if (needed > 0) {
2360 if (alloc_ok)
2361 goto retry;
2362
2363
2364
2365
2366
2367 goto free;
2368 }
2369
2370
2371
2372
2373
2374
2375
2376
2377 needed += allocated;
2378 h->resv_huge_pages += delta;
2379 ret = 0;
2380
2381
2382 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
2383 if ((--needed) < 0)
2384 break;
2385
2386 enqueue_huge_page(h, page);
2387 }
2388free:
2389 spin_unlock_irq(&hugetlb_lock);
2390
2391
2392
2393
2394
2395 list_for_each_entry_safe(page, tmp, &surplus_list, lru)
2396 free_huge_page(page);
2397 spin_lock_irq(&hugetlb_lock);
2398
2399 return ret;
2400}
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410static void return_unused_surplus_pages(struct hstate *h,
2411 unsigned long unused_resv_pages)
2412{
2413 unsigned long nr_pages;
2414 struct page *page;
2415 LIST_HEAD(page_list);
2416
2417 lockdep_assert_held(&hugetlb_lock);
2418
2419 h->resv_huge_pages -= unused_resv_pages;
2420
2421
2422 if (hstate_is_gigantic(h))
2423 goto out;
2424
2425
2426
2427
2428
2429 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439 while (nr_pages--) {
2440 page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
2441 if (!page)
2442 goto out;
2443
2444 list_add(&page->lru, &page_list);
2445 }
2446
2447out:
2448 spin_unlock_irq(&hugetlb_lock);
2449 update_and_free_pages_bulk(h, &page_list);
2450 spin_lock_irq(&hugetlb_lock);
2451}
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483enum vma_resv_mode {
2484 VMA_NEEDS_RESV,
2485 VMA_COMMIT_RESV,
2486 VMA_END_RESV,
2487 VMA_ADD_RESV,
2488 VMA_DEL_RESV,
2489};
2490static long __vma_reservation_common(struct hstate *h,
2491 struct vm_area_struct *vma, unsigned long addr,
2492 enum vma_resv_mode mode)
2493{
2494 struct resv_map *resv;
2495 pgoff_t idx;
2496 long ret;
2497 long dummy_out_regions_needed;
2498
2499 resv = vma_resv_map(vma);
2500 if (!resv)
2501 return 1;
2502
2503 idx = vma_hugecache_offset(h, vma, addr);
2504 switch (mode) {
2505 case VMA_NEEDS_RESV:
2506 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2507
2508
2509
2510
2511 VM_BUG_ON(dummy_out_regions_needed != 1);
2512 break;
2513 case VMA_COMMIT_RESV:
2514 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2515
2516 VM_BUG_ON(ret < 0);
2517 break;
2518 case VMA_END_RESV:
2519 region_abort(resv, idx, idx + 1, 1);
2520 ret = 0;
2521 break;
2522 case VMA_ADD_RESV:
2523 if (vma->vm_flags & VM_MAYSHARE) {
2524 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2525
2526 VM_BUG_ON(ret < 0);
2527 } else {
2528 region_abort(resv, idx, idx + 1, 1);
2529 ret = region_del(resv, idx, idx + 1);
2530 }
2531 break;
2532 case VMA_DEL_RESV:
2533 if (vma->vm_flags & VM_MAYSHARE) {
2534 region_abort(resv, idx, idx + 1, 1);
2535 ret = region_del(resv, idx, idx + 1);
2536 } else {
2537 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2538
2539 VM_BUG_ON(ret < 0);
2540 }
2541 break;
2542 default:
2543 BUG();
2544 }
2545
2546 if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2547 return ret;
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563 if (ret > 0)
2564 return 0;
2565 if (ret == 0)
2566 return 1;
2567 return ret;
2568}
2569
2570static long vma_needs_reservation(struct hstate *h,
2571 struct vm_area_struct *vma, unsigned long addr)
2572{
2573 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
2574}
2575
2576static long vma_commit_reservation(struct hstate *h,
2577 struct vm_area_struct *vma, unsigned long addr)
2578{
2579 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2580}
2581
2582static void vma_end_reservation(struct hstate *h,
2583 struct vm_area_struct *vma, unsigned long addr)
2584{
2585 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
2586}
2587
2588static long vma_add_reservation(struct hstate *h,
2589 struct vm_area_struct *vma, unsigned long addr)
2590{
2591 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2592}
2593
2594static long vma_del_reservation(struct hstate *h,
2595 struct vm_area_struct *vma, unsigned long addr)
2596{
2597 return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2598}
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
2621 unsigned long address, struct page *page)
2622{
2623 long rc = vma_needs_reservation(h, vma, address);
2624
2625 if (HPageRestoreReserve(page)) {
2626 if (unlikely(rc < 0))
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638 ClearHPageRestoreReserve(page);
2639 else if (rc)
2640 (void)vma_add_reservation(h, vma, address);
2641 else
2642 vma_end_reservation(h, vma, address);
2643 } else {
2644 if (!rc) {
2645
2646
2647
2648
2649
2650
2651
2652
2653 rc = vma_del_reservation(h, vma, address);
2654 if (rc < 0)
2655
2656
2657
2658
2659
2660
2661
2662
2663 SetHPageRestoreReserve(page);
2664 } else if (rc < 0) {
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675 if (!(vma->vm_flags & VM_MAYSHARE))
2676
2677
2678
2679
2680
2681
2682
2683
2684 SetHPageRestoreReserve(page);
2685 } else
2686
2687
2688
2689 vma_end_reservation(h, vma, address);
2690 }
2691}
2692
2693
2694
2695
2696
2697
2698
2699
2700static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
2701 struct list_head *list)
2702{
2703 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2704 int nid = page_to_nid(old_page);
2705 bool alloc_retry = false;
2706 struct page *new_page;
2707 int ret = 0;
2708
2709
2710
2711
2712
2713
2714
2715
2716alloc_retry:
2717 new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
2718 if (!new_page)
2719 return -ENOMEM;
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730 SetHPageTemporary(new_page);
2731 if (!put_page_testzero(new_page)) {
2732 if (alloc_retry)
2733 return -EBUSY;
2734
2735 alloc_retry = true;
2736 goto alloc_retry;
2737 }
2738 ClearHPageTemporary(new_page);
2739
2740 __prep_new_huge_page(h, new_page);
2741
2742retry:
2743 spin_lock_irq(&hugetlb_lock);
2744 if (!PageHuge(old_page)) {
2745
2746
2747
2748 goto free_new;
2749 } else if (page_count(old_page)) {
2750
2751
2752
2753
2754 spin_unlock_irq(&hugetlb_lock);
2755 if (!isolate_huge_page(old_page, list))
2756 ret = -EBUSY;
2757 spin_lock_irq(&hugetlb_lock);
2758 goto free_new;
2759 } else if (!HPageFreed(old_page)) {
2760
2761
2762
2763
2764
2765 spin_unlock_irq(&hugetlb_lock);
2766 cond_resched();
2767 goto retry;
2768 } else {
2769
2770
2771
2772
2773
2774
2775
2776 remove_hugetlb_page(h, old_page, false);
2777
2778
2779
2780
2781
2782 __prep_account_new_huge_page(h, nid);
2783 enqueue_huge_page(h, new_page);
2784
2785
2786
2787
2788 spin_unlock_irq(&hugetlb_lock);
2789 update_and_free_page(h, old_page, false);
2790 }
2791
2792 return ret;
2793
2794free_new:
2795 spin_unlock_irq(&hugetlb_lock);
2796
2797 set_page_refcounted(new_page);
2798 update_and_free_page(h, new_page, false);
2799
2800 return ret;
2801}
2802
2803int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
2804{
2805 struct hstate *h;
2806 struct page *head;
2807 int ret = -EBUSY;
2808
2809
2810
2811
2812
2813
2814 spin_lock_irq(&hugetlb_lock);
2815 if (PageHuge(page)) {
2816 head = compound_head(page);
2817 h = page_hstate(head);
2818 } else {
2819 spin_unlock_irq(&hugetlb_lock);
2820 return 0;
2821 }
2822 spin_unlock_irq(&hugetlb_lock);
2823
2824
2825
2826
2827
2828
2829 if (hstate_is_gigantic(h))
2830 return -ENOMEM;
2831
2832 if (page_count(head) && isolate_huge_page(head, list))
2833 ret = 0;
2834 else if (!page_count(head))
2835 ret = alloc_and_dissolve_huge_page(h, head, list);
2836
2837 return ret;
2838}
2839
2840struct page *alloc_huge_page(struct vm_area_struct *vma,
2841 unsigned long addr, int avoid_reserve)
2842{
2843 struct hugepage_subpool *spool = subpool_vma(vma);
2844 struct hstate *h = hstate_vma(vma);
2845 struct page *page;
2846 long map_chg, map_commit;
2847 long gbl_chg;
2848 int ret, idx;
2849 struct hugetlb_cgroup *h_cg;
2850 bool deferred_reserve;
2851
2852 idx = hstate_index(h);
2853
2854
2855
2856
2857
2858 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
2859 if (map_chg < 0)
2860 return ERR_PTR(-ENOMEM);
2861
2862
2863
2864
2865
2866
2867
2868
2869 if (map_chg || avoid_reserve) {
2870 gbl_chg = hugepage_subpool_get_pages(spool, 1);
2871 if (gbl_chg < 0) {
2872 vma_end_reservation(h, vma, addr);
2873 return ERR_PTR(-ENOSPC);
2874 }
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884 if (avoid_reserve)
2885 gbl_chg = 1;
2886 }
2887
2888
2889
2890 deferred_reserve = map_chg || avoid_reserve;
2891 if (deferred_reserve) {
2892 ret = hugetlb_cgroup_charge_cgroup_rsvd(
2893 idx, pages_per_huge_page(h), &h_cg);
2894 if (ret)
2895 goto out_subpool_put;
2896 }
2897
2898 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
2899 if (ret)
2900 goto out_uncharge_cgroup_reservation;
2901
2902 spin_lock_irq(&hugetlb_lock);
2903
2904
2905
2906
2907
2908 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
2909 if (!page) {
2910 spin_unlock_irq(&hugetlb_lock);
2911 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
2912 if (!page)
2913 goto out_uncharge_cgroup;
2914 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
2915 SetHPageRestoreReserve(page);
2916 h->resv_huge_pages--;
2917 }
2918 spin_lock_irq(&hugetlb_lock);
2919 list_add(&page->lru, &h->hugepage_activelist);
2920
2921 }
2922 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
2923
2924
2925
2926 if (deferred_reserve) {
2927 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
2928 h_cg, page);
2929 }
2930
2931 spin_unlock_irq(&hugetlb_lock);
2932
2933 hugetlb_set_page_subpool(page, spool);
2934
2935 map_commit = vma_commit_reservation(h, vma, addr);
2936 if (unlikely(map_chg > map_commit)) {
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946 long rsv_adjust;
2947
2948 rsv_adjust = hugepage_subpool_put_pages(spool, 1);
2949 hugetlb_acct_memory(h, -rsv_adjust);
2950 if (deferred_reserve)
2951 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
2952 pages_per_huge_page(h), page);
2953 }
2954 return page;
2955
2956out_uncharge_cgroup:
2957 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
2958out_uncharge_cgroup_reservation:
2959 if (deferred_reserve)
2960 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
2961 h_cg);
2962out_subpool_put:
2963 if (map_chg || avoid_reserve)
2964 hugepage_subpool_put_pages(spool, 1);
2965 vma_end_reservation(h, vma, addr);
2966 return ERR_PTR(-ENOSPC);
2967}
2968
2969int alloc_bootmem_huge_page(struct hstate *h, int nid)
2970 __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
2971int __alloc_bootmem_huge_page(struct hstate *h, int nid)
2972{
2973 struct huge_bootmem_page *m = NULL;
2974 int nr_nodes, node;
2975
2976 if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
2977 return 0;
2978
2979 if (nid != NUMA_NO_NODE) {
2980 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
2981 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
2982 if (!m)
2983 return 0;
2984 goto found;
2985 }
2986
2987 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
2988 m = memblock_alloc_try_nid_raw(
2989 huge_page_size(h), huge_page_size(h),
2990 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
2991
2992
2993
2994
2995
2996 if (!m)
2997 return 0;
2998 goto found;
2999 }
3000
3001found:
3002
3003 INIT_LIST_HEAD(&m->list);
3004 list_add(&m->list, &huge_boot_pages);
3005 m->hstate = h;
3006 return 1;
3007}
3008
3009
3010
3011
3012
3013static void __init gather_bootmem_prealloc(void)
3014{
3015 struct huge_bootmem_page *m;
3016
3017 list_for_each_entry(m, &huge_boot_pages, list) {
3018 struct page *page = virt_to_page(m);
3019 struct hstate *h = m->hstate;
3020
3021 VM_BUG_ON(!hstate_is_gigantic(h));
3022 WARN_ON(page_count(page) != 1);
3023 if (prep_compound_gigantic_page(page, huge_page_order(h))) {
3024 WARN_ON(PageReserved(page));
3025 prep_new_huge_page(h, page, page_to_nid(page));
3026 put_page(page);
3027 } else {
3028
3029 free_gigantic_page(page, huge_page_order(h));
3030 }
3031
3032
3033
3034
3035
3036
3037 adjust_managed_page_count(page, pages_per_huge_page(h));
3038 cond_resched();
3039 }
3040}
3041static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
3042{
3043 unsigned long i;
3044 char buf[32];
3045
3046 for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3047 if (hstate_is_gigantic(h)) {
3048 if (!alloc_bootmem_huge_page(h, nid))
3049 break;
3050 } else {
3051 struct page *page;
3052 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
3053
3054 page = alloc_fresh_huge_page(h, gfp_mask, nid,
3055 &node_states[N_MEMORY], NULL);
3056 if (!page)
3057 break;
3058 put_page(page);
3059 }
3060 cond_resched();
3061 }
3062 if (i == h->max_huge_pages_node[nid])
3063 return;
3064
3065 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3066 pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
3067 h->max_huge_pages_node[nid], buf, nid, i);
3068 h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3069 h->max_huge_pages_node[nid] = i;
3070}
3071
3072static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
3073{
3074 unsigned long i;
3075 nodemask_t *node_alloc_noretry;
3076 bool node_specific_alloc = false;
3077
3078
3079 if (hstate_is_gigantic(h) && hugetlb_cma_size) {
3080 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
3081 return;
3082 }
3083
3084
3085 for (i = 0; i < nr_online_nodes; i++) {
3086 if (h->max_huge_pages_node[i] > 0) {
3087 hugetlb_hstate_alloc_pages_onenode(h, i);
3088 node_specific_alloc = true;
3089 }
3090 }
3091
3092 if (node_specific_alloc)
3093 return;
3094
3095
3096 if (!hstate_is_gigantic(h)) {
3097
3098
3099
3100
3101
3102
3103 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
3104 GFP_KERNEL);
3105 } else {
3106
3107 node_alloc_noretry = NULL;
3108 }
3109
3110
3111 if (node_alloc_noretry)
3112 nodes_clear(*node_alloc_noretry);
3113
3114 for (i = 0; i < h->max_huge_pages; ++i) {
3115 if (hstate_is_gigantic(h)) {
3116 if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
3117 break;
3118 } else if (!alloc_pool_huge_page(h,
3119 &node_states[N_MEMORY],
3120 node_alloc_noretry))
3121 break;
3122 cond_resched();
3123 }
3124 if (i < h->max_huge_pages) {
3125 char buf[32];
3126
3127 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3128 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
3129 h->max_huge_pages, buf, i);
3130 h->max_huge_pages = i;
3131 }
3132 kfree(node_alloc_noretry);
3133}
3134
3135static void __init hugetlb_init_hstates(void)
3136{
3137 struct hstate *h, *h2;
3138
3139 for_each_hstate(h) {
3140 if (minimum_order > huge_page_order(h))
3141 minimum_order = huge_page_order(h);
3142
3143
3144 if (!hstate_is_gigantic(h))
3145 hugetlb_hstate_alloc_pages(h);
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3156 continue;
3157 if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
3158 continue;
3159 for_each_hstate(h2) {
3160 if (h2 == h)
3161 continue;
3162 if (h2->order < h->order &&
3163 h2->order > h->demote_order)
3164 h->demote_order = h2->order;
3165 }
3166 }
3167 VM_BUG_ON(minimum_order == UINT_MAX);
3168}
3169
3170static void __init report_hugepages(void)
3171{
3172 struct hstate *h;
3173
3174 for_each_hstate(h) {
3175 char buf[32];
3176
3177 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
3178 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
3179 buf, h->free_huge_pages);
3180 }
3181}
3182
3183#ifdef CONFIG_HIGHMEM
3184static void try_to_free_low(struct hstate *h, unsigned long count,
3185 nodemask_t *nodes_allowed)
3186{
3187 int i;
3188 LIST_HEAD(page_list);
3189
3190 lockdep_assert_held(&hugetlb_lock);
3191 if (hstate_is_gigantic(h))
3192 return;
3193
3194
3195
3196
3197 for_each_node_mask(i, *nodes_allowed) {
3198 struct page *page, *next;
3199 struct list_head *freel = &h->hugepage_freelists[i];
3200 list_for_each_entry_safe(page, next, freel, lru) {
3201 if (count >= h->nr_huge_pages)
3202 goto out;
3203 if (PageHighMem(page))
3204 continue;
3205 remove_hugetlb_page(h, page, false);
3206 list_add(&page->lru, &page_list);
3207 }
3208 }
3209
3210out:
3211 spin_unlock_irq(&hugetlb_lock);
3212 update_and_free_pages_bulk(h, &page_list);
3213 spin_lock_irq(&hugetlb_lock);
3214}
3215#else
3216static inline void try_to_free_low(struct hstate *h, unsigned long count,
3217 nodemask_t *nodes_allowed)
3218{
3219}
3220#endif
3221
3222
3223
3224
3225
3226
3227static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
3228 int delta)
3229{
3230 int nr_nodes, node;
3231
3232 lockdep_assert_held(&hugetlb_lock);
3233 VM_BUG_ON(delta != -1 && delta != 1);
3234
3235 if (delta < 0) {
3236 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
3237 if (h->surplus_huge_pages_node[node])
3238 goto found;
3239 }
3240 } else {
3241 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3242 if (h->surplus_huge_pages_node[node] <
3243 h->nr_huge_pages_node[node])
3244 goto found;
3245 }
3246 }
3247 return 0;
3248
3249found:
3250 h->surplus_huge_pages += delta;
3251 h->surplus_huge_pages_node[node] += delta;
3252 return 1;
3253}
3254
3255#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3256static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
3257 nodemask_t *nodes_allowed)
3258{
3259 unsigned long min_count, ret;
3260 struct page *page;
3261 LIST_HEAD(page_list);
3262 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3263
3264
3265
3266
3267
3268
3269 if (node_alloc_noretry)
3270 nodes_clear(*node_alloc_noretry);
3271 else
3272 return -ENOMEM;
3273
3274
3275
3276
3277
3278 mutex_lock(&h->resize_lock);
3279 flush_free_hpage_work(h);
3280 spin_lock_irq(&hugetlb_lock);
3281
3282
3283
3284
3285
3286
3287
3288 if (nid != NUMA_NO_NODE) {
3289 unsigned long old_count = count;
3290
3291 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
3292
3293
3294
3295
3296
3297
3298 if (count < old_count)
3299 count = ULONG_MAX;
3300 }
3301
3302
3303
3304
3305
3306
3307
3308
3309 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3310 if (count > persistent_huge_pages(h)) {
3311 spin_unlock_irq(&hugetlb_lock);
3312 mutex_unlock(&h->resize_lock);
3313 NODEMASK_FREE(node_alloc_noretry);
3314 return -EINVAL;
3315 }
3316
3317 }
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3331 if (!adjust_pool_surplus(h, nodes_allowed, -1))
3332 break;
3333 }
3334
3335 while (count > persistent_huge_pages(h)) {
3336
3337
3338
3339
3340
3341 spin_unlock_irq(&hugetlb_lock);
3342
3343
3344 cond_resched();
3345
3346 ret = alloc_pool_huge_page(h, nodes_allowed,
3347 node_alloc_noretry);
3348 spin_lock_irq(&hugetlb_lock);
3349 if (!ret)
3350 goto out;
3351
3352
3353 if (signal_pending(current))
3354 goto out;
3355 }
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
3373 min_count = max(count, min_count);
3374 try_to_free_low(h, min_count, nodes_allowed);
3375
3376
3377
3378
3379 while (min_count < persistent_huge_pages(h)) {
3380 page = remove_pool_huge_page(h, nodes_allowed, 0);
3381 if (!page)
3382 break;
3383
3384 list_add(&page->lru, &page_list);
3385 }
3386
3387 spin_unlock_irq(&hugetlb_lock);
3388 update_and_free_pages_bulk(h, &page_list);
3389 flush_free_hpage_work(h);
3390 spin_lock_irq(&hugetlb_lock);
3391
3392 while (count < persistent_huge_pages(h)) {
3393 if (!adjust_pool_surplus(h, nodes_allowed, 1))
3394 break;
3395 }
3396out:
3397 h->max_huge_pages = persistent_huge_pages(h);
3398 spin_unlock_irq(&hugetlb_lock);
3399 mutex_unlock(&h->resize_lock);
3400
3401 NODEMASK_FREE(node_alloc_noretry);
3402
3403 return 0;
3404}
3405
3406static int demote_free_huge_page(struct hstate *h, struct page *page)
3407{
3408 int i, nid = page_to_nid(page);
3409 struct hstate *target_hstate;
3410 int rc = 0;
3411
3412 target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
3413
3414 remove_hugetlb_page_for_demote(h, page, false);
3415 spin_unlock_irq(&hugetlb_lock);
3416
3417 rc = alloc_huge_page_vmemmap(h, page);
3418 if (rc) {
3419
3420 spin_lock_irq(&hugetlb_lock);
3421 set_page_refcounted(page);
3422 add_hugetlb_page(h, page, false);
3423 return rc;
3424 }
3425
3426
3427
3428
3429
3430 destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440 mutex_lock(&target_hstate->resize_lock);
3441 for (i = 0; i < pages_per_huge_page(h);
3442 i += pages_per_huge_page(target_hstate)) {
3443 if (hstate_is_gigantic(target_hstate))
3444 prep_compound_gigantic_page_for_demote(page + i,
3445 target_hstate->order);
3446 else
3447 prep_compound_page(page + i, target_hstate->order);
3448 set_page_private(page + i, 0);
3449 set_page_refcounted(page + i);
3450 prep_new_huge_page(target_hstate, page + i, nid);
3451 put_page(page + i);
3452 }
3453 mutex_unlock(&target_hstate->resize_lock);
3454
3455 spin_lock_irq(&hugetlb_lock);
3456
3457
3458
3459
3460
3461 h->max_huge_pages--;
3462 target_hstate->max_huge_pages += pages_per_huge_page(h);
3463
3464 return rc;
3465}
3466
3467static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
3468 __must_hold(&hugetlb_lock)
3469{
3470 int nr_nodes, node;
3471 struct page *page;
3472 int rc = 0;
3473
3474 lockdep_assert_held(&hugetlb_lock);
3475
3476
3477 if (!h->demote_order) {
3478 pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
3479 return -EINVAL;
3480 }
3481
3482 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
3483 if (!list_empty(&h->hugepage_freelists[node])) {
3484 page = list_entry(h->hugepage_freelists[node].next,
3485 struct page, lru);
3486 rc = demote_free_huge_page(h, page);
3487 break;
3488 }
3489 }
3490
3491 return rc;
3492}
3493
3494#define HSTATE_ATTR_RO(_name) \
3495 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
3496
3497#define HSTATE_ATTR_WO(_name) \
3498 static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
3499
3500#define HSTATE_ATTR(_name) \
3501 static struct kobj_attribute _name##_attr = \
3502 __ATTR(_name, 0644, _name##_show, _name##_store)
3503
3504static struct kobject *hugepages_kobj;
3505static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3506
3507static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
3508
3509static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
3510{
3511 int i;
3512
3513 for (i = 0; i < HUGE_MAX_HSTATE; i++)
3514 if (hstate_kobjs[i] == kobj) {
3515 if (nidp)
3516 *nidp = NUMA_NO_NODE;
3517 return &hstates[i];
3518 }
3519
3520 return kobj_to_node_hstate(kobj, nidp);
3521}
3522
3523static ssize_t nr_hugepages_show_common(struct kobject *kobj,
3524 struct kobj_attribute *attr, char *buf)
3525{
3526 struct hstate *h;
3527 unsigned long nr_huge_pages;
3528 int nid;
3529
3530 h = kobj_to_hstate(kobj, &nid);
3531 if (nid == NUMA_NO_NODE)
3532 nr_huge_pages = h->nr_huge_pages;
3533 else
3534 nr_huge_pages = h->nr_huge_pages_node[nid];
3535
3536 return sysfs_emit(buf, "%lu\n", nr_huge_pages);
3537}
3538
3539static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
3540 struct hstate *h, int nid,
3541 unsigned long count, size_t len)
3542{
3543 int err;
3544 nodemask_t nodes_allowed, *n_mask;
3545
3546 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3547 return -EINVAL;
3548
3549 if (nid == NUMA_NO_NODE) {
3550
3551
3552
3553 if (!(obey_mempolicy &&
3554 init_nodemask_of_mempolicy(&nodes_allowed)))
3555 n_mask = &node_states[N_MEMORY];
3556 else
3557 n_mask = &nodes_allowed;
3558 } else {
3559
3560
3561
3562
3563 init_nodemask_of_node(&nodes_allowed, nid);
3564 n_mask = &nodes_allowed;
3565 }
3566
3567 err = set_max_huge_pages(h, count, nid, n_mask);
3568
3569 return err ? err : len;
3570}
3571
3572static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
3573 struct kobject *kobj, const char *buf,
3574 size_t len)
3575{
3576 struct hstate *h;
3577 unsigned long count;
3578 int nid;
3579 int err;
3580
3581 err = kstrtoul(buf, 10, &count);
3582 if (err)
3583 return err;
3584
3585 h = kobj_to_hstate(kobj, &nid);
3586 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
3587}
3588
3589static ssize_t nr_hugepages_show(struct kobject *kobj,
3590 struct kobj_attribute *attr, char *buf)
3591{
3592 return nr_hugepages_show_common(kobj, attr, buf);
3593}
3594
3595static ssize_t nr_hugepages_store(struct kobject *kobj,
3596 struct kobj_attribute *attr, const char *buf, size_t len)
3597{
3598 return nr_hugepages_store_common(false, kobj, buf, len);
3599}
3600HSTATE_ATTR(nr_hugepages);
3601
3602#ifdef CONFIG_NUMA
3603
3604
3605
3606
3607
3608static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
3609 struct kobj_attribute *attr,
3610 char *buf)
3611{
3612 return nr_hugepages_show_common(kobj, attr, buf);
3613}
3614
3615static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
3616 struct kobj_attribute *attr, const char *buf, size_t len)
3617{
3618 return nr_hugepages_store_common(true, kobj, buf, len);
3619}
3620HSTATE_ATTR(nr_hugepages_mempolicy);
3621#endif
3622
3623
3624static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
3625 struct kobj_attribute *attr, char *buf)
3626{
3627 struct hstate *h = kobj_to_hstate(kobj, NULL);
3628 return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
3629}
3630
3631static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
3632 struct kobj_attribute *attr, const char *buf, size_t count)
3633{
3634 int err;
3635 unsigned long input;
3636 struct hstate *h = kobj_to_hstate(kobj, NULL);
3637
3638 if (hstate_is_gigantic(h))
3639 return -EINVAL;
3640
3641 err = kstrtoul(buf, 10, &input);
3642 if (err)
3643 return err;
3644
3645 spin_lock_irq(&hugetlb_lock);
3646 h->nr_overcommit_huge_pages = input;
3647 spin_unlock_irq(&hugetlb_lock);
3648
3649 return count;
3650}
3651HSTATE_ATTR(nr_overcommit_hugepages);
3652
3653static ssize_t free_hugepages_show(struct kobject *kobj,
3654 struct kobj_attribute *attr, char *buf)
3655{
3656 struct hstate *h;
3657 unsigned long free_huge_pages;
3658 int nid;
3659
3660 h = kobj_to_hstate(kobj, &nid);
3661 if (nid == NUMA_NO_NODE)
3662 free_huge_pages = h->free_huge_pages;
3663 else
3664 free_huge_pages = h->free_huge_pages_node[nid];
3665
3666 return sysfs_emit(buf, "%lu\n", free_huge_pages);
3667}
3668HSTATE_ATTR_RO(free_hugepages);
3669
3670static ssize_t resv_hugepages_show(struct kobject *kobj,
3671 struct kobj_attribute *attr, char *buf)
3672{
3673 struct hstate *h = kobj_to_hstate(kobj, NULL);
3674 return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
3675}
3676HSTATE_ATTR_RO(resv_hugepages);
3677
3678static ssize_t surplus_hugepages_show(struct kobject *kobj,
3679 struct kobj_attribute *attr, char *buf)
3680{
3681 struct hstate *h;
3682 unsigned long surplus_huge_pages;
3683 int nid;
3684
3685 h = kobj_to_hstate(kobj, &nid);
3686 if (nid == NUMA_NO_NODE)
3687 surplus_huge_pages = h->surplus_huge_pages;
3688 else
3689 surplus_huge_pages = h->surplus_huge_pages_node[nid];
3690
3691 return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
3692}
3693HSTATE_ATTR_RO(surplus_hugepages);
3694
3695static ssize_t demote_store(struct kobject *kobj,
3696 struct kobj_attribute *attr, const char *buf, size_t len)
3697{
3698 unsigned long nr_demote;
3699 unsigned long nr_available;
3700 nodemask_t nodes_allowed, *n_mask;
3701 struct hstate *h;
3702 int err = 0;
3703 int nid;
3704
3705 err = kstrtoul(buf, 10, &nr_demote);
3706 if (err)
3707 return err;
3708 h = kobj_to_hstate(kobj, &nid);
3709
3710 if (nid != NUMA_NO_NODE) {
3711 init_nodemask_of_node(&nodes_allowed, nid);
3712 n_mask = &nodes_allowed;
3713 } else {
3714 n_mask = &node_states[N_MEMORY];
3715 }
3716
3717
3718 mutex_lock(&h->resize_lock);
3719 spin_lock_irq(&hugetlb_lock);
3720
3721 while (nr_demote) {
3722
3723
3724
3725
3726 if (nid != NUMA_NO_NODE)
3727 nr_available = h->free_huge_pages_node[nid];
3728 else
3729 nr_available = h->free_huge_pages;
3730 nr_available -= h->resv_huge_pages;
3731 if (!nr_available)
3732 break;
3733
3734 err = demote_pool_huge_page(h, n_mask);
3735 if (err)
3736 break;
3737
3738 nr_demote--;
3739 }
3740
3741 spin_unlock_irq(&hugetlb_lock);
3742 mutex_unlock(&h->resize_lock);
3743
3744 if (err)
3745 return err;
3746 return len;
3747}
3748HSTATE_ATTR_WO(demote);
3749
3750static ssize_t demote_size_show(struct kobject *kobj,
3751 struct kobj_attribute *attr, char *buf)
3752{
3753 int nid;
3754 struct hstate *h = kobj_to_hstate(kobj, &nid);
3755 unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
3756
3757 return sysfs_emit(buf, "%lukB\n", demote_size);
3758}
3759
3760static ssize_t demote_size_store(struct kobject *kobj,
3761 struct kobj_attribute *attr,
3762 const char *buf, size_t count)
3763{
3764 struct hstate *h, *demote_hstate;
3765 unsigned long demote_size;
3766 unsigned int demote_order;
3767 int nid;
3768
3769 demote_size = (unsigned long)memparse(buf, NULL);
3770
3771 demote_hstate = size_to_hstate(demote_size);
3772 if (!demote_hstate)
3773 return -EINVAL;
3774 demote_order = demote_hstate->order;
3775 if (demote_order < HUGETLB_PAGE_ORDER)
3776 return -EINVAL;
3777
3778
3779 h = kobj_to_hstate(kobj, &nid);
3780 if (demote_order >= h->order)
3781 return -EINVAL;
3782
3783
3784 mutex_lock(&h->resize_lock);
3785 h->demote_order = demote_order;
3786 mutex_unlock(&h->resize_lock);
3787
3788 return count;
3789}
3790HSTATE_ATTR(demote_size);
3791
3792static struct attribute *hstate_attrs[] = {
3793 &nr_hugepages_attr.attr,
3794 &nr_overcommit_hugepages_attr.attr,
3795 &free_hugepages_attr.attr,
3796 &resv_hugepages_attr.attr,
3797 &surplus_hugepages_attr.attr,
3798#ifdef CONFIG_NUMA
3799 &nr_hugepages_mempolicy_attr.attr,
3800#endif
3801 NULL,
3802};
3803
3804static const struct attribute_group hstate_attr_group = {
3805 .attrs = hstate_attrs,
3806};
3807
3808static struct attribute *hstate_demote_attrs[] = {
3809 &demote_size_attr.attr,
3810 &demote_attr.attr,
3811 NULL,
3812};
3813
3814static const struct attribute_group hstate_demote_attr_group = {
3815 .attrs = hstate_demote_attrs,
3816};
3817
3818static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
3819 struct kobject **hstate_kobjs,
3820 const struct attribute_group *hstate_attr_group)
3821{
3822 int retval;
3823 int hi = hstate_index(h);
3824
3825 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
3826 if (!hstate_kobjs[hi])
3827 return -ENOMEM;
3828
3829 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
3830 if (retval) {
3831 kobject_put(hstate_kobjs[hi]);
3832 hstate_kobjs[hi] = NULL;
3833 }
3834
3835 if (h->demote_order) {
3836 if (sysfs_create_group(hstate_kobjs[hi],
3837 &hstate_demote_attr_group))
3838 pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
3839 }
3840
3841 return retval;
3842}
3843
3844static void __init hugetlb_sysfs_init(void)
3845{
3846 struct hstate *h;
3847 int err;
3848
3849 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
3850 if (!hugepages_kobj)
3851 return;
3852
3853 for_each_hstate(h) {
3854 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
3855 hstate_kobjs, &hstate_attr_group);
3856 if (err)
3857 pr_err("HugeTLB: Unable to add hstate %s", h->name);
3858 }
3859}
3860
3861#ifdef CONFIG_NUMA
3862
3863
3864
3865
3866
3867
3868
3869
3870struct node_hstate {
3871 struct kobject *hugepages_kobj;
3872 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3873};
3874static struct node_hstate node_hstates[MAX_NUMNODES];
3875
3876
3877
3878
3879static struct attribute *per_node_hstate_attrs[] = {
3880 &nr_hugepages_attr.attr,
3881 &free_hugepages_attr.attr,
3882 &surplus_hugepages_attr.attr,
3883 NULL,
3884};
3885
3886static const struct attribute_group per_node_hstate_attr_group = {
3887 .attrs = per_node_hstate_attrs,
3888};
3889
3890
3891
3892
3893
3894static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
3895{
3896 int nid;
3897
3898 for (nid = 0; nid < nr_node_ids; nid++) {
3899 struct node_hstate *nhs = &node_hstates[nid];
3900 int i;
3901 for (i = 0; i < HUGE_MAX_HSTATE; i++)
3902 if (nhs->hstate_kobjs[i] == kobj) {
3903 if (nidp)
3904 *nidp = nid;
3905 return &hstates[i];
3906 }
3907 }
3908
3909 BUG();
3910 return NULL;
3911}
3912
3913
3914
3915
3916
3917static void hugetlb_unregister_node(struct node *node)
3918{
3919 struct hstate *h;
3920 struct node_hstate *nhs = &node_hstates[node->dev.id];
3921
3922 if (!nhs->hugepages_kobj)
3923 return;
3924
3925 for_each_hstate(h) {
3926 int idx = hstate_index(h);
3927 if (nhs->hstate_kobjs[idx]) {
3928 kobject_put(nhs->hstate_kobjs[idx]);
3929 nhs->hstate_kobjs[idx] = NULL;
3930 }
3931 }
3932
3933 kobject_put(nhs->hugepages_kobj);
3934 nhs->hugepages_kobj = NULL;
3935}
3936
3937
3938
3939
3940
3941
3942static void hugetlb_register_node(struct node *node)
3943{
3944 struct hstate *h;
3945 struct node_hstate *nhs = &node_hstates[node->dev.id];
3946 int err;
3947
3948 if (nhs->hugepages_kobj)
3949 return;
3950
3951 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
3952 &node->dev.kobj);
3953 if (!nhs->hugepages_kobj)
3954 return;
3955
3956 for_each_hstate(h) {
3957 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
3958 nhs->hstate_kobjs,
3959 &per_node_hstate_attr_group);
3960 if (err) {
3961 pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
3962 h->name, node->dev.id);
3963 hugetlb_unregister_node(node);
3964 break;
3965 }
3966 }
3967}
3968
3969
3970
3971
3972
3973
3974static void __init hugetlb_register_all_nodes(void)
3975{
3976 int nid;
3977
3978 for_each_node_state(nid, N_MEMORY) {
3979 struct node *node = node_devices[nid];
3980 if (node->dev.id == nid)
3981 hugetlb_register_node(node);
3982 }
3983
3984
3985
3986
3987
3988 register_hugetlbfs_with_node(hugetlb_register_node,
3989 hugetlb_unregister_node);
3990}
3991#else
3992
3993static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
3994{
3995 BUG();
3996 if (nidp)
3997 *nidp = -1;
3998 return NULL;
3999}
4000
4001static void hugetlb_register_all_nodes(void) { }
4002
4003#endif
4004
4005static int __init hugetlb_init(void)
4006{
4007 int i;
4008
4009 BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
4010 __NR_HPAGEFLAGS);
4011
4012 if (!hugepages_supported()) {
4013 if (hugetlb_max_hstate || default_hstate_max_huge_pages)
4014 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4015 return 0;
4016 }
4017
4018
4019
4020
4021
4022 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
4023 if (!parsed_default_hugepagesz) {
4024
4025
4026
4027
4028
4029
4030
4031
4032 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
4033 if (default_hstate_max_huge_pages) {
4034 if (default_hstate.max_huge_pages) {
4035 char buf[32];
4036
4037 string_get_size(huge_page_size(&default_hstate),
4038 1, STRING_UNITS_2, buf, 32);
4039 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
4040 default_hstate.max_huge_pages, buf);
4041 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
4042 default_hstate_max_huge_pages);
4043 }
4044 default_hstate.max_huge_pages =
4045 default_hstate_max_huge_pages;
4046
4047 for (i = 0; i < nr_online_nodes; i++)
4048 default_hstate.max_huge_pages_node[i] =
4049 default_hugepages_in_node[i];
4050 }
4051 }
4052
4053 hugetlb_cma_check();
4054 hugetlb_init_hstates();
4055 gather_bootmem_prealloc();
4056 report_hugepages();
4057
4058 hugetlb_sysfs_init();
4059 hugetlb_register_all_nodes();
4060 hugetlb_cgroup_file_init();
4061
4062#ifdef CONFIG_SMP
4063 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
4064#else
4065 num_fault_mutexes = 1;
4066#endif
4067 hugetlb_fault_mutex_table =
4068 kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
4069 GFP_KERNEL);
4070 BUG_ON(!hugetlb_fault_mutex_table);
4071
4072 for (i = 0; i < num_fault_mutexes; i++)
4073 mutex_init(&hugetlb_fault_mutex_table[i]);
4074 return 0;
4075}
4076subsys_initcall(hugetlb_init);
4077
4078
4079bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
4080{
4081 return size == HPAGE_SIZE;
4082}
4083
4084void __init hugetlb_add_hstate(unsigned int order)
4085{
4086 struct hstate *h;
4087 unsigned long i;
4088
4089 if (size_to_hstate(PAGE_SIZE << order)) {
4090 return;
4091 }
4092 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
4093 BUG_ON(order == 0);
4094 h = &hstates[hugetlb_max_hstate++];
4095 mutex_init(&h->resize_lock);
4096 h->order = order;
4097 h->mask = ~(huge_page_size(h) - 1);
4098 for (i = 0; i < MAX_NUMNODES; ++i)
4099 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
4100 INIT_LIST_HEAD(&h->hugepage_activelist);
4101 h->next_nid_to_alloc = first_memory_node;
4102 h->next_nid_to_free = first_memory_node;
4103 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
4104 huge_page_size(h)/1024);
4105 hugetlb_vmemmap_init(h);
4106
4107 parsed_hstate = h;
4108}
4109
4110bool __init __weak hugetlb_node_alloc_supported(void)
4111{
4112 return true;
4113}
4114
4115
4116
4117
4118
4119
4120
4121static int __init hugepages_setup(char *s)
4122{
4123 unsigned long *mhp;
4124 static unsigned long *last_mhp;
4125 int node = NUMA_NO_NODE;
4126 int count;
4127 unsigned long tmp;
4128 char *p = s;
4129
4130 if (!parsed_valid_hugepagesz) {
4131 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
4132 parsed_valid_hugepagesz = true;
4133 return 0;
4134 }
4135
4136
4137
4138
4139
4140
4141
4142 else if (!hugetlb_max_hstate)
4143 mhp = &default_hstate_max_huge_pages;
4144 else
4145 mhp = &parsed_hstate->max_huge_pages;
4146
4147 if (mhp == last_mhp) {
4148 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
4149 return 0;
4150 }
4151
4152 while (*p) {
4153 count = 0;
4154 if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4155 goto invalid;
4156
4157 if (p[count] == ':') {
4158 if (!hugetlb_node_alloc_supported()) {
4159 pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
4160 return 0;
4161 }
4162 if (tmp >= nr_online_nodes)
4163 goto invalid;
4164 node = tmp;
4165 p += count + 1;
4166
4167 if (sscanf(p, "%lu%n", &tmp, &count) != 1)
4168 goto invalid;
4169 if (!hugetlb_max_hstate)
4170 default_hugepages_in_node[node] = tmp;
4171 else
4172 parsed_hstate->max_huge_pages_node[node] = tmp;
4173 *mhp += tmp;
4174
4175 if (p[count] == ',')
4176 p += count + 1;
4177 else
4178 break;
4179 } else {
4180 if (p != s)
4181 goto invalid;
4182 *mhp = tmp;
4183 break;
4184 }
4185 }
4186
4187
4188
4189
4190
4191
4192 if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
4193 hugetlb_hstate_alloc_pages(parsed_hstate);
4194
4195 last_mhp = mhp;
4196
4197 return 1;
4198
4199invalid:
4200 pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
4201 return 0;
4202}
4203__setup("hugepages=", hugepages_setup);
4204
4205
4206
4207
4208
4209
4210
4211
4212static int __init hugepagesz_setup(char *s)
4213{
4214 unsigned long size;
4215 struct hstate *h;
4216
4217 parsed_valid_hugepagesz = false;
4218 size = (unsigned long)memparse(s, NULL);
4219
4220 if (!arch_hugetlb_valid_size(size)) {
4221 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
4222 return 0;
4223 }
4224
4225 h = size_to_hstate(size);
4226 if (h) {
4227
4228
4229
4230
4231
4232
4233
4234 if (!parsed_default_hugepagesz || h != &default_hstate ||
4235 default_hstate.max_huge_pages) {
4236 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
4237 return 0;
4238 }
4239
4240
4241
4242
4243
4244
4245 parsed_hstate = h;
4246 parsed_valid_hugepagesz = true;
4247 return 1;
4248 }
4249
4250 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4251 parsed_valid_hugepagesz = true;
4252 return 1;
4253}
4254__setup("hugepagesz=", hugepagesz_setup);
4255
4256
4257
4258
4259
4260static int __init default_hugepagesz_setup(char *s)
4261{
4262 unsigned long size;
4263 int i;
4264
4265 parsed_valid_hugepagesz = false;
4266 if (parsed_default_hugepagesz) {
4267 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
4268 return 0;
4269 }
4270
4271 size = (unsigned long)memparse(s, NULL);
4272
4273 if (!arch_hugetlb_valid_size(size)) {
4274 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
4275 return 0;
4276 }
4277
4278 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4279 parsed_valid_hugepagesz = true;
4280 parsed_default_hugepagesz = true;
4281 default_hstate_idx = hstate_index(size_to_hstate(size));
4282
4283
4284
4285
4286
4287
4288
4289
4290 if (default_hstate_max_huge_pages) {
4291 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
4292 for (i = 0; i < nr_online_nodes; i++)
4293 default_hstate.max_huge_pages_node[i] =
4294 default_hugepages_in_node[i];
4295 if (hstate_is_gigantic(&default_hstate))
4296 hugetlb_hstate_alloc_pages(&default_hstate);
4297 default_hstate_max_huge_pages = 0;
4298 }
4299
4300 return 1;
4301}
4302__setup("default_hugepagesz=", default_hugepagesz_setup);
4303
4304static unsigned int allowed_mems_nr(struct hstate *h)
4305{
4306 int node;
4307 unsigned int nr = 0;
4308 nodemask_t *mpol_allowed;
4309 unsigned int *array = h->free_huge_pages_node;
4310 gfp_t gfp_mask = htlb_alloc_mask(h);
4311
4312 mpol_allowed = policy_nodemask_current(gfp_mask);
4313
4314 for_each_node_mask(node, cpuset_current_mems_allowed) {
4315 if (!mpol_allowed || node_isset(node, *mpol_allowed))
4316 nr += array[node];
4317 }
4318
4319 return nr;
4320}
4321
4322#ifdef CONFIG_SYSCTL
4323static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
4324 void *buffer, size_t *length,
4325 loff_t *ppos, unsigned long *out)
4326{
4327 struct ctl_table dup_table;
4328
4329
4330
4331
4332
4333 dup_table = *table;
4334 dup_table.data = out;
4335
4336 return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
4337}
4338
4339static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
4340 struct ctl_table *table, int write,
4341 void *buffer, size_t *length, loff_t *ppos)
4342{
4343 struct hstate *h = &default_hstate;
4344 unsigned long tmp = h->max_huge_pages;
4345 int ret;
4346
4347 if (!hugepages_supported())
4348 return -EOPNOTSUPP;
4349
4350 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4351 &tmp);
4352 if (ret)
4353 goto out;
4354
4355 if (write)
4356 ret = __nr_hugepages_store_common(obey_mempolicy, h,
4357 NUMA_NO_NODE, tmp, *length);
4358out:
4359 return ret;
4360}
4361
4362int hugetlb_sysctl_handler(struct ctl_table *table, int write,
4363 void *buffer, size_t *length, loff_t *ppos)
4364{
4365
4366 return hugetlb_sysctl_handler_common(false, table, write,
4367 buffer, length, ppos);
4368}
4369
4370#ifdef CONFIG_NUMA
4371int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
4372 void *buffer, size_t *length, loff_t *ppos)
4373{
4374 return hugetlb_sysctl_handler_common(true, table, write,
4375 buffer, length, ppos);
4376}
4377#endif
4378
4379int hugetlb_overcommit_handler(struct ctl_table *table, int write,
4380 void *buffer, size_t *length, loff_t *ppos)
4381{
4382 struct hstate *h = &default_hstate;
4383 unsigned long tmp;
4384 int ret;
4385
4386 if (!hugepages_supported())
4387 return -EOPNOTSUPP;
4388
4389 tmp = h->nr_overcommit_huge_pages;
4390
4391 if (write && hstate_is_gigantic(h))
4392 return -EINVAL;
4393
4394 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
4395 &tmp);
4396 if (ret)
4397 goto out;
4398
4399 if (write) {
4400 spin_lock_irq(&hugetlb_lock);
4401 h->nr_overcommit_huge_pages = tmp;
4402 spin_unlock_irq(&hugetlb_lock);
4403 }
4404out:
4405 return ret;
4406}
4407
4408#endif
4409
4410void hugetlb_report_meminfo(struct seq_file *m)
4411{
4412 struct hstate *h;
4413 unsigned long total = 0;
4414
4415 if (!hugepages_supported())
4416 return;
4417
4418 for_each_hstate(h) {
4419 unsigned long count = h->nr_huge_pages;
4420
4421 total += huge_page_size(h) * count;
4422
4423 if (h == &default_hstate)
4424 seq_printf(m,
4425 "HugePages_Total: %5lu\n"
4426 "HugePages_Free: %5lu\n"
4427 "HugePages_Rsvd: %5lu\n"
4428 "HugePages_Surp: %5lu\n"
4429 "Hugepagesize: %8lu kB\n",
4430 count,
4431 h->free_huge_pages,
4432 h->resv_huge_pages,
4433 h->surplus_huge_pages,
4434 huge_page_size(h) / SZ_1K);
4435 }
4436
4437 seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
4438}
4439
4440int hugetlb_report_node_meminfo(char *buf, int len, int nid)
4441{
4442 struct hstate *h = &default_hstate;
4443
4444 if (!hugepages_supported())
4445 return 0;
4446
4447 return sysfs_emit_at(buf, len,
4448 "Node %d HugePages_Total: %5u\n"
4449 "Node %d HugePages_Free: %5u\n"
4450 "Node %d HugePages_Surp: %5u\n",
4451 nid, h->nr_huge_pages_node[nid],
4452 nid, h->free_huge_pages_node[nid],
4453 nid, h->surplus_huge_pages_node[nid]);
4454}
4455
4456void hugetlb_show_meminfo(void)
4457{
4458 struct hstate *h;
4459 int nid;
4460
4461 if (!hugepages_supported())
4462 return;
4463
4464 for_each_node_state(nid, N_MEMORY)
4465 for_each_hstate(h)
4466 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
4467 nid,
4468 h->nr_huge_pages_node[nid],
4469 h->free_huge_pages_node[nid],
4470 h->surplus_huge_pages_node[nid],
4471 huge_page_size(h) / SZ_1K);
4472}
4473
4474void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
4475{
4476 seq_printf(m, "HugetlbPages:\t%8lu kB\n",
4477 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
4478}
4479
4480
4481unsigned long hugetlb_total_pages(void)
4482{
4483 struct hstate *h;
4484 unsigned long nr_total_pages = 0;
4485
4486 for_each_hstate(h)
4487 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
4488 return nr_total_pages;
4489}
4490
4491static int hugetlb_acct_memory(struct hstate *h, long delta)
4492{
4493 int ret = -ENOMEM;
4494
4495 if (!delta)
4496 return 0;
4497
4498 spin_lock_irq(&hugetlb_lock);
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522 if (delta > 0) {
4523 if (gather_surplus_pages(h, delta) < 0)
4524 goto out;
4525
4526 if (delta > allowed_mems_nr(h)) {
4527 return_unused_surplus_pages(h, delta);
4528 goto out;
4529 }
4530 }
4531
4532 ret = 0;
4533 if (delta < 0)
4534 return_unused_surplus_pages(h, (unsigned long) -delta);
4535
4536out:
4537 spin_unlock_irq(&hugetlb_lock);
4538 return ret;
4539}
4540
4541static void hugetlb_vm_op_open(struct vm_area_struct *vma)
4542{
4543 struct resv_map *resv = vma_resv_map(vma);
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
4554 resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
4555 kref_get(&resv->refs);
4556 }
4557}
4558
4559static void hugetlb_vm_op_close(struct vm_area_struct *vma)
4560{
4561 struct hstate *h = hstate_vma(vma);
4562 struct resv_map *resv = vma_resv_map(vma);
4563 struct hugepage_subpool *spool = subpool_vma(vma);
4564 unsigned long reserve, start, end;
4565 long gbl_reserve;
4566
4567 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4568 return;
4569
4570 start = vma_hugecache_offset(h, vma, vma->vm_start);
4571 end = vma_hugecache_offset(h, vma, vma->vm_end);
4572
4573 reserve = (end - start) - region_count(resv, start, end);
4574 hugetlb_cgroup_uncharge_counter(resv, start, end);
4575 if (reserve) {
4576
4577
4578
4579
4580 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
4581 hugetlb_acct_memory(h, -gbl_reserve);
4582 }
4583
4584 kref_put(&resv->refs, resv_map_release);
4585}
4586
4587static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
4588{
4589 if (addr & ~(huge_page_mask(hstate_vma(vma))))
4590 return -EINVAL;
4591 return 0;
4592}
4593
4594static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
4595{
4596 return huge_page_size(hstate_vma(vma));
4597}
4598
4599
4600
4601
4602
4603
4604
4605static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
4606{
4607 BUG();
4608 return 0;
4609}
4610
4611
4612
4613
4614
4615
4616
4617
4618const struct vm_operations_struct hugetlb_vm_ops = {
4619 .fault = hugetlb_vm_op_fault,
4620 .open = hugetlb_vm_op_open,
4621 .close = hugetlb_vm_op_close,
4622 .may_split = hugetlb_vm_op_split,
4623 .pagesize = hugetlb_vm_op_pagesize,
4624};
4625
4626static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
4627 int writable)
4628{
4629 pte_t entry;
4630 unsigned int shift = huge_page_shift(hstate_vma(vma));
4631
4632 if (writable) {
4633 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
4634 vma->vm_page_prot)));
4635 } else {
4636 entry = huge_pte_wrprotect(mk_huge_pte(page,
4637 vma->vm_page_prot));
4638 }
4639 entry = pte_mkyoung(entry);
4640 entry = pte_mkhuge(entry);
4641 entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
4642
4643 return entry;
4644}
4645
4646static void set_huge_ptep_writable(struct vm_area_struct *vma,
4647 unsigned long address, pte_t *ptep)
4648{
4649 pte_t entry;
4650
4651 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
4652 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
4653 update_mmu_cache(vma, address, ptep);
4654}
4655
4656bool is_hugetlb_entry_migration(pte_t pte)
4657{
4658 swp_entry_t swp;
4659
4660 if (huge_pte_none(pte) || pte_present(pte))
4661 return false;
4662 swp = pte_to_swp_entry(pte);
4663 if (is_migration_entry(swp))
4664 return true;
4665 else
4666 return false;
4667}
4668
4669static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
4670{
4671 swp_entry_t swp;
4672
4673 if (huge_pte_none(pte) || pte_present(pte))
4674 return false;
4675 swp = pte_to_swp_entry(pte);
4676 if (is_hwpoison_entry(swp))
4677 return true;
4678 else
4679 return false;
4680}
4681
4682static void
4683hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
4684 struct page *new_page)
4685{
4686 __SetPageUptodate(new_page);
4687 hugepage_add_new_anon_rmap(new_page, vma, addr);
4688 set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
4689 hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
4690 ClearHPageRestoreReserve(new_page);
4691 SetHPageMigratable(new_page);
4692}
4693
4694int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
4695 struct vm_area_struct *vma)
4696{
4697 pte_t *src_pte, *dst_pte, entry, dst_entry;
4698 struct page *ptepage;
4699 unsigned long addr;
4700 bool cow = is_cow_mapping(vma->vm_flags);
4701 struct hstate *h = hstate_vma(vma);
4702 unsigned long sz = huge_page_size(h);
4703 unsigned long npages = pages_per_huge_page(h);
4704 struct address_space *mapping = vma->vm_file->f_mapping;
4705 struct mmu_notifier_range range;
4706 int ret = 0;
4707
4708 if (cow) {
4709 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
4710 vma->vm_start,
4711 vma->vm_end);
4712 mmu_notifier_invalidate_range_start(&range);
4713 } else {
4714
4715
4716
4717
4718
4719
4720 i_mmap_lock_read(mapping);
4721 }
4722
4723 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
4724 spinlock_t *src_ptl, *dst_ptl;
4725 src_pte = huge_pte_offset(src, addr, sz);
4726 if (!src_pte)
4727 continue;
4728 dst_pte = huge_pte_alloc(dst, vma, addr, sz);
4729 if (!dst_pte) {
4730 ret = -ENOMEM;
4731 break;
4732 }
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743 dst_entry = huge_ptep_get(dst_pte);
4744 if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
4745 continue;
4746
4747 dst_ptl = huge_pte_lock(h, dst, dst_pte);
4748 src_ptl = huge_pte_lockptr(h, src, src_pte);
4749 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4750 entry = huge_ptep_get(src_pte);
4751 dst_entry = huge_ptep_get(dst_pte);
4752again:
4753 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
4754
4755
4756
4757
4758
4759 ;
4760 } else if (unlikely(is_hugetlb_entry_migration(entry) ||
4761 is_hugetlb_entry_hwpoisoned(entry))) {
4762 swp_entry_t swp_entry = pte_to_swp_entry(entry);
4763
4764 if (is_writable_migration_entry(swp_entry) && cow) {
4765
4766
4767
4768
4769 swp_entry = make_readable_migration_entry(
4770 swp_offset(swp_entry));
4771 entry = swp_entry_to_pte(swp_entry);
4772 set_huge_swap_pte_at(src, addr, src_pte,
4773 entry, sz);
4774 }
4775 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
4776 } else {
4777 entry = huge_ptep_get(src_pte);
4778 ptepage = pte_page(entry);
4779 get_page(ptepage);
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790 if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
4791 pte_t src_pte_old = entry;
4792 struct page *new;
4793
4794 spin_unlock(src_ptl);
4795 spin_unlock(dst_ptl);
4796
4797 new = alloc_huge_page(vma, addr, 1);
4798 if (IS_ERR(new)) {
4799 put_page(ptepage);
4800 ret = PTR_ERR(new);
4801 break;
4802 }
4803 copy_user_huge_page(new, ptepage, addr, vma,
4804 npages);
4805 put_page(ptepage);
4806
4807
4808 dst_ptl = huge_pte_lock(h, dst, dst_pte);
4809 src_ptl = huge_pte_lockptr(h, src, src_pte);
4810 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4811 entry = huge_ptep_get(src_pte);
4812 if (!pte_same(src_pte_old, entry)) {
4813 restore_reserve_on_error(h, vma, addr,
4814 new);
4815 put_page(new);
4816
4817 goto again;
4818 }
4819 hugetlb_install_page(vma, dst_pte, addr, new);
4820 spin_unlock(src_ptl);
4821 spin_unlock(dst_ptl);
4822 continue;
4823 }
4824
4825 if (cow) {
4826
4827
4828
4829
4830
4831
4832
4833 huge_ptep_set_wrprotect(src, addr, src_pte);
4834 entry = huge_pte_wrprotect(entry);
4835 }
4836
4837 page_dup_rmap(ptepage, true);
4838 set_huge_pte_at(dst, addr, dst_pte, entry);
4839 hugetlb_count_add(npages, dst);
4840 }
4841 spin_unlock(src_ptl);
4842 spin_unlock(dst_ptl);
4843 }
4844
4845 if (cow)
4846 mmu_notifier_invalidate_range_end(&range);
4847 else
4848 i_mmap_unlock_read(mapping);
4849
4850 return ret;
4851}
4852
4853static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
4854 unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
4855{
4856 struct hstate *h = hstate_vma(vma);
4857 struct mm_struct *mm = vma->vm_mm;
4858 spinlock_t *src_ptl, *dst_ptl;
4859 pte_t pte;
4860
4861 dst_ptl = huge_pte_lock(h, mm, dst_pte);
4862 src_ptl = huge_pte_lockptr(h, mm, src_pte);
4863
4864
4865
4866
4867
4868 if (src_ptl != dst_ptl)
4869 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4870
4871 pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
4872 set_huge_pte_at(mm, new_addr, dst_pte, pte);
4873
4874 if (src_ptl != dst_ptl)
4875 spin_unlock(src_ptl);
4876 spin_unlock(dst_ptl);
4877}
4878
4879int move_hugetlb_page_tables(struct vm_area_struct *vma,
4880 struct vm_area_struct *new_vma,
4881 unsigned long old_addr, unsigned long new_addr,
4882 unsigned long len)
4883{
4884 struct hstate *h = hstate_vma(vma);
4885 struct address_space *mapping = vma->vm_file->f_mapping;
4886 unsigned long sz = huge_page_size(h);
4887 struct mm_struct *mm = vma->vm_mm;
4888 unsigned long old_end = old_addr + len;
4889 unsigned long old_addr_copy;
4890 pte_t *src_pte, *dst_pte;
4891 struct mmu_notifier_range range;
4892
4893 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
4894 old_end);
4895 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4896 mmu_notifier_invalidate_range_start(&range);
4897
4898 i_mmap_lock_write(mapping);
4899 for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
4900 src_pte = huge_pte_offset(mm, old_addr, sz);
4901 if (!src_pte)
4902 continue;
4903 if (huge_pte_none(huge_ptep_get(src_pte)))
4904 continue;
4905
4906
4907
4908
4909
4910 old_addr_copy = old_addr;
4911
4912 if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
4913 continue;
4914
4915 dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
4916 if (!dst_pte)
4917 break;
4918
4919 move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
4920 }
4921 flush_tlb_range(vma, old_end - len, old_end);
4922 mmu_notifier_invalidate_range_end(&range);
4923 i_mmap_unlock_write(mapping);
4924
4925 return len + old_addr - old_end;
4926}
4927
4928static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
4929 unsigned long start, unsigned long end,
4930 struct page *ref_page)
4931{
4932 struct mm_struct *mm = vma->vm_mm;
4933 unsigned long address;
4934 pte_t *ptep;
4935 pte_t pte;
4936 spinlock_t *ptl;
4937 struct page *page;
4938 struct hstate *h = hstate_vma(vma);
4939 unsigned long sz = huge_page_size(h);
4940 struct mmu_notifier_range range;
4941 bool force_flush = false;
4942
4943 WARN_ON(!is_vm_hugetlb_page(vma));
4944 BUG_ON(start & ~huge_page_mask(h));
4945 BUG_ON(end & ~huge_page_mask(h));
4946
4947
4948
4949
4950
4951 tlb_change_page_size(tlb, sz);
4952 tlb_start_vma(tlb, vma);
4953
4954
4955
4956
4957 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
4958 end);
4959 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4960 mmu_notifier_invalidate_range_start(&range);
4961 address = start;
4962 for (; address < end; address += sz) {
4963 ptep = huge_pte_offset(mm, address, sz);
4964 if (!ptep)
4965 continue;
4966
4967 ptl = huge_pte_lock(h, mm, ptep);
4968 if (huge_pmd_unshare(mm, vma, &address, ptep)) {
4969 spin_unlock(ptl);
4970 tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
4971 force_flush = true;
4972 continue;
4973 }
4974
4975 pte = huge_ptep_get(ptep);
4976 if (huge_pte_none(pte)) {
4977 spin_unlock(ptl);
4978 continue;
4979 }
4980
4981
4982
4983
4984
4985 if (unlikely(!pte_present(pte))) {
4986 huge_pte_clear(mm, address, ptep, sz);
4987 spin_unlock(ptl);
4988 continue;
4989 }
4990
4991 page = pte_page(pte);
4992
4993
4994
4995
4996
4997 if (ref_page) {
4998 if (page != ref_page) {
4999 spin_unlock(ptl);
5000 continue;
5001 }
5002
5003
5004
5005
5006
5007 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
5008 }
5009
5010 pte = huge_ptep_get_and_clear(mm, address, ptep);
5011 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
5012 if (huge_pte_dirty(pte))
5013 set_page_dirty(page);
5014
5015 hugetlb_count_sub(pages_per_huge_page(h), mm);
5016 page_remove_rmap(page, true);
5017
5018 spin_unlock(ptl);
5019 tlb_remove_page_size(tlb, page, huge_page_size(h));
5020
5021
5022
5023 if (ref_page)
5024 break;
5025 }
5026 mmu_notifier_invalidate_range_end(&range);
5027 tlb_end_vma(tlb, vma);
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042 if (force_flush)
5043 tlb_flush_mmu_tlbonly(tlb);
5044}
5045
5046void __unmap_hugepage_range_final(struct mmu_gather *tlb,
5047 struct vm_area_struct *vma, unsigned long start,
5048 unsigned long end, struct page *ref_page)
5049{
5050 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062 vma->vm_flags &= ~VM_MAYSHARE;
5063}
5064
5065void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
5066 unsigned long end, struct page *ref_page)
5067{
5068 struct mmu_gather tlb;
5069
5070 tlb_gather_mmu(&tlb, vma->vm_mm);
5071 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
5072 tlb_finish_mmu(&tlb);
5073}
5074
5075
5076
5077
5078
5079
5080
5081static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
5082 struct page *page, unsigned long address)
5083{
5084 struct hstate *h = hstate_vma(vma);
5085 struct vm_area_struct *iter_vma;
5086 struct address_space *mapping;
5087 pgoff_t pgoff;
5088
5089
5090
5091
5092
5093 address = address & huge_page_mask(h);
5094 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
5095 vma->vm_pgoff;
5096 mapping = vma->vm_file->f_mapping;
5097
5098
5099
5100
5101
5102
5103 i_mmap_lock_write(mapping);
5104 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
5105
5106 if (iter_vma == vma)
5107 continue;
5108
5109
5110
5111
5112
5113
5114 if (iter_vma->vm_flags & VM_MAYSHARE)
5115 continue;
5116
5117
5118
5119
5120
5121
5122
5123
5124 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
5125 unmap_hugepage_range(iter_vma, address,
5126 address + huge_page_size(h), page);
5127 }
5128 i_mmap_unlock_write(mapping);
5129}
5130
5131
5132
5133
5134
5135
5136
5137static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
5138 unsigned long address, pte_t *ptep,
5139 struct page *pagecache_page, spinlock_t *ptl)
5140{
5141 pte_t pte;
5142 struct hstate *h = hstate_vma(vma);
5143 struct page *old_page, *new_page;
5144 int outside_reserve = 0;
5145 vm_fault_t ret = 0;
5146 unsigned long haddr = address & huge_page_mask(h);
5147 struct mmu_notifier_range range;
5148
5149 pte = huge_ptep_get(ptep);
5150 old_page = pte_page(pte);
5151
5152retry_avoidcopy:
5153
5154
5155 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
5156 page_move_anon_rmap(old_page, vma);
5157 set_huge_ptep_writable(vma, haddr, ptep);
5158 return 0;
5159 }
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
5171 old_page != pagecache_page)
5172 outside_reserve = 1;
5173
5174 get_page(old_page);
5175
5176
5177
5178
5179
5180 spin_unlock(ptl);
5181 new_page = alloc_huge_page(vma, haddr, outside_reserve);
5182
5183 if (IS_ERR(new_page)) {
5184
5185
5186
5187
5188
5189
5190
5191 if (outside_reserve) {
5192 struct address_space *mapping = vma->vm_file->f_mapping;
5193 pgoff_t idx;
5194 u32 hash;
5195
5196 put_page(old_page);
5197 BUG_ON(huge_pte_none(pte));
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207 idx = vma_hugecache_offset(h, vma, haddr);
5208 hash = hugetlb_fault_mutex_hash(mapping, idx);
5209 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5210 i_mmap_unlock_read(mapping);
5211
5212 unmap_ref_private(mm, vma, old_page, haddr);
5213
5214 i_mmap_lock_read(mapping);
5215 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5216 spin_lock(ptl);
5217 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5218 if (likely(ptep &&
5219 pte_same(huge_ptep_get(ptep), pte)))
5220 goto retry_avoidcopy;
5221
5222
5223
5224
5225 return 0;
5226 }
5227
5228 ret = vmf_error(PTR_ERR(new_page));
5229 goto out_release_old;
5230 }
5231
5232
5233
5234
5235
5236 if (unlikely(anon_vma_prepare(vma))) {
5237 ret = VM_FAULT_OOM;
5238 goto out_release_all;
5239 }
5240
5241 copy_user_huge_page(new_page, old_page, address, vma,
5242 pages_per_huge_page(h));
5243 __SetPageUptodate(new_page);
5244
5245 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
5246 haddr + huge_page_size(h));
5247 mmu_notifier_invalidate_range_start(&range);
5248
5249
5250
5251
5252
5253 spin_lock(ptl);
5254 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5255 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
5256 ClearHPageRestoreReserve(new_page);
5257
5258
5259 huge_ptep_clear_flush(vma, haddr, ptep);
5260 mmu_notifier_invalidate_range(mm, range.start, range.end);
5261 page_remove_rmap(old_page, true);
5262 hugepage_add_new_anon_rmap(new_page, vma, haddr);
5263 set_huge_pte_at(mm, haddr, ptep,
5264 make_huge_pte(vma, new_page, 1));
5265 SetHPageMigratable(new_page);
5266
5267 new_page = old_page;
5268 }
5269 spin_unlock(ptl);
5270 mmu_notifier_invalidate_range_end(&range);
5271out_release_all:
5272
5273 if (new_page != old_page)
5274 restore_reserve_on_error(h, vma, haddr, new_page);
5275 put_page(new_page);
5276out_release_old:
5277 put_page(old_page);
5278
5279 spin_lock(ptl);
5280 return ret;
5281}
5282
5283
5284static struct page *hugetlbfs_pagecache_page(struct hstate *h,
5285 struct vm_area_struct *vma, unsigned long address)
5286{
5287 struct address_space *mapping;
5288 pgoff_t idx;
5289
5290 mapping = vma->vm_file->f_mapping;
5291 idx = vma_hugecache_offset(h, vma, address);
5292
5293 return find_lock_page(mapping, idx);
5294}
5295
5296
5297
5298
5299
5300static bool hugetlbfs_pagecache_present(struct hstate *h,
5301 struct vm_area_struct *vma, unsigned long address)
5302{
5303 struct address_space *mapping;
5304 pgoff_t idx;
5305 struct page *page;
5306
5307 mapping = vma->vm_file->f_mapping;
5308 idx = vma_hugecache_offset(h, vma, address);
5309
5310 page = find_get_page(mapping, idx);
5311 if (page)
5312 put_page(page);
5313 return page != NULL;
5314}
5315
5316int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
5317 pgoff_t idx)
5318{
5319 struct inode *inode = mapping->host;
5320 struct hstate *h = hstate_inode(inode);
5321 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
5322
5323 if (err)
5324 return err;
5325 ClearHPageRestoreReserve(page);
5326
5327
5328
5329
5330
5331 set_page_dirty(page);
5332
5333 spin_lock(&inode->i_lock);
5334 inode->i_blocks += blocks_per_huge_page(h);
5335 spin_unlock(&inode->i_lock);
5336 return 0;
5337}
5338
5339static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
5340 struct address_space *mapping,
5341 pgoff_t idx,
5342 unsigned int flags,
5343 unsigned long haddr,
5344 unsigned long reason)
5345{
5346 vm_fault_t ret;
5347 u32 hash;
5348 struct vm_fault vmf = {
5349 .vma = vma,
5350 .address = haddr,
5351 .flags = flags,
5352
5353
5354
5355
5356
5357
5358
5359
5360 };
5361
5362
5363
5364
5365
5366
5367 hash = hugetlb_fault_mutex_hash(mapping, idx);
5368 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5369 i_mmap_unlock_read(mapping);
5370 ret = handle_userfault(&vmf, reason);
5371 i_mmap_lock_read(mapping);
5372 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5373
5374 return ret;
5375}
5376
5377static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
5378 struct vm_area_struct *vma,
5379 struct address_space *mapping, pgoff_t idx,
5380 unsigned long address, pte_t *ptep, unsigned int flags)
5381{
5382 struct hstate *h = hstate_vma(vma);
5383 vm_fault_t ret = VM_FAULT_SIGBUS;
5384 int anon_rmap = 0;
5385 unsigned long size;
5386 struct page *page;
5387 pte_t new_pte;
5388 spinlock_t *ptl;
5389 unsigned long haddr = address & huge_page_mask(h);
5390 bool new_page, new_pagecache_page = false;
5391
5392
5393
5394
5395
5396
5397 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
5398 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
5399 current->pid);
5400 return ret;
5401 }
5402
5403
5404
5405
5406
5407
5408 size = i_size_read(mapping->host) >> huge_page_shift(h);
5409 if (idx >= size)
5410 goto out;
5411
5412retry:
5413 new_page = false;
5414 page = find_lock_page(mapping, idx);
5415 if (!page) {
5416
5417 if (userfaultfd_missing(vma)) {
5418 ret = hugetlb_handle_userfault(vma, mapping, idx,
5419 flags, haddr,
5420 VM_UFFD_MISSING);
5421 goto out;
5422 }
5423
5424 page = alloc_huge_page(vma, haddr, 0);
5425 if (IS_ERR(page)) {
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438 ptl = huge_pte_lock(h, mm, ptep);
5439 ret = 0;
5440 if (huge_pte_none(huge_ptep_get(ptep)))
5441 ret = vmf_error(PTR_ERR(page));
5442 spin_unlock(ptl);
5443 goto out;
5444 }
5445 clear_huge_page(page, address, pages_per_huge_page(h));
5446 __SetPageUptodate(page);
5447 new_page = true;
5448
5449 if (vma->vm_flags & VM_MAYSHARE) {
5450 int err = huge_add_to_page_cache(page, mapping, idx);
5451 if (err) {
5452 put_page(page);
5453 if (err == -EEXIST)
5454 goto retry;
5455 goto out;
5456 }
5457 new_pagecache_page = true;
5458 } else {
5459 lock_page(page);
5460 if (unlikely(anon_vma_prepare(vma))) {
5461 ret = VM_FAULT_OOM;
5462 goto backout_unlocked;
5463 }
5464 anon_rmap = 1;
5465 }
5466 } else {
5467
5468
5469
5470
5471
5472 if (unlikely(PageHWPoison(page))) {
5473 ret = VM_FAULT_HWPOISON_LARGE |
5474 VM_FAULT_SET_HINDEX(hstate_index(h));
5475 goto backout_unlocked;
5476 }
5477
5478
5479 if (userfaultfd_minor(vma)) {
5480 unlock_page(page);
5481 put_page(page);
5482 ret = hugetlb_handle_userfault(vma, mapping, idx,
5483 flags, haddr,
5484 VM_UFFD_MINOR);
5485 goto out;
5486 }
5487 }
5488
5489
5490
5491
5492
5493
5494
5495 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5496 if (vma_needs_reservation(h, vma, haddr) < 0) {
5497 ret = VM_FAULT_OOM;
5498 goto backout_unlocked;
5499 }
5500
5501 vma_end_reservation(h, vma, haddr);
5502 }
5503
5504 ptl = huge_pte_lock(h, mm, ptep);
5505 ret = 0;
5506 if (!huge_pte_none(huge_ptep_get(ptep)))
5507 goto backout;
5508
5509 if (anon_rmap) {
5510 ClearHPageRestoreReserve(page);
5511 hugepage_add_new_anon_rmap(page, vma, haddr);
5512 } else
5513 page_dup_rmap(page, true);
5514 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
5515 && (vma->vm_flags & VM_SHARED)));
5516 set_huge_pte_at(mm, haddr, ptep, new_pte);
5517
5518 hugetlb_count_add(pages_per_huge_page(h), mm);
5519 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
5520
5521 ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
5522 }
5523
5524 spin_unlock(ptl);
5525
5526
5527
5528
5529
5530
5531 if (new_page)
5532 SetHPageMigratable(page);
5533
5534 unlock_page(page);
5535out:
5536 return ret;
5537
5538backout:
5539 spin_unlock(ptl);
5540backout_unlocked:
5541 unlock_page(page);
5542
5543 if (new_page && !new_pagecache_page)
5544 restore_reserve_on_error(h, vma, haddr, page);
5545 put_page(page);
5546 goto out;
5547}
5548
5549#ifdef CONFIG_SMP
5550u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5551{
5552 unsigned long key[2];
5553 u32 hash;
5554
5555 key[0] = (unsigned long) mapping;
5556 key[1] = idx;
5557
5558 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
5559
5560 return hash & (num_fault_mutexes - 1);
5561}
5562#else
5563
5564
5565
5566
5567u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
5568{
5569 return 0;
5570}
5571#endif
5572
5573vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
5574 unsigned long address, unsigned int flags)
5575{
5576 pte_t *ptep, entry;
5577 spinlock_t *ptl;
5578 vm_fault_t ret;
5579 u32 hash;
5580 pgoff_t idx;
5581 struct page *page = NULL;
5582 struct page *pagecache_page = NULL;
5583 struct hstate *h = hstate_vma(vma);
5584 struct address_space *mapping;
5585 int need_wait_lock = 0;
5586 unsigned long haddr = address & huge_page_mask(h);
5587
5588 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
5589 if (ptep) {
5590
5591
5592
5593
5594
5595 entry = huge_ptep_get(ptep);
5596 if (unlikely(is_hugetlb_entry_migration(entry))) {
5597 migration_entry_wait_huge(vma, mm, ptep);
5598 return 0;
5599 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
5600 return VM_FAULT_HWPOISON_LARGE |
5601 VM_FAULT_SET_HINDEX(hstate_index(h));
5602 }
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615 mapping = vma->vm_file->f_mapping;
5616 i_mmap_lock_read(mapping);
5617 ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
5618 if (!ptep) {
5619 i_mmap_unlock_read(mapping);
5620 return VM_FAULT_OOM;
5621 }
5622
5623
5624
5625
5626
5627
5628 idx = vma_hugecache_offset(h, vma, haddr);
5629 hash = hugetlb_fault_mutex_hash(mapping, idx);
5630 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5631
5632 entry = huge_ptep_get(ptep);
5633 if (huge_pte_none(entry)) {
5634 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
5635 goto out_mutex;
5636 }
5637
5638 ret = 0;
5639
5640
5641
5642
5643
5644
5645
5646
5647 if (!pte_present(entry))
5648 goto out_mutex;
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
5659 if (vma_needs_reservation(h, vma, haddr) < 0) {
5660 ret = VM_FAULT_OOM;
5661 goto out_mutex;
5662 }
5663
5664 vma_end_reservation(h, vma, haddr);
5665
5666 if (!(vma->vm_flags & VM_MAYSHARE))
5667 pagecache_page = hugetlbfs_pagecache_page(h,
5668 vma, haddr);
5669 }
5670
5671 ptl = huge_pte_lock(h, mm, ptep);
5672
5673
5674 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
5675 goto out_ptl;
5676
5677
5678
5679
5680
5681
5682 page = pte_page(entry);
5683 if (page != pagecache_page)
5684 if (!trylock_page(page)) {
5685 need_wait_lock = 1;
5686 goto out_ptl;
5687 }
5688
5689 get_page(page);
5690
5691 if (flags & FAULT_FLAG_WRITE) {
5692 if (!huge_pte_write(entry)) {
5693 ret = hugetlb_cow(mm, vma, address, ptep,
5694 pagecache_page, ptl);
5695 goto out_put_page;
5696 }
5697 entry = huge_pte_mkdirty(entry);
5698 }
5699 entry = pte_mkyoung(entry);
5700 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
5701 flags & FAULT_FLAG_WRITE))
5702 update_mmu_cache(vma, haddr, ptep);
5703out_put_page:
5704 if (page != pagecache_page)
5705 unlock_page(page);
5706 put_page(page);
5707out_ptl:
5708 spin_unlock(ptl);
5709
5710 if (pagecache_page) {
5711 unlock_page(pagecache_page);
5712 put_page(pagecache_page);
5713 }
5714out_mutex:
5715 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5716 i_mmap_unlock_read(mapping);
5717
5718
5719
5720
5721
5722
5723
5724 if (need_wait_lock)
5725 wait_on_page_locked(page);
5726 return ret;
5727}
5728
5729#ifdef CONFIG_USERFAULTFD
5730
5731
5732
5733
5734int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
5735 pte_t *dst_pte,
5736 struct vm_area_struct *dst_vma,
5737 unsigned long dst_addr,
5738 unsigned long src_addr,
5739 enum mcopy_atomic_mode mode,
5740 struct page **pagep)
5741{
5742 bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
5743 struct hstate *h = hstate_vma(dst_vma);
5744 struct address_space *mapping = dst_vma->vm_file->f_mapping;
5745 pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
5746 unsigned long size;
5747 int vm_shared = dst_vma->vm_flags & VM_SHARED;
5748 pte_t _dst_pte;
5749 spinlock_t *ptl;
5750 int ret = -ENOMEM;
5751 struct page *page;
5752 int writable;
5753 bool page_in_pagecache = false;
5754
5755 if (is_continue) {
5756 ret = -EFAULT;
5757 page = find_lock_page(mapping, idx);
5758 if (!page)
5759 goto out;
5760 page_in_pagecache = true;
5761 } else if (!*pagep) {
5762
5763
5764
5765 if (vm_shared &&
5766 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5767 ret = -EEXIST;
5768 goto out;
5769 }
5770
5771 page = alloc_huge_page(dst_vma, dst_addr, 0);
5772 if (IS_ERR(page)) {
5773 ret = -ENOMEM;
5774 goto out;
5775 }
5776
5777 ret = copy_huge_page_from_user(page,
5778 (const void __user *) src_addr,
5779 pages_per_huge_page(h), false);
5780
5781
5782 if (unlikely(ret)) {
5783 ret = -ENOENT;
5784
5785
5786
5787 restore_reserve_on_error(h, dst_vma, dst_addr, page);
5788 put_page(page);
5789
5790
5791
5792
5793 page = alloc_huge_page_vma(h, dst_vma, dst_addr);
5794 if (!page) {
5795 ret = -ENOMEM;
5796 goto out;
5797 }
5798 *pagep = page;
5799
5800
5801
5802
5803 goto out;
5804 }
5805 } else {
5806 if (vm_shared &&
5807 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5808 put_page(*pagep);
5809 ret = -EEXIST;
5810 *pagep = NULL;
5811 goto out;
5812 }
5813
5814 page = alloc_huge_page(dst_vma, dst_addr, 0);
5815 if (IS_ERR(page)) {
5816 ret = -ENOMEM;
5817 *pagep = NULL;
5818 goto out;
5819 }
5820 folio_copy(page_folio(page), page_folio(*pagep));
5821 put_page(*pagep);
5822 *pagep = NULL;
5823 }
5824
5825
5826
5827
5828
5829
5830 __SetPageUptodate(page);
5831
5832
5833 if (vm_shared && !is_continue) {
5834 size = i_size_read(mapping->host) >> huge_page_shift(h);
5835 ret = -EFAULT;
5836 if (idx >= size)
5837 goto out_release_nounlock;
5838
5839
5840
5841
5842
5843
5844
5845 ret = huge_add_to_page_cache(page, mapping, idx);
5846 if (ret)
5847 goto out_release_nounlock;
5848 page_in_pagecache = true;
5849 }
5850
5851 ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
5852 spin_lock(ptl);
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863 size = i_size_read(mapping->host) >> huge_page_shift(h);
5864 ret = -EFAULT;
5865 if (idx >= size)
5866 goto out_release_unlock;
5867
5868 ret = -EEXIST;
5869 if (!huge_pte_none(huge_ptep_get(dst_pte)))
5870 goto out_release_unlock;
5871
5872 if (vm_shared) {
5873 page_dup_rmap(page, true);
5874 } else {
5875 ClearHPageRestoreReserve(page);
5876 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
5877 }
5878
5879
5880 if (is_continue && !vm_shared)
5881 writable = 0;
5882 else
5883 writable = dst_vma->vm_flags & VM_WRITE;
5884
5885 _dst_pte = make_huge_pte(dst_vma, page, writable);
5886 if (writable)
5887 _dst_pte = huge_pte_mkdirty(_dst_pte);
5888 _dst_pte = pte_mkyoung(_dst_pte);
5889
5890 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
5891
5892 (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
5893 dst_vma->vm_flags & VM_WRITE);
5894 hugetlb_count_add(pages_per_huge_page(h), dst_mm);
5895
5896
5897 update_mmu_cache(dst_vma, dst_addr, dst_pte);
5898
5899 spin_unlock(ptl);
5900 if (!is_continue)
5901 SetHPageMigratable(page);
5902 if (vm_shared || is_continue)
5903 unlock_page(page);
5904 ret = 0;
5905out:
5906 return ret;
5907out_release_unlock:
5908 spin_unlock(ptl);
5909 if (vm_shared || is_continue)
5910 unlock_page(page);
5911out_release_nounlock:
5912 if (!page_in_pagecache)
5913 restore_reserve_on_error(h, dst_vma, dst_addr, page);
5914 put_page(page);
5915 goto out;
5916}
5917#endif
5918
5919static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
5920 int refs, struct page **pages,
5921 struct vm_area_struct **vmas)
5922{
5923 int nr;
5924
5925 for (nr = 0; nr < refs; nr++) {
5926 if (likely(pages))
5927 pages[nr] = mem_map_offset(page, nr);
5928 if (vmas)
5929 vmas[nr] = vma;
5930 }
5931}
5932
5933long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
5934 struct page **pages, struct vm_area_struct **vmas,
5935 unsigned long *position, unsigned long *nr_pages,
5936 long i, unsigned int flags, int *locked)
5937{
5938 unsigned long pfn_offset;
5939 unsigned long vaddr = *position;
5940 unsigned long remainder = *nr_pages;
5941 struct hstate *h = hstate_vma(vma);
5942 int err = -EFAULT, refs;
5943
5944 while (vaddr < vma->vm_end && remainder) {
5945 pte_t *pte;
5946 spinlock_t *ptl = NULL;
5947 int absent;
5948 struct page *page;
5949
5950
5951
5952
5953
5954 if (fatal_signal_pending(current)) {
5955 remainder = 0;
5956 break;
5957 }
5958
5959
5960
5961
5962
5963
5964
5965
5966 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
5967 huge_page_size(h));
5968 if (pte)
5969 ptl = huge_pte_lock(h, mm, pte);
5970 absent = !pte || huge_pte_none(huge_ptep_get(pte));
5971
5972
5973
5974
5975
5976
5977
5978
5979 if (absent && (flags & FOLL_DUMP) &&
5980 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
5981 if (pte)
5982 spin_unlock(ptl);
5983 remainder = 0;
5984 break;
5985 }
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997 if (absent || is_swap_pte(huge_ptep_get(pte)) ||
5998 ((flags & FOLL_WRITE) &&
5999 !huge_pte_write(huge_ptep_get(pte)))) {
6000 vm_fault_t ret;
6001 unsigned int fault_flags = 0;
6002
6003 if (pte)
6004 spin_unlock(ptl);
6005 if (flags & FOLL_WRITE)
6006 fault_flags |= FAULT_FLAG_WRITE;
6007 if (locked)
6008 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6009 FAULT_FLAG_KILLABLE;
6010 if (flags & FOLL_NOWAIT)
6011 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
6012 FAULT_FLAG_RETRY_NOWAIT;
6013 if (flags & FOLL_TRIED) {
6014
6015
6016
6017
6018 fault_flags |= FAULT_FLAG_TRIED;
6019 }
6020 ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
6021 if (ret & VM_FAULT_ERROR) {
6022 err = vm_fault_to_errno(ret, flags);
6023 remainder = 0;
6024 break;
6025 }
6026 if (ret & VM_FAULT_RETRY) {
6027 if (locked &&
6028 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
6029 *locked = 0;
6030 *nr_pages = 0;
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040 return i;
6041 }
6042 continue;
6043 }
6044
6045 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
6046 page = pte_page(huge_ptep_get(pte));
6047
6048
6049
6050
6051
6052 if (!pages && !vmas && !pfn_offset &&
6053 (vaddr + huge_page_size(h) < vma->vm_end) &&
6054 (remainder >= pages_per_huge_page(h))) {
6055 vaddr += huge_page_size(h);
6056 remainder -= pages_per_huge_page(h);
6057 i += pages_per_huge_page(h);
6058 spin_unlock(ptl);
6059 continue;
6060 }
6061
6062
6063 refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
6064 (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
6065
6066 if (pages || vmas)
6067 record_subpages_vmas(mem_map_offset(page, pfn_offset),
6068 vma, refs,
6069 likely(pages) ? pages + i : NULL,
6070 vmas ? vmas + i : NULL);
6071
6072 if (pages) {
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083 if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
6084 refs,
6085 flags))) {
6086 spin_unlock(ptl);
6087 remainder = 0;
6088 err = -ENOMEM;
6089 break;
6090 }
6091 }
6092
6093 vaddr += (refs << PAGE_SHIFT);
6094 remainder -= refs;
6095 i += refs;
6096
6097 spin_unlock(ptl);
6098 }
6099 *nr_pages = remainder;
6100
6101
6102
6103
6104
6105 *position = vaddr;
6106
6107 return i ? i : err;
6108}
6109
6110unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
6111 unsigned long address, unsigned long end, pgprot_t newprot)
6112{
6113 struct mm_struct *mm = vma->vm_mm;
6114 unsigned long start = address;
6115 pte_t *ptep;
6116 pte_t pte;
6117 struct hstate *h = hstate_vma(vma);
6118 unsigned long pages = 0;
6119 bool shared_pmd = false;
6120 struct mmu_notifier_range range;
6121
6122
6123
6124
6125
6126
6127 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
6128 0, vma, mm, start, end);
6129 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
6130
6131 BUG_ON(address >= end);
6132 flush_cache_range(vma, range.start, range.end);
6133
6134 mmu_notifier_invalidate_range_start(&range);
6135 i_mmap_lock_write(vma->vm_file->f_mapping);
6136 for (; address < end; address += huge_page_size(h)) {
6137 spinlock_t *ptl;
6138 ptep = huge_pte_offset(mm, address, huge_page_size(h));
6139 if (!ptep)
6140 continue;
6141 ptl = huge_pte_lock(h, mm, ptep);
6142 if (huge_pmd_unshare(mm, vma, &address, ptep)) {
6143 pages++;
6144 spin_unlock(ptl);
6145 shared_pmd = true;
6146 continue;
6147 }
6148 pte = huge_ptep_get(ptep);
6149 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
6150 spin_unlock(ptl);
6151 continue;
6152 }
6153 if (unlikely(is_hugetlb_entry_migration(pte))) {
6154 swp_entry_t entry = pte_to_swp_entry(pte);
6155
6156 if (is_writable_migration_entry(entry)) {
6157 pte_t newpte;
6158
6159 entry = make_readable_migration_entry(
6160 swp_offset(entry));
6161 newpte = swp_entry_to_pte(entry);
6162 set_huge_swap_pte_at(mm, address, ptep,
6163 newpte, huge_page_size(h));
6164 pages++;
6165 }
6166 spin_unlock(ptl);
6167 continue;
6168 }
6169 if (!huge_pte_none(pte)) {
6170 pte_t old_pte;
6171 unsigned int shift = huge_page_shift(hstate_vma(vma));
6172
6173 old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
6174 pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
6175 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
6176 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
6177 pages++;
6178 }
6179 spin_unlock(ptl);
6180 }
6181
6182
6183
6184
6185
6186
6187
6188 if (shared_pmd)
6189 flush_hugetlb_tlb_range(vma, range.start, range.end);
6190 else
6191 flush_hugetlb_tlb_range(vma, start, end);
6192
6193
6194
6195
6196
6197
6198 i_mmap_unlock_write(vma->vm_file->f_mapping);
6199 mmu_notifier_invalidate_range_end(&range);
6200
6201 return pages << h->order;
6202}
6203
6204
6205bool hugetlb_reserve_pages(struct inode *inode,
6206 long from, long to,
6207 struct vm_area_struct *vma,
6208 vm_flags_t vm_flags)
6209{
6210 long chg, add = -1;
6211 struct hstate *h = hstate_inode(inode);
6212 struct hugepage_subpool *spool = subpool_inode(inode);
6213 struct resv_map *resv_map;
6214 struct hugetlb_cgroup *h_cg = NULL;
6215 long gbl_reserve, regions_needed = 0;
6216
6217
6218 if (from > to) {
6219 VM_WARN(1, "%s called with a negative range\n", __func__);
6220 return false;
6221 }
6222
6223
6224
6225
6226
6227
6228 if (vm_flags & VM_NORESERVE)
6229 return true;
6230
6231
6232
6233
6234
6235
6236
6237 if (!vma || vma->vm_flags & VM_MAYSHARE) {
6238
6239
6240
6241
6242
6243 resv_map = inode_resv_map(inode);
6244
6245 chg = region_chg(resv_map, from, to, ®ions_needed);
6246
6247 } else {
6248
6249 resv_map = resv_map_alloc();
6250 if (!resv_map)
6251 return false;
6252
6253 chg = to - from;
6254
6255 set_vma_resv_map(vma, resv_map);
6256 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
6257 }
6258
6259 if (chg < 0)
6260 goto out_err;
6261
6262 if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
6263 chg * pages_per_huge_page(h), &h_cg) < 0)
6264 goto out_err;
6265
6266 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
6267
6268
6269
6270 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
6271 }
6272
6273
6274
6275
6276
6277
6278 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
6279 if (gbl_reserve < 0)
6280 goto out_uncharge_cgroup;
6281
6282
6283
6284
6285
6286 if (hugetlb_acct_memory(h, gbl_reserve) < 0)
6287 goto out_put_pages;
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300 if (!vma || vma->vm_flags & VM_MAYSHARE) {
6301 add = region_add(resv_map, from, to, regions_needed, h, h_cg);
6302
6303 if (unlikely(add < 0)) {
6304 hugetlb_acct_memory(h, -gbl_reserve);
6305 goto out_put_pages;
6306 } else if (unlikely(chg > add)) {
6307
6308
6309
6310
6311
6312
6313
6314 long rsv_adjust;
6315
6316
6317
6318
6319
6320 hugetlb_cgroup_uncharge_cgroup_rsvd(
6321 hstate_index(h),
6322 (chg - add) * pages_per_huge_page(h), h_cg);
6323
6324 rsv_adjust = hugepage_subpool_put_pages(spool,
6325 chg - add);
6326 hugetlb_acct_memory(h, -rsv_adjust);
6327 } else if (h_cg) {
6328
6329
6330
6331
6332
6333
6334 hugetlb_cgroup_put_rsvd_cgroup(h_cg);
6335 }
6336 }
6337 return true;
6338
6339out_put_pages:
6340
6341 (void)hugepage_subpool_put_pages(spool, chg);
6342out_uncharge_cgroup:
6343 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
6344 chg * pages_per_huge_page(h), h_cg);
6345out_err:
6346 if (!vma || vma->vm_flags & VM_MAYSHARE)
6347
6348
6349
6350 if (chg >= 0 && add < 0)
6351 region_abort(resv_map, from, to, regions_needed);
6352 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
6353 kref_put(&resv_map->refs, resv_map_release);
6354 return false;
6355}
6356
6357long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
6358 long freed)
6359{
6360 struct hstate *h = hstate_inode(inode);
6361 struct resv_map *resv_map = inode_resv_map(inode);
6362 long chg = 0;
6363 struct hugepage_subpool *spool = subpool_inode(inode);
6364 long gbl_reserve;
6365
6366
6367
6368
6369
6370 if (resv_map) {
6371 chg = region_del(resv_map, start, end);
6372
6373
6374
6375
6376
6377 if (chg < 0)
6378 return chg;
6379 }
6380
6381 spin_lock(&inode->i_lock);
6382 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
6383 spin_unlock(&inode->i_lock);
6384
6385
6386
6387
6388
6389
6390
6391
6392 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
6393 hugetlb_acct_memory(h, -gbl_reserve);
6394
6395 return 0;
6396}
6397
6398#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
6399static unsigned long page_table_shareable(struct vm_area_struct *svma,
6400 struct vm_area_struct *vma,
6401 unsigned long addr, pgoff_t idx)
6402{
6403 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
6404 svma->vm_start;
6405 unsigned long sbase = saddr & PUD_MASK;
6406 unsigned long s_end = sbase + PUD_SIZE;
6407
6408
6409 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
6410 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
6411
6412
6413
6414
6415
6416 if (pmd_index(addr) != pmd_index(saddr) ||
6417 vm_flags != svm_flags ||
6418 !range_in_vma(svma, sbase, s_end))
6419 return 0;
6420
6421 return saddr;
6422}
6423
6424static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
6425{
6426 unsigned long base = addr & PUD_MASK;
6427 unsigned long end = base + PUD_SIZE;
6428
6429
6430
6431
6432 if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
6433 return true;
6434 return false;
6435}
6436
6437bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6438{
6439#ifdef CONFIG_USERFAULTFD
6440 if (uffd_disable_huge_pmd_share(vma))
6441 return false;
6442#endif
6443 return vma_shareable(vma, addr);
6444}
6445
6446
6447
6448
6449
6450
6451void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6452 unsigned long *start, unsigned long *end)
6453{
6454 unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
6455 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
6456
6457
6458
6459
6460
6461 if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
6462 (*end <= v_start) || (*start >= v_end))
6463 return;
6464
6465
6466 if (*start > v_start)
6467 *start = ALIGN_DOWN(*start, PUD_SIZE);
6468
6469 if (*end < v_end)
6470 *end = ALIGN(*end, PUD_SIZE);
6471}
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6485 unsigned long addr, pud_t *pud)
6486{
6487 struct address_space *mapping = vma->vm_file->f_mapping;
6488 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
6489 vma->vm_pgoff;
6490 struct vm_area_struct *svma;
6491 unsigned long saddr;
6492 pte_t *spte = NULL;
6493 pte_t *pte;
6494 spinlock_t *ptl;
6495
6496 i_mmap_assert_locked(mapping);
6497 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
6498 if (svma == vma)
6499 continue;
6500
6501 saddr = page_table_shareable(svma, vma, addr, idx);
6502 if (saddr) {
6503 spte = huge_pte_offset(svma->vm_mm, saddr,
6504 vma_mmu_pagesize(svma));
6505 if (spte) {
6506 get_page(virt_to_page(spte));
6507 break;
6508 }
6509 }
6510 }
6511
6512 if (!spte)
6513 goto out;
6514
6515 ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
6516 if (pud_none(*pud)) {
6517 pud_populate(mm, pud,
6518 (pmd_t *)((unsigned long)spte & PAGE_MASK));
6519 mm_inc_nr_pmds(mm);
6520 } else {
6521 put_page(virt_to_page(spte));
6522 }
6523 spin_unlock(ptl);
6524out:
6525 pte = (pte_t *)pmd_alloc(mm, pud, addr);
6526 return pte;
6527}
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6542 unsigned long *addr, pte_t *ptep)
6543{
6544 pgd_t *pgd = pgd_offset(mm, *addr);
6545 p4d_t *p4d = p4d_offset(pgd, *addr);
6546 pud_t *pud = pud_offset(p4d, *addr);
6547
6548 i_mmap_assert_write_locked(vma->vm_file->f_mapping);
6549 BUG_ON(page_count(virt_to_page(ptep)) == 0);
6550 if (page_count(virt_to_page(ptep)) == 1)
6551 return 0;
6552
6553 pud_clear(pud);
6554 put_page(virt_to_page(ptep));
6555 mm_dec_nr_pmds(mm);
6556 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
6557 return 1;
6558}
6559
6560#else
6561pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
6562 unsigned long addr, pud_t *pud)
6563{
6564 return NULL;
6565}
6566
6567int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
6568 unsigned long *addr, pte_t *ptep)
6569{
6570 return 0;
6571}
6572
6573void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
6574 unsigned long *start, unsigned long *end)
6575{
6576}
6577
6578bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
6579{
6580 return false;
6581}
6582#endif
6583
6584#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
6585pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
6586 unsigned long addr, unsigned long sz)
6587{
6588 pgd_t *pgd;
6589 p4d_t *p4d;
6590 pud_t *pud;
6591 pte_t *pte = NULL;
6592
6593 pgd = pgd_offset(mm, addr);
6594 p4d = p4d_alloc(mm, pgd, addr);
6595 if (!p4d)
6596 return NULL;
6597 pud = pud_alloc(mm, p4d, addr);
6598 if (pud) {
6599 if (sz == PUD_SIZE) {
6600 pte = (pte_t *)pud;
6601 } else {
6602 BUG_ON(sz != PMD_SIZE);
6603 if (want_pmd_share(vma, addr) && pud_none(*pud))
6604 pte = huge_pmd_share(mm, vma, addr, pud);
6605 else
6606 pte = (pte_t *)pmd_alloc(mm, pud, addr);
6607 }
6608 }
6609 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
6610
6611 return pte;
6612}
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623pte_t *huge_pte_offset(struct mm_struct *mm,
6624 unsigned long addr, unsigned long sz)
6625{
6626 pgd_t *pgd;
6627 p4d_t *p4d;
6628 pud_t *pud;
6629 pmd_t *pmd;
6630
6631 pgd = pgd_offset(mm, addr);
6632 if (!pgd_present(*pgd))
6633 return NULL;
6634 p4d = p4d_offset(pgd, addr);
6635 if (!p4d_present(*p4d))
6636 return NULL;
6637
6638 pud = pud_offset(p4d, addr);
6639 if (sz == PUD_SIZE)
6640
6641 return (pte_t *)pud;
6642 if (!pud_present(*pud))
6643 return NULL;
6644
6645
6646 pmd = pmd_offset(pud, addr);
6647
6648 return (pte_t *)pmd;
6649}
6650
6651#endif
6652
6653
6654
6655
6656
6657struct page * __weak
6658follow_huge_addr(struct mm_struct *mm, unsigned long address,
6659 int write)
6660{
6661 return ERR_PTR(-EINVAL);
6662}
6663
6664struct page * __weak
6665follow_huge_pd(struct vm_area_struct *vma,
6666 unsigned long address, hugepd_t hpd, int flags, int pdshift)
6667{
6668 WARN(1, "hugepd follow called with no support for hugepage directory format\n");
6669 return NULL;
6670}
6671
6672struct page * __weak
6673follow_huge_pmd(struct mm_struct *mm, unsigned long address,
6674 pmd_t *pmd, int flags)
6675{
6676 struct page *page = NULL;
6677 spinlock_t *ptl;
6678 pte_t pte;
6679
6680
6681 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
6682 (FOLL_PIN | FOLL_GET)))
6683 return NULL;
6684
6685retry:
6686 ptl = pmd_lockptr(mm, pmd);
6687 spin_lock(ptl);
6688
6689
6690
6691
6692 if (!pmd_huge(*pmd))
6693 goto out;
6694 pte = huge_ptep_get((pte_t *)pmd);
6695 if (pte_present(pte)) {
6696 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
6697
6698
6699
6700
6701
6702
6703
6704
6705 if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
6706 page = NULL;
6707 goto out;
6708 }
6709 } else {
6710 if (is_hugetlb_entry_migration(pte)) {
6711 spin_unlock(ptl);
6712 __migration_entry_wait(mm, (pte_t *)pmd, ptl);
6713 goto retry;
6714 }
6715
6716
6717
6718
6719 }
6720out:
6721 spin_unlock(ptl);
6722 return page;
6723}
6724
6725struct page * __weak
6726follow_huge_pud(struct mm_struct *mm, unsigned long address,
6727 pud_t *pud, int flags)
6728{
6729 if (flags & (FOLL_GET | FOLL_PIN))
6730 return NULL;
6731
6732 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
6733}
6734
6735struct page * __weak
6736follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
6737{
6738 if (flags & (FOLL_GET | FOLL_PIN))
6739 return NULL;
6740
6741 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
6742}
6743
6744bool isolate_huge_page(struct page *page, struct list_head *list)
6745{
6746 bool ret = true;
6747
6748 spin_lock_irq(&hugetlb_lock);
6749 if (!PageHeadHuge(page) ||
6750 !HPageMigratable(page) ||
6751 !get_page_unless_zero(page)) {
6752 ret = false;
6753 goto unlock;
6754 }
6755 ClearHPageMigratable(page);
6756 list_move_tail(&page->lru, list);
6757unlock:
6758 spin_unlock_irq(&hugetlb_lock);
6759 return ret;
6760}
6761
6762int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
6763{
6764 int ret = 0;
6765
6766 *hugetlb = false;
6767 spin_lock_irq(&hugetlb_lock);
6768 if (PageHeadHuge(page)) {
6769 *hugetlb = true;
6770 if (HPageFreed(page) || HPageMigratable(page))
6771 ret = get_page_unless_zero(page);
6772 else
6773 ret = -EBUSY;
6774 }
6775 spin_unlock_irq(&hugetlb_lock);
6776 return ret;
6777}
6778
6779void putback_active_hugepage(struct page *page)
6780{
6781 spin_lock_irq(&hugetlb_lock);
6782 SetHPageMigratable(page);
6783 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
6784 spin_unlock_irq(&hugetlb_lock);
6785 put_page(page);
6786}
6787
6788void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
6789{
6790 struct hstate *h = page_hstate(oldpage);
6791
6792 hugetlb_cgroup_migrate(oldpage, newpage);
6793 set_page_owner_migrate_reason(newpage, reason);
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805 if (HPageTemporary(newpage)) {
6806 int old_nid = page_to_nid(oldpage);
6807 int new_nid = page_to_nid(newpage);
6808
6809 SetHPageTemporary(oldpage);
6810 ClearHPageTemporary(newpage);
6811
6812
6813
6814
6815
6816 if (new_nid == old_nid)
6817 return;
6818 spin_lock_irq(&hugetlb_lock);
6819 if (h->surplus_huge_pages_node[old_nid]) {
6820 h->surplus_huge_pages_node[old_nid]--;
6821 h->surplus_huge_pages_node[new_nid]++;
6822 }
6823 spin_unlock_irq(&hugetlb_lock);
6824 }
6825}
6826
6827
6828
6829
6830
6831void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
6832{
6833 struct hstate *h = hstate_vma(vma);
6834 unsigned long sz = huge_page_size(h);
6835 struct mm_struct *mm = vma->vm_mm;
6836 struct mmu_notifier_range range;
6837 unsigned long address, start, end;
6838 spinlock_t *ptl;
6839 pte_t *ptep;
6840
6841 if (!(vma->vm_flags & VM_MAYSHARE))
6842 return;
6843
6844 start = ALIGN(vma->vm_start, PUD_SIZE);
6845 end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
6846
6847 if (start >= end)
6848 return;
6849
6850
6851
6852
6853
6854 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
6855 start, end);
6856 mmu_notifier_invalidate_range_start(&range);
6857 i_mmap_lock_write(vma->vm_file->f_mapping);
6858 for (address = start; address < end; address += PUD_SIZE) {
6859 unsigned long tmp = address;
6860
6861 ptep = huge_pte_offset(mm, address, sz);
6862 if (!ptep)
6863 continue;
6864 ptl = huge_pte_lock(h, mm, ptep);
6865
6866 huge_pmd_unshare(mm, vma, &tmp, ptep);
6867 spin_unlock(ptl);
6868 }
6869 flush_hugetlb_tlb_range(vma, start, end);
6870 i_mmap_unlock_write(vma->vm_file->f_mapping);
6871
6872
6873
6874
6875 mmu_notifier_invalidate_range_end(&range);
6876}
6877
6878#ifdef CONFIG_CMA
6879static bool cma_reserve_called __initdata;
6880
6881static int __init cmdline_parse_hugetlb_cma(char *p)
6882{
6883 int nid, count = 0;
6884 unsigned long tmp;
6885 char *s = p;
6886
6887 while (*s) {
6888 if (sscanf(s, "%lu%n", &tmp, &count) != 1)
6889 break;
6890
6891 if (s[count] == ':') {
6892 nid = tmp;
6893 if (nid < 0 || nid >= MAX_NUMNODES)
6894 break;
6895
6896 s += count + 1;
6897 tmp = memparse(s, &s);
6898 hugetlb_cma_size_in_node[nid] = tmp;
6899 hugetlb_cma_size += tmp;
6900
6901
6902
6903
6904
6905 if (*s == ',')
6906 s++;
6907 else
6908 break;
6909 } else {
6910 hugetlb_cma_size = memparse(p, &p);
6911 break;
6912 }
6913 }
6914
6915 return 0;
6916}
6917
6918early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
6919
6920void __init hugetlb_cma_reserve(int order)
6921{
6922 unsigned long size, reserved, per_node;
6923 bool node_specific_cma_alloc = false;
6924 int nid;
6925
6926 cma_reserve_called = true;
6927
6928 if (!hugetlb_cma_size)
6929 return;
6930
6931 for (nid = 0; nid < MAX_NUMNODES; nid++) {
6932 if (hugetlb_cma_size_in_node[nid] == 0)
6933 continue;
6934
6935 if (!node_state(nid, N_ONLINE)) {
6936 pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
6937 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
6938 hugetlb_cma_size_in_node[nid] = 0;
6939 continue;
6940 }
6941
6942 if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
6943 pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
6944 nid, (PAGE_SIZE << order) / SZ_1M);
6945 hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
6946 hugetlb_cma_size_in_node[nid] = 0;
6947 } else {
6948 node_specific_cma_alloc = true;
6949 }
6950 }
6951
6952
6953 if (!hugetlb_cma_size)
6954 return;
6955
6956 if (hugetlb_cma_size < (PAGE_SIZE << order)) {
6957 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
6958 (PAGE_SIZE << order) / SZ_1M);
6959 hugetlb_cma_size = 0;
6960 return;
6961 }
6962
6963 if (!node_specific_cma_alloc) {
6964
6965
6966
6967
6968 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
6969 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
6970 hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
6971 }
6972
6973 reserved = 0;
6974 for_each_node_state(nid, N_ONLINE) {
6975 int res;
6976 char name[CMA_MAX_NAME];
6977
6978 if (node_specific_cma_alloc) {
6979 if (hugetlb_cma_size_in_node[nid] == 0)
6980 continue;
6981
6982 size = hugetlb_cma_size_in_node[nid];
6983 } else {
6984 size = min(per_node, hugetlb_cma_size - reserved);
6985 }
6986
6987 size = round_up(size, PAGE_SIZE << order);
6988
6989 snprintf(name, sizeof(name), "hugetlb%d", nid);
6990
6991
6992
6993
6994
6995 res = cma_declare_contiguous_nid(0, size, 0,
6996 PAGE_SIZE << HUGETLB_PAGE_ORDER,
6997 0, false, name,
6998 &hugetlb_cma[nid], nid);
6999 if (res) {
7000 pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
7001 res, nid);
7002 continue;
7003 }
7004
7005 reserved += size;
7006 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
7007 size / SZ_1M, nid);
7008
7009 if (reserved >= hugetlb_cma_size)
7010 break;
7011 }
7012
7013 if (!reserved)
7014
7015
7016
7017
7018 hugetlb_cma_size = 0;
7019}
7020
7021void __init hugetlb_cma_check(void)
7022{
7023 if (!hugetlb_cma_size || cma_reserve_called)
7024 return;
7025
7026 pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
7027}
7028
7029#endif
7030