1
2
3
4
5
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/mm.h>
9#include <linux/seq_file.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/mmu_notifier.h>
13#include <linux/nodemask.h>
14#include <linux/pagemap.h>
15#include <linux/mempolicy.h>
16#include <linux/compiler.h>
17#include <linux/cpuset.h>
18#include <linux/mutex.h>
19#include <linux/memblock.h>
20#include <linux/sysfs.h>
21#include <linux/slab.h>
22#include <linux/sched/mm.h>
23#include <linux/mmdebug.h>
24#include <linux/sched/signal.h>
25#include <linux/rmap.h>
26#include <linux/string_helpers.h>
27#include <linux/swap.h>
28#include <linux/swapops.h>
29#include <linux/jhash.h>
30#include <linux/numa.h>
31#include <linux/llist.h>
32#include <linux/cma.h>
33#include <linux/migrate.h>
34
35#include <asm/page.h>
36#include <asm/pgalloc.h>
37#include <asm/tlb.h>
38
39#include <linux/io.h>
40#include <linux/hugetlb.h>
41#include <linux/hugetlb_cgroup.h>
42#include <linux/node.h>
43#include <linux/page_owner.h>
44#include "internal.h"
45#include "hugetlb_vmemmap.h"
46
47int hugetlb_max_hstate __read_mostly;
48unsigned int default_hstate_idx;
49struct hstate hstates[HUGE_MAX_HSTATE];
50
51#ifdef CONFIG_CMA
52static struct cma *hugetlb_cma[MAX_NUMNODES];
53#endif
54static unsigned long hugetlb_cma_size __initdata;
55
56
57
58
59
60static unsigned int minimum_order __read_mostly = UINT_MAX;
61
62__initdata LIST_HEAD(huge_boot_pages);
63
64
65static struct hstate * __initdata parsed_hstate;
66static unsigned long __initdata default_hstate_max_huge_pages;
67static bool __initdata parsed_valid_hugepagesz = true;
68static bool __initdata parsed_default_hugepagesz;
69
70
71
72
73
74DEFINE_SPINLOCK(hugetlb_lock);
75
76
77
78
79
80static int num_fault_mutexes;
81struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
82
83
84static int hugetlb_acct_memory(struct hstate *h, long delta);
85
86static inline bool subpool_is_free(struct hugepage_subpool *spool)
87{
88 if (spool->count)
89 return false;
90 if (spool->max_hpages != -1)
91 return spool->used_hpages == 0;
92 if (spool->min_hpages != -1)
93 return spool->rsv_hpages == spool->min_hpages;
94
95 return true;
96}
97
98static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
99 unsigned long irq_flags)
100{
101 spin_unlock_irqrestore(&spool->lock, irq_flags);
102
103
104
105
106 if (subpool_is_free(spool)) {
107 if (spool->min_hpages != -1)
108 hugetlb_acct_memory(spool->hstate,
109 -spool->min_hpages);
110 kfree(spool);
111 }
112}
113
114struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
115 long min_hpages)
116{
117 struct hugepage_subpool *spool;
118
119 spool = kzalloc(sizeof(*spool), GFP_KERNEL);
120 if (!spool)
121 return NULL;
122
123 spin_lock_init(&spool->lock);
124 spool->count = 1;
125 spool->max_hpages = max_hpages;
126 spool->hstate = h;
127 spool->min_hpages = min_hpages;
128
129 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
130 kfree(spool);
131 return NULL;
132 }
133 spool->rsv_hpages = min_hpages;
134
135 return spool;
136}
137
138void hugepage_put_subpool(struct hugepage_subpool *spool)
139{
140 unsigned long flags;
141
142 spin_lock_irqsave(&spool->lock, flags);
143 BUG_ON(!spool->count);
144 spool->count--;
145 unlock_or_release_subpool(spool, flags);
146}
147
148
149
150
151
152
153
154
155
156static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
157 long delta)
158{
159 long ret = delta;
160
161 if (!spool)
162 return ret;
163
164 spin_lock_irq(&spool->lock);
165
166 if (spool->max_hpages != -1) {
167 if ((spool->used_hpages + delta) <= spool->max_hpages)
168 spool->used_hpages += delta;
169 else {
170 ret = -ENOMEM;
171 goto unlock_ret;
172 }
173 }
174
175
176 if (spool->min_hpages != -1 && spool->rsv_hpages) {
177 if (delta > spool->rsv_hpages) {
178
179
180
181
182 ret = delta - spool->rsv_hpages;
183 spool->rsv_hpages = 0;
184 } else {
185 ret = 0;
186 spool->rsv_hpages -= delta;
187 }
188 }
189
190unlock_ret:
191 spin_unlock_irq(&spool->lock);
192 return ret;
193}
194
195
196
197
198
199
200
201static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
202 long delta)
203{
204 long ret = delta;
205 unsigned long flags;
206
207 if (!spool)
208 return delta;
209
210 spin_lock_irqsave(&spool->lock, flags);
211
212 if (spool->max_hpages != -1)
213 spool->used_hpages -= delta;
214
215
216 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
217 if (spool->rsv_hpages + delta <= spool->min_hpages)
218 ret = 0;
219 else
220 ret = spool->rsv_hpages + delta - spool->min_hpages;
221
222 spool->rsv_hpages += delta;
223 if (spool->rsv_hpages > spool->min_hpages)
224 spool->rsv_hpages = spool->min_hpages;
225 }
226
227
228
229
230
231 unlock_or_release_subpool(spool, flags);
232
233 return ret;
234}
235
236static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
237{
238 return HUGETLBFS_SB(inode->i_sb)->spool;
239}
240
241static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
242{
243 return subpool_inode(file_inode(vma->vm_file));
244}
245
246
247
248
249static struct file_region *
250get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
251{
252 struct file_region *nrg = NULL;
253
254 VM_BUG_ON(resv->region_cache_count <= 0);
255
256 resv->region_cache_count--;
257 nrg = list_first_entry(&resv->region_cache, struct file_region, link);
258 list_del(&nrg->link);
259
260 nrg->from = from;
261 nrg->to = to;
262
263 return nrg;
264}
265
266static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
267 struct file_region *rg)
268{
269#ifdef CONFIG_CGROUP_HUGETLB
270 nrg->reservation_counter = rg->reservation_counter;
271 nrg->css = rg->css;
272 if (rg->css)
273 css_get(rg->css);
274#endif
275}
276
277
278static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
279 struct hstate *h,
280 struct resv_map *resv,
281 struct file_region *nrg)
282{
283#ifdef CONFIG_CGROUP_HUGETLB
284 if (h_cg) {
285 nrg->reservation_counter =
286 &h_cg->rsvd_hugepage[hstate_index(h)];
287 nrg->css = &h_cg->css;
288
289
290
291
292
293
294
295
296
297
298 css_get(&h_cg->css);
299 if (!resv->pages_per_hpage)
300 resv->pages_per_hpage = pages_per_huge_page(h);
301
302
303
304 VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
305 } else {
306 nrg->reservation_counter = NULL;
307 nrg->css = NULL;
308 }
309#endif
310}
311
312static void put_uncharge_info(struct file_region *rg)
313{
314#ifdef CONFIG_CGROUP_HUGETLB
315 if (rg->css)
316 css_put(rg->css);
317#endif
318}
319
320static bool has_same_uncharge_info(struct file_region *rg,
321 struct file_region *org)
322{
323#ifdef CONFIG_CGROUP_HUGETLB
324 return rg && org &&
325 rg->reservation_counter == org->reservation_counter &&
326 rg->css == org->css;
327
328#else
329 return true;
330#endif
331}
332
333static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
334{
335 struct file_region *nrg = NULL, *prg = NULL;
336
337 prg = list_prev_entry(rg, link);
338 if (&prg->link != &resv->regions && prg->to == rg->from &&
339 has_same_uncharge_info(prg, rg)) {
340 prg->to = rg->to;
341
342 list_del(&rg->link);
343 put_uncharge_info(rg);
344 kfree(rg);
345
346 rg = prg;
347 }
348
349 nrg = list_next_entry(rg, link);
350 if (&nrg->link != &resv->regions && nrg->from == rg->to &&
351 has_same_uncharge_info(nrg, rg)) {
352 nrg->from = rg->from;
353
354 list_del(&rg->link);
355 put_uncharge_info(rg);
356 kfree(rg);
357 }
358}
359
360static inline long
361hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
362 long to, struct hstate *h, struct hugetlb_cgroup *cg,
363 long *regions_needed)
364{
365 struct file_region *nrg;
366
367 if (!regions_needed) {
368 nrg = get_file_region_entry_from_cache(map, from, to);
369 record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
370 list_add(&nrg->link, rg->link.prev);
371 coalesce_file_region(map, nrg);
372 } else
373 *regions_needed += 1;
374
375 return to - from;
376}
377
378
379
380
381
382
383
384
385
386static long add_reservation_in_range(struct resv_map *resv, long f, long t,
387 struct hugetlb_cgroup *h_cg,
388 struct hstate *h, long *regions_needed)
389{
390 long add = 0;
391 struct list_head *head = &resv->regions;
392 long last_accounted_offset = f;
393 struct file_region *rg = NULL, *trg = NULL;
394
395 if (regions_needed)
396 *regions_needed = 0;
397
398
399
400
401
402 list_for_each_entry_safe(rg, trg, head, link) {
403
404 if (rg->from < f) {
405
406
407
408 if (rg->to > last_accounted_offset)
409 last_accounted_offset = rg->to;
410 continue;
411 }
412
413
414
415
416 if (rg->from >= t)
417 break;
418
419
420
421
422 if (rg->from > last_accounted_offset)
423 add += hugetlb_resv_map_add(resv, rg,
424 last_accounted_offset,
425 rg->from, h, h_cg,
426 regions_needed);
427
428 last_accounted_offset = rg->to;
429 }
430
431
432
433
434 if (last_accounted_offset < t)
435 add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
436 t, h, h_cg, regions_needed);
437
438 VM_BUG_ON(add < 0);
439 return add;
440}
441
442
443
444static int allocate_file_region_entries(struct resv_map *resv,
445 int regions_needed)
446 __must_hold(&resv->lock)
447{
448 struct list_head allocated_regions;
449 int to_allocate = 0, i = 0;
450 struct file_region *trg = NULL, *rg = NULL;
451
452 VM_BUG_ON(regions_needed < 0);
453
454 INIT_LIST_HEAD(&allocated_regions);
455
456
457
458
459
460
461
462
463
464
465 while (resv->region_cache_count <
466 (resv->adds_in_progress + regions_needed)) {
467 to_allocate = resv->adds_in_progress + regions_needed -
468 resv->region_cache_count;
469
470
471
472
473
474 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
475
476 spin_unlock(&resv->lock);
477 for (i = 0; i < to_allocate; i++) {
478 trg = kmalloc(sizeof(*trg), GFP_KERNEL);
479 if (!trg)
480 goto out_of_memory;
481 list_add(&trg->link, &allocated_regions);
482 }
483
484 spin_lock(&resv->lock);
485
486 list_splice(&allocated_regions, &resv->region_cache);
487 resv->region_cache_count += to_allocate;
488 }
489
490 return 0;
491
492out_of_memory:
493 list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
494 list_del(&rg->link);
495 kfree(rg);
496 }
497 return -ENOMEM;
498}
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517static long region_add(struct resv_map *resv, long f, long t,
518 long in_regions_needed, struct hstate *h,
519 struct hugetlb_cgroup *h_cg)
520{
521 long add = 0, actual_regions_needed = 0;
522
523 spin_lock(&resv->lock);
524retry:
525
526
527 add_reservation_in_range(resv, f, t, NULL, NULL,
528 &actual_regions_needed);
529
530
531
532
533
534
535
536
537
538
539 if (actual_regions_needed > in_regions_needed &&
540 resv->region_cache_count <
541 resv->adds_in_progress +
542 (actual_regions_needed - in_regions_needed)) {
543
544
545
546 VM_BUG_ON(t - f <= 1);
547
548 if (allocate_file_region_entries(
549 resv, actual_regions_needed - in_regions_needed)) {
550 return -ENOMEM;
551 }
552
553 goto retry;
554 }
555
556 add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
557
558 resv->adds_in_progress -= in_regions_needed;
559
560 spin_unlock(&resv->lock);
561 return add;
562}
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584static long region_chg(struct resv_map *resv, long f, long t,
585 long *out_regions_needed)
586{
587 long chg = 0;
588
589 spin_lock(&resv->lock);
590
591
592 chg = add_reservation_in_range(resv, f, t, NULL, NULL,
593 out_regions_needed);
594
595 if (*out_regions_needed == 0)
596 *out_regions_needed = 1;
597
598 if (allocate_file_region_entries(resv, *out_regions_needed))
599 return -ENOMEM;
600
601 resv->adds_in_progress += *out_regions_needed;
602
603 spin_unlock(&resv->lock);
604 return chg;
605}
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620static void region_abort(struct resv_map *resv, long f, long t,
621 long regions_needed)
622{
623 spin_lock(&resv->lock);
624 VM_BUG_ON(!resv->region_cache_count);
625 resv->adds_in_progress -= regions_needed;
626 spin_unlock(&resv->lock);
627}
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643static long region_del(struct resv_map *resv, long f, long t)
644{
645 struct list_head *head = &resv->regions;
646 struct file_region *rg, *trg;
647 struct file_region *nrg = NULL;
648 long del = 0;
649
650retry:
651 spin_lock(&resv->lock);
652 list_for_each_entry_safe(rg, trg, head, link) {
653
654
655
656
657
658
659
660 if (rg->to <= f && (rg->to != rg->from || rg->to != f))
661 continue;
662
663 if (rg->from >= t)
664 break;
665
666 if (f > rg->from && t < rg->to) {
667
668
669
670
671 if (!nrg &&
672 resv->region_cache_count > resv->adds_in_progress) {
673 nrg = list_first_entry(&resv->region_cache,
674 struct file_region,
675 link);
676 list_del(&nrg->link);
677 resv->region_cache_count--;
678 }
679
680 if (!nrg) {
681 spin_unlock(&resv->lock);
682 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
683 if (!nrg)
684 return -ENOMEM;
685 goto retry;
686 }
687
688 del += t - f;
689 hugetlb_cgroup_uncharge_file_region(
690 resv, rg, t - f, false);
691
692
693 nrg->from = t;
694 nrg->to = rg->to;
695
696 copy_hugetlb_cgroup_uncharge_info(nrg, rg);
697
698 INIT_LIST_HEAD(&nrg->link);
699
700
701 rg->to = f;
702
703 list_add(&nrg->link, &rg->link);
704 nrg = NULL;
705 break;
706 }
707
708 if (f <= rg->from && t >= rg->to) {
709 del += rg->to - rg->from;
710 hugetlb_cgroup_uncharge_file_region(resv, rg,
711 rg->to - rg->from, true);
712 list_del(&rg->link);
713 kfree(rg);
714 continue;
715 }
716
717 if (f <= rg->from) {
718 hugetlb_cgroup_uncharge_file_region(resv, rg,
719 t - rg->from, false);
720
721 del += t - rg->from;
722 rg->from = t;
723 } else {
724 hugetlb_cgroup_uncharge_file_region(resv, rg,
725 rg->to - f, false);
726
727 del += rg->to - f;
728 rg->to = f;
729 }
730 }
731
732 spin_unlock(&resv->lock);
733 kfree(nrg);
734 return del;
735}
736
737
738
739
740
741
742
743
744
745
746void hugetlb_fix_reserve_counts(struct inode *inode)
747{
748 struct hugepage_subpool *spool = subpool_inode(inode);
749 long rsv_adjust;
750 bool reserved = false;
751
752 rsv_adjust = hugepage_subpool_get_pages(spool, 1);
753 if (rsv_adjust > 0) {
754 struct hstate *h = hstate_inode(inode);
755
756 if (!hugetlb_acct_memory(h, 1))
757 reserved = true;
758 } else if (!rsv_adjust) {
759 reserved = true;
760 }
761
762 if (!reserved)
763 pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
764}
765
766
767
768
769
770static long region_count(struct resv_map *resv, long f, long t)
771{
772 struct list_head *head = &resv->regions;
773 struct file_region *rg;
774 long chg = 0;
775
776 spin_lock(&resv->lock);
777
778 list_for_each_entry(rg, head, link) {
779 long seg_from;
780 long seg_to;
781
782 if (rg->to <= f)
783 continue;
784 if (rg->from >= t)
785 break;
786
787 seg_from = max(rg->from, f);
788 seg_to = min(rg->to, t);
789
790 chg += seg_to - seg_from;
791 }
792 spin_unlock(&resv->lock);
793
794 return chg;
795}
796
797
798
799
800
801static pgoff_t vma_hugecache_offset(struct hstate *h,
802 struct vm_area_struct *vma, unsigned long address)
803{
804 return ((address - vma->vm_start) >> huge_page_shift(h)) +
805 (vma->vm_pgoff >> huge_page_order(h));
806}
807
808pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
809 unsigned long address)
810{
811 return vma_hugecache_offset(hstate_vma(vma), vma, address);
812}
813EXPORT_SYMBOL_GPL(linear_hugepage_index);
814
815
816
817
818
819unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
820{
821 if (vma->vm_ops && vma->vm_ops->pagesize)
822 return vma->vm_ops->pagesize(vma);
823 return PAGE_SIZE;
824}
825EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
826
827
828
829
830
831
832
833__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
834{
835 return vma_kernel_pagesize(vma);
836}
837
838
839
840
841
842
843#define HPAGE_RESV_OWNER (1UL << 0)
844#define HPAGE_RESV_UNMAPPED (1UL << 1)
845#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866static unsigned long get_vma_private_data(struct vm_area_struct *vma)
867{
868 return (unsigned long)vma->vm_private_data;
869}
870
871static void set_vma_private_data(struct vm_area_struct *vma,
872 unsigned long value)
873{
874 vma->vm_private_data = (void *)value;
875}
876
877static void
878resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
879 struct hugetlb_cgroup *h_cg,
880 struct hstate *h)
881{
882#ifdef CONFIG_CGROUP_HUGETLB
883 if (!h_cg || !h) {
884 resv_map->reservation_counter = NULL;
885 resv_map->pages_per_hpage = 0;
886 resv_map->css = NULL;
887 } else {
888 resv_map->reservation_counter =
889 &h_cg->rsvd_hugepage[hstate_index(h)];
890 resv_map->pages_per_hpage = pages_per_huge_page(h);
891 resv_map->css = &h_cg->css;
892 }
893#endif
894}
895
896struct resv_map *resv_map_alloc(void)
897{
898 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
899 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
900
901 if (!resv_map || !rg) {
902 kfree(resv_map);
903 kfree(rg);
904 return NULL;
905 }
906
907 kref_init(&resv_map->refs);
908 spin_lock_init(&resv_map->lock);
909 INIT_LIST_HEAD(&resv_map->regions);
910
911 resv_map->adds_in_progress = 0;
912
913
914
915
916
917
918 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
919
920 INIT_LIST_HEAD(&resv_map->region_cache);
921 list_add(&rg->link, &resv_map->region_cache);
922 resv_map->region_cache_count = 1;
923
924 return resv_map;
925}
926
927void resv_map_release(struct kref *ref)
928{
929 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
930 struct list_head *head = &resv_map->region_cache;
931 struct file_region *rg, *trg;
932
933
934 region_del(resv_map, 0, LONG_MAX);
935
936
937 list_for_each_entry_safe(rg, trg, head, link) {
938 list_del(&rg->link);
939 kfree(rg);
940 }
941
942 VM_BUG_ON(resv_map->adds_in_progress);
943
944 kfree(resv_map);
945}
946
947static inline struct resv_map *inode_resv_map(struct inode *inode)
948{
949
950
951
952
953
954
955
956
957 return (struct resv_map *)(&inode->i_data)->private_data;
958}
959
960static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
961{
962 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
963 if (vma->vm_flags & VM_MAYSHARE) {
964 struct address_space *mapping = vma->vm_file->f_mapping;
965 struct inode *inode = mapping->host;
966
967 return inode_resv_map(inode);
968
969 } else {
970 return (struct resv_map *)(get_vma_private_data(vma) &
971 ~HPAGE_RESV_MASK);
972 }
973}
974
975static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
976{
977 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
978 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
979
980 set_vma_private_data(vma, (get_vma_private_data(vma) &
981 HPAGE_RESV_MASK) | (unsigned long)map);
982}
983
984static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
985{
986 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
987 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
988
989 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
990}
991
992static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
993{
994 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
995
996 return (get_vma_private_data(vma) & flag) != 0;
997}
998
999
1000void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
1001{
1002 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
1003 if (!(vma->vm_flags & VM_MAYSHARE))
1004 vma->vm_private_data = (void *)0;
1005}
1006
1007
1008static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
1009{
1010 if (vma->vm_flags & VM_NORESERVE) {
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
1021 return true;
1022 else
1023 return false;
1024 }
1025
1026
1027 if (vma->vm_flags & VM_MAYSHARE) {
1028
1029
1030
1031
1032
1033
1034
1035 if (chg)
1036 return false;
1037 else
1038 return true;
1039 }
1040
1041
1042
1043
1044
1045 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061 if (chg)
1062 return false;
1063 else
1064 return true;
1065 }
1066
1067 return false;
1068}
1069
1070static void enqueue_huge_page(struct hstate *h, struct page *page)
1071{
1072 int nid = page_to_nid(page);
1073
1074 lockdep_assert_held(&hugetlb_lock);
1075 list_move(&page->lru, &h->hugepage_freelists[nid]);
1076 h->free_huge_pages++;
1077 h->free_huge_pages_node[nid]++;
1078 SetHPageFreed(page);
1079}
1080
1081static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
1082{
1083 struct page *page;
1084 bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1085
1086 lockdep_assert_held(&hugetlb_lock);
1087 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
1088 if (pin && !is_pinnable_page(page))
1089 continue;
1090
1091 if (PageHWPoison(page))
1092 continue;
1093
1094 list_move(&page->lru, &h->hugepage_activelist);
1095 set_page_refcounted(page);
1096 ClearHPageFreed(page);
1097 h->free_huge_pages--;
1098 h->free_huge_pages_node[nid]--;
1099 return page;
1100 }
1101
1102 return NULL;
1103}
1104
1105static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
1106 nodemask_t *nmask)
1107{
1108 unsigned int cpuset_mems_cookie;
1109 struct zonelist *zonelist;
1110 struct zone *zone;
1111 struct zoneref *z;
1112 int node = NUMA_NO_NODE;
1113
1114 zonelist = node_zonelist(nid, gfp_mask);
1115
1116retry_cpuset:
1117 cpuset_mems_cookie = read_mems_allowed_begin();
1118 for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
1119 struct page *page;
1120
1121 if (!cpuset_zone_allowed(zone, gfp_mask))
1122 continue;
1123
1124
1125
1126
1127 if (zone_to_nid(zone) == node)
1128 continue;
1129 node = zone_to_nid(zone);
1130
1131 page = dequeue_huge_page_node_exact(h, node);
1132 if (page)
1133 return page;
1134 }
1135 if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
1136 goto retry_cpuset;
1137
1138 return NULL;
1139}
1140
1141static struct page *dequeue_huge_page_vma(struct hstate *h,
1142 struct vm_area_struct *vma,
1143 unsigned long address, int avoid_reserve,
1144 long chg)
1145{
1146 struct page *page;
1147 struct mempolicy *mpol;
1148 gfp_t gfp_mask;
1149 nodemask_t *nodemask;
1150 int nid;
1151
1152
1153
1154
1155
1156
1157 if (!vma_has_reserves(vma, chg) &&
1158 h->free_huge_pages - h->resv_huge_pages == 0)
1159 goto err;
1160
1161
1162 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
1163 goto err;
1164
1165 gfp_mask = htlb_alloc_mask(h);
1166 nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
1167 page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
1168 if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
1169 SetHPageRestoreReserve(page);
1170 h->resv_huge_pages--;
1171 }
1172
1173 mpol_cond_put(mpol);
1174 return page;
1175
1176err:
1177 return NULL;
1178}
1179
1180
1181
1182
1183
1184
1185
1186
1187static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
1188{
1189 nid = next_node_in(nid, *nodes_allowed);
1190 VM_BUG_ON(nid >= MAX_NUMNODES);
1191
1192 return nid;
1193}
1194
1195static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
1196{
1197 if (!node_isset(nid, *nodes_allowed))
1198 nid = next_node_allowed(nid, nodes_allowed);
1199 return nid;
1200}
1201
1202
1203
1204
1205
1206
1207
1208static int hstate_next_node_to_alloc(struct hstate *h,
1209 nodemask_t *nodes_allowed)
1210{
1211 int nid;
1212
1213 VM_BUG_ON(!nodes_allowed);
1214
1215 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
1216 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
1217
1218 return nid;
1219}
1220
1221
1222
1223
1224
1225
1226
1227static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1228{
1229 int nid;
1230
1231 VM_BUG_ON(!nodes_allowed);
1232
1233 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1234 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1235
1236 return nid;
1237}
1238
1239#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
1240 for (nr_nodes = nodes_weight(*mask); \
1241 nr_nodes > 0 && \
1242 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
1243 nr_nodes--)
1244
1245#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
1246 for (nr_nodes = nodes_weight(*mask); \
1247 nr_nodes > 0 && \
1248 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
1249 nr_nodes--)
1250
1251#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
1252static void destroy_compound_gigantic_page(struct page *page,
1253 unsigned int order)
1254{
1255 int i;
1256 int nr_pages = 1 << order;
1257 struct page *p = page + 1;
1258
1259 atomic_set(compound_mapcount_ptr(page), 0);
1260 atomic_set(compound_pincount_ptr(page), 0);
1261
1262 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1263 clear_compound_head(p);
1264 set_page_refcounted(p);
1265 }
1266
1267 set_compound_order(page, 0);
1268 page[1].compound_nr = 0;
1269 __ClearPageHead(page);
1270}
1271
1272static void free_gigantic_page(struct page *page, unsigned int order)
1273{
1274
1275
1276
1277
1278#ifdef CONFIG_CMA
1279 if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
1280 return;
1281#endif
1282
1283 free_contig_range(page_to_pfn(page), 1 << order);
1284}
1285
1286#ifdef CONFIG_CONTIG_ALLOC
1287static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1288 int nid, nodemask_t *nodemask)
1289{
1290 unsigned long nr_pages = pages_per_huge_page(h);
1291 if (nid == NUMA_NO_NODE)
1292 nid = numa_mem_id();
1293
1294#ifdef CONFIG_CMA
1295 {
1296 struct page *page;
1297 int node;
1298
1299 if (hugetlb_cma[nid]) {
1300 page = cma_alloc(hugetlb_cma[nid], nr_pages,
1301 huge_page_order(h), true);
1302 if (page)
1303 return page;
1304 }
1305
1306 if (!(gfp_mask & __GFP_THISNODE)) {
1307 for_each_node_mask(node, *nodemask) {
1308 if (node == nid || !hugetlb_cma[node])
1309 continue;
1310
1311 page = cma_alloc(hugetlb_cma[node], nr_pages,
1312 huge_page_order(h), true);
1313 if (page)
1314 return page;
1315 }
1316 }
1317 }
1318#endif
1319
1320 return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
1321}
1322
1323#else
1324static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1325 int nid, nodemask_t *nodemask)
1326{
1327 return NULL;
1328}
1329#endif
1330
1331#else
1332static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
1333 int nid, nodemask_t *nodemask)
1334{
1335 return NULL;
1336}
1337static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1338static inline void destroy_compound_gigantic_page(struct page *page,
1339 unsigned int order) { }
1340#endif
1341
1342
1343
1344
1345
1346
1347
1348static void remove_hugetlb_page(struct hstate *h, struct page *page,
1349 bool adjust_surplus)
1350{
1351 int nid = page_to_nid(page);
1352
1353 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1354 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
1355
1356 lockdep_assert_held(&hugetlb_lock);
1357 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1358 return;
1359
1360 list_del(&page->lru);
1361
1362 if (HPageFreed(page)) {
1363 h->free_huge_pages--;
1364 h->free_huge_pages_node[nid]--;
1365 }
1366 if (adjust_surplus) {
1367 h->surplus_huge_pages--;
1368 h->surplus_huge_pages_node[nid]--;
1369 }
1370
1371 set_page_refcounted(page);
1372 set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1373
1374 h->nr_huge_pages--;
1375 h->nr_huge_pages_node[nid]--;
1376}
1377
1378static void add_hugetlb_page(struct hstate *h, struct page *page,
1379 bool adjust_surplus)
1380{
1381 int zeroed;
1382 int nid = page_to_nid(page);
1383
1384 VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
1385
1386 lockdep_assert_held(&hugetlb_lock);
1387
1388 INIT_LIST_HEAD(&page->lru);
1389 h->nr_huge_pages++;
1390 h->nr_huge_pages_node[nid]++;
1391
1392 if (adjust_surplus) {
1393 h->surplus_huge_pages++;
1394 h->surplus_huge_pages_node[nid]++;
1395 }
1396
1397 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1398 set_page_private(page, 0);
1399 SetHPageVmemmapOptimized(page);
1400
1401
1402
1403
1404
1405 zeroed = put_page_testzero(page);
1406 VM_BUG_ON_PAGE(!zeroed, page);
1407 arch_clear_hugepage_flags(page);
1408 enqueue_huge_page(h, page);
1409}
1410
1411static void __update_and_free_page(struct hstate *h, struct page *page)
1412{
1413 int i;
1414 struct page *subpage = page;
1415
1416 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
1417 return;
1418
1419 if (alloc_huge_page_vmemmap(h, page)) {
1420 spin_lock_irq(&hugetlb_lock);
1421
1422
1423
1424
1425
1426 add_hugetlb_page(h, page, true);
1427 spin_unlock_irq(&hugetlb_lock);
1428 return;
1429 }
1430
1431 for (i = 0; i < pages_per_huge_page(h);
1432 i++, subpage = mem_map_next(subpage, page, i)) {
1433 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1434 1 << PG_referenced | 1 << PG_dirty |
1435 1 << PG_active | 1 << PG_private |
1436 1 << PG_writeback);
1437 }
1438 if (hstate_is_gigantic(h)) {
1439 destroy_compound_gigantic_page(page, huge_page_order(h));
1440 free_gigantic_page(page, huge_page_order(h));
1441 } else {
1442 __free_pages(page, huge_page_order(h));
1443 }
1444}
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457static LLIST_HEAD(hpage_freelist);
1458
1459static void free_hpage_workfn(struct work_struct *work)
1460{
1461 struct llist_node *node;
1462
1463 node = llist_del_all(&hpage_freelist);
1464
1465 while (node) {
1466 struct page *page;
1467 struct hstate *h;
1468
1469 page = container_of((struct address_space **)node,
1470 struct page, mapping);
1471 node = node->next;
1472 page->mapping = NULL;
1473
1474
1475
1476
1477
1478
1479 h = size_to_hstate(page_size(page));
1480
1481 __update_and_free_page(h, page);
1482
1483 cond_resched();
1484 }
1485}
1486static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
1487
1488static inline void flush_free_hpage_work(struct hstate *h)
1489{
1490 if (free_vmemmap_pages_per_hpage(h))
1491 flush_work(&free_hpage_work);
1492}
1493
1494static void update_and_free_page(struct hstate *h, struct page *page,
1495 bool atomic)
1496{
1497 if (!HPageVmemmapOptimized(page) || !atomic) {
1498 __update_and_free_page(h, page);
1499 return;
1500 }
1501
1502
1503
1504
1505
1506
1507
1508
1509 if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
1510 schedule_work(&free_hpage_work);
1511}
1512
1513static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
1514{
1515 struct page *page, *t_page;
1516
1517 list_for_each_entry_safe(page, t_page, list, lru) {
1518 update_and_free_page(h, page, false);
1519 cond_resched();
1520 }
1521}
1522
1523struct hstate *size_to_hstate(unsigned long size)
1524{
1525 struct hstate *h;
1526
1527 for_each_hstate(h) {
1528 if (huge_page_size(h) == size)
1529 return h;
1530 }
1531 return NULL;
1532}
1533
1534void free_huge_page(struct page *page)
1535{
1536
1537
1538
1539
1540 struct hstate *h = page_hstate(page);
1541 int nid = page_to_nid(page);
1542 struct hugepage_subpool *spool = hugetlb_page_subpool(page);
1543 bool restore_reserve;
1544 unsigned long flags;
1545
1546 VM_BUG_ON_PAGE(page_count(page), page);
1547 VM_BUG_ON_PAGE(page_mapcount(page), page);
1548
1549 hugetlb_set_page_subpool(page, NULL);
1550 page->mapping = NULL;
1551 restore_reserve = HPageRestoreReserve(page);
1552 ClearHPageRestoreReserve(page);
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562 if (!restore_reserve) {
1563
1564
1565
1566
1567
1568
1569 if (hugepage_subpool_put_pages(spool, 1) == 0)
1570 restore_reserve = true;
1571 }
1572
1573 spin_lock_irqsave(&hugetlb_lock, flags);
1574 ClearHPageMigratable(page);
1575 hugetlb_cgroup_uncharge_page(hstate_index(h),
1576 pages_per_huge_page(h), page);
1577 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
1578 pages_per_huge_page(h), page);
1579 if (restore_reserve)
1580 h->resv_huge_pages++;
1581
1582 if (HPageTemporary(page)) {
1583 remove_hugetlb_page(h, page, false);
1584 spin_unlock_irqrestore(&hugetlb_lock, flags);
1585 update_and_free_page(h, page, true);
1586 } else if (h->surplus_huge_pages_node[nid]) {
1587
1588 remove_hugetlb_page(h, page, true);
1589 spin_unlock_irqrestore(&hugetlb_lock, flags);
1590 update_and_free_page(h, page, true);
1591 } else {
1592 arch_clear_hugepage_flags(page);
1593 enqueue_huge_page(h, page);
1594 spin_unlock_irqrestore(&hugetlb_lock, flags);
1595 }
1596}
1597
1598
1599
1600
1601static void __prep_account_new_huge_page(struct hstate *h, int nid)
1602{
1603 lockdep_assert_held(&hugetlb_lock);
1604 h->nr_huge_pages++;
1605 h->nr_huge_pages_node[nid]++;
1606}
1607
1608static void __prep_new_huge_page(struct hstate *h, struct page *page)
1609{
1610 free_huge_page_vmemmap(h, page);
1611 INIT_LIST_HEAD(&page->lru);
1612 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1613 hugetlb_set_page_subpool(page, NULL);
1614 set_hugetlb_cgroup(page, NULL);
1615 set_hugetlb_cgroup_rsvd(page, NULL);
1616}
1617
1618static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1619{
1620 __prep_new_huge_page(h, page);
1621 spin_lock_irq(&hugetlb_lock);
1622 __prep_account_new_huge_page(h, nid);
1623 spin_unlock_irq(&hugetlb_lock);
1624}
1625
1626static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
1627{
1628 int i, j;
1629 int nr_pages = 1 << order;
1630 struct page *p = page + 1;
1631
1632
1633 set_compound_order(page, order);
1634 __ClearPageReserved(page);
1635 __SetPageHead(page);
1636 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649 __ClearPageReserved(p);
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665 if (!page_ref_freeze(p, 1)) {
1666 pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
1667 synchronize_rcu();
1668 if (!page_ref_freeze(p, 1))
1669 goto out_error;
1670 }
1671 set_page_count(p, 0);
1672 set_compound_head(p, page);
1673 }
1674 atomic_set(compound_mapcount_ptr(page), -1);
1675 atomic_set(compound_pincount_ptr(page), 0);
1676 return true;
1677
1678out_error:
1679
1680 p = page + 1;
1681 for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
1682 clear_compound_head(p);
1683 set_page_refcounted(p);
1684 }
1685
1686 for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
1687 __ClearPageReserved(p);
1688 set_compound_order(page, 0);
1689 page[1].compound_nr = 0;
1690 __ClearPageHead(page);
1691 return false;
1692}
1693
1694
1695
1696
1697
1698
1699int PageHuge(struct page *page)
1700{
1701 if (!PageCompound(page))
1702 return 0;
1703
1704 page = compound_head(page);
1705 return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
1706}
1707EXPORT_SYMBOL_GPL(PageHuge);
1708
1709
1710
1711
1712
1713int PageHeadHuge(struct page *page_head)
1714{
1715 if (!PageHead(page_head))
1716 return 0;
1717
1718 return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
1719}
1720
1721
1722
1723
1724
1725
1726
1727
1728struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
1729{
1730 struct address_space *mapping = page_mapping(hpage);
1731
1732 if (!mapping)
1733 return mapping;
1734
1735 if (i_mmap_trylock_write(mapping))
1736 return mapping;
1737
1738 return NULL;
1739}
1740
1741pgoff_t hugetlb_basepage_index(struct page *page)
1742{
1743 struct page *page_head = compound_head(page);
1744 pgoff_t index = page_index(page_head);
1745 unsigned long compound_idx;
1746
1747 if (compound_order(page_head) >= MAX_ORDER)
1748 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
1749 else
1750 compound_idx = page - page_head;
1751
1752 return (index << compound_order(page_head)) + compound_idx;
1753}
1754
1755static struct page *alloc_buddy_huge_page(struct hstate *h,
1756 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1757 nodemask_t *node_alloc_noretry)
1758{
1759 int order = huge_page_order(h);
1760 struct page *page;
1761 bool alloc_try_hard = true;
1762
1763
1764
1765
1766
1767
1768
1769
1770 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1771 alloc_try_hard = false;
1772 gfp_mask |= __GFP_COMP|__GFP_NOWARN;
1773 if (alloc_try_hard)
1774 gfp_mask |= __GFP_RETRY_MAYFAIL;
1775 if (nid == NUMA_NO_NODE)
1776 nid = numa_mem_id();
1777 page = __alloc_pages(gfp_mask, order, nid, nmask);
1778 if (page)
1779 __count_vm_event(HTLB_BUDDY_PGALLOC);
1780 else
1781 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1782
1783
1784
1785
1786
1787
1788 if (node_alloc_noretry && page && !alloc_try_hard)
1789 node_clear(nid, *node_alloc_noretry);
1790
1791
1792
1793
1794
1795
1796 if (node_alloc_noretry && !page && alloc_try_hard)
1797 node_set(nid, *node_alloc_noretry);
1798
1799 return page;
1800}
1801
1802
1803
1804
1805
1806static struct page *alloc_fresh_huge_page(struct hstate *h,
1807 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1808 nodemask_t *node_alloc_noretry)
1809{
1810 struct page *page;
1811 bool retry = false;
1812
1813retry:
1814 if (hstate_is_gigantic(h))
1815 page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
1816 else
1817 page = alloc_buddy_huge_page(h, gfp_mask,
1818 nid, nmask, node_alloc_noretry);
1819 if (!page)
1820 return NULL;
1821
1822 if (hstate_is_gigantic(h)) {
1823 if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
1824
1825
1826
1827
1828 free_gigantic_page(page, huge_page_order(h));
1829 if (!retry) {
1830 retry = true;
1831 goto retry;
1832 }
1833 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
1834 return NULL;
1835 }
1836 }
1837 prep_new_huge_page(h, page, page_to_nid(page));
1838
1839 return page;
1840}
1841
1842
1843
1844
1845
1846static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1847 nodemask_t *node_alloc_noretry)
1848{
1849 struct page *page;
1850 int nr_nodes, node;
1851 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
1852
1853 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1854 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
1855 node_alloc_noretry);
1856 if (page)
1857 break;
1858 }
1859
1860 if (!page)
1861 return 0;
1862
1863 put_page(page);
1864
1865 return 1;
1866}
1867
1868
1869
1870
1871
1872
1873
1874
1875static struct page *remove_pool_huge_page(struct hstate *h,
1876 nodemask_t *nodes_allowed,
1877 bool acct_surplus)
1878{
1879 int nr_nodes, node;
1880 struct page *page = NULL;
1881
1882 lockdep_assert_held(&hugetlb_lock);
1883 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1884
1885
1886
1887
1888 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
1889 !list_empty(&h->hugepage_freelists[node])) {
1890 page = list_entry(h->hugepage_freelists[node].next,
1891 struct page, lru);
1892 remove_hugetlb_page(h, page, acct_surplus);
1893 break;
1894 }
1895 }
1896
1897 return page;
1898}
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914int dissolve_free_huge_page(struct page *page)
1915{
1916 int rc = -EBUSY;
1917
1918retry:
1919
1920 if (!PageHuge(page))
1921 return 0;
1922
1923 spin_lock_irq(&hugetlb_lock);
1924 if (!PageHuge(page)) {
1925 rc = 0;
1926 goto out;
1927 }
1928
1929 if (!page_count(page)) {
1930 struct page *head = compound_head(page);
1931 struct hstate *h = page_hstate(head);
1932 if (h->free_huge_pages - h->resv_huge_pages == 0)
1933 goto out;
1934
1935
1936
1937
1938
1939 if (unlikely(!HPageFreed(head))) {
1940 spin_unlock_irq(&hugetlb_lock);
1941 cond_resched();
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951 goto retry;
1952 }
1953
1954 remove_hugetlb_page(h, head, false);
1955 h->max_huge_pages--;
1956 spin_unlock_irq(&hugetlb_lock);
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966 rc = alloc_huge_page_vmemmap(h, head);
1967 if (!rc) {
1968
1969
1970
1971
1972
1973 if (PageHWPoison(head) && page != head) {
1974 SetPageHWPoison(page);
1975 ClearPageHWPoison(head);
1976 }
1977 update_and_free_page(h, head, false);
1978 } else {
1979 spin_lock_irq(&hugetlb_lock);
1980 add_hugetlb_page(h, head, false);
1981 h->max_huge_pages++;
1982 spin_unlock_irq(&hugetlb_lock);
1983 }
1984
1985 return rc;
1986 }
1987out:
1988 spin_unlock_irq(&hugetlb_lock);
1989 return rc;
1990}
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
2001{
2002 unsigned long pfn;
2003 struct page *page;
2004 int rc = 0;
2005
2006 if (!hugepages_supported())
2007 return rc;
2008
2009 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
2010 page = pfn_to_page(pfn);
2011 rc = dissolve_free_huge_page(page);
2012 if (rc)
2013 break;
2014 }
2015
2016 return rc;
2017}
2018
2019
2020
2021
2022static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
2023 int nid, nodemask_t *nmask)
2024{
2025 struct page *page = NULL;
2026
2027 if (hstate_is_gigantic(h))
2028 return NULL;
2029
2030 spin_lock_irq(&hugetlb_lock);
2031 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2032 goto out_unlock;
2033 spin_unlock_irq(&hugetlb_lock);
2034
2035 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2036 if (!page)
2037 return NULL;
2038
2039 spin_lock_irq(&hugetlb_lock);
2040
2041
2042
2043
2044
2045
2046
2047 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2048 SetHPageTemporary(page);
2049 spin_unlock_irq(&hugetlb_lock);
2050 put_page(page);
2051 return NULL;
2052 } else {
2053 h->surplus_huge_pages++;
2054 h->surplus_huge_pages_node[page_to_nid(page)]++;
2055 }
2056
2057out_unlock:
2058 spin_unlock_irq(&hugetlb_lock);
2059
2060 return page;
2061}
2062
2063static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
2064 int nid, nodemask_t *nmask)
2065{
2066 struct page *page;
2067
2068 if (hstate_is_gigantic(h))
2069 return NULL;
2070
2071 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
2072 if (!page)
2073 return NULL;
2074
2075
2076
2077
2078
2079 SetHPageTemporary(page);
2080
2081 return page;
2082}
2083
2084
2085
2086
2087static
2088struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
2089 struct vm_area_struct *vma, unsigned long addr)
2090{
2091 struct page *page;
2092 struct mempolicy *mpol;
2093 gfp_t gfp_mask = htlb_alloc_mask(h);
2094 int nid;
2095 nodemask_t *nodemask;
2096
2097 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
2098 page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
2099 mpol_cond_put(mpol);
2100
2101 return page;
2102}
2103
2104
2105struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
2106 nodemask_t *nmask, gfp_t gfp_mask)
2107{
2108 spin_lock_irq(&hugetlb_lock);
2109 if (h->free_huge_pages - h->resv_huge_pages > 0) {
2110 struct page *page;
2111
2112 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
2113 if (page) {
2114 spin_unlock_irq(&hugetlb_lock);
2115 return page;
2116 }
2117 }
2118 spin_unlock_irq(&hugetlb_lock);
2119
2120 return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
2121}
2122
2123
2124struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
2125 unsigned long address)
2126{
2127 struct mempolicy *mpol;
2128 nodemask_t *nodemask;
2129 struct page *page;
2130 gfp_t gfp_mask;
2131 int node;
2132
2133 gfp_mask = htlb_alloc_mask(h);
2134 node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
2135 page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
2136 mpol_cond_put(mpol);
2137
2138 return page;
2139}
2140
2141
2142
2143
2144
2145static int gather_surplus_pages(struct hstate *h, long delta)
2146 __must_hold(&hugetlb_lock)
2147{
2148 struct list_head surplus_list;
2149 struct page *page, *tmp;
2150 int ret;
2151 long i;
2152 long needed, allocated;
2153 bool alloc_ok = true;
2154
2155 lockdep_assert_held(&hugetlb_lock);
2156 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2157 if (needed <= 0) {
2158 h->resv_huge_pages += delta;
2159 return 0;
2160 }
2161
2162 allocated = 0;
2163 INIT_LIST_HEAD(&surplus_list);
2164
2165 ret = -ENOMEM;
2166retry:
2167 spin_unlock_irq(&hugetlb_lock);
2168 for (i = 0; i < needed; i++) {
2169 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
2170 NUMA_NO_NODE, NULL);
2171 if (!page) {
2172 alloc_ok = false;
2173 break;
2174 }
2175 list_add(&page->lru, &surplus_list);
2176 cond_resched();
2177 }
2178 allocated += i;
2179
2180
2181
2182
2183
2184 spin_lock_irq(&hugetlb_lock);
2185 needed = (h->resv_huge_pages + delta) -
2186 (h->free_huge_pages + allocated);
2187 if (needed > 0) {
2188 if (alloc_ok)
2189 goto retry;
2190
2191
2192
2193
2194
2195 goto free;
2196 }
2197
2198
2199
2200
2201
2202
2203
2204
2205 needed += allocated;
2206 h->resv_huge_pages += delta;
2207 ret = 0;
2208
2209
2210 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
2211 int zeroed;
2212
2213 if ((--needed) < 0)
2214 break;
2215
2216
2217
2218
2219 zeroed = put_page_testzero(page);
2220 VM_BUG_ON_PAGE(!zeroed, page);
2221 enqueue_huge_page(h, page);
2222 }
2223free:
2224 spin_unlock_irq(&hugetlb_lock);
2225
2226
2227 list_for_each_entry_safe(page, tmp, &surplus_list, lru)
2228 put_page(page);
2229 spin_lock_irq(&hugetlb_lock);
2230
2231 return ret;
2232}
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242static void return_unused_surplus_pages(struct hstate *h,
2243 unsigned long unused_resv_pages)
2244{
2245 unsigned long nr_pages;
2246 struct page *page;
2247 LIST_HEAD(page_list);
2248
2249 lockdep_assert_held(&hugetlb_lock);
2250
2251 h->resv_huge_pages -= unused_resv_pages;
2252
2253
2254 if (hstate_is_gigantic(h))
2255 goto out;
2256
2257
2258
2259
2260
2261 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271 while (nr_pages--) {
2272 page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
2273 if (!page)
2274 goto out;
2275
2276 list_add(&page->lru, &page_list);
2277 }
2278
2279out:
2280 spin_unlock_irq(&hugetlb_lock);
2281 update_and_free_pages_bulk(h, &page_list);
2282 spin_lock_irq(&hugetlb_lock);
2283}
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315enum vma_resv_mode {
2316 VMA_NEEDS_RESV,
2317 VMA_COMMIT_RESV,
2318 VMA_END_RESV,
2319 VMA_ADD_RESV,
2320 VMA_DEL_RESV,
2321};
2322static long __vma_reservation_common(struct hstate *h,
2323 struct vm_area_struct *vma, unsigned long addr,
2324 enum vma_resv_mode mode)
2325{
2326 struct resv_map *resv;
2327 pgoff_t idx;
2328 long ret;
2329 long dummy_out_regions_needed;
2330
2331 resv = vma_resv_map(vma);
2332 if (!resv)
2333 return 1;
2334
2335 idx = vma_hugecache_offset(h, vma, addr);
2336 switch (mode) {
2337 case VMA_NEEDS_RESV:
2338 ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
2339
2340
2341
2342
2343 VM_BUG_ON(dummy_out_regions_needed != 1);
2344 break;
2345 case VMA_COMMIT_RESV:
2346 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2347
2348 VM_BUG_ON(ret < 0);
2349 break;
2350 case VMA_END_RESV:
2351 region_abort(resv, idx, idx + 1, 1);
2352 ret = 0;
2353 break;
2354 case VMA_ADD_RESV:
2355 if (vma->vm_flags & VM_MAYSHARE) {
2356 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2357
2358 VM_BUG_ON(ret < 0);
2359 } else {
2360 region_abort(resv, idx, idx + 1, 1);
2361 ret = region_del(resv, idx, idx + 1);
2362 }
2363 break;
2364 case VMA_DEL_RESV:
2365 if (vma->vm_flags & VM_MAYSHARE) {
2366 region_abort(resv, idx, idx + 1, 1);
2367 ret = region_del(resv, idx, idx + 1);
2368 } else {
2369 ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
2370
2371 VM_BUG_ON(ret < 0);
2372 }
2373 break;
2374 default:
2375 BUG();
2376 }
2377
2378 if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2379 return ret;
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395 if (ret > 0)
2396 return 0;
2397 if (ret == 0)
2398 return 1;
2399 return ret;
2400}
2401
2402static long vma_needs_reservation(struct hstate *h,
2403 struct vm_area_struct *vma, unsigned long addr)
2404{
2405 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
2406}
2407
2408static long vma_commit_reservation(struct hstate *h,
2409 struct vm_area_struct *vma, unsigned long addr)
2410{
2411 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
2412}
2413
2414static void vma_end_reservation(struct hstate *h,
2415 struct vm_area_struct *vma, unsigned long addr)
2416{
2417 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
2418}
2419
2420static long vma_add_reservation(struct hstate *h,
2421 struct vm_area_struct *vma, unsigned long addr)
2422{
2423 return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
2424}
2425
2426static long vma_del_reservation(struct hstate *h,
2427 struct vm_area_struct *vma, unsigned long addr)
2428{
2429 return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
2430}
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
2453 unsigned long address, struct page *page)
2454{
2455 long rc = vma_needs_reservation(h, vma, address);
2456
2457 if (HPageRestoreReserve(page)) {
2458 if (unlikely(rc < 0))
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470 ClearHPageRestoreReserve(page);
2471 else if (rc)
2472 (void)vma_add_reservation(h, vma, address);
2473 else
2474 vma_end_reservation(h, vma, address);
2475 } else {
2476 if (!rc) {
2477
2478
2479
2480
2481
2482
2483
2484
2485 rc = vma_del_reservation(h, vma, address);
2486 if (rc < 0)
2487
2488
2489
2490
2491
2492
2493
2494
2495 SetHPageRestoreReserve(page);
2496 } else if (rc < 0) {
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507 if (!(vma->vm_flags & VM_MAYSHARE))
2508
2509
2510
2511
2512
2513
2514
2515
2516 SetHPageRestoreReserve(page);
2517 } else
2518
2519
2520
2521 vma_end_reservation(h, vma, address);
2522 }
2523}
2524
2525
2526
2527
2528
2529
2530
2531
2532static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
2533 struct list_head *list)
2534{
2535 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
2536 int nid = page_to_nid(old_page);
2537 struct page *new_page;
2538 int ret = 0;
2539
2540
2541
2542
2543
2544
2545
2546
2547 new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
2548 if (!new_page)
2549 return -ENOMEM;
2550 __prep_new_huge_page(h, new_page);
2551
2552retry:
2553 spin_lock_irq(&hugetlb_lock);
2554 if (!PageHuge(old_page)) {
2555
2556
2557
2558 goto free_new;
2559 } else if (page_count(old_page)) {
2560
2561
2562
2563
2564 spin_unlock_irq(&hugetlb_lock);
2565 if (!isolate_huge_page(old_page, list))
2566 ret = -EBUSY;
2567 spin_lock_irq(&hugetlb_lock);
2568 goto free_new;
2569 } else if (!HPageFreed(old_page)) {
2570
2571
2572
2573
2574
2575 spin_unlock_irq(&hugetlb_lock);
2576 cond_resched();
2577 goto retry;
2578 } else {
2579
2580
2581
2582
2583
2584
2585
2586 remove_hugetlb_page(h, old_page, false);
2587
2588
2589
2590
2591
2592 __prep_account_new_huge_page(h, nid);
2593 page_ref_dec(new_page);
2594 enqueue_huge_page(h, new_page);
2595
2596
2597
2598
2599 spin_unlock_irq(&hugetlb_lock);
2600 update_and_free_page(h, old_page, false);
2601 }
2602
2603 return ret;
2604
2605free_new:
2606 spin_unlock_irq(&hugetlb_lock);
2607 update_and_free_page(h, new_page, false);
2608
2609 return ret;
2610}
2611
2612int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
2613{
2614 struct hstate *h;
2615 struct page *head;
2616 int ret = -EBUSY;
2617
2618
2619
2620
2621
2622
2623 spin_lock_irq(&hugetlb_lock);
2624 if (PageHuge(page)) {
2625 head = compound_head(page);
2626 h = page_hstate(head);
2627 } else {
2628 spin_unlock_irq(&hugetlb_lock);
2629 return 0;
2630 }
2631 spin_unlock_irq(&hugetlb_lock);
2632
2633
2634
2635
2636
2637
2638 if (hstate_is_gigantic(h))
2639 return -ENOMEM;
2640
2641 if (page_count(head) && isolate_huge_page(head, list))
2642 ret = 0;
2643 else if (!page_count(head))
2644 ret = alloc_and_dissolve_huge_page(h, head, list);
2645
2646 return ret;
2647}
2648
2649struct page *alloc_huge_page(struct vm_area_struct *vma,
2650 unsigned long addr, int avoid_reserve)
2651{
2652 struct hugepage_subpool *spool = subpool_vma(vma);
2653 struct hstate *h = hstate_vma(vma);
2654 struct page *page;
2655 long map_chg, map_commit;
2656 long gbl_chg;
2657 int ret, idx;
2658 struct hugetlb_cgroup *h_cg;
2659 bool deferred_reserve;
2660
2661 idx = hstate_index(h);
2662
2663
2664
2665
2666
2667 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
2668 if (map_chg < 0)
2669 return ERR_PTR(-ENOMEM);
2670
2671
2672
2673
2674
2675
2676
2677
2678 if (map_chg || avoid_reserve) {
2679 gbl_chg = hugepage_subpool_get_pages(spool, 1);
2680 if (gbl_chg < 0) {
2681 vma_end_reservation(h, vma, addr);
2682 return ERR_PTR(-ENOSPC);
2683 }
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693 if (avoid_reserve)
2694 gbl_chg = 1;
2695 }
2696
2697
2698
2699 deferred_reserve = map_chg || avoid_reserve;
2700 if (deferred_reserve) {
2701 ret = hugetlb_cgroup_charge_cgroup_rsvd(
2702 idx, pages_per_huge_page(h), &h_cg);
2703 if (ret)
2704 goto out_subpool_put;
2705 }
2706
2707 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
2708 if (ret)
2709 goto out_uncharge_cgroup_reservation;
2710
2711 spin_lock_irq(&hugetlb_lock);
2712
2713
2714
2715
2716
2717 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
2718 if (!page) {
2719 spin_unlock_irq(&hugetlb_lock);
2720 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
2721 if (!page)
2722 goto out_uncharge_cgroup;
2723 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
2724 SetHPageRestoreReserve(page);
2725 h->resv_huge_pages--;
2726 }
2727 spin_lock_irq(&hugetlb_lock);
2728 list_add(&page->lru, &h->hugepage_activelist);
2729
2730 }
2731 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
2732
2733
2734
2735 if (deferred_reserve) {
2736 hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
2737 h_cg, page);
2738 }
2739
2740 spin_unlock_irq(&hugetlb_lock);
2741
2742 hugetlb_set_page_subpool(page, spool);
2743
2744 map_commit = vma_commit_reservation(h, vma, addr);
2745 if (unlikely(map_chg > map_commit)) {
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755 long rsv_adjust;
2756
2757 rsv_adjust = hugepage_subpool_put_pages(spool, 1);
2758 hugetlb_acct_memory(h, -rsv_adjust);
2759 if (deferred_reserve)
2760 hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
2761 pages_per_huge_page(h), page);
2762 }
2763 return page;
2764
2765out_uncharge_cgroup:
2766 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
2767out_uncharge_cgroup_reservation:
2768 if (deferred_reserve)
2769 hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
2770 h_cg);
2771out_subpool_put:
2772 if (map_chg || avoid_reserve)
2773 hugepage_subpool_put_pages(spool, 1);
2774 vma_end_reservation(h, vma, addr);
2775 return ERR_PTR(-ENOSPC);
2776}
2777
2778int alloc_bootmem_huge_page(struct hstate *h)
2779 __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
2780int __alloc_bootmem_huge_page(struct hstate *h)
2781{
2782 struct huge_bootmem_page *m;
2783 int nr_nodes, node;
2784
2785 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
2786 void *addr;
2787
2788 addr = memblock_alloc_try_nid_raw(
2789 huge_page_size(h), huge_page_size(h),
2790 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
2791 if (addr) {
2792
2793
2794
2795
2796
2797 m = addr;
2798 goto found;
2799 }
2800 }
2801 return 0;
2802
2803found:
2804 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
2805
2806 INIT_LIST_HEAD(&m->list);
2807 list_add(&m->list, &huge_boot_pages);
2808 m->hstate = h;
2809 return 1;
2810}
2811
2812
2813
2814
2815
2816static void __init gather_bootmem_prealloc(void)
2817{
2818 struct huge_bootmem_page *m;
2819
2820 list_for_each_entry(m, &huge_boot_pages, list) {
2821 struct page *page = virt_to_page(m);
2822 struct hstate *h = m->hstate;
2823
2824 VM_BUG_ON(!hstate_is_gigantic(h));
2825 WARN_ON(page_count(page) != 1);
2826 if (prep_compound_gigantic_page(page, huge_page_order(h))) {
2827 WARN_ON(PageReserved(page));
2828 prep_new_huge_page(h, page, page_to_nid(page));
2829 put_page(page);
2830 } else {
2831 free_gigantic_page(page, huge_page_order(h));
2832 pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
2833 }
2834
2835
2836
2837
2838
2839
2840 adjust_managed_page_count(page, pages_per_huge_page(h));
2841 cond_resched();
2842 }
2843}
2844
2845static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
2846{
2847 unsigned long i;
2848 nodemask_t *node_alloc_noretry;
2849
2850 if (!hstate_is_gigantic(h)) {
2851
2852
2853
2854
2855
2856
2857 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
2858 GFP_KERNEL);
2859 } else {
2860
2861 node_alloc_noretry = NULL;
2862 }
2863
2864
2865 if (node_alloc_noretry)
2866 nodes_clear(*node_alloc_noretry);
2867
2868 for (i = 0; i < h->max_huge_pages; ++i) {
2869 if (hstate_is_gigantic(h)) {
2870 if (hugetlb_cma_size) {
2871 pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
2872 goto free;
2873 }
2874 if (!alloc_bootmem_huge_page(h))
2875 break;
2876 } else if (!alloc_pool_huge_page(h,
2877 &node_states[N_MEMORY],
2878 node_alloc_noretry))
2879 break;
2880 cond_resched();
2881 }
2882 if (i < h->max_huge_pages) {
2883 char buf[32];
2884
2885 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
2886 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
2887 h->max_huge_pages, buf, i);
2888 h->max_huge_pages = i;
2889 }
2890free:
2891 kfree(node_alloc_noretry);
2892}
2893
2894static void __init hugetlb_init_hstates(void)
2895{
2896 struct hstate *h;
2897
2898 for_each_hstate(h) {
2899 if (minimum_order > huge_page_order(h))
2900 minimum_order = huge_page_order(h);
2901
2902
2903 if (!hstate_is_gigantic(h))
2904 hugetlb_hstate_alloc_pages(h);
2905 }
2906 VM_BUG_ON(minimum_order == UINT_MAX);
2907}
2908
2909static void __init report_hugepages(void)
2910{
2911 struct hstate *h;
2912
2913 for_each_hstate(h) {
2914 char buf[32];
2915
2916 string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
2917 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
2918 buf, h->free_huge_pages);
2919 }
2920}
2921
2922#ifdef CONFIG_HIGHMEM
2923static void try_to_free_low(struct hstate *h, unsigned long count,
2924 nodemask_t *nodes_allowed)
2925{
2926 int i;
2927 LIST_HEAD(page_list);
2928
2929 lockdep_assert_held(&hugetlb_lock);
2930 if (hstate_is_gigantic(h))
2931 return;
2932
2933
2934
2935
2936 for_each_node_mask(i, *nodes_allowed) {
2937 struct page *page, *next;
2938 struct list_head *freel = &h->hugepage_freelists[i];
2939 list_for_each_entry_safe(page, next, freel, lru) {
2940 if (count >= h->nr_huge_pages)
2941 goto out;
2942 if (PageHighMem(page))
2943 continue;
2944 remove_hugetlb_page(h, page, false);
2945 list_add(&page->lru, &page_list);
2946 }
2947 }
2948
2949out:
2950 spin_unlock_irq(&hugetlb_lock);
2951 update_and_free_pages_bulk(h, &page_list);
2952 spin_lock_irq(&hugetlb_lock);
2953}
2954#else
2955static inline void try_to_free_low(struct hstate *h, unsigned long count,
2956 nodemask_t *nodes_allowed)
2957{
2958}
2959#endif
2960
2961
2962
2963
2964
2965
2966static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
2967 int delta)
2968{
2969 int nr_nodes, node;
2970
2971 lockdep_assert_held(&hugetlb_lock);
2972 VM_BUG_ON(delta != -1 && delta != 1);
2973
2974 if (delta < 0) {
2975 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
2976 if (h->surplus_huge_pages_node[node])
2977 goto found;
2978 }
2979 } else {
2980 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2981 if (h->surplus_huge_pages_node[node] <
2982 h->nr_huge_pages_node[node])
2983 goto found;
2984 }
2985 }
2986 return 0;
2987
2988found:
2989 h->surplus_huge_pages += delta;
2990 h->surplus_huge_pages_node[node] += delta;
2991 return 1;
2992}
2993
2994#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
2995static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
2996 nodemask_t *nodes_allowed)
2997{
2998 unsigned long min_count, ret;
2999 struct page *page;
3000 LIST_HEAD(page_list);
3001 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
3002
3003
3004
3005
3006
3007
3008 if (node_alloc_noretry)
3009 nodes_clear(*node_alloc_noretry);
3010 else
3011 return -ENOMEM;
3012
3013
3014
3015
3016
3017 mutex_lock(&h->resize_lock);
3018 flush_free_hpage_work(h);
3019 spin_lock_irq(&hugetlb_lock);
3020
3021
3022
3023
3024
3025
3026
3027 if (nid != NUMA_NO_NODE) {
3028 unsigned long old_count = count;
3029
3030 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
3031
3032
3033
3034
3035
3036
3037 if (count < old_count)
3038 count = ULONG_MAX;
3039 }
3040
3041
3042
3043
3044
3045
3046
3047
3048 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
3049 if (count > persistent_huge_pages(h)) {
3050 spin_unlock_irq(&hugetlb_lock);
3051 mutex_unlock(&h->resize_lock);
3052 NODEMASK_FREE(node_alloc_noretry);
3053 return -EINVAL;
3054 }
3055
3056 }
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3070 if (!adjust_pool_surplus(h, nodes_allowed, -1))
3071 break;
3072 }
3073
3074 while (count > persistent_huge_pages(h)) {
3075
3076
3077
3078
3079
3080 spin_unlock_irq(&hugetlb_lock);
3081
3082
3083 cond_resched();
3084
3085 ret = alloc_pool_huge_page(h, nodes_allowed,
3086 node_alloc_noretry);
3087 spin_lock_irq(&hugetlb_lock);
3088 if (!ret)
3089 goto out;
3090
3091
3092 if (signal_pending(current))
3093 goto out;
3094 }
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
3112 min_count = max(count, min_count);
3113 try_to_free_low(h, min_count, nodes_allowed);
3114
3115
3116
3117
3118 while (min_count < persistent_huge_pages(h)) {
3119 page = remove_pool_huge_page(h, nodes_allowed, 0);
3120 if (!page)
3121 break;
3122
3123 list_add(&page->lru, &page_list);
3124 }
3125
3126 spin_unlock_irq(&hugetlb_lock);
3127 update_and_free_pages_bulk(h, &page_list);
3128 flush_free_hpage_work(h);
3129 spin_lock_irq(&hugetlb_lock);
3130
3131 while (count < persistent_huge_pages(h)) {
3132 if (!adjust_pool_surplus(h, nodes_allowed, 1))
3133 break;
3134 }
3135out:
3136 h->max_huge_pages = persistent_huge_pages(h);
3137 spin_unlock_irq(&hugetlb_lock);
3138 mutex_unlock(&h->resize_lock);
3139
3140 NODEMASK_FREE(node_alloc_noretry);
3141
3142 return 0;
3143}
3144
3145#define HSTATE_ATTR_RO(_name) \
3146 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
3147
3148#define HSTATE_ATTR(_name) \
3149 static struct kobj_attribute _name##_attr = \
3150 __ATTR(_name, 0644, _name##_show, _name##_store)
3151
3152static struct kobject *hugepages_kobj;
3153static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3154
3155static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
3156
3157static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
3158{
3159 int i;
3160
3161 for (i = 0; i < HUGE_MAX_HSTATE; i++)
3162 if (hstate_kobjs[i] == kobj) {
3163 if (nidp)
3164 *nidp = NUMA_NO_NODE;
3165 return &hstates[i];
3166 }
3167
3168 return kobj_to_node_hstate(kobj, nidp);
3169}
3170
3171static ssize_t nr_hugepages_show_common(struct kobject *kobj,
3172 struct kobj_attribute *attr, char *buf)
3173{
3174 struct hstate *h;
3175 unsigned long nr_huge_pages;
3176 int nid;
3177
3178 h = kobj_to_hstate(kobj, &nid);
3179 if (nid == NUMA_NO_NODE)
3180 nr_huge_pages = h->nr_huge_pages;
3181 else
3182 nr_huge_pages = h->nr_huge_pages_node[nid];
3183
3184 return sysfs_emit(buf, "%lu\n", nr_huge_pages);
3185}
3186
3187static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
3188 struct hstate *h, int nid,
3189 unsigned long count, size_t len)
3190{
3191 int err;
3192 nodemask_t nodes_allowed, *n_mask;
3193
3194 if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
3195 return -EINVAL;
3196
3197 if (nid == NUMA_NO_NODE) {
3198
3199
3200
3201 if (!(obey_mempolicy &&
3202 init_nodemask_of_mempolicy(&nodes_allowed)))
3203 n_mask = &node_states[N_MEMORY];
3204 else
3205 n_mask = &nodes_allowed;
3206 } else {
3207
3208
3209
3210
3211 init_nodemask_of_node(&nodes_allowed, nid);
3212 n_mask = &nodes_allowed;
3213 }
3214
3215 err = set_max_huge_pages(h, count, nid, n_mask);
3216
3217 return err ? err : len;
3218}
3219
3220static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
3221 struct kobject *kobj, const char *buf,
3222 size_t len)
3223{
3224 struct hstate *h;
3225 unsigned long count;
3226 int nid;
3227 int err;
3228
3229 err = kstrtoul(buf, 10, &count);
3230 if (err)
3231 return err;
3232
3233 h = kobj_to_hstate(kobj, &nid);
3234 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
3235}
3236
3237static ssize_t nr_hugepages_show(struct kobject *kobj,
3238 struct kobj_attribute *attr, char *buf)
3239{
3240 return nr_hugepages_show_common(kobj, attr, buf);
3241}
3242
3243static ssize_t nr_hugepages_store(struct kobject *kobj,
3244 struct kobj_attribute *attr, const char *buf, size_t len)
3245{
3246 return nr_hugepages_store_common(false, kobj, buf, len);
3247}
3248HSTATE_ATTR(nr_hugepages);
3249
3250#ifdef CONFIG_NUMA
3251
3252
3253
3254
3255
3256static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
3257 struct kobj_attribute *attr,
3258 char *buf)
3259{
3260 return nr_hugepages_show_common(kobj, attr, buf);
3261}
3262
3263static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
3264 struct kobj_attribute *attr, const char *buf, size_t len)
3265{
3266 return nr_hugepages_store_common(true, kobj, buf, len);
3267}
3268HSTATE_ATTR(nr_hugepages_mempolicy);
3269#endif
3270
3271
3272static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
3273 struct kobj_attribute *attr, char *buf)
3274{
3275 struct hstate *h = kobj_to_hstate(kobj, NULL);
3276 return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
3277}
3278
3279static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
3280 struct kobj_attribute *attr, const char *buf, size_t count)
3281{
3282 int err;
3283 unsigned long input;
3284 struct hstate *h = kobj_to_hstate(kobj, NULL);
3285
3286 if (hstate_is_gigantic(h))
3287 return -EINVAL;
3288
3289 err = kstrtoul(buf, 10, &input);
3290 if (err)
3291 return err;
3292
3293 spin_lock_irq(&hugetlb_lock);
3294 h->nr_overcommit_huge_pages = input;
3295 spin_unlock_irq(&hugetlb_lock);
3296
3297 return count;
3298}
3299HSTATE_ATTR(nr_overcommit_hugepages);
3300
3301static ssize_t free_hugepages_show(struct kobject *kobj,
3302 struct kobj_attribute *attr, char *buf)
3303{
3304 struct hstate *h;
3305 unsigned long free_huge_pages;
3306 int nid;
3307
3308 h = kobj_to_hstate(kobj, &nid);
3309 if (nid == NUMA_NO_NODE)
3310 free_huge_pages = h->free_huge_pages;
3311 else
3312 free_huge_pages = h->free_huge_pages_node[nid];
3313
3314 return sysfs_emit(buf, "%lu\n", free_huge_pages);
3315}
3316HSTATE_ATTR_RO(free_hugepages);
3317
3318static ssize_t resv_hugepages_show(struct kobject *kobj,
3319 struct kobj_attribute *attr, char *buf)
3320{
3321 struct hstate *h = kobj_to_hstate(kobj, NULL);
3322 return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
3323}
3324HSTATE_ATTR_RO(resv_hugepages);
3325
3326static ssize_t surplus_hugepages_show(struct kobject *kobj,
3327 struct kobj_attribute *attr, char *buf)
3328{
3329 struct hstate *h;
3330 unsigned long surplus_huge_pages;
3331 int nid;
3332
3333 h = kobj_to_hstate(kobj, &nid);
3334 if (nid == NUMA_NO_NODE)
3335 surplus_huge_pages = h->surplus_huge_pages;
3336 else
3337 surplus_huge_pages = h->surplus_huge_pages_node[nid];
3338
3339 return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
3340}
3341HSTATE_ATTR_RO(surplus_hugepages);
3342
3343static struct attribute *hstate_attrs[] = {
3344 &nr_hugepages_attr.attr,
3345 &nr_overcommit_hugepages_attr.attr,
3346 &free_hugepages_attr.attr,
3347 &resv_hugepages_attr.attr,
3348 &surplus_hugepages_attr.attr,
3349#ifdef CONFIG_NUMA
3350 &nr_hugepages_mempolicy_attr.attr,
3351#endif
3352 NULL,
3353};
3354
3355static const struct attribute_group hstate_attr_group = {
3356 .attrs = hstate_attrs,
3357};
3358
3359static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
3360 struct kobject **hstate_kobjs,
3361 const struct attribute_group *hstate_attr_group)
3362{
3363 int retval;
3364 int hi = hstate_index(h);
3365
3366 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
3367 if (!hstate_kobjs[hi])
3368 return -ENOMEM;
3369
3370 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
3371 if (retval) {
3372 kobject_put(hstate_kobjs[hi]);
3373 hstate_kobjs[hi] = NULL;
3374 }
3375
3376 return retval;
3377}
3378
3379static void __init hugetlb_sysfs_init(void)
3380{
3381 struct hstate *h;
3382 int err;
3383
3384 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
3385 if (!hugepages_kobj)
3386 return;
3387
3388 for_each_hstate(h) {
3389 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
3390 hstate_kobjs, &hstate_attr_group);
3391 if (err)
3392 pr_err("HugeTLB: Unable to add hstate %s", h->name);
3393 }
3394}
3395
3396#ifdef CONFIG_NUMA
3397
3398
3399
3400
3401
3402
3403
3404
3405struct node_hstate {
3406 struct kobject *hugepages_kobj;
3407 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
3408};
3409static struct node_hstate node_hstates[MAX_NUMNODES];
3410
3411
3412
3413
3414static struct attribute *per_node_hstate_attrs[] = {
3415 &nr_hugepages_attr.attr,
3416 &free_hugepages_attr.attr,
3417 &surplus_hugepages_attr.attr,
3418 NULL,
3419};
3420
3421static const struct attribute_group per_node_hstate_attr_group = {
3422 .attrs = per_node_hstate_attrs,
3423};
3424
3425
3426
3427
3428
3429static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
3430{
3431 int nid;
3432
3433 for (nid = 0; nid < nr_node_ids; nid++) {
3434 struct node_hstate *nhs = &node_hstates[nid];
3435 int i;
3436 for (i = 0; i < HUGE_MAX_HSTATE; i++)
3437 if (nhs->hstate_kobjs[i] == kobj) {
3438 if (nidp)
3439 *nidp = nid;
3440 return &hstates[i];
3441 }
3442 }
3443
3444 BUG();
3445 return NULL;
3446}
3447
3448
3449
3450
3451
3452static void hugetlb_unregister_node(struct node *node)
3453{
3454 struct hstate *h;
3455 struct node_hstate *nhs = &node_hstates[node->dev.id];
3456
3457 if (!nhs->hugepages_kobj)
3458 return;
3459
3460 for_each_hstate(h) {
3461 int idx = hstate_index(h);
3462 if (nhs->hstate_kobjs[idx]) {
3463 kobject_put(nhs->hstate_kobjs[idx]);
3464 nhs->hstate_kobjs[idx] = NULL;
3465 }
3466 }
3467
3468 kobject_put(nhs->hugepages_kobj);
3469 nhs->hugepages_kobj = NULL;
3470}
3471
3472
3473
3474
3475
3476
3477static void hugetlb_register_node(struct node *node)
3478{
3479 struct hstate *h;
3480 struct node_hstate *nhs = &node_hstates[node->dev.id];
3481 int err;
3482
3483 if (nhs->hugepages_kobj)
3484 return;
3485
3486 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
3487 &node->dev.kobj);
3488 if (!nhs->hugepages_kobj)
3489 return;
3490
3491 for_each_hstate(h) {
3492 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
3493 nhs->hstate_kobjs,
3494 &per_node_hstate_attr_group);
3495 if (err) {
3496 pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
3497 h->name, node->dev.id);
3498 hugetlb_unregister_node(node);
3499 break;
3500 }
3501 }
3502}
3503
3504
3505
3506
3507
3508
3509static void __init hugetlb_register_all_nodes(void)
3510{
3511 int nid;
3512
3513 for_each_node_state(nid, N_MEMORY) {
3514 struct node *node = node_devices[nid];
3515 if (node->dev.id == nid)
3516 hugetlb_register_node(node);
3517 }
3518
3519
3520
3521
3522
3523 register_hugetlbfs_with_node(hugetlb_register_node,
3524 hugetlb_unregister_node);
3525}
3526#else
3527
3528static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
3529{
3530 BUG();
3531 if (nidp)
3532 *nidp = -1;
3533 return NULL;
3534}
3535
3536static void hugetlb_register_all_nodes(void) { }
3537
3538#endif
3539
3540static int __init hugetlb_init(void)
3541{
3542 int i;
3543
3544 BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
3545 __NR_HPAGEFLAGS);
3546
3547 if (!hugepages_supported()) {
3548 if (hugetlb_max_hstate || default_hstate_max_huge_pages)
3549 pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
3550 return 0;
3551 }
3552
3553
3554
3555
3556
3557 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
3558 if (!parsed_default_hugepagesz) {
3559
3560
3561
3562
3563
3564
3565
3566
3567 default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
3568 if (default_hstate_max_huge_pages) {
3569 if (default_hstate.max_huge_pages) {
3570 char buf[32];
3571
3572 string_get_size(huge_page_size(&default_hstate),
3573 1, STRING_UNITS_2, buf, 32);
3574 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
3575 default_hstate.max_huge_pages, buf);
3576 pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
3577 default_hstate_max_huge_pages);
3578 }
3579 default_hstate.max_huge_pages =
3580 default_hstate_max_huge_pages;
3581 }
3582 }
3583
3584 hugetlb_cma_check();
3585 hugetlb_init_hstates();
3586 gather_bootmem_prealloc();
3587 report_hugepages();
3588
3589 hugetlb_sysfs_init();
3590 hugetlb_register_all_nodes();
3591 hugetlb_cgroup_file_init();
3592
3593#ifdef CONFIG_SMP
3594 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
3595#else
3596 num_fault_mutexes = 1;
3597#endif
3598 hugetlb_fault_mutex_table =
3599 kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
3600 GFP_KERNEL);
3601 BUG_ON(!hugetlb_fault_mutex_table);
3602
3603 for (i = 0; i < num_fault_mutexes; i++)
3604 mutex_init(&hugetlb_fault_mutex_table[i]);
3605 return 0;
3606}
3607subsys_initcall(hugetlb_init);
3608
3609
3610bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
3611{
3612 return size == HPAGE_SIZE;
3613}
3614
3615void __init hugetlb_add_hstate(unsigned int order)
3616{
3617 struct hstate *h;
3618 unsigned long i;
3619
3620 if (size_to_hstate(PAGE_SIZE << order)) {
3621 return;
3622 }
3623 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
3624 BUG_ON(order == 0);
3625 h = &hstates[hugetlb_max_hstate++];
3626 mutex_init(&h->resize_lock);
3627 h->order = order;
3628 h->mask = ~(huge_page_size(h) - 1);
3629 for (i = 0; i < MAX_NUMNODES; ++i)
3630 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
3631 INIT_LIST_HEAD(&h->hugepage_activelist);
3632 h->next_nid_to_alloc = first_memory_node;
3633 h->next_nid_to_free = first_memory_node;
3634 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
3635 huge_page_size(h)/1024);
3636 hugetlb_vmemmap_init(h);
3637
3638 parsed_hstate = h;
3639}
3640
3641
3642
3643
3644
3645
3646
3647
3648static int __init hugepages_setup(char *s)
3649{
3650 unsigned long *mhp;
3651 static unsigned long *last_mhp;
3652
3653 if (!parsed_valid_hugepagesz) {
3654 pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
3655 parsed_valid_hugepagesz = true;
3656 return 0;
3657 }
3658
3659
3660
3661
3662
3663
3664
3665 else if (!hugetlb_max_hstate)
3666 mhp = &default_hstate_max_huge_pages;
3667 else
3668 mhp = &parsed_hstate->max_huge_pages;
3669
3670 if (mhp == last_mhp) {
3671 pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
3672 return 0;
3673 }
3674
3675 if (sscanf(s, "%lu", mhp) <= 0)
3676 *mhp = 0;
3677
3678
3679
3680
3681
3682
3683 if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
3684 hugetlb_hstate_alloc_pages(parsed_hstate);
3685
3686 last_mhp = mhp;
3687
3688 return 1;
3689}
3690__setup("hugepages=", hugepages_setup);
3691
3692
3693
3694
3695
3696
3697
3698
3699static int __init hugepagesz_setup(char *s)
3700{
3701 unsigned long size;
3702 struct hstate *h;
3703
3704 parsed_valid_hugepagesz = false;
3705 size = (unsigned long)memparse(s, NULL);
3706
3707 if (!arch_hugetlb_valid_size(size)) {
3708 pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
3709 return 0;
3710 }
3711
3712 h = size_to_hstate(size);
3713 if (h) {
3714
3715
3716
3717
3718
3719
3720
3721 if (!parsed_default_hugepagesz || h != &default_hstate ||
3722 default_hstate.max_huge_pages) {
3723 pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
3724 return 0;
3725 }
3726
3727
3728
3729
3730
3731
3732 parsed_hstate = h;
3733 parsed_valid_hugepagesz = true;
3734 return 1;
3735 }
3736
3737 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
3738 parsed_valid_hugepagesz = true;
3739 return 1;
3740}
3741__setup("hugepagesz=", hugepagesz_setup);
3742
3743
3744
3745
3746
3747static int __init default_hugepagesz_setup(char *s)
3748{
3749 unsigned long size;
3750
3751 parsed_valid_hugepagesz = false;
3752 if (parsed_default_hugepagesz) {
3753 pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
3754 return 0;
3755 }
3756
3757 size = (unsigned long)memparse(s, NULL);
3758
3759 if (!arch_hugetlb_valid_size(size)) {
3760 pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
3761 return 0;
3762 }
3763
3764 hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
3765 parsed_valid_hugepagesz = true;
3766 parsed_default_hugepagesz = true;
3767 default_hstate_idx = hstate_index(size_to_hstate(size));
3768
3769
3770
3771
3772
3773
3774
3775
3776 if (default_hstate_max_huge_pages) {
3777 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
3778 if (hstate_is_gigantic(&default_hstate))
3779 hugetlb_hstate_alloc_pages(&default_hstate);
3780 default_hstate_max_huge_pages = 0;
3781 }
3782
3783 return 1;
3784}
3785__setup("default_hugepagesz=", default_hugepagesz_setup);
3786
3787static unsigned int allowed_mems_nr(struct hstate *h)
3788{
3789 int node;
3790 unsigned int nr = 0;
3791 nodemask_t *mpol_allowed;
3792 unsigned int *array = h->free_huge_pages_node;
3793 gfp_t gfp_mask = htlb_alloc_mask(h);
3794
3795 mpol_allowed = policy_nodemask_current(gfp_mask);
3796
3797 for_each_node_mask(node, cpuset_current_mems_allowed) {
3798 if (!mpol_allowed || node_isset(node, *mpol_allowed))
3799 nr += array[node];
3800 }
3801
3802 return nr;
3803}
3804
3805#ifdef CONFIG_SYSCTL
3806static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
3807 void *buffer, size_t *length,
3808 loff_t *ppos, unsigned long *out)
3809{
3810 struct ctl_table dup_table;
3811
3812
3813
3814
3815
3816 dup_table = *table;
3817 dup_table.data = out;
3818
3819 return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
3820}
3821
3822static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
3823 struct ctl_table *table, int write,
3824 void *buffer, size_t *length, loff_t *ppos)
3825{
3826 struct hstate *h = &default_hstate;
3827 unsigned long tmp = h->max_huge_pages;
3828 int ret;
3829
3830 if (!hugepages_supported())
3831 return -EOPNOTSUPP;
3832
3833 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
3834 &tmp);
3835 if (ret)
3836 goto out;
3837
3838 if (write)
3839 ret = __nr_hugepages_store_common(obey_mempolicy, h,
3840 NUMA_NO_NODE, tmp, *length);
3841out:
3842 return ret;
3843}
3844
3845int hugetlb_sysctl_handler(struct ctl_table *table, int write,
3846 void *buffer, size_t *length, loff_t *ppos)
3847{
3848
3849 return hugetlb_sysctl_handler_common(false, table, write,
3850 buffer, length, ppos);
3851}
3852
3853#ifdef CONFIG_NUMA
3854int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
3855 void *buffer, size_t *length, loff_t *ppos)
3856{
3857 return hugetlb_sysctl_handler_common(true, table, write,
3858 buffer, length, ppos);
3859}
3860#endif
3861
3862int hugetlb_overcommit_handler(struct ctl_table *table, int write,
3863 void *buffer, size_t *length, loff_t *ppos)
3864{
3865 struct hstate *h = &default_hstate;
3866 unsigned long tmp;
3867 int ret;
3868
3869 if (!hugepages_supported())
3870 return -EOPNOTSUPP;
3871
3872 tmp = h->nr_overcommit_huge_pages;
3873
3874 if (write && hstate_is_gigantic(h))
3875 return -EINVAL;
3876
3877 ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
3878 &tmp);
3879 if (ret)
3880 goto out;
3881
3882 if (write) {
3883 spin_lock_irq(&hugetlb_lock);
3884 h->nr_overcommit_huge_pages = tmp;
3885 spin_unlock_irq(&hugetlb_lock);
3886 }
3887out:
3888 return ret;
3889}
3890
3891#endif
3892
3893void hugetlb_report_meminfo(struct seq_file *m)
3894{
3895 struct hstate *h;
3896 unsigned long total = 0;
3897
3898 if (!hugepages_supported())
3899 return;
3900
3901 for_each_hstate(h) {
3902 unsigned long count = h->nr_huge_pages;
3903
3904 total += huge_page_size(h) * count;
3905
3906 if (h == &default_hstate)
3907 seq_printf(m,
3908 "HugePages_Total: %5lu\n"
3909 "HugePages_Free: %5lu\n"
3910 "HugePages_Rsvd: %5lu\n"
3911 "HugePages_Surp: %5lu\n"
3912 "Hugepagesize: %8lu kB\n",
3913 count,
3914 h->free_huge_pages,
3915 h->resv_huge_pages,
3916 h->surplus_huge_pages,
3917 huge_page_size(h) / SZ_1K);
3918 }
3919
3920 seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
3921}
3922
3923int hugetlb_report_node_meminfo(char *buf, int len, int nid)
3924{
3925 struct hstate *h = &default_hstate;
3926
3927 if (!hugepages_supported())
3928 return 0;
3929
3930 return sysfs_emit_at(buf, len,
3931 "Node %d HugePages_Total: %5u\n"
3932 "Node %d HugePages_Free: %5u\n"
3933 "Node %d HugePages_Surp: %5u\n",
3934 nid, h->nr_huge_pages_node[nid],
3935 nid, h->free_huge_pages_node[nid],
3936 nid, h->surplus_huge_pages_node[nid]);
3937}
3938
3939void hugetlb_show_meminfo(void)
3940{
3941 struct hstate *h;
3942 int nid;
3943
3944 if (!hugepages_supported())
3945 return;
3946
3947 for_each_node_state(nid, N_MEMORY)
3948 for_each_hstate(h)
3949 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
3950 nid,
3951 h->nr_huge_pages_node[nid],
3952 h->free_huge_pages_node[nid],
3953 h->surplus_huge_pages_node[nid],
3954 huge_page_size(h) / SZ_1K);
3955}
3956
3957void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
3958{
3959 seq_printf(m, "HugetlbPages:\t%8lu kB\n",
3960 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
3961}
3962
3963
3964unsigned long hugetlb_total_pages(void)
3965{
3966 struct hstate *h;
3967 unsigned long nr_total_pages = 0;
3968
3969 for_each_hstate(h)
3970 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
3971 return nr_total_pages;
3972}
3973
3974static int hugetlb_acct_memory(struct hstate *h, long delta)
3975{
3976 int ret = -ENOMEM;
3977
3978 if (!delta)
3979 return 0;
3980
3981 spin_lock_irq(&hugetlb_lock);
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005 if (delta > 0) {
4006 if (gather_surplus_pages(h, delta) < 0)
4007 goto out;
4008
4009 if (delta > allowed_mems_nr(h)) {
4010 return_unused_surplus_pages(h, delta);
4011 goto out;
4012 }
4013 }
4014
4015 ret = 0;
4016 if (delta < 0)
4017 return_unused_surplus_pages(h, (unsigned long) -delta);
4018
4019out:
4020 spin_unlock_irq(&hugetlb_lock);
4021 return ret;
4022}
4023
4024static void hugetlb_vm_op_open(struct vm_area_struct *vma)
4025{
4026 struct resv_map *resv = vma_resv_map(vma);
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4037 kref_get(&resv->refs);
4038}
4039
4040static void hugetlb_vm_op_close(struct vm_area_struct *vma)
4041{
4042 struct hstate *h = hstate_vma(vma);
4043 struct resv_map *resv = vma_resv_map(vma);
4044 struct hugepage_subpool *spool = subpool_vma(vma);
4045 unsigned long reserve, start, end;
4046 long gbl_reserve;
4047
4048 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4049 return;
4050
4051 start = vma_hugecache_offset(h, vma, vma->vm_start);
4052 end = vma_hugecache_offset(h, vma, vma->vm_end);
4053
4054 reserve = (end - start) - region_count(resv, start, end);
4055 hugetlb_cgroup_uncharge_counter(resv, start, end);
4056 if (reserve) {
4057
4058
4059
4060
4061 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
4062 hugetlb_acct_memory(h, -gbl_reserve);
4063 }
4064
4065 kref_put(&resv->refs, resv_map_release);
4066}
4067
4068static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
4069{
4070 if (addr & ~(huge_page_mask(hstate_vma(vma))))
4071 return -EINVAL;
4072 return 0;
4073}
4074
4075static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
4076{
4077 return huge_page_size(hstate_vma(vma));
4078}
4079
4080
4081
4082
4083
4084
4085
4086static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
4087{
4088 BUG();
4089 return 0;
4090}
4091
4092
4093
4094
4095
4096
4097
4098
4099const struct vm_operations_struct hugetlb_vm_ops = {
4100 .fault = hugetlb_vm_op_fault,
4101 .open = hugetlb_vm_op_open,
4102 .close = hugetlb_vm_op_close,
4103 .may_split = hugetlb_vm_op_split,
4104 .pagesize = hugetlb_vm_op_pagesize,
4105};
4106
4107static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
4108 int writable)
4109{
4110 pte_t entry;
4111 unsigned int shift = huge_page_shift(hstate_vma(vma));
4112
4113 if (writable) {
4114 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
4115 vma->vm_page_prot)));
4116 } else {
4117 entry = huge_pte_wrprotect(mk_huge_pte(page,
4118 vma->vm_page_prot));
4119 }
4120 entry = pte_mkyoung(entry);
4121 entry = pte_mkhuge(entry);
4122 entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
4123
4124 return entry;
4125}
4126
4127static void set_huge_ptep_writable(struct vm_area_struct *vma,
4128 unsigned long address, pte_t *ptep)
4129{
4130 pte_t entry;
4131
4132 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
4133 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
4134 update_mmu_cache(vma, address, ptep);
4135}
4136
4137bool is_hugetlb_entry_migration(pte_t pte)
4138{
4139 swp_entry_t swp;
4140
4141 if (huge_pte_none(pte) || pte_present(pte))
4142 return false;
4143 swp = pte_to_swp_entry(pte);
4144 if (is_migration_entry(swp))
4145 return true;
4146 else
4147 return false;
4148}
4149
4150static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
4151{
4152 swp_entry_t swp;
4153
4154 if (huge_pte_none(pte) || pte_present(pte))
4155 return false;
4156 swp = pte_to_swp_entry(pte);
4157 if (is_hwpoison_entry(swp))
4158 return true;
4159 else
4160 return false;
4161}
4162
4163static void
4164hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
4165 struct page *new_page)
4166{
4167 __SetPageUptodate(new_page);
4168 set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
4169 hugepage_add_new_anon_rmap(new_page, vma, addr);
4170 hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
4171 ClearHPageRestoreReserve(new_page);
4172 SetHPageMigratable(new_page);
4173}
4174
4175int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
4176 struct vm_area_struct *vma)
4177{
4178 pte_t *src_pte, *dst_pte, entry, dst_entry;
4179 struct page *ptepage;
4180 unsigned long addr;
4181 bool cow = is_cow_mapping(vma->vm_flags);
4182 struct hstate *h = hstate_vma(vma);
4183 unsigned long sz = huge_page_size(h);
4184 unsigned long npages = pages_per_huge_page(h);
4185 struct address_space *mapping = vma->vm_file->f_mapping;
4186 struct mmu_notifier_range range;
4187 int ret = 0;
4188
4189 if (cow) {
4190 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
4191 vma->vm_start,
4192 vma->vm_end);
4193 mmu_notifier_invalidate_range_start(&range);
4194 } else {
4195
4196
4197
4198
4199
4200
4201 i_mmap_lock_read(mapping);
4202 }
4203
4204 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
4205 spinlock_t *src_ptl, *dst_ptl;
4206 src_pte = huge_pte_offset(src, addr, sz);
4207 if (!src_pte)
4208 continue;
4209 dst_pte = huge_pte_alloc(dst, vma, addr, sz);
4210 if (!dst_pte) {
4211 ret = -ENOMEM;
4212 break;
4213 }
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224 dst_entry = huge_ptep_get(dst_pte);
4225 if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
4226 continue;
4227
4228 dst_ptl = huge_pte_lock(h, dst, dst_pte);
4229 src_ptl = huge_pte_lockptr(h, src, src_pte);
4230 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4231 entry = huge_ptep_get(src_pte);
4232 dst_entry = huge_ptep_get(dst_pte);
4233again:
4234 if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
4235
4236
4237
4238
4239
4240 ;
4241 } else if (unlikely(is_hugetlb_entry_migration(entry) ||
4242 is_hugetlb_entry_hwpoisoned(entry))) {
4243 swp_entry_t swp_entry = pte_to_swp_entry(entry);
4244
4245 if (is_writable_migration_entry(swp_entry) && cow) {
4246
4247
4248
4249
4250 swp_entry = make_readable_migration_entry(
4251 swp_offset(swp_entry));
4252 entry = swp_entry_to_pte(swp_entry);
4253 set_huge_swap_pte_at(src, addr, src_pte,
4254 entry, sz);
4255 }
4256 set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
4257 } else {
4258 entry = huge_ptep_get(src_pte);
4259 ptepage = pte_page(entry);
4260 get_page(ptepage);
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271 if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
4272 pte_t src_pte_old = entry;
4273 struct page *new;
4274
4275 spin_unlock(src_ptl);
4276 spin_unlock(dst_ptl);
4277
4278 new = alloc_huge_page(vma, addr, 1);
4279 if (IS_ERR(new)) {
4280 put_page(ptepage);
4281 ret = PTR_ERR(new);
4282 break;
4283 }
4284 copy_user_huge_page(new, ptepage, addr, vma,
4285 npages);
4286 put_page(ptepage);
4287
4288
4289 dst_ptl = huge_pte_lock(h, dst, dst_pte);
4290 src_ptl = huge_pte_lockptr(h, src, src_pte);
4291 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4292 entry = huge_ptep_get(src_pte);
4293 if (!pte_same(src_pte_old, entry)) {
4294 restore_reserve_on_error(h, vma, addr,
4295 new);
4296 put_page(new);
4297
4298 goto again;
4299 }
4300 hugetlb_install_page(vma, dst_pte, addr, new);
4301 spin_unlock(src_ptl);
4302 spin_unlock(dst_ptl);
4303 continue;
4304 }
4305
4306 if (cow) {
4307
4308
4309
4310
4311
4312
4313
4314 huge_ptep_set_wrprotect(src, addr, src_pte);
4315 entry = huge_pte_wrprotect(entry);
4316 }
4317
4318 page_dup_rmap(ptepage, true);
4319 set_huge_pte_at(dst, addr, dst_pte, entry);
4320 hugetlb_count_add(npages, dst);
4321 }
4322 spin_unlock(src_ptl);
4323 spin_unlock(dst_ptl);
4324 }
4325
4326 if (cow)
4327 mmu_notifier_invalidate_range_end(&range);
4328 else
4329 i_mmap_unlock_read(mapping);
4330
4331 return ret;
4332}
4333
4334void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
4335 unsigned long start, unsigned long end,
4336 struct page *ref_page)
4337{
4338 struct mm_struct *mm = vma->vm_mm;
4339 unsigned long address;
4340 pte_t *ptep;
4341 pte_t pte;
4342 spinlock_t *ptl;
4343 struct page *page;
4344 struct hstate *h = hstate_vma(vma);
4345 unsigned long sz = huge_page_size(h);
4346 struct mmu_notifier_range range;
4347
4348 WARN_ON(!is_vm_hugetlb_page(vma));
4349 BUG_ON(start & ~huge_page_mask(h));
4350 BUG_ON(end & ~huge_page_mask(h));
4351
4352
4353
4354
4355
4356 tlb_change_page_size(tlb, sz);
4357 tlb_start_vma(tlb, vma);
4358
4359
4360
4361
4362 mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
4363 end);
4364 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4365 mmu_notifier_invalidate_range_start(&range);
4366 address = start;
4367 for (; address < end; address += sz) {
4368 ptep = huge_pte_offset(mm, address, sz);
4369 if (!ptep)
4370 continue;
4371
4372 ptl = huge_pte_lock(h, mm, ptep);
4373 if (huge_pmd_unshare(mm, vma, &address, ptep)) {
4374 spin_unlock(ptl);
4375
4376
4377
4378
4379 continue;
4380 }
4381
4382 pte = huge_ptep_get(ptep);
4383 if (huge_pte_none(pte)) {
4384 spin_unlock(ptl);
4385 continue;
4386 }
4387
4388
4389
4390
4391
4392 if (unlikely(!pte_present(pte))) {
4393 huge_pte_clear(mm, address, ptep, sz);
4394 spin_unlock(ptl);
4395 continue;
4396 }
4397
4398 page = pte_page(pte);
4399
4400
4401
4402
4403
4404 if (ref_page) {
4405 if (page != ref_page) {
4406 spin_unlock(ptl);
4407 continue;
4408 }
4409
4410
4411
4412
4413
4414 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
4415 }
4416
4417 pte = huge_ptep_get_and_clear(mm, address, ptep);
4418 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
4419 if (huge_pte_dirty(pte))
4420 set_page_dirty(page);
4421
4422 hugetlb_count_sub(pages_per_huge_page(h), mm);
4423 page_remove_rmap(page, true);
4424
4425 spin_unlock(ptl);
4426 tlb_remove_page_size(tlb, page, huge_page_size(h));
4427
4428
4429
4430 if (ref_page)
4431 break;
4432 }
4433 mmu_notifier_invalidate_range_end(&range);
4434 tlb_end_vma(tlb, vma);
4435}
4436
4437void __unmap_hugepage_range_final(struct mmu_gather *tlb,
4438 struct vm_area_struct *vma, unsigned long start,
4439 unsigned long end, struct page *ref_page)
4440{
4441 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453 vma->vm_flags &= ~VM_MAYSHARE;
4454}
4455
4456void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
4457 unsigned long end, struct page *ref_page)
4458{
4459 struct mmu_gather tlb;
4460
4461 tlb_gather_mmu(&tlb, vma->vm_mm);
4462 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
4463 tlb_finish_mmu(&tlb);
4464}
4465
4466
4467
4468
4469
4470
4471
4472static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
4473 struct page *page, unsigned long address)
4474{
4475 struct hstate *h = hstate_vma(vma);
4476 struct vm_area_struct *iter_vma;
4477 struct address_space *mapping;
4478 pgoff_t pgoff;
4479
4480
4481
4482
4483
4484 address = address & huge_page_mask(h);
4485 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
4486 vma->vm_pgoff;
4487 mapping = vma->vm_file->f_mapping;
4488
4489
4490
4491
4492
4493
4494 i_mmap_lock_write(mapping);
4495 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
4496
4497 if (iter_vma == vma)
4498 continue;
4499
4500
4501
4502
4503
4504
4505 if (iter_vma->vm_flags & VM_MAYSHARE)
4506 continue;
4507
4508
4509
4510
4511
4512
4513
4514
4515 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
4516 unmap_hugepage_range(iter_vma, address,
4517 address + huge_page_size(h), page);
4518 }
4519 i_mmap_unlock_write(mapping);
4520}
4521
4522
4523
4524
4525
4526
4527
4528static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
4529 unsigned long address, pte_t *ptep,
4530 struct page *pagecache_page, spinlock_t *ptl)
4531{
4532 pte_t pte;
4533 struct hstate *h = hstate_vma(vma);
4534 struct page *old_page, *new_page;
4535 int outside_reserve = 0;
4536 vm_fault_t ret = 0;
4537 unsigned long haddr = address & huge_page_mask(h);
4538 struct mmu_notifier_range range;
4539
4540 pte = huge_ptep_get(ptep);
4541 old_page = pte_page(pte);
4542
4543retry_avoidcopy:
4544
4545
4546 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
4547 page_move_anon_rmap(old_page, vma);
4548 set_huge_ptep_writable(vma, haddr, ptep);
4549 return 0;
4550 }
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
4562 old_page != pagecache_page)
4563 outside_reserve = 1;
4564
4565 get_page(old_page);
4566
4567
4568
4569
4570
4571 spin_unlock(ptl);
4572 new_page = alloc_huge_page(vma, haddr, outside_reserve);
4573
4574 if (IS_ERR(new_page)) {
4575
4576
4577
4578
4579
4580
4581
4582 if (outside_reserve) {
4583 struct address_space *mapping = vma->vm_file->f_mapping;
4584 pgoff_t idx;
4585 u32 hash;
4586
4587 put_page(old_page);
4588 BUG_ON(huge_pte_none(pte));
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598 idx = vma_hugecache_offset(h, vma, haddr);
4599 hash = hugetlb_fault_mutex_hash(mapping, idx);
4600 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
4601 i_mmap_unlock_read(mapping);
4602
4603 unmap_ref_private(mm, vma, old_page, haddr);
4604
4605 i_mmap_lock_read(mapping);
4606 mutex_lock(&hugetlb_fault_mutex_table[hash]);
4607 spin_lock(ptl);
4608 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
4609 if (likely(ptep &&
4610 pte_same(huge_ptep_get(ptep), pte)))
4611 goto retry_avoidcopy;
4612
4613
4614
4615
4616 return 0;
4617 }
4618
4619 ret = vmf_error(PTR_ERR(new_page));
4620 goto out_release_old;
4621 }
4622
4623
4624
4625
4626
4627 if (unlikely(anon_vma_prepare(vma))) {
4628 ret = VM_FAULT_OOM;
4629 goto out_release_all;
4630 }
4631
4632 copy_user_huge_page(new_page, old_page, address, vma,
4633 pages_per_huge_page(h));
4634 __SetPageUptodate(new_page);
4635
4636 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
4637 haddr + huge_page_size(h));
4638 mmu_notifier_invalidate_range_start(&range);
4639
4640
4641
4642
4643
4644 spin_lock(ptl);
4645 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
4646 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
4647 ClearHPageRestoreReserve(new_page);
4648
4649
4650 huge_ptep_clear_flush(vma, haddr, ptep);
4651 mmu_notifier_invalidate_range(mm, range.start, range.end);
4652 set_huge_pte_at(mm, haddr, ptep,
4653 make_huge_pte(vma, new_page, 1));
4654 page_remove_rmap(old_page, true);
4655 hugepage_add_new_anon_rmap(new_page, vma, haddr);
4656 SetHPageMigratable(new_page);
4657
4658 new_page = old_page;
4659 }
4660 spin_unlock(ptl);
4661 mmu_notifier_invalidate_range_end(&range);
4662out_release_all:
4663
4664 if (new_page != old_page)
4665 restore_reserve_on_error(h, vma, haddr, new_page);
4666 put_page(new_page);
4667out_release_old:
4668 put_page(old_page);
4669
4670 spin_lock(ptl);
4671 return ret;
4672}
4673
4674
4675static struct page *hugetlbfs_pagecache_page(struct hstate *h,
4676 struct vm_area_struct *vma, unsigned long address)
4677{
4678 struct address_space *mapping;
4679 pgoff_t idx;
4680
4681 mapping = vma->vm_file->f_mapping;
4682 idx = vma_hugecache_offset(h, vma, address);
4683
4684 return find_lock_page(mapping, idx);
4685}
4686
4687
4688
4689
4690
4691static bool hugetlbfs_pagecache_present(struct hstate *h,
4692 struct vm_area_struct *vma, unsigned long address)
4693{
4694 struct address_space *mapping;
4695 pgoff_t idx;
4696 struct page *page;
4697
4698 mapping = vma->vm_file->f_mapping;
4699 idx = vma_hugecache_offset(h, vma, address);
4700
4701 page = find_get_page(mapping, idx);
4702 if (page)
4703 put_page(page);
4704 return page != NULL;
4705}
4706
4707int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
4708 pgoff_t idx)
4709{
4710 struct inode *inode = mapping->host;
4711 struct hstate *h = hstate_inode(inode);
4712 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
4713
4714 if (err)
4715 return err;
4716 ClearHPageRestoreReserve(page);
4717
4718
4719
4720
4721
4722 set_page_dirty(page);
4723
4724 spin_lock(&inode->i_lock);
4725 inode->i_blocks += blocks_per_huge_page(h);
4726 spin_unlock(&inode->i_lock);
4727 return 0;
4728}
4729
4730static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
4731 struct address_space *mapping,
4732 pgoff_t idx,
4733 unsigned int flags,
4734 unsigned long haddr,
4735 unsigned long reason)
4736{
4737 vm_fault_t ret;
4738 u32 hash;
4739 struct vm_fault vmf = {
4740 .vma = vma,
4741 .address = haddr,
4742 .flags = flags,
4743
4744
4745
4746
4747
4748
4749
4750
4751 };
4752
4753
4754
4755
4756
4757
4758 hash = hugetlb_fault_mutex_hash(mapping, idx);
4759 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
4760 i_mmap_unlock_read(mapping);
4761 ret = handle_userfault(&vmf, reason);
4762 i_mmap_lock_read(mapping);
4763 mutex_lock(&hugetlb_fault_mutex_table[hash]);
4764
4765 return ret;
4766}
4767
4768static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
4769 struct vm_area_struct *vma,
4770 struct address_space *mapping, pgoff_t idx,
4771 unsigned long address, pte_t *ptep, unsigned int flags)
4772{
4773 struct hstate *h = hstate_vma(vma);
4774 vm_fault_t ret = VM_FAULT_SIGBUS;
4775 int anon_rmap = 0;
4776 unsigned long size;
4777 struct page *page;
4778 pte_t new_pte;
4779 spinlock_t *ptl;
4780 unsigned long haddr = address & huge_page_mask(h);
4781 bool new_page, new_pagecache_page = false;
4782
4783
4784
4785
4786
4787
4788 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
4789 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
4790 current->pid);
4791 return ret;
4792 }
4793
4794
4795
4796
4797
4798
4799 size = i_size_read(mapping->host) >> huge_page_shift(h);
4800 if (idx >= size)
4801 goto out;
4802
4803retry:
4804 new_page = false;
4805 page = find_lock_page(mapping, idx);
4806 if (!page) {
4807
4808 if (userfaultfd_missing(vma)) {
4809 ret = hugetlb_handle_userfault(vma, mapping, idx,
4810 flags, haddr,
4811 VM_UFFD_MISSING);
4812 goto out;
4813 }
4814
4815 page = alloc_huge_page(vma, haddr, 0);
4816 if (IS_ERR(page)) {
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829 ptl = huge_pte_lock(h, mm, ptep);
4830 ret = 0;
4831 if (huge_pte_none(huge_ptep_get(ptep)))
4832 ret = vmf_error(PTR_ERR(page));
4833 spin_unlock(ptl);
4834 goto out;
4835 }
4836 clear_huge_page(page, address, pages_per_huge_page(h));
4837 __SetPageUptodate(page);
4838 new_page = true;
4839
4840 if (vma->vm_flags & VM_MAYSHARE) {
4841 int err = huge_add_to_page_cache(page, mapping, idx);
4842 if (err) {
4843 put_page(page);
4844 if (err == -EEXIST)
4845 goto retry;
4846 goto out;
4847 }
4848 new_pagecache_page = true;
4849 } else {
4850 lock_page(page);
4851 if (unlikely(anon_vma_prepare(vma))) {
4852 ret = VM_FAULT_OOM;
4853 goto backout_unlocked;
4854 }
4855 anon_rmap = 1;
4856 }
4857 } else {
4858
4859
4860
4861
4862
4863 if (unlikely(PageHWPoison(page))) {
4864 ret = VM_FAULT_HWPOISON_LARGE |
4865 VM_FAULT_SET_HINDEX(hstate_index(h));
4866 goto backout_unlocked;
4867 }
4868
4869
4870 if (userfaultfd_minor(vma)) {
4871 unlock_page(page);
4872 put_page(page);
4873 ret = hugetlb_handle_userfault(vma, mapping, idx,
4874 flags, haddr,
4875 VM_UFFD_MINOR);
4876 goto out;
4877 }
4878 }
4879
4880
4881
4882
4883
4884
4885
4886 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
4887 if (vma_needs_reservation(h, vma, haddr) < 0) {
4888 ret = VM_FAULT_OOM;
4889 goto backout_unlocked;
4890 }
4891
4892 vma_end_reservation(h, vma, haddr);
4893 }
4894
4895 ptl = huge_pte_lock(h, mm, ptep);
4896 ret = 0;
4897 if (!huge_pte_none(huge_ptep_get(ptep)))
4898 goto backout;
4899
4900 if (anon_rmap) {
4901 ClearHPageRestoreReserve(page);
4902 hugepage_add_new_anon_rmap(page, vma, haddr);
4903 } else
4904 page_dup_rmap(page, true);
4905 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
4906 && (vma->vm_flags & VM_SHARED)));
4907 set_huge_pte_at(mm, haddr, ptep, new_pte);
4908
4909 hugetlb_count_add(pages_per_huge_page(h), mm);
4910 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
4911
4912 ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
4913 }
4914
4915 spin_unlock(ptl);
4916
4917
4918
4919
4920
4921
4922 if (new_page)
4923 SetHPageMigratable(page);
4924
4925 unlock_page(page);
4926out:
4927 return ret;
4928
4929backout:
4930 spin_unlock(ptl);
4931backout_unlocked:
4932 unlock_page(page);
4933
4934 if (new_page && !new_pagecache_page)
4935 restore_reserve_on_error(h, vma, haddr, page);
4936 put_page(page);
4937 goto out;
4938}
4939
4940#ifdef CONFIG_SMP
4941u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
4942{
4943 unsigned long key[2];
4944 u32 hash;
4945
4946 key[0] = (unsigned long) mapping;
4947 key[1] = idx;
4948
4949 hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
4950
4951 return hash & (num_fault_mutexes - 1);
4952}
4953#else
4954
4955
4956
4957
4958u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
4959{
4960 return 0;
4961}
4962#endif
4963
4964vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
4965 unsigned long address, unsigned int flags)
4966{
4967 pte_t *ptep, entry;
4968 spinlock_t *ptl;
4969 vm_fault_t ret;
4970 u32 hash;
4971 pgoff_t idx;
4972 struct page *page = NULL;
4973 struct page *pagecache_page = NULL;
4974 struct hstate *h = hstate_vma(vma);
4975 struct address_space *mapping;
4976 int need_wait_lock = 0;
4977 unsigned long haddr = address & huge_page_mask(h);
4978
4979 ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
4980 if (ptep) {
4981
4982
4983
4984
4985
4986 entry = huge_ptep_get(ptep);
4987 if (unlikely(is_hugetlb_entry_migration(entry))) {
4988 migration_entry_wait_huge(vma, mm, ptep);
4989 return 0;
4990 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
4991 return VM_FAULT_HWPOISON_LARGE |
4992 VM_FAULT_SET_HINDEX(hstate_index(h));
4993 }
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006 mapping = vma->vm_file->f_mapping;
5007 i_mmap_lock_read(mapping);
5008 ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
5009 if (!ptep) {
5010 i_mmap_unlock_read(mapping);
5011 return VM_FAULT_OOM;
5012 }
5013
5014
5015
5016
5017
5018
5019 idx = vma_hugecache_offset(h, vma, haddr);
5020 hash = hugetlb_fault_mutex_hash(mapping, idx);
5021 mutex_lock(&hugetlb_fault_mutex_table[hash]);
5022
5023 entry = huge_ptep_get(ptep);
5024 if (huge_pte_none(entry)) {
5025 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
5026 goto out_mutex;
5027 }
5028
5029 ret = 0;
5030
5031
5032
5033
5034
5035
5036
5037
5038 if (!pte_present(entry))
5039 goto out_mutex;
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
5050 if (vma_needs_reservation(h, vma, haddr) < 0) {
5051 ret = VM_FAULT_OOM;
5052 goto out_mutex;
5053 }
5054
5055 vma_end_reservation(h, vma, haddr);
5056
5057 if (!(vma->vm_flags & VM_MAYSHARE))
5058 pagecache_page = hugetlbfs_pagecache_page(h,
5059 vma, haddr);
5060 }
5061
5062 ptl = huge_pte_lock(h, mm, ptep);
5063
5064
5065 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
5066 goto out_ptl;
5067
5068
5069
5070
5071
5072
5073 page = pte_page(entry);
5074 if (page != pagecache_page)
5075 if (!trylock_page(page)) {
5076 need_wait_lock = 1;
5077 goto out_ptl;
5078 }
5079
5080 get_page(page);
5081
5082 if (flags & FAULT_FLAG_WRITE) {
5083 if (!huge_pte_write(entry)) {
5084 ret = hugetlb_cow(mm, vma, address, ptep,
5085 pagecache_page, ptl);
5086 goto out_put_page;
5087 }
5088 entry = huge_pte_mkdirty(entry);
5089 }
5090 entry = pte_mkyoung(entry);
5091 if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
5092 flags & FAULT_FLAG_WRITE))
5093 update_mmu_cache(vma, haddr, ptep);
5094out_put_page:
5095 if (page != pagecache_page)
5096 unlock_page(page);
5097 put_page(page);
5098out_ptl:
5099 spin_unlock(ptl);
5100
5101 if (pagecache_page) {
5102 unlock_page(pagecache_page);
5103 put_page(pagecache_page);
5104 }
5105out_mutex:
5106 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
5107 i_mmap_unlock_read(mapping);
5108
5109
5110
5111
5112
5113
5114
5115 if (need_wait_lock)
5116 wait_on_page_locked(page);
5117 return ret;
5118}
5119
5120#ifdef CONFIG_USERFAULTFD
5121
5122
5123
5124
5125int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
5126 pte_t *dst_pte,
5127 struct vm_area_struct *dst_vma,
5128 unsigned long dst_addr,
5129 unsigned long src_addr,
5130 enum mcopy_atomic_mode mode,
5131 struct page **pagep)
5132{
5133 bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
5134 struct hstate *h = hstate_vma(dst_vma);
5135 struct address_space *mapping = dst_vma->vm_file->f_mapping;
5136 pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
5137 unsigned long size;
5138 int vm_shared = dst_vma->vm_flags & VM_SHARED;
5139 pte_t _dst_pte;
5140 spinlock_t *ptl;
5141 int ret = -ENOMEM;
5142 struct page *page;
5143 int writable;
5144 bool new_pagecache_page = false;
5145
5146 if (is_continue) {
5147 ret = -EFAULT;
5148 page = find_lock_page(mapping, idx);
5149 if (!page)
5150 goto out;
5151 } else if (!*pagep) {
5152
5153
5154
5155 if (vm_shared &&
5156 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5157 ret = -EEXIST;
5158 goto out;
5159 }
5160
5161 page = alloc_huge_page(dst_vma, dst_addr, 0);
5162 if (IS_ERR(page)) {
5163 ret = -ENOMEM;
5164 goto out;
5165 }
5166
5167 ret = copy_huge_page_from_user(page,
5168 (const void __user *) src_addr,
5169 pages_per_huge_page(h), false);
5170
5171
5172 if (unlikely(ret)) {
5173 ret = -ENOENT;
5174
5175
5176
5177 restore_reserve_on_error(h, dst_vma, dst_addr, page);
5178 put_page(page);
5179
5180
5181
5182
5183 page = alloc_huge_page_vma(h, dst_vma, dst_addr);
5184 if (!page) {
5185 ret = -ENOMEM;
5186 goto out;
5187 }
5188 *pagep = page;
5189
5190
5191
5192
5193 goto out;
5194 }
5195 } else {
5196 if (vm_shared &&
5197 hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5198 put_page(*pagep);
5199 ret = -EEXIST;
5200 *pagep = NULL;
5201 goto out;
5202 }
5203
5204 page = alloc_huge_page(dst_vma, dst_addr, 0);
5205 if (IS_ERR(page)) {
5206 ret = -ENOMEM;
5207 *pagep = NULL;
5208 goto out;
5209 }
5210 copy_huge_page(page, *pagep);
5211 put_page(*pagep);
5212 *pagep = NULL;
5213 }
5214
5215
5216
5217
5218
5219
5220 __SetPageUptodate(page);
5221
5222
5223 if (vm_shared && !is_continue) {
5224 size = i_size_read(mapping->host) >> huge_page_shift(h);
5225 ret = -EFAULT;
5226 if (idx >= size)
5227 goto out_release_nounlock;
5228
5229
5230
5231
5232
5233
5234
5235 ret = huge_add_to_page_cache(page, mapping, idx);
5236 if (ret)
5237 goto out_release_nounlock;
5238 new_pagecache_page = true;
5239 }
5240
5241 ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
5242 spin_lock(ptl);
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253 size = i_size_read(mapping->host) >> huge_page_shift(h);
5254 ret = -EFAULT;
5255 if (idx >= size)
5256 goto out_release_unlock;
5257
5258 ret = -EEXIST;
5259 if (!huge_pte_none(huge_ptep_get(dst_pte)))
5260 goto out_release_unlock;
5261
5262 if (vm_shared) {
5263 page_dup_rmap(page, true);
5264 } else {
5265 ClearHPageRestoreReserve(page);
5266 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
5267 }
5268
5269
5270 if (is_continue && !vm_shared)
5271 writable = 0;
5272 else
5273 writable = dst_vma->vm_flags & VM_WRITE;
5274
5275 _dst_pte = make_huge_pte(dst_vma, page, writable);
5276 if (writable)
5277 _dst_pte = huge_pte_mkdirty(_dst_pte);
5278 _dst_pte = pte_mkyoung(_dst_pte);
5279
5280 set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
5281
5282 (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
5283 dst_vma->vm_flags & VM_WRITE);
5284 hugetlb_count_add(pages_per_huge_page(h), dst_mm);
5285
5286
5287 update_mmu_cache(dst_vma, dst_addr, dst_pte);
5288
5289 spin_unlock(ptl);
5290 if (!is_continue)
5291 SetHPageMigratable(page);
5292 if (vm_shared || is_continue)
5293 unlock_page(page);
5294 ret = 0;
5295out:
5296 return ret;
5297out_release_unlock:
5298 spin_unlock(ptl);
5299 if (vm_shared || is_continue)
5300 unlock_page(page);
5301out_release_nounlock:
5302 if (!new_pagecache_page)
5303 restore_reserve_on_error(h, dst_vma, dst_addr, page);
5304 put_page(page);
5305 goto out;
5306}
5307#endif
5308
5309static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
5310 int refs, struct page **pages,
5311 struct vm_area_struct **vmas)
5312{
5313 int nr;
5314
5315 for (nr = 0; nr < refs; nr++) {
5316 if (likely(pages))
5317 pages[nr] = mem_map_offset(page, nr);
5318 if (vmas)
5319 vmas[nr] = vma;
5320 }
5321}
5322
5323long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
5324 struct page **pages, struct vm_area_struct **vmas,
5325 unsigned long *position, unsigned long *nr_pages,
5326 long i, unsigned int flags, int *locked)
5327{
5328 unsigned long pfn_offset;
5329 unsigned long vaddr = *position;
5330 unsigned long remainder = *nr_pages;
5331 struct hstate *h = hstate_vma(vma);
5332 int err = -EFAULT, refs;
5333
5334 while (vaddr < vma->vm_end && remainder) {
5335 pte_t *pte;
5336 spinlock_t *ptl = NULL;
5337 int absent;
5338 struct page *page;
5339
5340
5341
5342
5343
5344 if (fatal_signal_pending(current)) {
5345 remainder = 0;
5346 break;
5347 }
5348
5349
5350
5351
5352
5353
5354
5355
5356 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
5357 huge_page_size(h));
5358 if (pte)
5359 ptl = huge_pte_lock(h, mm, pte);
5360 absent = !pte || huge_pte_none(huge_ptep_get(pte));
5361
5362
5363
5364
5365
5366
5367
5368
5369 if (absent && (flags & FOLL_DUMP) &&
5370 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
5371 if (pte)
5372 spin_unlock(ptl);
5373 remainder = 0;
5374 break;
5375 }
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387 if (absent || is_swap_pte(huge_ptep_get(pte)) ||
5388 ((flags & FOLL_WRITE) &&
5389 !huge_pte_write(huge_ptep_get(pte)))) {
5390 vm_fault_t ret;
5391 unsigned int fault_flags = 0;
5392
5393 if (pte)
5394 spin_unlock(ptl);
5395 if (flags & FOLL_WRITE)
5396 fault_flags |= FAULT_FLAG_WRITE;
5397 if (locked)
5398 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
5399 FAULT_FLAG_KILLABLE;
5400 if (flags & FOLL_NOWAIT)
5401 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
5402 FAULT_FLAG_RETRY_NOWAIT;
5403 if (flags & FOLL_TRIED) {
5404
5405
5406
5407
5408 fault_flags |= FAULT_FLAG_TRIED;
5409 }
5410 ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
5411 if (ret & VM_FAULT_ERROR) {
5412 err = vm_fault_to_errno(ret, flags);
5413 remainder = 0;
5414 break;
5415 }
5416 if (ret & VM_FAULT_RETRY) {
5417 if (locked &&
5418 !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
5419 *locked = 0;
5420 *nr_pages = 0;
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430 return i;
5431 }
5432 continue;
5433 }
5434
5435 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
5436 page = pte_page(huge_ptep_get(pte));
5437
5438
5439
5440
5441
5442 if (!pages && !vmas && !pfn_offset &&
5443 (vaddr + huge_page_size(h) < vma->vm_end) &&
5444 (remainder >= pages_per_huge_page(h))) {
5445 vaddr += huge_page_size(h);
5446 remainder -= pages_per_huge_page(h);
5447 i += pages_per_huge_page(h);
5448 spin_unlock(ptl);
5449 continue;
5450 }
5451
5452
5453 refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
5454 (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
5455
5456 if (pages || vmas)
5457 record_subpages_vmas(mem_map_offset(page, pfn_offset),
5458 vma, refs,
5459 likely(pages) ? pages + i : NULL,
5460 vmas ? vmas + i : NULL);
5461
5462 if (pages) {
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473 if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
5474 refs,
5475 flags))) {
5476 spin_unlock(ptl);
5477 remainder = 0;
5478 err = -ENOMEM;
5479 break;
5480 }
5481 }
5482
5483 vaddr += (refs << PAGE_SHIFT);
5484 remainder -= refs;
5485 i += refs;
5486
5487 spin_unlock(ptl);
5488 }
5489 *nr_pages = remainder;
5490
5491
5492
5493
5494
5495 *position = vaddr;
5496
5497 return i ? i : err;
5498}
5499
5500unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
5501 unsigned long address, unsigned long end, pgprot_t newprot)
5502{
5503 struct mm_struct *mm = vma->vm_mm;
5504 unsigned long start = address;
5505 pte_t *ptep;
5506 pte_t pte;
5507 struct hstate *h = hstate_vma(vma);
5508 unsigned long pages = 0;
5509 bool shared_pmd = false;
5510 struct mmu_notifier_range range;
5511
5512
5513
5514
5515
5516
5517 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
5518 0, vma, mm, start, end);
5519 adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
5520
5521 BUG_ON(address >= end);
5522 flush_cache_range(vma, range.start, range.end);
5523
5524 mmu_notifier_invalidate_range_start(&range);
5525 i_mmap_lock_write(vma->vm_file->f_mapping);
5526 for (; address < end; address += huge_page_size(h)) {
5527 spinlock_t *ptl;
5528 ptep = huge_pte_offset(mm, address, huge_page_size(h));
5529 if (!ptep)
5530 continue;
5531 ptl = huge_pte_lock(h, mm, ptep);
5532 if (huge_pmd_unshare(mm, vma, &address, ptep)) {
5533 pages++;
5534 spin_unlock(ptl);
5535 shared_pmd = true;
5536 continue;
5537 }
5538 pte = huge_ptep_get(ptep);
5539 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
5540 spin_unlock(ptl);
5541 continue;
5542 }
5543 if (unlikely(is_hugetlb_entry_migration(pte))) {
5544 swp_entry_t entry = pte_to_swp_entry(pte);
5545
5546 if (is_writable_migration_entry(entry)) {
5547 pte_t newpte;
5548
5549 entry = make_readable_migration_entry(
5550 swp_offset(entry));
5551 newpte = swp_entry_to_pte(entry);
5552 set_huge_swap_pte_at(mm, address, ptep,
5553 newpte, huge_page_size(h));
5554 pages++;
5555 }
5556 spin_unlock(ptl);
5557 continue;
5558 }
5559 if (!huge_pte_none(pte)) {
5560 pte_t old_pte;
5561 unsigned int shift = huge_page_shift(hstate_vma(vma));
5562
5563 old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
5564 pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
5565 pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
5566 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
5567 pages++;
5568 }
5569 spin_unlock(ptl);
5570 }
5571
5572
5573
5574
5575
5576
5577
5578 if (shared_pmd)
5579 flush_hugetlb_tlb_range(vma, range.start, range.end);
5580 else
5581 flush_hugetlb_tlb_range(vma, start, end);
5582
5583
5584
5585
5586
5587
5588 i_mmap_unlock_write(vma->vm_file->f_mapping);
5589 mmu_notifier_invalidate_range_end(&range);
5590
5591 return pages << h->order;
5592}
5593
5594
5595bool hugetlb_reserve_pages(struct inode *inode,
5596 long from, long to,
5597 struct vm_area_struct *vma,
5598 vm_flags_t vm_flags)
5599{
5600 long chg, add = -1;
5601 struct hstate *h = hstate_inode(inode);
5602 struct hugepage_subpool *spool = subpool_inode(inode);
5603 struct resv_map *resv_map;
5604 struct hugetlb_cgroup *h_cg = NULL;
5605 long gbl_reserve, regions_needed = 0;
5606
5607
5608 if (from > to) {
5609 VM_WARN(1, "%s called with a negative range\n", __func__);
5610 return false;
5611 }
5612
5613
5614
5615
5616
5617
5618 if (vm_flags & VM_NORESERVE)
5619 return true;
5620
5621
5622
5623
5624
5625
5626
5627 if (!vma || vma->vm_flags & VM_MAYSHARE) {
5628
5629
5630
5631
5632
5633 resv_map = inode_resv_map(inode);
5634
5635 chg = region_chg(resv_map, from, to, ®ions_needed);
5636
5637 } else {
5638
5639 resv_map = resv_map_alloc();
5640 if (!resv_map)
5641 return false;
5642
5643 chg = to - from;
5644
5645 set_vma_resv_map(vma, resv_map);
5646 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
5647 }
5648
5649 if (chg < 0)
5650 goto out_err;
5651
5652 if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
5653 chg * pages_per_huge_page(h), &h_cg) < 0)
5654 goto out_err;
5655
5656 if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
5657
5658
5659
5660 resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
5661 }
5662
5663
5664
5665
5666
5667
5668 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
5669 if (gbl_reserve < 0)
5670 goto out_uncharge_cgroup;
5671
5672
5673
5674
5675
5676 if (hugetlb_acct_memory(h, gbl_reserve) < 0)
5677 goto out_put_pages;
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690 if (!vma || vma->vm_flags & VM_MAYSHARE) {
5691 add = region_add(resv_map, from, to, regions_needed, h, h_cg);
5692
5693 if (unlikely(add < 0)) {
5694 hugetlb_acct_memory(h, -gbl_reserve);
5695 goto out_put_pages;
5696 } else if (unlikely(chg > add)) {
5697
5698
5699
5700
5701
5702
5703
5704 long rsv_adjust;
5705
5706
5707
5708
5709
5710 hugetlb_cgroup_uncharge_cgroup_rsvd(
5711 hstate_index(h),
5712 (chg - add) * pages_per_huge_page(h), h_cg);
5713
5714 rsv_adjust = hugepage_subpool_put_pages(spool,
5715 chg - add);
5716 hugetlb_acct_memory(h, -rsv_adjust);
5717 } else if (h_cg) {
5718
5719
5720
5721
5722
5723
5724 hugetlb_cgroup_put_rsvd_cgroup(h_cg);
5725 }
5726 }
5727 return true;
5728
5729out_put_pages:
5730
5731 (void)hugepage_subpool_put_pages(spool, chg);
5732out_uncharge_cgroup:
5733 hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
5734 chg * pages_per_huge_page(h), h_cg);
5735out_err:
5736 if (!vma || vma->vm_flags & VM_MAYSHARE)
5737
5738
5739
5740 if (chg >= 0 && add < 0)
5741 region_abort(resv_map, from, to, regions_needed);
5742 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
5743 kref_put(&resv_map->refs, resv_map_release);
5744 return false;
5745}
5746
5747long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
5748 long freed)
5749{
5750 struct hstate *h = hstate_inode(inode);
5751 struct resv_map *resv_map = inode_resv_map(inode);
5752 long chg = 0;
5753 struct hugepage_subpool *spool = subpool_inode(inode);
5754 long gbl_reserve;
5755
5756
5757
5758
5759
5760 if (resv_map) {
5761 chg = region_del(resv_map, start, end);
5762
5763
5764
5765
5766
5767 if (chg < 0)
5768 return chg;
5769 }
5770
5771 spin_lock(&inode->i_lock);
5772 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
5773 spin_unlock(&inode->i_lock);
5774
5775
5776
5777
5778
5779
5780
5781
5782 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
5783 hugetlb_acct_memory(h, -gbl_reserve);
5784
5785 return 0;
5786}
5787
5788#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
5789static unsigned long page_table_shareable(struct vm_area_struct *svma,
5790 struct vm_area_struct *vma,
5791 unsigned long addr, pgoff_t idx)
5792{
5793 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
5794 svma->vm_start;
5795 unsigned long sbase = saddr & PUD_MASK;
5796 unsigned long s_end = sbase + PUD_SIZE;
5797
5798
5799 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
5800 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
5801
5802
5803
5804
5805
5806 if (pmd_index(addr) != pmd_index(saddr) ||
5807 vm_flags != svm_flags ||
5808 !range_in_vma(svma, sbase, s_end))
5809 return 0;
5810
5811 return saddr;
5812}
5813
5814static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
5815{
5816 unsigned long base = addr & PUD_MASK;
5817 unsigned long end = base + PUD_SIZE;
5818
5819
5820
5821
5822 if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
5823 return true;
5824 return false;
5825}
5826
5827bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
5828{
5829#ifdef CONFIG_USERFAULTFD
5830 if (uffd_disable_huge_pmd_share(vma))
5831 return false;
5832#endif
5833 return vma_shareable(vma, addr);
5834}
5835
5836
5837
5838
5839
5840
5841void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
5842 unsigned long *start, unsigned long *end)
5843{
5844 unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
5845 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
5846
5847
5848
5849
5850
5851 if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
5852 (*end <= v_start) || (*start >= v_end))
5853 return;
5854
5855
5856 if (*start > v_start)
5857 *start = ALIGN_DOWN(*start, PUD_SIZE);
5858
5859 if (*end < v_end)
5860 *end = ALIGN(*end, PUD_SIZE);
5861}
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
5881 unsigned long addr, pud_t *pud)
5882{
5883 struct address_space *mapping = vma->vm_file->f_mapping;
5884 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
5885 vma->vm_pgoff;
5886 struct vm_area_struct *svma;
5887 unsigned long saddr;
5888 pte_t *spte = NULL;
5889 pte_t *pte;
5890 spinlock_t *ptl;
5891
5892 i_mmap_assert_locked(mapping);
5893 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
5894 if (svma == vma)
5895 continue;
5896
5897 saddr = page_table_shareable(svma, vma, addr, idx);
5898 if (saddr) {
5899 spte = huge_pte_offset(svma->vm_mm, saddr,
5900 vma_mmu_pagesize(svma));
5901 if (spte) {
5902 get_page(virt_to_page(spte));
5903 break;
5904 }
5905 }
5906 }
5907
5908 if (!spte)
5909 goto out;
5910
5911 ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
5912 if (pud_none(*pud)) {
5913 pud_populate(mm, pud,
5914 (pmd_t *)((unsigned long)spte & PAGE_MASK));
5915 mm_inc_nr_pmds(mm);
5916 } else {
5917 put_page(virt_to_page(spte));
5918 }
5919 spin_unlock(ptl);
5920out:
5921 pte = (pte_t *)pmd_alloc(mm, pud, addr);
5922 return pte;
5923}
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
5938 unsigned long *addr, pte_t *ptep)
5939{
5940 pgd_t *pgd = pgd_offset(mm, *addr);
5941 p4d_t *p4d = p4d_offset(pgd, *addr);
5942 pud_t *pud = pud_offset(p4d, *addr);
5943
5944 i_mmap_assert_write_locked(vma->vm_file->f_mapping);
5945 BUG_ON(page_count(virt_to_page(ptep)) == 0);
5946 if (page_count(virt_to_page(ptep)) == 1)
5947 return 0;
5948
5949 pud_clear(pud);
5950 put_page(virt_to_page(ptep));
5951 mm_dec_nr_pmds(mm);
5952 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
5953 return 1;
5954}
5955
5956#else
5957pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
5958 unsigned long addr, pud_t *pud)
5959{
5960 return NULL;
5961}
5962
5963int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
5964 unsigned long *addr, pte_t *ptep)
5965{
5966 return 0;
5967}
5968
5969void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
5970 unsigned long *start, unsigned long *end)
5971{
5972}
5973
5974bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
5975{
5976 return false;
5977}
5978#endif
5979
5980#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
5981pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
5982 unsigned long addr, unsigned long sz)
5983{
5984 pgd_t *pgd;
5985 p4d_t *p4d;
5986 pud_t *pud;
5987 pte_t *pte = NULL;
5988
5989 pgd = pgd_offset(mm, addr);
5990 p4d = p4d_alloc(mm, pgd, addr);
5991 if (!p4d)
5992 return NULL;
5993 pud = pud_alloc(mm, p4d, addr);
5994 if (pud) {
5995 if (sz == PUD_SIZE) {
5996 pte = (pte_t *)pud;
5997 } else {
5998 BUG_ON(sz != PMD_SIZE);
5999 if (want_pmd_share(vma, addr) && pud_none(*pud))
6000 pte = huge_pmd_share(mm, vma, addr, pud);
6001 else
6002 pte = (pte_t *)pmd_alloc(mm, pud, addr);
6003 }
6004 }
6005 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
6006
6007 return pte;
6008}
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019pte_t *huge_pte_offset(struct mm_struct *mm,
6020 unsigned long addr, unsigned long sz)
6021{
6022 pgd_t *pgd;
6023 p4d_t *p4d;
6024 pud_t *pud;
6025 pmd_t *pmd;
6026
6027 pgd = pgd_offset(mm, addr);
6028 if (!pgd_present(*pgd))
6029 return NULL;
6030 p4d = p4d_offset(pgd, addr);
6031 if (!p4d_present(*p4d))
6032 return NULL;
6033
6034 pud = pud_offset(p4d, addr);
6035 if (sz == PUD_SIZE)
6036
6037 return (pte_t *)pud;
6038 if (!pud_present(*pud))
6039 return NULL;
6040
6041
6042 pmd = pmd_offset(pud, addr);
6043
6044 return (pte_t *)pmd;
6045}
6046
6047#endif
6048
6049
6050
6051
6052
6053struct page * __weak
6054follow_huge_addr(struct mm_struct *mm, unsigned long address,
6055 int write)
6056{
6057 return ERR_PTR(-EINVAL);
6058}
6059
6060struct page * __weak
6061follow_huge_pd(struct vm_area_struct *vma,
6062 unsigned long address, hugepd_t hpd, int flags, int pdshift)
6063{
6064 WARN(1, "hugepd follow called with no support for hugepage directory format\n");
6065 return NULL;
6066}
6067
6068struct page * __weak
6069follow_huge_pmd(struct mm_struct *mm, unsigned long address,
6070 pmd_t *pmd, int flags)
6071{
6072 struct page *page = NULL;
6073 spinlock_t *ptl;
6074 pte_t pte;
6075
6076
6077 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
6078 (FOLL_PIN | FOLL_GET)))
6079 return NULL;
6080
6081retry:
6082 ptl = pmd_lockptr(mm, pmd);
6083 spin_lock(ptl);
6084
6085
6086
6087
6088 if (!pmd_huge(*pmd))
6089 goto out;
6090 pte = huge_ptep_get((pte_t *)pmd);
6091 if (pte_present(pte)) {
6092 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
6093
6094
6095
6096
6097
6098
6099
6100
6101 if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
6102 page = NULL;
6103 goto out;
6104 }
6105 } else {
6106 if (is_hugetlb_entry_migration(pte)) {
6107 spin_unlock(ptl);
6108 __migration_entry_wait(mm, (pte_t *)pmd, ptl);
6109 goto retry;
6110 }
6111
6112
6113
6114
6115 }
6116out:
6117 spin_unlock(ptl);
6118 return page;
6119}
6120
6121struct page * __weak
6122follow_huge_pud(struct mm_struct *mm, unsigned long address,
6123 pud_t *pud, int flags)
6124{
6125 if (flags & (FOLL_GET | FOLL_PIN))
6126 return NULL;
6127
6128 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
6129}
6130
6131struct page * __weak
6132follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
6133{
6134 if (flags & (FOLL_GET | FOLL_PIN))
6135 return NULL;
6136
6137 return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
6138}
6139
6140bool isolate_huge_page(struct page *page, struct list_head *list)
6141{
6142 bool ret = true;
6143
6144 spin_lock_irq(&hugetlb_lock);
6145 if (!PageHeadHuge(page) ||
6146 !HPageMigratable(page) ||
6147 !get_page_unless_zero(page)) {
6148 ret = false;
6149 goto unlock;
6150 }
6151 ClearHPageMigratable(page);
6152 list_move_tail(&page->lru, list);
6153unlock:
6154 spin_unlock_irq(&hugetlb_lock);
6155 return ret;
6156}
6157
6158int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
6159{
6160 int ret = 0;
6161
6162 *hugetlb = false;
6163 spin_lock_irq(&hugetlb_lock);
6164 if (PageHeadHuge(page)) {
6165 *hugetlb = true;
6166 if (HPageFreed(page) || HPageMigratable(page))
6167 ret = get_page_unless_zero(page);
6168 else
6169 ret = -EBUSY;
6170 }
6171 spin_unlock_irq(&hugetlb_lock);
6172 return ret;
6173}
6174
6175void putback_active_hugepage(struct page *page)
6176{
6177 spin_lock_irq(&hugetlb_lock);
6178 SetHPageMigratable(page);
6179 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
6180 spin_unlock_irq(&hugetlb_lock);
6181 put_page(page);
6182}
6183
6184void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
6185{
6186 struct hstate *h = page_hstate(oldpage);
6187
6188 hugetlb_cgroup_migrate(oldpage, newpage);
6189 set_page_owner_migrate_reason(newpage, reason);
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201 if (HPageTemporary(newpage)) {
6202 int old_nid = page_to_nid(oldpage);
6203 int new_nid = page_to_nid(newpage);
6204
6205 SetHPageTemporary(oldpage);
6206 ClearHPageTemporary(newpage);
6207
6208
6209
6210
6211
6212 if (new_nid == old_nid)
6213 return;
6214 spin_lock_irq(&hugetlb_lock);
6215 if (h->surplus_huge_pages_node[old_nid]) {
6216 h->surplus_huge_pages_node[old_nid]--;
6217 h->surplus_huge_pages_node[new_nid]++;
6218 }
6219 spin_unlock_irq(&hugetlb_lock);
6220 }
6221}
6222
6223
6224
6225
6226
6227void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
6228{
6229 struct hstate *h = hstate_vma(vma);
6230 unsigned long sz = huge_page_size(h);
6231 struct mm_struct *mm = vma->vm_mm;
6232 struct mmu_notifier_range range;
6233 unsigned long address, start, end;
6234 spinlock_t *ptl;
6235 pte_t *ptep;
6236
6237 if (!(vma->vm_flags & VM_MAYSHARE))
6238 return;
6239
6240 start = ALIGN(vma->vm_start, PUD_SIZE);
6241 end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
6242
6243 if (start >= end)
6244 return;
6245
6246
6247
6248
6249
6250 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
6251 start, end);
6252 mmu_notifier_invalidate_range_start(&range);
6253 i_mmap_lock_write(vma->vm_file->f_mapping);
6254 for (address = start; address < end; address += PUD_SIZE) {
6255 unsigned long tmp = address;
6256
6257 ptep = huge_pte_offset(mm, address, sz);
6258 if (!ptep)
6259 continue;
6260 ptl = huge_pte_lock(h, mm, ptep);
6261
6262 huge_pmd_unshare(mm, vma, &tmp, ptep);
6263 spin_unlock(ptl);
6264 }
6265 flush_hugetlb_tlb_range(vma, start, end);
6266 i_mmap_unlock_write(vma->vm_file->f_mapping);
6267
6268
6269
6270
6271 mmu_notifier_invalidate_range_end(&range);
6272}
6273
6274#ifdef CONFIG_CMA
6275static bool cma_reserve_called __initdata;
6276
6277static int __init cmdline_parse_hugetlb_cma(char *p)
6278{
6279 hugetlb_cma_size = memparse(p, &p);
6280 return 0;
6281}
6282
6283early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
6284
6285void __init hugetlb_cma_reserve(int order)
6286{
6287 unsigned long size, reserved, per_node;
6288 int nid;
6289
6290 cma_reserve_called = true;
6291
6292 if (!hugetlb_cma_size)
6293 return;
6294
6295 if (hugetlb_cma_size < (PAGE_SIZE << order)) {
6296 pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
6297 (PAGE_SIZE << order) / SZ_1M);
6298 return;
6299 }
6300
6301
6302
6303
6304
6305 per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
6306 pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
6307 hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
6308
6309 reserved = 0;
6310 for_each_node_state(nid, N_ONLINE) {
6311 int res;
6312 char name[CMA_MAX_NAME];
6313
6314 size = min(per_node, hugetlb_cma_size - reserved);
6315 size = round_up(size, PAGE_SIZE << order);
6316
6317 snprintf(name, sizeof(name), "hugetlb%d", nid);
6318 res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
6319 0, false, name,
6320 &hugetlb_cma[nid], nid);
6321 if (res) {
6322 pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
6323 res, nid);
6324 continue;
6325 }
6326
6327 reserved += size;
6328 pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
6329 size / SZ_1M, nid);
6330
6331 if (reserved >= hugetlb_cma_size)
6332 break;
6333 }
6334}
6335
6336void __init hugetlb_cma_check(void)
6337{
6338 if (!hugetlb_cma_size || cma_reserve_called)
6339 return;
6340
6341 pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
6342}
6343
6344#endif
6345