1
2
3
4
5#include <linux/list.h>
6#include <linux/init.h>
7#include <linux/mm.h>
8#include <linux/seq_file.h>
9#include <linux/sysctl.h>
10#include <linux/highmem.h>
11#include <linux/mmu_notifier.h>
12#include <linux/nodemask.h>
13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15#include <linux/compiler.h>
16#include <linux/cpuset.h>
17#include <linux/mutex.h>
18#include <linux/bootmem.h>
19#include <linux/sysfs.h>
20#include <linux/slab.h>
21#include <linux/rmap.h>
22#include <linux/swap.h>
23#include <linux/swapops.h>
24#include <linux/page-isolation.h>
25#include <linux/jhash.h>
26
27#include <asm/page.h>
28#include <asm/pgtable.h>
29#include <asm/tlb.h>
30
31#include <linux/io.h>
32#include <linux/hugetlb.h>
33#include <linux/hugetlb_cgroup.h>
34#include <linux/node.h>
35#include "internal.h"
36
37int hugepages_treat_as_movable;
38
39int hugetlb_max_hstate __read_mostly;
40unsigned int default_hstate_idx;
41struct hstate hstates[HUGE_MAX_HSTATE];
42
43
44
45
46static unsigned int minimum_order __read_mostly = UINT_MAX;
47
48__initdata LIST_HEAD(huge_boot_pages);
49
50
51static struct hstate * __initdata parsed_hstate;
52static unsigned long __initdata default_hstate_max_huge_pages;
53static unsigned long __initdata default_hstate_size;
54static bool __initdata parsed_valid_hugepagesz = true;
55
56
57
58
59
60DEFINE_SPINLOCK(hugetlb_lock);
61
62
63
64
65
66static int num_fault_mutexes;
67struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
68
69
70static int hugetlb_acct_memory(struct hstate *h, long delta);
71
72static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
73{
74 bool free = (spool->count == 0) && (spool->used_hpages == 0);
75
76 spin_unlock(&spool->lock);
77
78
79
80
81 if (free) {
82 if (spool->min_hpages != -1)
83 hugetlb_acct_memory(spool->hstate,
84 -spool->min_hpages);
85 kfree(spool);
86 }
87}
88
89struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
90 long min_hpages)
91{
92 struct hugepage_subpool *spool;
93
94 spool = kzalloc(sizeof(*spool), GFP_KERNEL);
95 if (!spool)
96 return NULL;
97
98 spin_lock_init(&spool->lock);
99 spool->count = 1;
100 spool->max_hpages = max_hpages;
101 spool->hstate = h;
102 spool->min_hpages = min_hpages;
103
104 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
105 kfree(spool);
106 return NULL;
107 }
108 spool->rsv_hpages = min_hpages;
109
110 return spool;
111}
112
113void hugepage_put_subpool(struct hugepage_subpool *spool)
114{
115 spin_lock(&spool->lock);
116 BUG_ON(!spool->count);
117 spool->count--;
118 unlock_or_release_subpool(spool);
119}
120
121
122
123
124
125
126
127
128
129static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
130 long delta)
131{
132 long ret = delta;
133
134 if (!spool)
135 return ret;
136
137 spin_lock(&spool->lock);
138
139 if (spool->max_hpages != -1) {
140 if ((spool->used_hpages + delta) <= spool->max_hpages)
141 spool->used_hpages += delta;
142 else {
143 ret = -ENOMEM;
144 goto unlock_ret;
145 }
146 }
147
148
149 if (spool->min_hpages != -1 && spool->rsv_hpages) {
150 if (delta > spool->rsv_hpages) {
151
152
153
154
155 ret = delta - spool->rsv_hpages;
156 spool->rsv_hpages = 0;
157 } else {
158 ret = 0;
159 spool->rsv_hpages -= delta;
160 }
161 }
162
163unlock_ret:
164 spin_unlock(&spool->lock);
165 return ret;
166}
167
168
169
170
171
172
173
174static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
175 long delta)
176{
177 long ret = delta;
178
179 if (!spool)
180 return delta;
181
182 spin_lock(&spool->lock);
183
184 if (spool->max_hpages != -1)
185 spool->used_hpages -= delta;
186
187
188 if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
189 if (spool->rsv_hpages + delta <= spool->min_hpages)
190 ret = 0;
191 else
192 ret = spool->rsv_hpages + delta - spool->min_hpages;
193
194 spool->rsv_hpages += delta;
195 if (spool->rsv_hpages > spool->min_hpages)
196 spool->rsv_hpages = spool->min_hpages;
197 }
198
199
200
201
202
203 unlock_or_release_subpool(spool);
204
205 return ret;
206}
207
208static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
209{
210 return HUGETLBFS_SB(inode->i_sb)->spool;
211}
212
213static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
214{
215 return subpool_inode(file_inode(vma->vm_file));
216}
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237struct file_region {
238 struct list_head link;
239 long from;
240 long to;
241};
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257static long region_add(struct resv_map *resv, long f, long t)
258{
259 struct list_head *head = &resv->regions;
260 struct file_region *rg, *nrg, *trg;
261 long add = 0;
262
263 spin_lock(&resv->lock);
264
265 list_for_each_entry(rg, head, link)
266 if (f <= rg->to)
267 break;
268
269
270
271
272
273
274
275 if (&rg->link == head || t < rg->from) {
276 VM_BUG_ON(resv->region_cache_count <= 0);
277
278 resv->region_cache_count--;
279 nrg = list_first_entry(&resv->region_cache, struct file_region,
280 link);
281 list_del(&nrg->link);
282
283 nrg->from = f;
284 nrg->to = t;
285 list_add(&nrg->link, rg->link.prev);
286
287 add += t - f;
288 goto out_locked;
289 }
290
291
292 if (f > rg->from)
293 f = rg->from;
294
295
296 nrg = rg;
297 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
298 if (&rg->link == head)
299 break;
300 if (rg->from > t)
301 break;
302
303
304
305
306 if (rg->to > t)
307 t = rg->to;
308 if (rg != nrg) {
309
310
311
312
313 add -= (rg->to - rg->from);
314 list_del(&rg->link);
315 kfree(rg);
316 }
317 }
318
319 add += (nrg->from - f);
320 nrg->from = f;
321 add += t - nrg->to;
322 nrg->to = t;
323
324out_locked:
325 resv->adds_in_progress--;
326 spin_unlock(&resv->lock);
327 VM_BUG_ON(add < 0);
328 return add;
329}
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353static long region_chg(struct resv_map *resv, long f, long t)
354{
355 struct list_head *head = &resv->regions;
356 struct file_region *rg, *nrg = NULL;
357 long chg = 0;
358
359retry:
360 spin_lock(&resv->lock);
361retry_locked:
362 resv->adds_in_progress++;
363
364
365
366
367
368 if (resv->adds_in_progress > resv->region_cache_count) {
369 struct file_region *trg;
370
371 VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
372
373 resv->adds_in_progress--;
374 spin_unlock(&resv->lock);
375
376 trg = kmalloc(sizeof(*trg), GFP_KERNEL);
377 if (!trg) {
378 kfree(nrg);
379 return -ENOMEM;
380 }
381
382 spin_lock(&resv->lock);
383 list_add(&trg->link, &resv->region_cache);
384 resv->region_cache_count++;
385 goto retry_locked;
386 }
387
388
389 list_for_each_entry(rg, head, link)
390 if (f <= rg->to)
391 break;
392
393
394
395
396 if (&rg->link == head || t < rg->from) {
397 if (!nrg) {
398 resv->adds_in_progress--;
399 spin_unlock(&resv->lock);
400 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
401 if (!nrg)
402 return -ENOMEM;
403
404 nrg->from = f;
405 nrg->to = f;
406 INIT_LIST_HEAD(&nrg->link);
407 goto retry;
408 }
409
410 list_add(&nrg->link, rg->link.prev);
411 chg = t - f;
412 goto out_nrg;
413 }
414
415
416 if (f > rg->from)
417 f = rg->from;
418 chg = t - f;
419
420
421 list_for_each_entry(rg, rg->link.prev, link) {
422 if (&rg->link == head)
423 break;
424 if (rg->from > t)
425 goto out;
426
427
428
429
430 if (rg->to > t) {
431 chg += rg->to - t;
432 t = rg->to;
433 }
434 chg -= rg->to - rg->from;
435 }
436
437out:
438 spin_unlock(&resv->lock);
439
440 kfree(nrg);
441 return chg;
442out_nrg:
443 spin_unlock(&resv->lock);
444 return chg;
445}
446
447
448
449
450
451
452
453
454
455
456
457
458static void region_abort(struct resv_map *resv, long f, long t)
459{
460 spin_lock(&resv->lock);
461 VM_BUG_ON(!resv->region_cache_count);
462 resv->adds_in_progress--;
463 spin_unlock(&resv->lock);
464}
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480static long region_del(struct resv_map *resv, long f, long t)
481{
482 struct list_head *head = &resv->regions;
483 struct file_region *rg, *trg;
484 struct file_region *nrg = NULL;
485 long del = 0;
486
487retry:
488 spin_lock(&resv->lock);
489 list_for_each_entry_safe(rg, trg, head, link) {
490
491
492
493
494
495
496
497 if (rg->to <= f && (rg->to != rg->from || rg->to != f))
498 continue;
499
500 if (rg->from >= t)
501 break;
502
503 if (f > rg->from && t < rg->to) {
504
505
506
507
508 if (!nrg &&
509 resv->region_cache_count > resv->adds_in_progress) {
510 nrg = list_first_entry(&resv->region_cache,
511 struct file_region,
512 link);
513 list_del(&nrg->link);
514 resv->region_cache_count--;
515 }
516
517 if (!nrg) {
518 spin_unlock(&resv->lock);
519 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
520 if (!nrg)
521 return -ENOMEM;
522 goto retry;
523 }
524
525 del += t - f;
526
527
528 nrg->from = t;
529 nrg->to = rg->to;
530 INIT_LIST_HEAD(&nrg->link);
531
532
533 rg->to = f;
534
535 list_add(&nrg->link, &rg->link);
536 nrg = NULL;
537 break;
538 }
539
540 if (f <= rg->from && t >= rg->to) {
541 del += rg->to - rg->from;
542 list_del(&rg->link);
543 kfree(rg);
544 continue;
545 }
546
547 if (f <= rg->from) {
548 del += t - rg->from;
549 rg->from = t;
550 } else {
551 del += rg->to - f;
552 rg->to = f;
553 }
554 }
555
556 spin_unlock(&resv->lock);
557 kfree(nrg);
558 return del;
559}
560
561
562
563
564
565
566
567
568
569
570void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
571{
572 struct hugepage_subpool *spool = subpool_inode(inode);
573 long rsv_adjust;
574
575 rsv_adjust = hugepage_subpool_get_pages(spool, 1);
576 if (restore_reserve && rsv_adjust) {
577 struct hstate *h = hstate_inode(inode);
578
579 hugetlb_acct_memory(h, 1);
580 }
581}
582
583
584
585
586
587static long region_count(struct resv_map *resv, long f, long t)
588{
589 struct list_head *head = &resv->regions;
590 struct file_region *rg;
591 long chg = 0;
592
593 spin_lock(&resv->lock);
594
595 list_for_each_entry(rg, head, link) {
596 long seg_from;
597 long seg_to;
598
599 if (rg->to <= f)
600 continue;
601 if (rg->from >= t)
602 break;
603
604 seg_from = max(rg->from, f);
605 seg_to = min(rg->to, t);
606
607 chg += seg_to - seg_from;
608 }
609 spin_unlock(&resv->lock);
610
611 return chg;
612}
613
614
615
616
617
618static pgoff_t vma_hugecache_offset(struct hstate *h,
619 struct vm_area_struct *vma, unsigned long address)
620{
621 return ((address - vma->vm_start) >> huge_page_shift(h)) +
622 (vma->vm_pgoff >> huge_page_order(h));
623}
624
625pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
626 unsigned long address)
627{
628 return vma_hugecache_offset(hstate_vma(vma), vma, address);
629}
630EXPORT_SYMBOL_GPL(linear_hugepage_index);
631
632
633
634
635
636unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
637{
638 struct hstate *hstate;
639
640 if (!is_vm_hugetlb_page(vma))
641 return PAGE_SIZE;
642
643 hstate = hstate_vma(vma);
644
645 return 1UL << huge_page_shift(hstate);
646}
647EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
648
649
650
651
652
653
654
655#ifndef vma_mmu_pagesize
656unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
657{
658 return vma_kernel_pagesize(vma);
659}
660#endif
661
662
663
664
665
666
667#define HPAGE_RESV_OWNER (1UL << 0)
668#define HPAGE_RESV_UNMAPPED (1UL << 1)
669#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690static unsigned long get_vma_private_data(struct vm_area_struct *vma)
691{
692 return (unsigned long)vma->vm_private_data;
693}
694
695static void set_vma_private_data(struct vm_area_struct *vma,
696 unsigned long value)
697{
698 vma->vm_private_data = (void *)value;
699}
700
701struct resv_map *resv_map_alloc(void)
702{
703 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
704 struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
705
706 if (!resv_map || !rg) {
707 kfree(resv_map);
708 kfree(rg);
709 return NULL;
710 }
711
712 kref_init(&resv_map->refs);
713 spin_lock_init(&resv_map->lock);
714 INIT_LIST_HEAD(&resv_map->regions);
715
716 resv_map->adds_in_progress = 0;
717
718 INIT_LIST_HEAD(&resv_map->region_cache);
719 list_add(&rg->link, &resv_map->region_cache);
720 resv_map->region_cache_count = 1;
721
722 return resv_map;
723}
724
725void resv_map_release(struct kref *ref)
726{
727 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
728 struct list_head *head = &resv_map->region_cache;
729 struct file_region *rg, *trg;
730
731
732 region_del(resv_map, 0, LONG_MAX);
733
734
735 list_for_each_entry_safe(rg, trg, head, link) {
736 list_del(&rg->link);
737 kfree(rg);
738 }
739
740 VM_BUG_ON(resv_map->adds_in_progress);
741
742 kfree(resv_map);
743}
744
745static inline struct resv_map *inode_resv_map(struct inode *inode)
746{
747 return inode->i_mapping->private_data;
748}
749
750static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
751{
752 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
753 if (vma->vm_flags & VM_MAYSHARE) {
754 struct address_space *mapping = vma->vm_file->f_mapping;
755 struct inode *inode = mapping->host;
756
757 return inode_resv_map(inode);
758
759 } else {
760 return (struct resv_map *)(get_vma_private_data(vma) &
761 ~HPAGE_RESV_MASK);
762 }
763}
764
765static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
766{
767 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
768 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
769
770 set_vma_private_data(vma, (get_vma_private_data(vma) &
771 HPAGE_RESV_MASK) | (unsigned long)map);
772}
773
774static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
775{
776 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
777 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
778
779 set_vma_private_data(vma, get_vma_private_data(vma) | flags);
780}
781
782static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
783{
784 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
785
786 return (get_vma_private_data(vma) & flag) != 0;
787}
788
789
790void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
791{
792 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
793 if (!(vma->vm_flags & VM_MAYSHARE))
794 vma->vm_private_data = (void *)0;
795}
796
797
798static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
799{
800 if (vma->vm_flags & VM_NORESERVE) {
801
802
803
804
805
806
807
808
809
810 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
811 return true;
812 else
813 return false;
814 }
815
816
817 if (vma->vm_flags & VM_MAYSHARE) {
818
819
820
821
822
823
824
825 if (chg)
826 return false;
827 else
828 return true;
829 }
830
831
832
833
834
835 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851 if (chg)
852 return false;
853 else
854 return true;
855 }
856
857 return false;
858}
859
860static void enqueue_huge_page(struct hstate *h, struct page *page)
861{
862 int nid = page_to_nid(page);
863 list_move(&page->lru, &h->hugepage_freelists[nid]);
864 h->free_huge_pages++;
865 h->free_huge_pages_node[nid]++;
866}
867
868static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
869{
870 struct page *page;
871
872 list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
873 if (!is_migrate_isolate_page(page))
874 break;
875
876
877
878
879 if (&h->hugepage_freelists[nid] == &page->lru)
880 return NULL;
881 list_move(&page->lru, &h->hugepage_activelist);
882 set_page_refcounted(page);
883 h->free_huge_pages--;
884 h->free_huge_pages_node[nid]--;
885 return page;
886}
887
888
889static inline gfp_t htlb_alloc_mask(struct hstate *h)
890{
891 if (hugepages_treat_as_movable || hugepage_migration_supported(h))
892 return GFP_HIGHUSER_MOVABLE;
893 else
894 return GFP_HIGHUSER;
895}
896
897static struct page *dequeue_huge_page_vma(struct hstate *h,
898 struct vm_area_struct *vma,
899 unsigned long address, int avoid_reserve,
900 long chg)
901{
902 struct page *page = NULL;
903 struct mempolicy *mpol;
904 nodemask_t *nodemask;
905 struct zonelist *zonelist;
906 struct zone *zone;
907 struct zoneref *z;
908 unsigned int cpuset_mems_cookie;
909
910
911
912
913
914
915 if (!vma_has_reserves(vma, chg) &&
916 h->free_huge_pages - h->resv_huge_pages == 0)
917 goto err;
918
919
920 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
921 goto err;
922
923retry_cpuset:
924 cpuset_mems_cookie = read_mems_allowed_begin();
925 zonelist = huge_zonelist(vma, address,
926 htlb_alloc_mask(h), &mpol, &nodemask);
927
928 for_each_zone_zonelist_nodemask(zone, z, zonelist,
929 MAX_NR_ZONES - 1, nodemask) {
930 if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
931 page = dequeue_huge_page_node(h, zone_to_nid(zone));
932 if (page) {
933 if (avoid_reserve)
934 break;
935 if (!vma_has_reserves(vma, chg))
936 break;
937
938 SetPagePrivate(page);
939 h->resv_huge_pages--;
940 break;
941 }
942 }
943 }
944
945 mpol_cond_put(mpol);
946 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
947 goto retry_cpuset;
948 return page;
949
950err:
951 return NULL;
952}
953
954
955
956
957
958
959
960
961static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
962{
963 nid = next_node_in(nid, *nodes_allowed);
964 VM_BUG_ON(nid >= MAX_NUMNODES);
965
966 return nid;
967}
968
969static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
970{
971 if (!node_isset(nid, *nodes_allowed))
972 nid = next_node_allowed(nid, nodes_allowed);
973 return nid;
974}
975
976
977
978
979
980
981
982static int hstate_next_node_to_alloc(struct hstate *h,
983 nodemask_t *nodes_allowed)
984{
985 int nid;
986
987 VM_BUG_ON(!nodes_allowed);
988
989 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
990 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
991
992 return nid;
993}
994
995
996
997
998
999
1000
1001static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
1002{
1003 int nid;
1004
1005 VM_BUG_ON(!nodes_allowed);
1006
1007 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1008 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1009
1010 return nid;
1011}
1012
1013#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
1014 for (nr_nodes = nodes_weight(*mask); \
1015 nr_nodes > 0 && \
1016 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
1017 nr_nodes--)
1018
1019#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
1020 for (nr_nodes = nodes_weight(*mask); \
1021 nr_nodes > 0 && \
1022 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
1023 nr_nodes--)
1024
1025#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \
1026 ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
1027 defined(CONFIG_CMA))
1028static void destroy_compound_gigantic_page(struct page *page,
1029 unsigned int order)
1030{
1031 int i;
1032 int nr_pages = 1 << order;
1033 struct page *p = page + 1;
1034
1035 atomic_set(compound_mapcount_ptr(page), 0);
1036 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1037 clear_compound_head(p);
1038 set_page_refcounted(p);
1039 }
1040
1041 set_compound_order(page, 0);
1042 __ClearPageHead(page);
1043}
1044
1045static void free_gigantic_page(struct page *page, unsigned int order)
1046{
1047 free_contig_range(page_to_pfn(page), 1 << order);
1048}
1049
1050static int __alloc_gigantic_page(unsigned long start_pfn,
1051 unsigned long nr_pages)
1052{
1053 unsigned long end_pfn = start_pfn + nr_pages;
1054 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1055}
1056
1057static bool pfn_range_valid_gigantic(struct zone *z,
1058 unsigned long start_pfn, unsigned long nr_pages)
1059{
1060 unsigned long i, end_pfn = start_pfn + nr_pages;
1061 struct page *page;
1062
1063 for (i = start_pfn; i < end_pfn; i++) {
1064 if (!pfn_valid(i))
1065 return false;
1066
1067 page = pfn_to_page(i);
1068
1069 if (page_zone(page) != z)
1070 return false;
1071
1072 if (PageReserved(page))
1073 return false;
1074
1075 if (page_count(page) > 0)
1076 return false;
1077
1078 if (PageHuge(page))
1079 return false;
1080 }
1081
1082 return true;
1083}
1084
1085static bool zone_spans_last_pfn(const struct zone *zone,
1086 unsigned long start_pfn, unsigned long nr_pages)
1087{
1088 unsigned long last_pfn = start_pfn + nr_pages - 1;
1089 return zone_spans_pfn(zone, last_pfn);
1090}
1091
1092static struct page *alloc_gigantic_page(int nid, unsigned int order)
1093{
1094 unsigned long nr_pages = 1 << order;
1095 unsigned long ret, pfn, flags;
1096 struct zone *z;
1097
1098 z = NODE_DATA(nid)->node_zones;
1099 for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
1100 spin_lock_irqsave(&z->lock, flags);
1101
1102 pfn = ALIGN(z->zone_start_pfn, nr_pages);
1103 while (zone_spans_last_pfn(z, pfn, nr_pages)) {
1104 if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
1105
1106
1107
1108
1109
1110
1111
1112 spin_unlock_irqrestore(&z->lock, flags);
1113 ret = __alloc_gigantic_page(pfn, nr_pages);
1114 if (!ret)
1115 return pfn_to_page(pfn);
1116 spin_lock_irqsave(&z->lock, flags);
1117 }
1118 pfn += nr_pages;
1119 }
1120
1121 spin_unlock_irqrestore(&z->lock, flags);
1122 }
1123
1124 return NULL;
1125}
1126
1127static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
1128static void prep_compound_gigantic_page(struct page *page, unsigned int order);
1129
1130static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
1131{
1132 struct page *page;
1133
1134 page = alloc_gigantic_page(nid, huge_page_order(h));
1135 if (page) {
1136 prep_compound_gigantic_page(page, huge_page_order(h));
1137 prep_new_huge_page(h, page, nid);
1138 }
1139
1140 return page;
1141}
1142
1143static int alloc_fresh_gigantic_page(struct hstate *h,
1144 nodemask_t *nodes_allowed)
1145{
1146 struct page *page = NULL;
1147 int nr_nodes, node;
1148
1149 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1150 page = alloc_fresh_gigantic_page_node(h, node);
1151 if (page)
1152 return 1;
1153 }
1154
1155 return 0;
1156}
1157
1158static inline bool gigantic_page_supported(void) { return true; }
1159#else
1160static inline bool gigantic_page_supported(void) { return false; }
1161static inline void free_gigantic_page(struct page *page, unsigned int order) { }
1162static inline void destroy_compound_gigantic_page(struct page *page,
1163 unsigned int order) { }
1164static inline int alloc_fresh_gigantic_page(struct hstate *h,
1165 nodemask_t *nodes_allowed) { return 0; }
1166#endif
1167
1168static void update_and_free_page(struct hstate *h, struct page *page)
1169{
1170 int i;
1171
1172 if (hstate_is_gigantic(h) && !gigantic_page_supported())
1173 return;
1174
1175 h->nr_huge_pages--;
1176 h->nr_huge_pages_node[page_to_nid(page)]--;
1177 for (i = 0; i < pages_per_huge_page(h); i++) {
1178 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
1179 1 << PG_referenced | 1 << PG_dirty |
1180 1 << PG_active | 1 << PG_private |
1181 1 << PG_writeback);
1182 }
1183 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
1184 set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
1185 set_page_refcounted(page);
1186 if (hstate_is_gigantic(h)) {
1187 destroy_compound_gigantic_page(page, huge_page_order(h));
1188 free_gigantic_page(page, huge_page_order(h));
1189 } else {
1190 __free_pages(page, huge_page_order(h));
1191 }
1192}
1193
1194struct hstate *size_to_hstate(unsigned long size)
1195{
1196 struct hstate *h;
1197
1198 for_each_hstate(h) {
1199 if (huge_page_size(h) == size)
1200 return h;
1201 }
1202 return NULL;
1203}
1204
1205
1206
1207
1208
1209
1210
1211bool page_huge_active(struct page *page)
1212{
1213 VM_BUG_ON_PAGE(!PageHuge(page), page);
1214 return PageHead(page) && PagePrivate(&page[1]);
1215}
1216
1217
1218static void set_page_huge_active(struct page *page)
1219{
1220 VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
1221 SetPagePrivate(&page[1]);
1222}
1223
1224static void clear_page_huge_active(struct page *page)
1225{
1226 VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
1227 ClearPagePrivate(&page[1]);
1228}
1229
1230void free_huge_page(struct page *page)
1231{
1232
1233
1234
1235
1236 struct hstate *h = page_hstate(page);
1237 int nid = page_to_nid(page);
1238 struct hugepage_subpool *spool =
1239 (struct hugepage_subpool *)page_private(page);
1240 bool restore_reserve;
1241
1242 set_page_private(page, 0);
1243 page->mapping = NULL;
1244 VM_BUG_ON_PAGE(page_count(page), page);
1245 VM_BUG_ON_PAGE(page_mapcount(page), page);
1246 restore_reserve = PagePrivate(page);
1247 ClearPagePrivate(page);
1248
1249
1250
1251
1252
1253
1254 if (hugepage_subpool_put_pages(spool, 1) == 0)
1255 restore_reserve = true;
1256
1257 spin_lock(&hugetlb_lock);
1258 clear_page_huge_active(page);
1259 hugetlb_cgroup_uncharge_page(hstate_index(h),
1260 pages_per_huge_page(h), page);
1261 if (restore_reserve)
1262 h->resv_huge_pages++;
1263
1264 if (h->surplus_huge_pages_node[nid]) {
1265
1266 list_del(&page->lru);
1267 update_and_free_page(h, page);
1268 h->surplus_huge_pages--;
1269 h->surplus_huge_pages_node[nid]--;
1270 } else {
1271 arch_clear_hugepage_flags(page);
1272 enqueue_huge_page(h, page);
1273 }
1274 spin_unlock(&hugetlb_lock);
1275}
1276
1277static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
1278{
1279 INIT_LIST_HEAD(&page->lru);
1280 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1281 spin_lock(&hugetlb_lock);
1282 set_hugetlb_cgroup(page, NULL);
1283 h->nr_huge_pages++;
1284 h->nr_huge_pages_node[nid]++;
1285 spin_unlock(&hugetlb_lock);
1286 put_page(page);
1287}
1288
1289static void prep_compound_gigantic_page(struct page *page, unsigned int order)
1290{
1291 int i;
1292 int nr_pages = 1 << order;
1293 struct page *p = page + 1;
1294
1295
1296 set_compound_order(page, order);
1297 __ClearPageReserved(page);
1298 __SetPageHead(page);
1299 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312 __ClearPageReserved(p);
1313 set_page_count(p, 0);
1314 set_compound_head(p, page);
1315 }
1316 atomic_set(compound_mapcount_ptr(page), -1);
1317}
1318
1319
1320
1321
1322
1323
1324int PageHuge(struct page *page)
1325{
1326 if (!PageCompound(page))
1327 return 0;
1328
1329 page = compound_head(page);
1330 return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
1331}
1332EXPORT_SYMBOL_GPL(PageHuge);
1333
1334
1335
1336
1337
1338int PageHeadHuge(struct page *page_head)
1339{
1340 if (!PageHead(page_head))
1341 return 0;
1342
1343 return get_compound_page_dtor(page_head) == free_huge_page;
1344}
1345
1346pgoff_t __basepage_index(struct page *page)
1347{
1348 struct page *page_head = compound_head(page);
1349 pgoff_t index = page_index(page_head);
1350 unsigned long compound_idx;
1351
1352 if (!PageHuge(page_head))
1353 return page_index(page);
1354
1355 if (compound_order(page_head) >= MAX_ORDER)
1356 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
1357 else
1358 compound_idx = page - page_head;
1359
1360 return (index << compound_order(page_head)) + compound_idx;
1361}
1362
1363static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
1364{
1365 struct page *page;
1366
1367 page = __alloc_pages_node(nid,
1368 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
1369 __GFP_REPEAT|__GFP_NOWARN,
1370 huge_page_order(h));
1371 if (page) {
1372 prep_new_huge_page(h, page, nid);
1373 }
1374
1375 return page;
1376}
1377
1378static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
1379{
1380 struct page *page;
1381 int nr_nodes, node;
1382 int ret = 0;
1383
1384 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1385 page = alloc_fresh_huge_page_node(h, node);
1386 if (page) {
1387 ret = 1;
1388 break;
1389 }
1390 }
1391
1392 if (ret)
1393 count_vm_event(HTLB_BUDDY_PGALLOC);
1394 else
1395 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1396
1397 return ret;
1398}
1399
1400
1401
1402
1403
1404
1405
1406static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1407 bool acct_surplus)
1408{
1409 int nr_nodes, node;
1410 int ret = 0;
1411
1412 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1413
1414
1415
1416
1417 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
1418 !list_empty(&h->hugepage_freelists[node])) {
1419 struct page *page =
1420 list_entry(h->hugepage_freelists[node].next,
1421 struct page, lru);
1422 list_del(&page->lru);
1423 h->free_huge_pages--;
1424 h->free_huge_pages_node[node]--;
1425 if (acct_surplus) {
1426 h->surplus_huge_pages--;
1427 h->surplus_huge_pages_node[node]--;
1428 }
1429 update_and_free_page(h, page);
1430 ret = 1;
1431 break;
1432 }
1433 }
1434
1435 return ret;
1436}
1437
1438
1439
1440
1441
1442static void dissolve_free_huge_page(struct page *page)
1443{
1444 spin_lock(&hugetlb_lock);
1445 if (PageHuge(page) && !page_count(page)) {
1446 struct hstate *h = page_hstate(page);
1447 int nid = page_to_nid(page);
1448 list_del(&page->lru);
1449 h->free_huge_pages--;
1450 h->free_huge_pages_node[nid]--;
1451 h->max_huge_pages--;
1452 update_and_free_page(h, page);
1453 }
1454 spin_unlock(&hugetlb_lock);
1455}
1456
1457
1458
1459
1460
1461
1462void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1463{
1464 unsigned long pfn;
1465
1466 if (!hugepages_supported())
1467 return;
1468
1469 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
1470 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
1471 dissolve_free_huge_page(pfn_to_page(pfn));
1472}
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
1484 struct vm_area_struct *vma, unsigned long addr, int nid)
1485{
1486 int order = huge_page_order(h);
1487 gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
1488 unsigned int cpuset_mems_cookie;
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499 if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
1500
1501
1502
1503
1504
1505 if (nid != NUMA_NO_NODE)
1506 gfp |= __GFP_THISNODE;
1507
1508
1509
1510
1511 return alloc_pages_node(nid, gfp, order);
1512 }
1513
1514
1515
1516
1517
1518
1519 do {
1520 struct page *page;
1521 struct mempolicy *mpol;
1522 struct zonelist *zl;
1523 nodemask_t *nodemask;
1524
1525 cpuset_mems_cookie = read_mems_allowed_begin();
1526 zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
1527 mpol_cond_put(mpol);
1528 page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
1529 if (page)
1530 return page;
1531 } while (read_mems_allowed_retry(cpuset_mems_cookie));
1532
1533 return NULL;
1534}
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548static struct page *__alloc_buddy_huge_page(struct hstate *h,
1549 struct vm_area_struct *vma, unsigned long addr, int nid)
1550{
1551 struct page *page;
1552 unsigned int r_nid;
1553
1554 if (hstate_is_gigantic(h))
1555 return NULL;
1556
1557
1558
1559
1560
1561
1562 if (vma || (addr != -1)) {
1563 VM_WARN_ON_ONCE(addr == -1);
1564 VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
1565 }
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589 spin_lock(&hugetlb_lock);
1590 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
1591 spin_unlock(&hugetlb_lock);
1592 return NULL;
1593 } else {
1594 h->nr_huge_pages++;
1595 h->surplus_huge_pages++;
1596 }
1597 spin_unlock(&hugetlb_lock);
1598
1599 page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
1600
1601 spin_lock(&hugetlb_lock);
1602 if (page) {
1603 INIT_LIST_HEAD(&page->lru);
1604 r_nid = page_to_nid(page);
1605 set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
1606 set_hugetlb_cgroup(page, NULL);
1607
1608
1609
1610 h->nr_huge_pages_node[r_nid]++;
1611 h->surplus_huge_pages_node[r_nid]++;
1612 __count_vm_event(HTLB_BUDDY_PGALLOC);
1613 } else {
1614 h->nr_huge_pages--;
1615 h->surplus_huge_pages--;
1616 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1617 }
1618 spin_unlock(&hugetlb_lock);
1619
1620 return page;
1621}
1622
1623
1624
1625
1626
1627
1628static
1629struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
1630{
1631 unsigned long addr = -1;
1632
1633 return __alloc_buddy_huge_page(h, NULL, addr, nid);
1634}
1635
1636
1637
1638
1639static
1640struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
1641 struct vm_area_struct *vma, unsigned long addr)
1642{
1643 return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
1644}
1645
1646
1647
1648
1649
1650
1651struct page *alloc_huge_page_node(struct hstate *h, int nid)
1652{
1653 struct page *page = NULL;
1654
1655 spin_lock(&hugetlb_lock);
1656 if (h->free_huge_pages - h->resv_huge_pages > 0)
1657 page = dequeue_huge_page_node(h, nid);
1658 spin_unlock(&hugetlb_lock);
1659
1660 if (!page)
1661 page = __alloc_buddy_huge_page_no_mpol(h, nid);
1662
1663 return page;
1664}
1665
1666
1667
1668
1669
1670static int gather_surplus_pages(struct hstate *h, int delta)
1671{
1672 struct list_head surplus_list;
1673 struct page *page, *tmp;
1674 int ret, i;
1675 int needed, allocated;
1676 bool alloc_ok = true;
1677
1678 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
1679 if (needed <= 0) {
1680 h->resv_huge_pages += delta;
1681 return 0;
1682 }
1683
1684 allocated = 0;
1685 INIT_LIST_HEAD(&surplus_list);
1686
1687 ret = -ENOMEM;
1688retry:
1689 spin_unlock(&hugetlb_lock);
1690 for (i = 0; i < needed; i++) {
1691 page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
1692 if (!page) {
1693 alloc_ok = false;
1694 break;
1695 }
1696 list_add(&page->lru, &surplus_list);
1697 }
1698 allocated += i;
1699
1700
1701
1702
1703
1704 spin_lock(&hugetlb_lock);
1705 needed = (h->resv_huge_pages + delta) -
1706 (h->free_huge_pages + allocated);
1707 if (needed > 0) {
1708 if (alloc_ok)
1709 goto retry;
1710
1711
1712
1713
1714
1715 goto free;
1716 }
1717
1718
1719
1720
1721
1722
1723
1724
1725 needed += allocated;
1726 h->resv_huge_pages += delta;
1727 ret = 0;
1728
1729
1730 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1731 if ((--needed) < 0)
1732 break;
1733
1734
1735
1736
1737 put_page_testzero(page);
1738 VM_BUG_ON_PAGE(page_count(page), page);
1739 enqueue_huge_page(h, page);
1740 }
1741free:
1742 spin_unlock(&hugetlb_lock);
1743
1744
1745 list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1746 put_page(page);
1747 spin_lock(&hugetlb_lock);
1748
1749 return ret;
1750}
1751
1752
1753
1754
1755
1756
1757
1758static void return_unused_surplus_pages(struct hstate *h,
1759 unsigned long unused_resv_pages)
1760{
1761 unsigned long nr_pages;
1762
1763
1764 h->resv_huge_pages -= unused_resv_pages;
1765
1766
1767 if (hstate_is_gigantic(h))
1768 return;
1769
1770 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780 while (nr_pages--) {
1781 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
1782 break;
1783 cond_resched_lock(&hugetlb_lock);
1784 }
1785}
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807enum vma_resv_mode {
1808 VMA_NEEDS_RESV,
1809 VMA_COMMIT_RESV,
1810 VMA_END_RESV,
1811};
1812static long __vma_reservation_common(struct hstate *h,
1813 struct vm_area_struct *vma, unsigned long addr,
1814 enum vma_resv_mode mode)
1815{
1816 struct resv_map *resv;
1817 pgoff_t idx;
1818 long ret;
1819
1820 resv = vma_resv_map(vma);
1821 if (!resv)
1822 return 1;
1823
1824 idx = vma_hugecache_offset(h, vma, addr);
1825 switch (mode) {
1826 case VMA_NEEDS_RESV:
1827 ret = region_chg(resv, idx, idx + 1);
1828 break;
1829 case VMA_COMMIT_RESV:
1830 ret = region_add(resv, idx, idx + 1);
1831 break;
1832 case VMA_END_RESV:
1833 region_abort(resv, idx, idx + 1);
1834 ret = 0;
1835 break;
1836 default:
1837 BUG();
1838 }
1839
1840 if (vma->vm_flags & VM_MAYSHARE)
1841 return ret;
1842 else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856 if (ret)
1857 return 0;
1858 else
1859 return 1;
1860 }
1861 else
1862 return ret < 0 ? ret : 0;
1863}
1864
1865static long vma_needs_reservation(struct hstate *h,
1866 struct vm_area_struct *vma, unsigned long addr)
1867{
1868 return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
1869}
1870
1871static long vma_commit_reservation(struct hstate *h,
1872 struct vm_area_struct *vma, unsigned long addr)
1873{
1874 return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
1875}
1876
1877static void vma_end_reservation(struct hstate *h,
1878 struct vm_area_struct *vma, unsigned long addr)
1879{
1880 (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
1881}
1882
1883struct page *alloc_huge_page(struct vm_area_struct *vma,
1884 unsigned long addr, int avoid_reserve)
1885{
1886 struct hugepage_subpool *spool = subpool_vma(vma);
1887 struct hstate *h = hstate_vma(vma);
1888 struct page *page;
1889 long map_chg, map_commit;
1890 long gbl_chg;
1891 int ret, idx;
1892 struct hugetlb_cgroup *h_cg;
1893
1894 idx = hstate_index(h);
1895
1896
1897
1898
1899
1900 map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
1901 if (map_chg < 0)
1902 return ERR_PTR(-ENOMEM);
1903
1904
1905
1906
1907
1908
1909
1910
1911 if (map_chg || avoid_reserve) {
1912 gbl_chg = hugepage_subpool_get_pages(spool, 1);
1913 if (gbl_chg < 0) {
1914 vma_end_reservation(h, vma, addr);
1915 return ERR_PTR(-ENOSPC);
1916 }
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926 if (avoid_reserve)
1927 gbl_chg = 1;
1928 }
1929
1930 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1931 if (ret)
1932 goto out_subpool_put;
1933
1934 spin_lock(&hugetlb_lock);
1935
1936
1937
1938
1939
1940 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
1941 if (!page) {
1942 spin_unlock(&hugetlb_lock);
1943 page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
1944 if (!page)
1945 goto out_uncharge_cgroup;
1946 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
1947 SetPagePrivate(page);
1948 h->resv_huge_pages--;
1949 }
1950 spin_lock(&hugetlb_lock);
1951 list_move(&page->lru, &h->hugepage_activelist);
1952
1953 }
1954 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
1955 spin_unlock(&hugetlb_lock);
1956
1957 set_page_private(page, (unsigned long)spool);
1958
1959 map_commit = vma_commit_reservation(h, vma, addr);
1960 if (unlikely(map_chg > map_commit)) {
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970 long rsv_adjust;
1971
1972 rsv_adjust = hugepage_subpool_put_pages(spool, 1);
1973 hugetlb_acct_memory(h, -rsv_adjust);
1974 }
1975 return page;
1976
1977out_uncharge_cgroup:
1978 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
1979out_subpool_put:
1980 if (map_chg || avoid_reserve)
1981 hugepage_subpool_put_pages(spool, 1);
1982 vma_end_reservation(h, vma, addr);
1983 return ERR_PTR(-ENOSPC);
1984}
1985
1986
1987
1988
1989
1990
1991struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
1992 unsigned long addr, int avoid_reserve)
1993{
1994 struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
1995 if (IS_ERR(page))
1996 page = NULL;
1997 return page;
1998}
1999
2000int __weak alloc_bootmem_huge_page(struct hstate *h)
2001{
2002 struct huge_bootmem_page *m;
2003 int nr_nodes, node;
2004
2005 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
2006 void *addr;
2007
2008 addr = memblock_virt_alloc_try_nid_nopanic(
2009 huge_page_size(h), huge_page_size(h),
2010 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
2011 if (addr) {
2012
2013
2014
2015
2016
2017 m = addr;
2018 goto found;
2019 }
2020 }
2021 return 0;
2022
2023found:
2024 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
2025
2026 list_add(&m->list, &huge_boot_pages);
2027 m->hstate = h;
2028 return 1;
2029}
2030
2031static void __init prep_compound_huge_page(struct page *page,
2032 unsigned int order)
2033{
2034 if (unlikely(order > (MAX_ORDER - 1)))
2035 prep_compound_gigantic_page(page, order);
2036 else
2037 prep_compound_page(page, order);
2038}
2039
2040
2041static void __init gather_bootmem_prealloc(void)
2042{
2043 struct huge_bootmem_page *m;
2044
2045 list_for_each_entry(m, &huge_boot_pages, list) {
2046 struct hstate *h = m->hstate;
2047 struct page *page;
2048
2049#ifdef CONFIG_HIGHMEM
2050 page = pfn_to_page(m->phys >> PAGE_SHIFT);
2051 memblock_free_late(__pa(m),
2052 sizeof(struct huge_bootmem_page));
2053#else
2054 page = virt_to_page(m);
2055#endif
2056 WARN_ON(page_count(page) != 1);
2057 prep_compound_huge_page(page, h->order);
2058 WARN_ON(PageReserved(page));
2059 prep_new_huge_page(h, page, page_to_nid(page));
2060
2061
2062
2063
2064
2065
2066 if (hstate_is_gigantic(h))
2067 adjust_managed_page_count(page, 1 << h->order);
2068 }
2069}
2070
2071static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
2072{
2073 unsigned long i;
2074
2075 for (i = 0; i < h->max_huge_pages; ++i) {
2076 if (hstate_is_gigantic(h)) {
2077 if (!alloc_bootmem_huge_page(h))
2078 break;
2079 } else if (!alloc_fresh_huge_page(h,
2080 &node_states[N_MEMORY]))
2081 break;
2082 }
2083 h->max_huge_pages = i;
2084}
2085
2086static void __init hugetlb_init_hstates(void)
2087{
2088 struct hstate *h;
2089
2090 for_each_hstate(h) {
2091 if (minimum_order > huge_page_order(h))
2092 minimum_order = huge_page_order(h);
2093
2094
2095 if (!hstate_is_gigantic(h))
2096 hugetlb_hstate_alloc_pages(h);
2097 }
2098 VM_BUG_ON(minimum_order == UINT_MAX);
2099}
2100
2101static char * __init memfmt(char *buf, unsigned long n)
2102{
2103 if (n >= (1UL << 30))
2104 sprintf(buf, "%lu GB", n >> 30);
2105 else if (n >= (1UL << 20))
2106 sprintf(buf, "%lu MB", n >> 20);
2107 else
2108 sprintf(buf, "%lu KB", n >> 10);
2109 return buf;
2110}
2111
2112static void __init report_hugepages(void)
2113{
2114 struct hstate *h;
2115
2116 for_each_hstate(h) {
2117 char buf[32];
2118 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
2119 memfmt(buf, huge_page_size(h)),
2120 h->free_huge_pages);
2121 }
2122}
2123
2124#ifdef CONFIG_HIGHMEM
2125static void try_to_free_low(struct hstate *h, unsigned long count,
2126 nodemask_t *nodes_allowed)
2127{
2128 int i;
2129
2130 if (hstate_is_gigantic(h))
2131 return;
2132
2133 for_each_node_mask(i, *nodes_allowed) {
2134 struct page *page, *next;
2135 struct list_head *freel = &h->hugepage_freelists[i];
2136 list_for_each_entry_safe(page, next, freel, lru) {
2137 if (count >= h->nr_huge_pages)
2138 return;
2139 if (PageHighMem(page))
2140 continue;
2141 list_del(&page->lru);
2142 update_and_free_page(h, page);
2143 h->free_huge_pages--;
2144 h->free_huge_pages_node[page_to_nid(page)]--;
2145 }
2146 }
2147}
2148#else
2149static inline void try_to_free_low(struct hstate *h, unsigned long count,
2150 nodemask_t *nodes_allowed)
2151{
2152}
2153#endif
2154
2155
2156
2157
2158
2159
2160static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
2161 int delta)
2162{
2163 int nr_nodes, node;
2164
2165 VM_BUG_ON(delta != -1 && delta != 1);
2166
2167 if (delta < 0) {
2168 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
2169 if (h->surplus_huge_pages_node[node])
2170 goto found;
2171 }
2172 } else {
2173 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
2174 if (h->surplus_huge_pages_node[node] <
2175 h->nr_huge_pages_node[node])
2176 goto found;
2177 }
2178 }
2179 return 0;
2180
2181found:
2182 h->surplus_huge_pages += delta;
2183 h->surplus_huge_pages_node[node] += delta;
2184 return 1;
2185}
2186
2187#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
2188static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
2189 nodemask_t *nodes_allowed)
2190{
2191 unsigned long min_count, ret;
2192
2193 if (hstate_is_gigantic(h) && !gigantic_page_supported())
2194 return h->max_huge_pages;
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207 spin_lock(&hugetlb_lock);
2208 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
2209 if (!adjust_pool_surplus(h, nodes_allowed, -1))
2210 break;
2211 }
2212
2213 while (count > persistent_huge_pages(h)) {
2214
2215
2216
2217
2218
2219 spin_unlock(&hugetlb_lock);
2220
2221
2222 cond_resched();
2223
2224 if (hstate_is_gigantic(h))
2225 ret = alloc_fresh_gigantic_page(h, nodes_allowed);
2226 else
2227 ret = alloc_fresh_huge_page(h, nodes_allowed);
2228 spin_lock(&hugetlb_lock);
2229 if (!ret)
2230 goto out;
2231
2232
2233 if (signal_pending(current))
2234 goto out;
2235 }
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
2253 min_count = max(count, min_count);
2254 try_to_free_low(h, min_count, nodes_allowed);
2255 while (min_count < persistent_huge_pages(h)) {
2256 if (!free_pool_huge_page(h, nodes_allowed, 0))
2257 break;
2258 cond_resched_lock(&hugetlb_lock);
2259 }
2260 while (count < persistent_huge_pages(h)) {
2261 if (!adjust_pool_surplus(h, nodes_allowed, 1))
2262 break;
2263 }
2264out:
2265 ret = persistent_huge_pages(h);
2266 spin_unlock(&hugetlb_lock);
2267 return ret;
2268}
2269
2270#define HSTATE_ATTR_RO(_name) \
2271 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2272
2273#define HSTATE_ATTR(_name) \
2274 static struct kobj_attribute _name##_attr = \
2275 __ATTR(_name, 0644, _name##_show, _name##_store)
2276
2277static struct kobject *hugepages_kobj;
2278static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
2279
2280static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
2281
2282static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
2283{
2284 int i;
2285
2286 for (i = 0; i < HUGE_MAX_HSTATE; i++)
2287 if (hstate_kobjs[i] == kobj) {
2288 if (nidp)
2289 *nidp = NUMA_NO_NODE;
2290 return &hstates[i];
2291 }
2292
2293 return kobj_to_node_hstate(kobj, nidp);
2294}
2295
2296static ssize_t nr_hugepages_show_common(struct kobject *kobj,
2297 struct kobj_attribute *attr, char *buf)
2298{
2299 struct hstate *h;
2300 unsigned long nr_huge_pages;
2301 int nid;
2302
2303 h = kobj_to_hstate(kobj, &nid);
2304 if (nid == NUMA_NO_NODE)
2305 nr_huge_pages = h->nr_huge_pages;
2306 else
2307 nr_huge_pages = h->nr_huge_pages_node[nid];
2308
2309 return sprintf(buf, "%lu\n", nr_huge_pages);
2310}
2311
2312static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
2313 struct hstate *h, int nid,
2314 unsigned long count, size_t len)
2315{
2316 int err;
2317 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
2318
2319 if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
2320 err = -EINVAL;
2321 goto out;
2322 }
2323
2324 if (nid == NUMA_NO_NODE) {
2325
2326
2327
2328 if (!(obey_mempolicy &&
2329 init_nodemask_of_mempolicy(nodes_allowed))) {
2330 NODEMASK_FREE(nodes_allowed);
2331 nodes_allowed = &node_states[N_MEMORY];
2332 }
2333 } else if (nodes_allowed) {
2334
2335
2336
2337
2338 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
2339 init_nodemask_of_node(nodes_allowed, nid);
2340 } else
2341 nodes_allowed = &node_states[N_MEMORY];
2342
2343 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
2344
2345 if (nodes_allowed != &node_states[N_MEMORY])
2346 NODEMASK_FREE(nodes_allowed);
2347
2348 return len;
2349out:
2350 NODEMASK_FREE(nodes_allowed);
2351 return err;
2352}
2353
2354static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
2355 struct kobject *kobj, const char *buf,
2356 size_t len)
2357{
2358 struct hstate *h;
2359 unsigned long count;
2360 int nid;
2361 int err;
2362
2363 err = kstrtoul(buf, 10, &count);
2364 if (err)
2365 return err;
2366
2367 h = kobj_to_hstate(kobj, &nid);
2368 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
2369}
2370
2371static ssize_t nr_hugepages_show(struct kobject *kobj,
2372 struct kobj_attribute *attr, char *buf)
2373{
2374 return nr_hugepages_show_common(kobj, attr, buf);
2375}
2376
2377static ssize_t nr_hugepages_store(struct kobject *kobj,
2378 struct kobj_attribute *attr, const char *buf, size_t len)
2379{
2380 return nr_hugepages_store_common(false, kobj, buf, len);
2381}
2382HSTATE_ATTR(nr_hugepages);
2383
2384#ifdef CONFIG_NUMA
2385
2386
2387
2388
2389
2390static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
2391 struct kobj_attribute *attr, char *buf)
2392{
2393 return nr_hugepages_show_common(kobj, attr, buf);
2394}
2395
2396static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
2397 struct kobj_attribute *attr, const char *buf, size_t len)
2398{
2399 return nr_hugepages_store_common(true, kobj, buf, len);
2400}
2401HSTATE_ATTR(nr_hugepages_mempolicy);
2402#endif
2403
2404
2405static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
2406 struct kobj_attribute *attr, char *buf)
2407{
2408 struct hstate *h = kobj_to_hstate(kobj, NULL);
2409 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
2410}
2411
2412static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
2413 struct kobj_attribute *attr, const char *buf, size_t count)
2414{
2415 int err;
2416 unsigned long input;
2417 struct hstate *h = kobj_to_hstate(kobj, NULL);
2418
2419 if (hstate_is_gigantic(h))
2420 return -EINVAL;
2421
2422 err = kstrtoul(buf, 10, &input);
2423 if (err)
2424 return err;
2425
2426 spin_lock(&hugetlb_lock);
2427 h->nr_overcommit_huge_pages = input;
2428 spin_unlock(&hugetlb_lock);
2429
2430 return count;
2431}
2432HSTATE_ATTR(nr_overcommit_hugepages);
2433
2434static ssize_t free_hugepages_show(struct kobject *kobj,
2435 struct kobj_attribute *attr, char *buf)
2436{
2437 struct hstate *h;
2438 unsigned long free_huge_pages;
2439 int nid;
2440
2441 h = kobj_to_hstate(kobj, &nid);
2442 if (nid == NUMA_NO_NODE)
2443 free_huge_pages = h->free_huge_pages;
2444 else
2445 free_huge_pages = h->free_huge_pages_node[nid];
2446
2447 return sprintf(buf, "%lu\n", free_huge_pages);
2448}
2449HSTATE_ATTR_RO(free_hugepages);
2450
2451static ssize_t resv_hugepages_show(struct kobject *kobj,
2452 struct kobj_attribute *attr, char *buf)
2453{
2454 struct hstate *h = kobj_to_hstate(kobj, NULL);
2455 return sprintf(buf, "%lu\n", h->resv_huge_pages);
2456}
2457HSTATE_ATTR_RO(resv_hugepages);
2458
2459static ssize_t surplus_hugepages_show(struct kobject *kobj,
2460 struct kobj_attribute *attr, char *buf)
2461{
2462 struct hstate *h;
2463 unsigned long surplus_huge_pages;
2464 int nid;
2465
2466 h = kobj_to_hstate(kobj, &nid);
2467 if (nid == NUMA_NO_NODE)
2468 surplus_huge_pages = h->surplus_huge_pages;
2469 else
2470 surplus_huge_pages = h->surplus_huge_pages_node[nid];
2471
2472 return sprintf(buf, "%lu\n", surplus_huge_pages);
2473}
2474HSTATE_ATTR_RO(surplus_hugepages);
2475
2476static struct attribute *hstate_attrs[] = {
2477 &nr_hugepages_attr.attr,
2478 &nr_overcommit_hugepages_attr.attr,
2479 &free_hugepages_attr.attr,
2480 &resv_hugepages_attr.attr,
2481 &surplus_hugepages_attr.attr,
2482#ifdef CONFIG_NUMA
2483 &nr_hugepages_mempolicy_attr.attr,
2484#endif
2485 NULL,
2486};
2487
2488static struct attribute_group hstate_attr_group = {
2489 .attrs = hstate_attrs,
2490};
2491
2492static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
2493 struct kobject **hstate_kobjs,
2494 struct attribute_group *hstate_attr_group)
2495{
2496 int retval;
2497 int hi = hstate_index(h);
2498
2499 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
2500 if (!hstate_kobjs[hi])
2501 return -ENOMEM;
2502
2503 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
2504 if (retval)
2505 kobject_put(hstate_kobjs[hi]);
2506
2507 return retval;
2508}
2509
2510static void __init hugetlb_sysfs_init(void)
2511{
2512 struct hstate *h;
2513 int err;
2514
2515 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
2516 if (!hugepages_kobj)
2517 return;
2518
2519 for_each_hstate(h) {
2520 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
2521 hstate_kobjs, &hstate_attr_group);
2522 if (err)
2523 pr_err("Hugetlb: Unable to add hstate %s", h->name);
2524 }
2525}
2526
2527#ifdef CONFIG_NUMA
2528
2529
2530
2531
2532
2533
2534
2535
2536struct node_hstate {
2537 struct kobject *hugepages_kobj;
2538 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
2539};
2540static struct node_hstate node_hstates[MAX_NUMNODES];
2541
2542
2543
2544
2545static struct attribute *per_node_hstate_attrs[] = {
2546 &nr_hugepages_attr.attr,
2547 &free_hugepages_attr.attr,
2548 &surplus_hugepages_attr.attr,
2549 NULL,
2550};
2551
2552static struct attribute_group per_node_hstate_attr_group = {
2553 .attrs = per_node_hstate_attrs,
2554};
2555
2556
2557
2558
2559
2560static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
2561{
2562 int nid;
2563
2564 for (nid = 0; nid < nr_node_ids; nid++) {
2565 struct node_hstate *nhs = &node_hstates[nid];
2566 int i;
2567 for (i = 0; i < HUGE_MAX_HSTATE; i++)
2568 if (nhs->hstate_kobjs[i] == kobj) {
2569 if (nidp)
2570 *nidp = nid;
2571 return &hstates[i];
2572 }
2573 }
2574
2575 BUG();
2576 return NULL;
2577}
2578
2579
2580
2581
2582
2583static void hugetlb_unregister_node(struct node *node)
2584{
2585 struct hstate *h;
2586 struct node_hstate *nhs = &node_hstates[node->dev.id];
2587
2588 if (!nhs->hugepages_kobj)
2589 return;
2590
2591 for_each_hstate(h) {
2592 int idx = hstate_index(h);
2593 if (nhs->hstate_kobjs[idx]) {
2594 kobject_put(nhs->hstate_kobjs[idx]);
2595 nhs->hstate_kobjs[idx] = NULL;
2596 }
2597 }
2598
2599 kobject_put(nhs->hugepages_kobj);
2600 nhs->hugepages_kobj = NULL;
2601}
2602
2603
2604
2605
2606
2607
2608static void hugetlb_register_node(struct node *node)
2609{
2610 struct hstate *h;
2611 struct node_hstate *nhs = &node_hstates[node->dev.id];
2612 int err;
2613
2614 if (nhs->hugepages_kobj)
2615 return;
2616
2617 nhs->hugepages_kobj = kobject_create_and_add("hugepages",
2618 &node->dev.kobj);
2619 if (!nhs->hugepages_kobj)
2620 return;
2621
2622 for_each_hstate(h) {
2623 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
2624 nhs->hstate_kobjs,
2625 &per_node_hstate_attr_group);
2626 if (err) {
2627 pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
2628 h->name, node->dev.id);
2629 hugetlb_unregister_node(node);
2630 break;
2631 }
2632 }
2633}
2634
2635
2636
2637
2638
2639
2640static void __init hugetlb_register_all_nodes(void)
2641{
2642 int nid;
2643
2644 for_each_node_state(nid, N_MEMORY) {
2645 struct node *node = node_devices[nid];
2646 if (node->dev.id == nid)
2647 hugetlb_register_node(node);
2648 }
2649
2650
2651
2652
2653
2654 register_hugetlbfs_with_node(hugetlb_register_node,
2655 hugetlb_unregister_node);
2656}
2657#else
2658
2659static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
2660{
2661 BUG();
2662 if (nidp)
2663 *nidp = -1;
2664 return NULL;
2665}
2666
2667static void hugetlb_register_all_nodes(void) { }
2668
2669#endif
2670
2671static int __init hugetlb_init(void)
2672{
2673 int i;
2674
2675 if (!hugepages_supported())
2676 return 0;
2677
2678 if (!size_to_hstate(default_hstate_size)) {
2679 default_hstate_size = HPAGE_SIZE;
2680 if (!size_to_hstate(default_hstate_size))
2681 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
2682 }
2683 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
2684 if (default_hstate_max_huge_pages) {
2685 if (!default_hstate.max_huge_pages)
2686 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
2687 }
2688
2689 hugetlb_init_hstates();
2690 gather_bootmem_prealloc();
2691 report_hugepages();
2692
2693 hugetlb_sysfs_init();
2694 hugetlb_register_all_nodes();
2695 hugetlb_cgroup_file_init();
2696
2697#ifdef CONFIG_SMP
2698 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
2699#else
2700 num_fault_mutexes = 1;
2701#endif
2702 hugetlb_fault_mutex_table =
2703 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
2704 BUG_ON(!hugetlb_fault_mutex_table);
2705
2706 for (i = 0; i < num_fault_mutexes; i++)
2707 mutex_init(&hugetlb_fault_mutex_table[i]);
2708 return 0;
2709}
2710subsys_initcall(hugetlb_init);
2711
2712
2713void __init hugetlb_bad_size(void)
2714{
2715 parsed_valid_hugepagesz = false;
2716}
2717
2718void __init hugetlb_add_hstate(unsigned int order)
2719{
2720 struct hstate *h;
2721 unsigned long i;
2722
2723 if (size_to_hstate(PAGE_SIZE << order)) {
2724 pr_warn("hugepagesz= specified twice, ignoring\n");
2725 return;
2726 }
2727 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
2728 BUG_ON(order == 0);
2729 h = &hstates[hugetlb_max_hstate++];
2730 h->order = order;
2731 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
2732 h->nr_huge_pages = 0;
2733 h->free_huge_pages = 0;
2734 for (i = 0; i < MAX_NUMNODES; ++i)
2735 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
2736 INIT_LIST_HEAD(&h->hugepage_activelist);
2737 h->next_nid_to_alloc = first_memory_node;
2738 h->next_nid_to_free = first_memory_node;
2739 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
2740 huge_page_size(h)/1024);
2741
2742 parsed_hstate = h;
2743}
2744
2745static int __init hugetlb_nrpages_setup(char *s)
2746{
2747 unsigned long *mhp;
2748 static unsigned long *last_mhp;
2749
2750 if (!parsed_valid_hugepagesz) {
2751 pr_warn("hugepages = %s preceded by "
2752 "an unsupported hugepagesz, ignoring\n", s);
2753 parsed_valid_hugepagesz = true;
2754 return 1;
2755 }
2756
2757
2758
2759
2760 else if (!hugetlb_max_hstate)
2761 mhp = &default_hstate_max_huge_pages;
2762 else
2763 mhp = &parsed_hstate->max_huge_pages;
2764
2765 if (mhp == last_mhp) {
2766 pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
2767 return 1;
2768 }
2769
2770 if (sscanf(s, "%lu", mhp) <= 0)
2771 *mhp = 0;
2772
2773
2774
2775
2776
2777
2778 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
2779 hugetlb_hstate_alloc_pages(parsed_hstate);
2780
2781 last_mhp = mhp;
2782
2783 return 1;
2784}
2785__setup("hugepages=", hugetlb_nrpages_setup);
2786
2787static int __init hugetlb_default_setup(char *s)
2788{
2789 default_hstate_size = memparse(s, &s);
2790 return 1;
2791}
2792__setup("default_hugepagesz=", hugetlb_default_setup);
2793
2794static unsigned int cpuset_mems_nr(unsigned int *array)
2795{
2796 int node;
2797 unsigned int nr = 0;
2798
2799 for_each_node_mask(node, cpuset_current_mems_allowed)
2800 nr += array[node];
2801
2802 return nr;
2803}
2804
2805#ifdef CONFIG_SYSCTL
2806static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2807 struct ctl_table *table, int write,
2808 void __user *buffer, size_t *length, loff_t *ppos)
2809{
2810 struct hstate *h = &default_hstate;
2811 unsigned long tmp = h->max_huge_pages;
2812 int ret;
2813
2814 if (!hugepages_supported())
2815 return -EOPNOTSUPP;
2816
2817 table->data = &tmp;
2818 table->maxlen = sizeof(unsigned long);
2819 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
2820 if (ret)
2821 goto out;
2822
2823 if (write)
2824 ret = __nr_hugepages_store_common(obey_mempolicy, h,
2825 NUMA_NO_NODE, tmp, *length);
2826out:
2827 return ret;
2828}
2829
2830int hugetlb_sysctl_handler(struct ctl_table *table, int write,
2831 void __user *buffer, size_t *length, loff_t *ppos)
2832{
2833
2834 return hugetlb_sysctl_handler_common(false, table, write,
2835 buffer, length, ppos);
2836}
2837
2838#ifdef CONFIG_NUMA
2839int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
2840 void __user *buffer, size_t *length, loff_t *ppos)
2841{
2842 return hugetlb_sysctl_handler_common(true, table, write,
2843 buffer, length, ppos);
2844}
2845#endif
2846
2847int hugetlb_overcommit_handler(struct ctl_table *table, int write,
2848 void __user *buffer,
2849 size_t *length, loff_t *ppos)
2850{
2851 struct hstate *h = &default_hstate;
2852 unsigned long tmp;
2853 int ret;
2854
2855 if (!hugepages_supported())
2856 return -EOPNOTSUPP;
2857
2858 tmp = h->nr_overcommit_huge_pages;
2859
2860 if (write && hstate_is_gigantic(h))
2861 return -EINVAL;
2862
2863 table->data = &tmp;
2864 table->maxlen = sizeof(unsigned long);
2865 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
2866 if (ret)
2867 goto out;
2868
2869 if (write) {
2870 spin_lock(&hugetlb_lock);
2871 h->nr_overcommit_huge_pages = tmp;
2872 spin_unlock(&hugetlb_lock);
2873 }
2874out:
2875 return ret;
2876}
2877
2878#endif
2879
2880void hugetlb_report_meminfo(struct seq_file *m)
2881{
2882 struct hstate *h = &default_hstate;
2883 if (!hugepages_supported())
2884 return;
2885 seq_printf(m,
2886 "HugePages_Total: %5lu\n"
2887 "HugePages_Free: %5lu\n"
2888 "HugePages_Rsvd: %5lu\n"
2889 "HugePages_Surp: %5lu\n"
2890 "Hugepagesize: %8lu kB\n",
2891 h->nr_huge_pages,
2892 h->free_huge_pages,
2893 h->resv_huge_pages,
2894 h->surplus_huge_pages,
2895 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2896}
2897
2898int hugetlb_report_node_meminfo(int nid, char *buf)
2899{
2900 struct hstate *h = &default_hstate;
2901 if (!hugepages_supported())
2902 return 0;
2903 return sprintf(buf,
2904 "Node %d HugePages_Total: %5u\n"
2905 "Node %d HugePages_Free: %5u\n"
2906 "Node %d HugePages_Surp: %5u\n",
2907 nid, h->nr_huge_pages_node[nid],
2908 nid, h->free_huge_pages_node[nid],
2909 nid, h->surplus_huge_pages_node[nid]);
2910}
2911
2912void hugetlb_show_meminfo(void)
2913{
2914 struct hstate *h;
2915 int nid;
2916
2917 if (!hugepages_supported())
2918 return;
2919
2920 for_each_node_state(nid, N_MEMORY)
2921 for_each_hstate(h)
2922 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
2923 nid,
2924 h->nr_huge_pages_node[nid],
2925 h->free_huge_pages_node[nid],
2926 h->surplus_huge_pages_node[nid],
2927 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
2928}
2929
2930void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
2931{
2932 seq_printf(m, "HugetlbPages:\t%8lu kB\n",
2933 atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
2934}
2935
2936
2937unsigned long hugetlb_total_pages(void)
2938{
2939 struct hstate *h;
2940 unsigned long nr_total_pages = 0;
2941
2942 for_each_hstate(h)
2943 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
2944 return nr_total_pages;
2945}
2946
2947static int hugetlb_acct_memory(struct hstate *h, long delta)
2948{
2949 int ret = -ENOMEM;
2950
2951 spin_lock(&hugetlb_lock);
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969 if (delta > 0) {
2970 if (gather_surplus_pages(h, delta) < 0)
2971 goto out;
2972
2973 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2974 return_unused_surplus_pages(h, delta);
2975 goto out;
2976 }
2977 }
2978
2979 ret = 0;
2980 if (delta < 0)
2981 return_unused_surplus_pages(h, (unsigned long) -delta);
2982
2983out:
2984 spin_unlock(&hugetlb_lock);
2985 return ret;
2986}
2987
2988static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2989{
2990 struct resv_map *resv = vma_resv_map(vma);
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3001 kref_get(&resv->refs);
3002}
3003
3004static void hugetlb_vm_op_close(struct vm_area_struct *vma)
3005{
3006 struct hstate *h = hstate_vma(vma);
3007 struct resv_map *resv = vma_resv_map(vma);
3008 struct hugepage_subpool *spool = subpool_vma(vma);
3009 unsigned long reserve, start, end;
3010 long gbl_reserve;
3011
3012 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
3013 return;
3014
3015 start = vma_hugecache_offset(h, vma, vma->vm_start);
3016 end = vma_hugecache_offset(h, vma, vma->vm_end);
3017
3018 reserve = (end - start) - region_count(resv, start, end);
3019
3020 kref_put(&resv->refs, resv_map_release);
3021
3022 if (reserve) {
3023
3024
3025
3026
3027 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
3028 hugetlb_acct_memory(h, -gbl_reserve);
3029 }
3030}
3031
3032
3033
3034
3035
3036
3037
3038static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3039{
3040 BUG();
3041 return 0;
3042}
3043
3044const struct vm_operations_struct hugetlb_vm_ops = {
3045 .fault = hugetlb_vm_op_fault,
3046 .open = hugetlb_vm_op_open,
3047 .close = hugetlb_vm_op_close,
3048};
3049
3050static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
3051 int writable)
3052{
3053 pte_t entry;
3054
3055 if (writable) {
3056 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
3057 vma->vm_page_prot)));
3058 } else {
3059 entry = huge_pte_wrprotect(mk_huge_pte(page,
3060 vma->vm_page_prot));
3061 }
3062 entry = pte_mkyoung(entry);
3063 entry = pte_mkhuge(entry);
3064 entry = arch_make_huge_pte(entry, vma, page, writable);
3065
3066 return entry;
3067}
3068
3069static void set_huge_ptep_writable(struct vm_area_struct *vma,
3070 unsigned long address, pte_t *ptep)
3071{
3072 pte_t entry;
3073
3074 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
3075 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
3076 update_mmu_cache(vma, address, ptep);
3077}
3078
3079static int is_hugetlb_entry_migration(pte_t pte)
3080{
3081 swp_entry_t swp;
3082
3083 if (huge_pte_none(pte) || pte_present(pte))
3084 return 0;
3085 swp = pte_to_swp_entry(pte);
3086 if (non_swap_entry(swp) && is_migration_entry(swp))
3087 return 1;
3088 else
3089 return 0;
3090}
3091
3092static int is_hugetlb_entry_hwpoisoned(pte_t pte)
3093{
3094 swp_entry_t swp;
3095
3096 if (huge_pte_none(pte) || pte_present(pte))
3097 return 0;
3098 swp = pte_to_swp_entry(pte);
3099 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
3100 return 1;
3101 else
3102 return 0;
3103}
3104
3105int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
3106 struct vm_area_struct *vma)
3107{
3108 pte_t *src_pte, *dst_pte, entry;
3109 struct page *ptepage;
3110 unsigned long addr;
3111 int cow;
3112 struct hstate *h = hstate_vma(vma);
3113 unsigned long sz = huge_page_size(h);
3114 unsigned long mmun_start;
3115 unsigned long mmun_end;
3116 int ret = 0;
3117
3118 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
3119
3120 mmun_start = vma->vm_start;
3121 mmun_end = vma->vm_end;
3122 if (cow)
3123 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
3124
3125 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
3126 spinlock_t *src_ptl, *dst_ptl;
3127 src_pte = huge_pte_offset(src, addr);
3128 if (!src_pte)
3129 continue;
3130 dst_pte = huge_pte_alloc(dst, addr, sz);
3131 if (!dst_pte) {
3132 ret = -ENOMEM;
3133 break;
3134 }
3135
3136
3137 if (dst_pte == src_pte)
3138 continue;
3139
3140 dst_ptl = huge_pte_lock(h, dst, dst_pte);
3141 src_ptl = huge_pte_lockptr(h, src, src_pte);
3142 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
3143 entry = huge_ptep_get(src_pte);
3144 if (huge_pte_none(entry)) {
3145 ;
3146 } else if (unlikely(is_hugetlb_entry_migration(entry) ||
3147 is_hugetlb_entry_hwpoisoned(entry))) {
3148 swp_entry_t swp_entry = pte_to_swp_entry(entry);
3149
3150 if (is_write_migration_entry(swp_entry) && cow) {
3151
3152
3153
3154
3155 make_migration_entry_read(&swp_entry);
3156 entry = swp_entry_to_pte(swp_entry);
3157 set_huge_pte_at(src, addr, src_pte, entry);
3158 }
3159 set_huge_pte_at(dst, addr, dst_pte, entry);
3160 } else {
3161 if (cow) {
3162 huge_ptep_set_wrprotect(src, addr, src_pte);
3163 mmu_notifier_invalidate_range(src, mmun_start,
3164 mmun_end);
3165 }
3166 entry = huge_ptep_get(src_pte);
3167 ptepage = pte_page(entry);
3168 get_page(ptepage);
3169 page_dup_rmap(ptepage, true);
3170 set_huge_pte_at(dst, addr, dst_pte, entry);
3171 hugetlb_count_add(pages_per_huge_page(h), dst);
3172 }
3173 spin_unlock(src_ptl);
3174 spin_unlock(dst_ptl);
3175 }
3176
3177 if (cow)
3178 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
3179
3180 return ret;
3181}
3182
3183void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
3184 unsigned long start, unsigned long end,
3185 struct page *ref_page)
3186{
3187 struct mm_struct *mm = vma->vm_mm;
3188 unsigned long address;
3189 pte_t *ptep;
3190 pte_t pte;
3191 spinlock_t *ptl;
3192 struct page *page;
3193 struct hstate *h = hstate_vma(vma);
3194 unsigned long sz = huge_page_size(h);
3195 const unsigned long mmun_start = start;
3196 const unsigned long mmun_end = end;
3197
3198 WARN_ON(!is_vm_hugetlb_page(vma));
3199 BUG_ON(start & ~huge_page_mask(h));
3200 BUG_ON(end & ~huge_page_mask(h));
3201
3202 tlb_start_vma(tlb, vma);
3203 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3204 address = start;
3205 for (; address < end; address += sz) {
3206 ptep = huge_pte_offset(mm, address);
3207 if (!ptep)
3208 continue;
3209
3210 ptl = huge_pte_lock(h, mm, ptep);
3211 if (huge_pmd_unshare(mm, &address, ptep)) {
3212 spin_unlock(ptl);
3213 continue;
3214 }
3215
3216 pte = huge_ptep_get(ptep);
3217 if (huge_pte_none(pte)) {
3218 spin_unlock(ptl);
3219 continue;
3220 }
3221
3222
3223
3224
3225
3226 if (unlikely(!pte_present(pte))) {
3227 huge_pte_clear(mm, address, ptep);
3228 spin_unlock(ptl);
3229 continue;
3230 }
3231
3232 page = pte_page(pte);
3233
3234
3235
3236
3237
3238 if (ref_page) {
3239 if (page != ref_page) {
3240 spin_unlock(ptl);
3241 continue;
3242 }
3243
3244
3245
3246
3247
3248 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
3249 }
3250
3251 pte = huge_ptep_get_and_clear(mm, address, ptep);
3252 tlb_remove_tlb_entry(tlb, ptep, address);
3253 if (huge_pte_dirty(pte))
3254 set_page_dirty(page);
3255
3256 hugetlb_count_sub(pages_per_huge_page(h), mm);
3257 page_remove_rmap(page, true);
3258
3259 spin_unlock(ptl);
3260 tlb_remove_page_size(tlb, page, huge_page_size(h));
3261
3262
3263
3264 if (ref_page)
3265 break;
3266 }
3267 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
3268 tlb_end_vma(tlb, vma);
3269}
3270
3271void __unmap_hugepage_range_final(struct mmu_gather *tlb,
3272 struct vm_area_struct *vma, unsigned long start,
3273 unsigned long end, struct page *ref_page)
3274{
3275 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287 vma->vm_flags &= ~VM_MAYSHARE;
3288}
3289
3290void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
3291 unsigned long end, struct page *ref_page)
3292{
3293 struct mm_struct *mm;
3294 struct mmu_gather tlb;
3295
3296 mm = vma->vm_mm;
3297
3298 tlb_gather_mmu(&tlb, mm, start, end);
3299 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
3300 tlb_finish_mmu(&tlb, start, end);
3301}
3302
3303
3304
3305
3306
3307
3308
3309static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
3310 struct page *page, unsigned long address)
3311{
3312 struct hstate *h = hstate_vma(vma);
3313 struct vm_area_struct *iter_vma;
3314 struct address_space *mapping;
3315 pgoff_t pgoff;
3316
3317
3318
3319
3320
3321 address = address & huge_page_mask(h);
3322 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
3323 vma->vm_pgoff;
3324 mapping = vma->vm_file->f_mapping;
3325
3326
3327
3328
3329
3330
3331 i_mmap_lock_write(mapping);
3332 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
3333
3334 if (iter_vma == vma)
3335 continue;
3336
3337
3338
3339
3340
3341
3342 if (iter_vma->vm_flags & VM_MAYSHARE)
3343 continue;
3344
3345
3346
3347
3348
3349
3350
3351
3352 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
3353 unmap_hugepage_range(iter_vma, address,
3354 address + huge_page_size(h), page);
3355 }
3356 i_mmap_unlock_write(mapping);
3357}
3358
3359
3360
3361
3362
3363
3364
3365static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
3366 unsigned long address, pte_t *ptep, pte_t pte,
3367 struct page *pagecache_page, spinlock_t *ptl)
3368{
3369 struct hstate *h = hstate_vma(vma);
3370 struct page *old_page, *new_page;
3371 int ret = 0, outside_reserve = 0;
3372 unsigned long mmun_start;
3373 unsigned long mmun_end;
3374
3375 old_page = pte_page(pte);
3376
3377retry_avoidcopy:
3378
3379
3380 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
3381 page_move_anon_rmap(old_page, vma);
3382 set_huge_ptep_writable(vma, address, ptep);
3383 return 0;
3384 }
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
3396 old_page != pagecache_page)
3397 outside_reserve = 1;
3398
3399 get_page(old_page);
3400
3401
3402
3403
3404
3405 spin_unlock(ptl);
3406 new_page = alloc_huge_page(vma, address, outside_reserve);
3407
3408 if (IS_ERR(new_page)) {
3409
3410
3411
3412
3413
3414
3415
3416 if (outside_reserve) {
3417 put_page(old_page);
3418 BUG_ON(huge_pte_none(pte));
3419 unmap_ref_private(mm, vma, old_page, address);
3420 BUG_ON(huge_pte_none(pte));
3421 spin_lock(ptl);
3422 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
3423 if (likely(ptep &&
3424 pte_same(huge_ptep_get(ptep), pte)))
3425 goto retry_avoidcopy;
3426
3427
3428
3429
3430 return 0;
3431 }
3432
3433 ret = (PTR_ERR(new_page) == -ENOMEM) ?
3434 VM_FAULT_OOM : VM_FAULT_SIGBUS;
3435 goto out_release_old;
3436 }
3437
3438
3439
3440
3441
3442 if (unlikely(anon_vma_prepare(vma))) {
3443 ret = VM_FAULT_OOM;
3444 goto out_release_all;
3445 }
3446
3447 copy_user_huge_page(new_page, old_page, address, vma,
3448 pages_per_huge_page(h));
3449 __SetPageUptodate(new_page);
3450 set_page_huge_active(new_page);
3451
3452 mmun_start = address & huge_page_mask(h);
3453 mmun_end = mmun_start + huge_page_size(h);
3454 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3455
3456
3457
3458
3459
3460 spin_lock(ptl);
3461 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
3462 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
3463 ClearPagePrivate(new_page);
3464
3465
3466 huge_ptep_clear_flush(vma, address, ptep);
3467 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
3468 set_huge_pte_at(mm, address, ptep,
3469 make_huge_pte(vma, new_page, 1));
3470 page_remove_rmap(old_page, true);
3471 hugepage_add_new_anon_rmap(new_page, vma, address);
3472
3473 new_page = old_page;
3474 }
3475 spin_unlock(ptl);
3476 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
3477out_release_all:
3478 put_page(new_page);
3479out_release_old:
3480 put_page(old_page);
3481
3482 spin_lock(ptl);
3483 return ret;
3484}
3485
3486
3487static struct page *hugetlbfs_pagecache_page(struct hstate *h,
3488 struct vm_area_struct *vma, unsigned long address)
3489{
3490 struct address_space *mapping;
3491 pgoff_t idx;
3492
3493 mapping = vma->vm_file->f_mapping;
3494 idx = vma_hugecache_offset(h, vma, address);
3495
3496 return find_lock_page(mapping, idx);
3497}
3498
3499
3500
3501
3502
3503static bool hugetlbfs_pagecache_present(struct hstate *h,
3504 struct vm_area_struct *vma, unsigned long address)
3505{
3506 struct address_space *mapping;
3507 pgoff_t idx;
3508 struct page *page;
3509
3510 mapping = vma->vm_file->f_mapping;
3511 idx = vma_hugecache_offset(h, vma, address);
3512
3513 page = find_get_page(mapping, idx);
3514 if (page)
3515 put_page(page);
3516 return page != NULL;
3517}
3518
3519int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
3520 pgoff_t idx)
3521{
3522 struct inode *inode = mapping->host;
3523 struct hstate *h = hstate_inode(inode);
3524 int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
3525
3526 if (err)
3527 return err;
3528 ClearPagePrivate(page);
3529
3530 spin_lock(&inode->i_lock);
3531 inode->i_blocks += blocks_per_huge_page(h);
3532 spin_unlock(&inode->i_lock);
3533 return 0;
3534}
3535
3536static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
3537 struct address_space *mapping, pgoff_t idx,
3538 unsigned long address, pte_t *ptep, unsigned int flags)
3539{
3540 struct hstate *h = hstate_vma(vma);
3541 int ret = VM_FAULT_SIGBUS;
3542 int anon_rmap = 0;
3543 unsigned long size;
3544 struct page *page;
3545 pte_t new_pte;
3546 spinlock_t *ptl;
3547
3548
3549
3550
3551
3552
3553 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
3554 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
3555 current->pid);
3556 return ret;
3557 }
3558
3559
3560
3561
3562
3563retry:
3564 page = find_lock_page(mapping, idx);
3565 if (!page) {
3566 size = i_size_read(mapping->host) >> huge_page_shift(h);
3567 if (idx >= size)
3568 goto out;
3569 page = alloc_huge_page(vma, address, 0);
3570 if (IS_ERR(page)) {
3571 ret = PTR_ERR(page);
3572 if (ret == -ENOMEM)
3573 ret = VM_FAULT_OOM;
3574 else
3575 ret = VM_FAULT_SIGBUS;
3576 goto out;
3577 }
3578 clear_huge_page(page, address, pages_per_huge_page(h));
3579 __SetPageUptodate(page);
3580 set_page_huge_active(page);
3581
3582 if (vma->vm_flags & VM_MAYSHARE) {
3583 int err = huge_add_to_page_cache(page, mapping, idx);
3584 if (err) {
3585 put_page(page);
3586 if (err == -EEXIST)
3587 goto retry;
3588 goto out;
3589 }
3590 } else {
3591 lock_page(page);
3592 if (unlikely(anon_vma_prepare(vma))) {
3593 ret = VM_FAULT_OOM;
3594 goto backout_unlocked;
3595 }
3596 anon_rmap = 1;
3597 }
3598 } else {
3599
3600
3601
3602
3603
3604 if (unlikely(PageHWPoison(page))) {
3605 ret = VM_FAULT_HWPOISON |
3606 VM_FAULT_SET_HINDEX(hstate_index(h));
3607 goto backout_unlocked;
3608 }
3609 }
3610
3611
3612
3613
3614
3615
3616
3617 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3618 if (vma_needs_reservation(h, vma, address) < 0) {
3619 ret = VM_FAULT_OOM;
3620 goto backout_unlocked;
3621 }
3622
3623 vma_end_reservation(h, vma, address);
3624 }
3625
3626 ptl = huge_pte_lockptr(h, mm, ptep);
3627 spin_lock(ptl);
3628 size = i_size_read(mapping->host) >> huge_page_shift(h);
3629 if (idx >= size)
3630 goto backout;
3631
3632 ret = 0;
3633 if (!huge_pte_none(huge_ptep_get(ptep)))
3634 goto backout;
3635
3636 if (anon_rmap) {
3637 ClearPagePrivate(page);
3638 hugepage_add_new_anon_rmap(page, vma, address);
3639 } else
3640 page_dup_rmap(page, true);
3641 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
3642 && (vma->vm_flags & VM_SHARED)));
3643 set_huge_pte_at(mm, address, ptep, new_pte);
3644
3645 hugetlb_count_add(pages_per_huge_page(h), mm);
3646 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3647
3648 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
3649 }
3650
3651 spin_unlock(ptl);
3652 unlock_page(page);
3653out:
3654 return ret;
3655
3656backout:
3657 spin_unlock(ptl);
3658backout_unlocked:
3659 unlock_page(page);
3660 put_page(page);
3661 goto out;
3662}
3663
3664#ifdef CONFIG_SMP
3665u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
3666 struct vm_area_struct *vma,
3667 struct address_space *mapping,
3668 pgoff_t idx, unsigned long address)
3669{
3670 unsigned long key[2];
3671 u32 hash;
3672
3673 if (vma->vm_flags & VM_SHARED) {
3674 key[0] = (unsigned long) mapping;
3675 key[1] = idx;
3676 } else {
3677 key[0] = (unsigned long) mm;
3678 key[1] = address >> huge_page_shift(h);
3679 }
3680
3681 hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
3682
3683 return hash & (num_fault_mutexes - 1);
3684}
3685#else
3686
3687
3688
3689
3690u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
3691 struct vm_area_struct *vma,
3692 struct address_space *mapping,
3693 pgoff_t idx, unsigned long address)
3694{
3695 return 0;
3696}
3697#endif
3698
3699int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3700 unsigned long address, unsigned int flags)
3701{
3702 pte_t *ptep, entry;
3703 spinlock_t *ptl;
3704 int ret;
3705 u32 hash;
3706 pgoff_t idx;
3707 struct page *page = NULL;
3708 struct page *pagecache_page = NULL;
3709 struct hstate *h = hstate_vma(vma);
3710 struct address_space *mapping;
3711 int need_wait_lock = 0;
3712
3713 address &= huge_page_mask(h);
3714
3715 ptep = huge_pte_offset(mm, address);
3716 if (ptep) {
3717 entry = huge_ptep_get(ptep);
3718 if (unlikely(is_hugetlb_entry_migration(entry))) {
3719 migration_entry_wait_huge(vma, mm, ptep);
3720 return 0;
3721 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
3722 return VM_FAULT_HWPOISON_LARGE |
3723 VM_FAULT_SET_HINDEX(hstate_index(h));
3724 } else {
3725 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
3726 if (!ptep)
3727 return VM_FAULT_OOM;
3728 }
3729
3730 mapping = vma->vm_file->f_mapping;
3731 idx = vma_hugecache_offset(h, vma, address);
3732
3733
3734
3735
3736
3737
3738 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
3739 mutex_lock(&hugetlb_fault_mutex_table[hash]);
3740
3741 entry = huge_ptep_get(ptep);
3742 if (huge_pte_none(entry)) {
3743 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
3744 goto out_mutex;
3745 }
3746
3747 ret = 0;
3748
3749
3750
3751
3752
3753
3754
3755
3756 if (!pte_present(entry))
3757 goto out_mutex;
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
3768 if (vma_needs_reservation(h, vma, address) < 0) {
3769 ret = VM_FAULT_OOM;
3770 goto out_mutex;
3771 }
3772
3773 vma_end_reservation(h, vma, address);
3774
3775 if (!(vma->vm_flags & VM_MAYSHARE))
3776 pagecache_page = hugetlbfs_pagecache_page(h,
3777 vma, address);
3778 }
3779
3780 ptl = huge_pte_lock(h, mm, ptep);
3781
3782
3783 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3784 goto out_ptl;
3785
3786
3787
3788
3789
3790
3791 page = pte_page(entry);
3792 if (page != pagecache_page)
3793 if (!trylock_page(page)) {
3794 need_wait_lock = 1;
3795 goto out_ptl;
3796 }
3797
3798 get_page(page);
3799
3800 if (flags & FAULT_FLAG_WRITE) {
3801 if (!huge_pte_write(entry)) {
3802 ret = hugetlb_cow(mm, vma, address, ptep, entry,
3803 pagecache_page, ptl);
3804 goto out_put_page;
3805 }
3806 entry = huge_pte_mkdirty(entry);
3807 }
3808 entry = pte_mkyoung(entry);
3809 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
3810 flags & FAULT_FLAG_WRITE))
3811 update_mmu_cache(vma, address, ptep);
3812out_put_page:
3813 if (page != pagecache_page)
3814 unlock_page(page);
3815 put_page(page);
3816out_ptl:
3817 spin_unlock(ptl);
3818
3819 if (pagecache_page) {
3820 unlock_page(pagecache_page);
3821 put_page(pagecache_page);
3822 }
3823out_mutex:
3824 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
3825
3826
3827
3828
3829
3830
3831
3832 if (need_wait_lock)
3833 wait_on_page_locked(page);
3834 return ret;
3835}
3836
3837long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3838 struct page **pages, struct vm_area_struct **vmas,
3839 unsigned long *position, unsigned long *nr_pages,
3840 long i, unsigned int flags)
3841{
3842 unsigned long pfn_offset;
3843 unsigned long vaddr = *position;
3844 unsigned long remainder = *nr_pages;
3845 struct hstate *h = hstate_vma(vma);
3846
3847 while (vaddr < vma->vm_end && remainder) {
3848 pte_t *pte;
3849 spinlock_t *ptl = NULL;
3850 int absent;
3851 struct page *page;
3852
3853
3854
3855
3856
3857 if (unlikely(fatal_signal_pending(current))) {
3858 remainder = 0;
3859 break;
3860 }
3861
3862
3863
3864
3865
3866
3867
3868
3869 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
3870 if (pte)
3871 ptl = huge_pte_lock(h, mm, pte);
3872 absent = !pte || huge_pte_none(huge_ptep_get(pte));
3873
3874
3875
3876
3877
3878
3879
3880
3881 if (absent && (flags & FOLL_DUMP) &&
3882 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
3883 if (pte)
3884 spin_unlock(ptl);
3885 remainder = 0;
3886 break;
3887 }
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899 if (absent || is_swap_pte(huge_ptep_get(pte)) ||
3900 ((flags & FOLL_WRITE) &&
3901 !huge_pte_write(huge_ptep_get(pte)))) {
3902 int ret;
3903
3904 if (pte)
3905 spin_unlock(ptl);
3906 ret = hugetlb_fault(mm, vma, vaddr,
3907 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
3908 if (!(ret & VM_FAULT_ERROR))
3909 continue;
3910
3911 remainder = 0;
3912 break;
3913 }
3914
3915 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
3916 page = pte_page(huge_ptep_get(pte));
3917same_page:
3918 if (pages) {
3919 pages[i] = mem_map_offset(page, pfn_offset);
3920 get_page(pages[i]);
3921 }
3922
3923 if (vmas)
3924 vmas[i] = vma;
3925
3926 vaddr += PAGE_SIZE;
3927 ++pfn_offset;
3928 --remainder;
3929 ++i;
3930 if (vaddr < vma->vm_end && remainder &&
3931 pfn_offset < pages_per_huge_page(h)) {
3932
3933
3934
3935
3936 goto same_page;
3937 }
3938 spin_unlock(ptl);
3939 }
3940 *nr_pages = remainder;
3941 *position = vaddr;
3942
3943 return i ? i : -EFAULT;
3944}
3945
3946#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
3947
3948
3949
3950
3951#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
3952#endif
3953
3954unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3955 unsigned long address, unsigned long end, pgprot_t newprot)
3956{
3957 struct mm_struct *mm = vma->vm_mm;
3958 unsigned long start = address;
3959 pte_t *ptep;
3960 pte_t pte;
3961 struct hstate *h = hstate_vma(vma);
3962 unsigned long pages = 0;
3963
3964 BUG_ON(address >= end);
3965 flush_cache_range(vma, address, end);
3966
3967 mmu_notifier_invalidate_range_start(mm, start, end);
3968 i_mmap_lock_write(vma->vm_file->f_mapping);
3969 for (; address < end; address += huge_page_size(h)) {
3970 spinlock_t *ptl;
3971 ptep = huge_pte_offset(mm, address);
3972 if (!ptep)
3973 continue;
3974 ptl = huge_pte_lock(h, mm, ptep);
3975 if (huge_pmd_unshare(mm, &address, ptep)) {
3976 pages++;
3977 spin_unlock(ptl);
3978 continue;
3979 }
3980 pte = huge_ptep_get(ptep);
3981 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
3982 spin_unlock(ptl);
3983 continue;
3984 }
3985 if (unlikely(is_hugetlb_entry_migration(pte))) {
3986 swp_entry_t entry = pte_to_swp_entry(pte);
3987
3988 if (is_write_migration_entry(entry)) {
3989 pte_t newpte;
3990
3991 make_migration_entry_read(&entry);
3992 newpte = swp_entry_to_pte(entry);
3993 set_huge_pte_at(mm, address, ptep, newpte);
3994 pages++;
3995 }
3996 spin_unlock(ptl);
3997 continue;
3998 }
3999 if (!huge_pte_none(pte)) {
4000 pte = huge_ptep_get_and_clear(mm, address, ptep);
4001 pte = pte_mkhuge(huge_pte_modify(pte, newprot));
4002 pte = arch_make_huge_pte(pte, vma, NULL, 0);
4003 set_huge_pte_at(mm, address, ptep, pte);
4004 pages++;
4005 }
4006 spin_unlock(ptl);
4007 }
4008
4009
4010
4011
4012
4013
4014 flush_hugetlb_tlb_range(vma, start, end);
4015 mmu_notifier_invalidate_range(mm, start, end);
4016 i_mmap_unlock_write(vma->vm_file->f_mapping);
4017 mmu_notifier_invalidate_range_end(mm, start, end);
4018
4019 return pages << h->order;
4020}
4021
4022int hugetlb_reserve_pages(struct inode *inode,
4023 long from, long to,
4024 struct vm_area_struct *vma,
4025 vm_flags_t vm_flags)
4026{
4027 long ret, chg;
4028 struct hstate *h = hstate_inode(inode);
4029 struct hugepage_subpool *spool = subpool_inode(inode);
4030 struct resv_map *resv_map;
4031 long gbl_reserve;
4032
4033
4034
4035
4036
4037
4038 if (vm_flags & VM_NORESERVE)
4039 return 0;
4040
4041
4042
4043
4044
4045
4046
4047 if (!vma || vma->vm_flags & VM_MAYSHARE) {
4048 resv_map = inode_resv_map(inode);
4049
4050 chg = region_chg(resv_map, from, to);
4051
4052 } else {
4053 resv_map = resv_map_alloc();
4054 if (!resv_map)
4055 return -ENOMEM;
4056
4057 chg = to - from;
4058
4059 set_vma_resv_map(vma, resv_map);
4060 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
4061 }
4062
4063 if (chg < 0) {
4064 ret = chg;
4065 goto out_err;
4066 }
4067
4068
4069
4070
4071
4072
4073 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
4074 if (gbl_reserve < 0) {
4075 ret = -ENOSPC;
4076 goto out_err;
4077 }
4078
4079
4080
4081
4082
4083 ret = hugetlb_acct_memory(h, gbl_reserve);
4084 if (ret < 0) {
4085
4086 (void)hugepage_subpool_put_pages(spool, chg);
4087 goto out_err;
4088 }
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101 if (!vma || vma->vm_flags & VM_MAYSHARE) {
4102 long add = region_add(resv_map, from, to);
4103
4104 if (unlikely(chg > add)) {
4105
4106
4107
4108
4109
4110
4111
4112 long rsv_adjust;
4113
4114 rsv_adjust = hugepage_subpool_put_pages(spool,
4115 chg - add);
4116 hugetlb_acct_memory(h, -rsv_adjust);
4117 }
4118 }
4119 return 0;
4120out_err:
4121 if (!vma || vma->vm_flags & VM_MAYSHARE)
4122 region_abort(resv_map, from, to);
4123 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
4124 kref_put(&resv_map->refs, resv_map_release);
4125 return ret;
4126}
4127
4128long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
4129 long freed)
4130{
4131 struct hstate *h = hstate_inode(inode);
4132 struct resv_map *resv_map = inode_resv_map(inode);
4133 long chg = 0;
4134 struct hugepage_subpool *spool = subpool_inode(inode);
4135 long gbl_reserve;
4136
4137 if (resv_map) {
4138 chg = region_del(resv_map, start, end);
4139
4140
4141
4142
4143
4144 if (chg < 0)
4145 return chg;
4146 }
4147
4148 spin_lock(&inode->i_lock);
4149 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
4150 spin_unlock(&inode->i_lock);
4151
4152
4153
4154
4155
4156 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
4157 hugetlb_acct_memory(h, -gbl_reserve);
4158
4159 return 0;
4160}
4161
4162#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
4163static unsigned long page_table_shareable(struct vm_area_struct *svma,
4164 struct vm_area_struct *vma,
4165 unsigned long addr, pgoff_t idx)
4166{
4167 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
4168 svma->vm_start;
4169 unsigned long sbase = saddr & PUD_MASK;
4170 unsigned long s_end = sbase + PUD_SIZE;
4171
4172
4173 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
4174 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
4175
4176
4177
4178
4179
4180 if (pmd_index(addr) != pmd_index(saddr) ||
4181 vm_flags != svm_flags ||
4182 sbase < svma->vm_start || svma->vm_end < s_end)
4183 return 0;
4184
4185 return saddr;
4186}
4187
4188static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
4189{
4190 unsigned long base = addr & PUD_MASK;
4191 unsigned long end = base + PUD_SIZE;
4192
4193
4194
4195
4196 if (vma->vm_flags & VM_MAYSHARE &&
4197 vma->vm_start <= base && end <= vma->vm_end)
4198 return true;
4199 return false;
4200}
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4212{
4213 struct vm_area_struct *vma = find_vma(mm, addr);
4214 struct address_space *mapping = vma->vm_file->f_mapping;
4215 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
4216 vma->vm_pgoff;
4217 struct vm_area_struct *svma;
4218 unsigned long saddr;
4219 pte_t *spte = NULL;
4220 pte_t *pte;
4221 spinlock_t *ptl;
4222
4223 if (!vma_shareable(vma, addr))
4224 return (pte_t *)pmd_alloc(mm, pud, addr);
4225
4226 i_mmap_lock_write(mapping);
4227 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
4228 if (svma == vma)
4229 continue;
4230
4231 saddr = page_table_shareable(svma, vma, addr, idx);
4232 if (saddr) {
4233 spte = huge_pte_offset(svma->vm_mm, saddr);
4234 if (spte) {
4235 get_page(virt_to_page(spte));
4236 break;
4237 }
4238 }
4239 }
4240
4241 if (!spte)
4242 goto out;
4243
4244 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
4245 spin_lock(ptl);
4246 if (pud_none(*pud)) {
4247 pud_populate(mm, pud,
4248 (pmd_t *)((unsigned long)spte & PAGE_MASK));
4249 mm_inc_nr_pmds(mm);
4250 } else {
4251 put_page(virt_to_page(spte));
4252 }
4253 spin_unlock(ptl);
4254out:
4255 pte = (pte_t *)pmd_alloc(mm, pud, addr);
4256 i_mmap_unlock_write(mapping);
4257 return pte;
4258}
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
4273{
4274 pgd_t *pgd = pgd_offset(mm, *addr);
4275 pud_t *pud = pud_offset(pgd, *addr);
4276
4277 BUG_ON(page_count(virt_to_page(ptep)) == 0);
4278 if (page_count(virt_to_page(ptep)) == 1)
4279 return 0;
4280
4281 pud_clear(pud);
4282 put_page(virt_to_page(ptep));
4283 mm_dec_nr_pmds(mm);
4284 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
4285 return 1;
4286}
4287#define want_pmd_share() (1)
4288#else
4289pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4290{
4291 return NULL;
4292}
4293
4294int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
4295{
4296 return 0;
4297}
4298#define want_pmd_share() (0)
4299#endif
4300
4301#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
4302pte_t *huge_pte_alloc(struct mm_struct *mm,
4303 unsigned long addr, unsigned long sz)
4304{
4305 pgd_t *pgd;
4306 pud_t *pud;
4307 pte_t *pte = NULL;
4308
4309 pgd = pgd_offset(mm, addr);
4310 pud = pud_alloc(mm, pgd, addr);
4311 if (pud) {
4312 if (sz == PUD_SIZE) {
4313 pte = (pte_t *)pud;
4314 } else {
4315 BUG_ON(sz != PMD_SIZE);
4316 if (want_pmd_share() && pud_none(*pud))
4317 pte = huge_pmd_share(mm, addr, pud);
4318 else
4319 pte = (pte_t *)pmd_alloc(mm, pud, addr);
4320 }
4321 }
4322 BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
4323
4324 return pte;
4325}
4326
4327pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
4328{
4329 pgd_t *pgd;
4330 pud_t *pud;
4331 pmd_t *pmd = NULL;
4332
4333 pgd = pgd_offset(mm, addr);
4334 if (pgd_present(*pgd)) {
4335 pud = pud_offset(pgd, addr);
4336 if (pud_present(*pud)) {
4337 if (pud_huge(*pud))
4338 return (pte_t *)pud;
4339 pmd = pmd_offset(pud, addr);
4340 }
4341 }
4342 return (pte_t *) pmd;
4343}
4344
4345#endif
4346
4347
4348
4349
4350
4351struct page * __weak
4352follow_huge_addr(struct mm_struct *mm, unsigned long address,
4353 int write)
4354{
4355 return ERR_PTR(-EINVAL);
4356}
4357
4358struct page * __weak
4359follow_huge_pmd(struct mm_struct *mm, unsigned long address,
4360 pmd_t *pmd, int flags)
4361{
4362 struct page *page = NULL;
4363 spinlock_t *ptl;
4364retry:
4365 ptl = pmd_lockptr(mm, pmd);
4366 spin_lock(ptl);
4367
4368
4369
4370
4371 if (!pmd_huge(*pmd))
4372 goto out;
4373 if (pmd_present(*pmd)) {
4374 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
4375 if (flags & FOLL_GET)
4376 get_page(page);
4377 } else {
4378 if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
4379 spin_unlock(ptl);
4380 __migration_entry_wait(mm, (pte_t *)pmd, ptl);
4381 goto retry;
4382 }
4383
4384
4385
4386
4387 }
4388out:
4389 spin_unlock(ptl);
4390 return page;
4391}
4392
4393struct page * __weak
4394follow_huge_pud(struct mm_struct *mm, unsigned long address,
4395 pud_t *pud, int flags)
4396{
4397 if (flags & FOLL_GET)
4398 return NULL;
4399
4400 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
4401}
4402
4403#ifdef CONFIG_MEMORY_FAILURE
4404
4405
4406
4407
4408int dequeue_hwpoisoned_huge_page(struct page *hpage)
4409{
4410 struct hstate *h = page_hstate(hpage);
4411 int nid = page_to_nid(hpage);
4412 int ret = -EBUSY;
4413
4414 spin_lock(&hugetlb_lock);
4415
4416
4417
4418
4419 if (!page_huge_active(hpage) && !page_count(hpage)) {
4420
4421
4422
4423
4424
4425
4426 list_del_init(&hpage->lru);
4427 set_page_refcounted(hpage);
4428 h->free_huge_pages--;
4429 h->free_huge_pages_node[nid]--;
4430 ret = 0;
4431 }
4432 spin_unlock(&hugetlb_lock);
4433 return ret;
4434}
4435#endif
4436
4437bool isolate_huge_page(struct page *page, struct list_head *list)
4438{
4439 bool ret = true;
4440
4441 VM_BUG_ON_PAGE(!PageHead(page), page);
4442 spin_lock(&hugetlb_lock);
4443 if (!page_huge_active(page) || !get_page_unless_zero(page)) {
4444 ret = false;
4445 goto unlock;
4446 }
4447 clear_page_huge_active(page);
4448 list_move_tail(&page->lru, list);
4449unlock:
4450 spin_unlock(&hugetlb_lock);
4451 return ret;
4452}
4453
4454void putback_active_hugepage(struct page *page)
4455{
4456 VM_BUG_ON_PAGE(!PageHead(page), page);
4457 spin_lock(&hugetlb_lock);
4458 set_page_huge_active(page);
4459 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
4460 spin_unlock(&hugetlb_lock);
4461 put_page(page);
4462}
4463