1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70#include <linux/mempolicy.h>
71#include <linux/pagewalk.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
76#include <linux/sched/mm.h>
77#include <linux/sched/numa_balancing.h>
78#include <linux/sched/task.h>
79#include <linux/nodemask.h>
80#include <linux/cpuset.h>
81#include <linux/slab.h>
82#include <linux/string.h>
83#include <linux/export.h>
84#include <linux/nsproxy.h>
85#include <linux/interrupt.h>
86#include <linux/init.h>
87#include <linux/compat.h>
88#include <linux/ptrace.h>
89#include <linux/swap.h>
90#include <linux/seq_file.h>
91#include <linux/proc_fs.h>
92#include <linux/migrate.h>
93#include <linux/ksm.h>
94#include <linux/rmap.h>
95#include <linux/security.h>
96#include <linux/syscalls.h>
97#include <linux/ctype.h>
98#include <linux/mm_inline.h>
99#include <linux/mmu_notifier.h>
100#include <linux/printk.h>
101#include <linux/swapops.h>
102
103#include <asm/tlbflush.h>
104#include <linux/uaccess.h>
105
106#include "internal.h"
107
108
109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)
110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)
111
112static struct kmem_cache *policy_cache;
113static struct kmem_cache *sn_cache;
114
115
116
117enum zone_type policy_zone = 0;
118
119
120
121
122static struct mempolicy default_policy = {
123 .refcnt = ATOMIC_INIT(1),
124 .mode = MPOL_PREFERRED,
125 .flags = MPOL_F_LOCAL,
126};
127
128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129
130
131
132
133
134
135
136int numa_map_to_online_node(int node)
137{
138 int min_node;
139
140 if (node == NUMA_NO_NODE)
141 node = 0;
142
143 min_node = node;
144 if (!node_online(node)) {
145 int min_dist = INT_MAX, dist, n;
146
147 for_each_online_node(n) {
148 dist = node_distance(node, n);
149 if (dist < min_dist) {
150 min_dist = dist;
151 min_node = n;
152 }
153 }
154 }
155
156 return min_node;
157}
158EXPORT_SYMBOL_GPL(numa_map_to_online_node);
159
160struct mempolicy *get_task_policy(struct task_struct *p)
161{
162 struct mempolicy *pol = p->mempolicy;
163 int node;
164
165 if (pol)
166 return pol;
167
168 node = numa_node_id();
169 if (node != NUMA_NO_NODE) {
170 pol = &preferred_node_policy[node];
171
172 if (pol->mode)
173 return pol;
174 }
175
176 return &default_policy;
177}
178
179static const struct mempolicy_operations {
180 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
181 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
182} mpol_ops[MPOL_MAX];
183
184static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
185{
186 return pol->flags & MPOL_MODE_FLAGS;
187}
188
189static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
190 const nodemask_t *rel)
191{
192 nodemask_t tmp;
193 nodes_fold(tmp, *orig, nodes_weight(*rel));
194 nodes_onto(*ret, tmp, *rel);
195}
196
197static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
198{
199 if (nodes_empty(*nodes))
200 return -EINVAL;
201 pol->v.nodes = *nodes;
202 return 0;
203}
204
205static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
206{
207 if (!nodes)
208 pol->flags |= MPOL_F_LOCAL;
209 else if (nodes_empty(*nodes))
210 return -EINVAL;
211 else
212 pol->v.preferred_node = first_node(*nodes);
213 return 0;
214}
215
216static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
217{
218 if (nodes_empty(*nodes))
219 return -EINVAL;
220 pol->v.nodes = *nodes;
221 return 0;
222}
223
224
225
226
227
228
229
230
231
232
233static int mpol_set_nodemask(struct mempolicy *pol,
234 const nodemask_t *nodes, struct nodemask_scratch *nsc)
235{
236 int ret;
237
238
239 if (pol == NULL)
240 return 0;
241
242 nodes_and(nsc->mask1,
243 cpuset_current_mems_allowed, node_states[N_MEMORY]);
244
245 VM_BUG_ON(!nodes);
246 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
247 nodes = NULL;
248 else {
249 if (pol->flags & MPOL_F_RELATIVE_NODES)
250 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
251 else
252 nodes_and(nsc->mask2, *nodes, nsc->mask1);
253
254 if (mpol_store_user_nodemask(pol))
255 pol->w.user_nodemask = *nodes;
256 else
257 pol->w.cpuset_mems_allowed =
258 cpuset_current_mems_allowed;
259 }
260
261 if (nodes)
262 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
263 else
264 ret = mpol_ops[pol->mode].create(pol, NULL);
265 return ret;
266}
267
268
269
270
271
272static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
273 nodemask_t *nodes)
274{
275 struct mempolicy *policy;
276
277 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
278 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
279
280 if (mode == MPOL_DEFAULT) {
281 if (nodes && !nodes_empty(*nodes))
282 return ERR_PTR(-EINVAL);
283 return NULL;
284 }
285 VM_BUG_ON(!nodes);
286
287
288
289
290
291
292 if (mode == MPOL_PREFERRED) {
293 if (nodes_empty(*nodes)) {
294 if (((flags & MPOL_F_STATIC_NODES) ||
295 (flags & MPOL_F_RELATIVE_NODES)))
296 return ERR_PTR(-EINVAL);
297 }
298 } else if (mode == MPOL_LOCAL) {
299 if (!nodes_empty(*nodes) ||
300 (flags & MPOL_F_STATIC_NODES) ||
301 (flags & MPOL_F_RELATIVE_NODES))
302 return ERR_PTR(-EINVAL);
303 mode = MPOL_PREFERRED;
304 } else if (nodes_empty(*nodes))
305 return ERR_PTR(-EINVAL);
306 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
307 if (!policy)
308 return ERR_PTR(-ENOMEM);
309 atomic_set(&policy->refcnt, 1);
310 policy->mode = mode;
311 policy->flags = flags;
312
313 return policy;
314}
315
316
317void __mpol_put(struct mempolicy *p)
318{
319 if (!atomic_dec_and_test(&p->refcnt))
320 return;
321 kmem_cache_free(policy_cache, p);
322}
323
324static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
325{
326}
327
328static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
329{
330 nodemask_t tmp;
331
332 if (pol->flags & MPOL_F_STATIC_NODES)
333 nodes_and(tmp, pol->w.user_nodemask, *nodes);
334 else if (pol->flags & MPOL_F_RELATIVE_NODES)
335 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
336 else {
337 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
338 *nodes);
339 pol->w.cpuset_mems_allowed = *nodes;
340 }
341
342 if (nodes_empty(tmp))
343 tmp = *nodes;
344
345 pol->v.nodes = tmp;
346}
347
348static void mpol_rebind_preferred(struct mempolicy *pol,
349 const nodemask_t *nodes)
350{
351 nodemask_t tmp;
352
353 if (pol->flags & MPOL_F_STATIC_NODES) {
354 int node = first_node(pol->w.user_nodemask);
355
356 if (node_isset(node, *nodes)) {
357 pol->v.preferred_node = node;
358 pol->flags &= ~MPOL_F_LOCAL;
359 } else
360 pol->flags |= MPOL_F_LOCAL;
361 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
362 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
363 pol->v.preferred_node = first_node(tmp);
364 } else if (!(pol->flags & MPOL_F_LOCAL)) {
365 pol->v.preferred_node = node_remap(pol->v.preferred_node,
366 pol->w.cpuset_mems_allowed,
367 *nodes);
368 pol->w.cpuset_mems_allowed = *nodes;
369 }
370}
371
372
373
374
375
376
377
378
379static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
380{
381 if (!pol)
382 return;
383 if (!mpol_store_user_nodemask(pol) &&
384 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
385 return;
386
387 mpol_ops[pol->mode].rebind(pol, newmask);
388}
389
390
391
392
393
394
395
396
397void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
398{
399 mpol_rebind_policy(tsk->mempolicy, new);
400}
401
402
403
404
405
406
407
408void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
409{
410 struct vm_area_struct *vma;
411
412 down_write(&mm->mmap_sem);
413 for (vma = mm->mmap; vma; vma = vma->vm_next)
414 mpol_rebind_policy(vma->vm_policy, new);
415 up_write(&mm->mmap_sem);
416}
417
418static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
419 [MPOL_DEFAULT] = {
420 .rebind = mpol_rebind_default,
421 },
422 [MPOL_INTERLEAVE] = {
423 .create = mpol_new_interleave,
424 .rebind = mpol_rebind_nodemask,
425 },
426 [MPOL_PREFERRED] = {
427 .create = mpol_new_preferred,
428 .rebind = mpol_rebind_preferred,
429 },
430 [MPOL_BIND] = {
431 .create = mpol_new_bind,
432 .rebind = mpol_rebind_nodemask,
433 },
434};
435
436static int migrate_page_add(struct page *page, struct list_head *pagelist,
437 unsigned long flags);
438
439struct queue_pages {
440 struct list_head *pagelist;
441 unsigned long flags;
442 nodemask_t *nmask;
443 struct vm_area_struct *prev;
444};
445
446
447
448
449
450
451
452static inline bool queue_pages_required(struct page *page,
453 struct queue_pages *qp)
454{
455 int nid = page_to_nid(page);
456 unsigned long flags = qp->flags;
457
458 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
459}
460
461
462
463
464
465
466
467
468
469
470
471static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
472 unsigned long end, struct mm_walk *walk)
473{
474 int ret = 0;
475 struct page *page;
476 struct queue_pages *qp = walk->private;
477 unsigned long flags;
478
479 if (unlikely(is_pmd_migration_entry(*pmd))) {
480 ret = -EIO;
481 goto unlock;
482 }
483 page = pmd_page(*pmd);
484 if (is_huge_zero_page(page)) {
485 spin_unlock(ptl);
486 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
487 ret = 2;
488 goto out;
489 }
490 if (!queue_pages_required(page, qp))
491 goto unlock;
492
493 flags = qp->flags;
494
495 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
496 if (!vma_migratable(walk->vma) ||
497 migrate_page_add(page, qp->pagelist, flags)) {
498 ret = 1;
499 goto unlock;
500 }
501 } else
502 ret = -EIO;
503unlock:
504 spin_unlock(ptl);
505out:
506 return ret;
507}
508
509
510
511
512
513
514
515
516
517
518
519
520static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
521 unsigned long end, struct mm_walk *walk)
522{
523 struct vm_area_struct *vma = walk->vma;
524 struct page *page;
525 struct queue_pages *qp = walk->private;
526 unsigned long flags = qp->flags;
527 int ret;
528 bool has_unmovable = false;
529 pte_t *pte, *mapped_pte;
530 spinlock_t *ptl;
531
532 ptl = pmd_trans_huge_lock(pmd, vma);
533 if (ptl) {
534 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
535 if (ret != 2)
536 return ret;
537 }
538
539
540 if (pmd_trans_unstable(pmd))
541 return 0;
542
543 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
544 for (; addr != end; pte++, addr += PAGE_SIZE) {
545 if (!pte_present(*pte))
546 continue;
547 page = vm_normal_page(vma, addr, *pte);
548 if (!page)
549 continue;
550
551
552
553
554 if (PageReserved(page))
555 continue;
556 if (!queue_pages_required(page, qp))
557 continue;
558 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
559
560 if (!vma_migratable(vma)) {
561 has_unmovable = true;
562 break;
563 }
564
565
566
567
568
569
570 if (migrate_page_add(page, qp->pagelist, flags))
571 has_unmovable = true;
572 } else
573 break;
574 }
575 pte_unmap_unlock(mapped_pte, ptl);
576 cond_resched();
577
578 if (has_unmovable)
579 return 1;
580
581 return addr != end ? -EIO : 0;
582}
583
584static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
585 unsigned long addr, unsigned long end,
586 struct mm_walk *walk)
587{
588#ifdef CONFIG_HUGETLB_PAGE
589 struct queue_pages *qp = walk->private;
590 unsigned long flags = qp->flags;
591 struct page *page;
592 spinlock_t *ptl;
593 pte_t entry;
594
595 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
596 entry = huge_ptep_get(pte);
597 if (!pte_present(entry))
598 goto unlock;
599 page = pte_page(entry);
600 if (!queue_pages_required(page, qp))
601 goto unlock;
602
603 if (flags & (MPOL_MF_MOVE_ALL) ||
604 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
605 isolate_huge_page(page, qp->pagelist);
606unlock:
607 spin_unlock(ptl);
608#else
609 BUG();
610#endif
611 return 0;
612}
613
614#ifdef CONFIG_NUMA_BALANCING
615
616
617
618
619
620
621
622
623
624unsigned long change_prot_numa(struct vm_area_struct *vma,
625 unsigned long addr, unsigned long end)
626{
627 int nr_updated;
628
629 nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
630 if (nr_updated)
631 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
632
633 return nr_updated;
634}
635#else
636static unsigned long change_prot_numa(struct vm_area_struct *vma,
637 unsigned long addr, unsigned long end)
638{
639 return 0;
640}
641#endif
642
643static int queue_pages_test_walk(unsigned long start, unsigned long end,
644 struct mm_walk *walk)
645{
646 struct vm_area_struct *vma = walk->vma;
647 struct queue_pages *qp = walk->private;
648 unsigned long endvma = vma->vm_end;
649 unsigned long flags = qp->flags;
650
651
652 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
653 if (!vma->vm_next && vma->vm_end < end)
654 return -EFAULT;
655 if (qp->prev && qp->prev->vm_end < vma->vm_start)
656 return -EFAULT;
657 }
658
659 qp->prev = vma;
660
661
662
663
664
665 if (!vma_migratable(vma) &&
666 !(flags & MPOL_MF_STRICT))
667 return 1;
668
669 if (endvma > end)
670 endvma = end;
671 if (vma->vm_start > start)
672 start = vma->vm_start;
673
674 if (flags & MPOL_MF_LAZY) {
675
676 if (!is_vm_hugetlb_page(vma) &&
677 (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
678 !(vma->vm_flags & VM_MIXEDMAP))
679 change_prot_numa(vma, start, endvma);
680 return 1;
681 }
682
683
684 if (flags & MPOL_MF_VALID)
685 return 0;
686 return 1;
687}
688
689static const struct mm_walk_ops queue_pages_walk_ops = {
690 .hugetlb_entry = queue_pages_hugetlb,
691 .pmd_entry = queue_pages_pte_range,
692 .test_walk = queue_pages_test_walk,
693};
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710static int
711queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
712 nodemask_t *nodes, unsigned long flags,
713 struct list_head *pagelist)
714{
715 struct queue_pages qp = {
716 .pagelist = pagelist,
717 .flags = flags,
718 .nmask = nodes,
719 .prev = NULL,
720 };
721
722 return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
723}
724
725
726
727
728
729static int vma_replace_policy(struct vm_area_struct *vma,
730 struct mempolicy *pol)
731{
732 int err;
733 struct mempolicy *old;
734 struct mempolicy *new;
735
736 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
737 vma->vm_start, vma->vm_end, vma->vm_pgoff,
738 vma->vm_ops, vma->vm_file,
739 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
740
741 new = mpol_dup(pol);
742 if (IS_ERR(new))
743 return PTR_ERR(new);
744
745 if (vma->vm_ops && vma->vm_ops->set_policy) {
746 err = vma->vm_ops->set_policy(vma, new);
747 if (err)
748 goto err_out;
749 }
750
751 old = vma->vm_policy;
752 vma->vm_policy = new;
753 mpol_put(old);
754
755 return 0;
756 err_out:
757 mpol_put(new);
758 return err;
759}
760
761
762static int mbind_range(struct mm_struct *mm, unsigned long start,
763 unsigned long end, struct mempolicy *new_pol)
764{
765 struct vm_area_struct *next;
766 struct vm_area_struct *prev;
767 struct vm_area_struct *vma;
768 int err = 0;
769 pgoff_t pgoff;
770 unsigned long vmstart;
771 unsigned long vmend;
772
773 vma = find_vma(mm, start);
774 if (!vma || vma->vm_start > start)
775 return -EFAULT;
776
777 prev = vma->vm_prev;
778 if (start > vma->vm_start)
779 prev = vma;
780
781 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
782 next = vma->vm_next;
783 vmstart = max(start, vma->vm_start);
784 vmend = min(end, vma->vm_end);
785
786 if (mpol_equal(vma_policy(vma), new_pol))
787 continue;
788
789 pgoff = vma->vm_pgoff +
790 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
791 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
792 vma->anon_vma, vma->vm_file, pgoff,
793 new_pol, vma->vm_userfaultfd_ctx);
794 if (prev) {
795 vma = prev;
796 next = vma->vm_next;
797 if (mpol_equal(vma_policy(vma), new_pol))
798 continue;
799
800 goto replace;
801 }
802 if (vma->vm_start != vmstart) {
803 err = split_vma(vma->vm_mm, vma, vmstart, 1);
804 if (err)
805 goto out;
806 }
807 if (vma->vm_end != vmend) {
808 err = split_vma(vma->vm_mm, vma, vmend, 0);
809 if (err)
810 goto out;
811 }
812 replace:
813 err = vma_replace_policy(vma, new_pol);
814 if (err)
815 goto out;
816 }
817
818 out:
819 return err;
820}
821
822
823static long do_set_mempolicy(unsigned short mode, unsigned short flags,
824 nodemask_t *nodes)
825{
826 struct mempolicy *new, *old;
827 NODEMASK_SCRATCH(scratch);
828 int ret;
829
830 if (!scratch)
831 return -ENOMEM;
832
833 new = mpol_new(mode, flags, nodes);
834 if (IS_ERR(new)) {
835 ret = PTR_ERR(new);
836 goto out;
837 }
838
839 task_lock(current);
840 ret = mpol_set_nodemask(new, nodes, scratch);
841 if (ret) {
842 task_unlock(current);
843 mpol_put(new);
844 goto out;
845 }
846 old = current->mempolicy;
847 current->mempolicy = new;
848 if (new && new->mode == MPOL_INTERLEAVE)
849 current->il_prev = MAX_NUMNODES-1;
850 task_unlock(current);
851 mpol_put(old);
852 ret = 0;
853out:
854 NODEMASK_SCRATCH_FREE(scratch);
855 return ret;
856}
857
858
859
860
861
862
863static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
864{
865 nodes_clear(*nodes);
866 if (p == &default_policy)
867 return;
868
869 switch (p->mode) {
870 case MPOL_BIND:
871
872 case MPOL_INTERLEAVE:
873 *nodes = p->v.nodes;
874 break;
875 case MPOL_PREFERRED:
876 if (!(p->flags & MPOL_F_LOCAL))
877 node_set(p->v.preferred_node, *nodes);
878
879 break;
880 default:
881 BUG();
882 }
883}
884
885static int lookup_node(struct mm_struct *mm, unsigned long addr)
886{
887 struct page *p = NULL;
888 int err;
889
890 int locked = 1;
891 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
892 if (err == 0) {
893
894 err = -EFAULT;
895 } else if (err > 0) {
896 err = page_to_nid(p);
897 put_page(p);
898 }
899 if (locked)
900 up_read(&mm->mmap_sem);
901 return err;
902}
903
904
905static long do_get_mempolicy(int *policy, nodemask_t *nmask,
906 unsigned long addr, unsigned long flags)
907{
908 int err;
909 struct mm_struct *mm = current->mm;
910 struct vm_area_struct *vma = NULL;
911 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
912
913 if (flags &
914 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
915 return -EINVAL;
916
917 if (flags & MPOL_F_MEMS_ALLOWED) {
918 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
919 return -EINVAL;
920 *policy = 0;
921 task_lock(current);
922 *nmask = cpuset_current_mems_allowed;
923 task_unlock(current);
924 return 0;
925 }
926
927 if (flags & MPOL_F_ADDR) {
928
929
930
931
932
933 down_read(&mm->mmap_sem);
934 vma = find_vma_intersection(mm, addr, addr+1);
935 if (!vma) {
936 up_read(&mm->mmap_sem);
937 return -EFAULT;
938 }
939 if (vma->vm_ops && vma->vm_ops->get_policy)
940 pol = vma->vm_ops->get_policy(vma, addr);
941 else
942 pol = vma->vm_policy;
943 } else if (addr)
944 return -EINVAL;
945
946 if (!pol)
947 pol = &default_policy;
948
949 if (flags & MPOL_F_NODE) {
950 if (flags & MPOL_F_ADDR) {
951
952
953
954
955
956
957 pol_refcount = pol;
958 vma = NULL;
959 mpol_get(pol);
960 err = lookup_node(mm, addr);
961 if (err < 0)
962 goto out;
963 *policy = err;
964 } else if (pol == current->mempolicy &&
965 pol->mode == MPOL_INTERLEAVE) {
966 *policy = next_node_in(current->il_prev, pol->v.nodes);
967 } else {
968 err = -EINVAL;
969 goto out;
970 }
971 } else {
972 *policy = pol == &default_policy ? MPOL_DEFAULT :
973 pol->mode;
974
975
976
977
978 *policy |= (pol->flags & MPOL_MODE_FLAGS);
979 }
980
981 err = 0;
982 if (nmask) {
983 if (mpol_store_user_nodemask(pol)) {
984 *nmask = pol->w.user_nodemask;
985 } else {
986 task_lock(current);
987 get_policy_nodemask(pol, nmask);
988 task_unlock(current);
989 }
990 }
991
992 out:
993 mpol_cond_put(pol);
994 if (vma)
995 up_read(&mm->mmap_sem);
996 if (pol_refcount)
997 mpol_put(pol_refcount);
998 return err;
999}
1000
1001#ifdef CONFIG_MIGRATION
1002
1003
1004
1005static int migrate_page_add(struct page *page, struct list_head *pagelist,
1006 unsigned long flags)
1007{
1008 struct page *head = compound_head(page);
1009
1010
1011
1012 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1013 if (!isolate_lru_page(head)) {
1014 list_add_tail(&head->lru, pagelist);
1015 mod_node_page_state(page_pgdat(head),
1016 NR_ISOLATED_ANON + page_is_file_lru(head),
1017 thp_nr_pages(head));
1018 } else if (flags & MPOL_MF_STRICT) {
1019
1020
1021
1022
1023
1024
1025
1026 return -EIO;
1027 }
1028 }
1029
1030 return 0;
1031}
1032
1033
1034struct page *alloc_new_node_page(struct page *page, unsigned long node)
1035{
1036 if (PageHuge(page))
1037 return alloc_huge_page_node(page_hstate(compound_head(page)),
1038 node);
1039 else if (PageTransHuge(page)) {
1040 struct page *thp;
1041
1042 thp = alloc_pages_node(node,
1043 (GFP_TRANSHUGE | __GFP_THISNODE),
1044 HPAGE_PMD_ORDER);
1045 if (!thp)
1046 return NULL;
1047 prep_transhuge_page(thp);
1048 return thp;
1049 } else
1050 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1051 __GFP_THISNODE, 0);
1052}
1053
1054
1055
1056
1057
1058static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1059 int flags)
1060{
1061 nodemask_t nmask;
1062 LIST_HEAD(pagelist);
1063 int err = 0;
1064
1065 nodes_clear(nmask);
1066 node_set(source, nmask);
1067
1068
1069
1070
1071
1072
1073 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1074 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1075 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1076
1077 if (!list_empty(&pagelist)) {
1078 err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1079 MIGRATE_SYNC, MR_SYSCALL);
1080 if (err)
1081 putback_movable_pages(&pagelist);
1082 }
1083
1084 return err;
1085}
1086
1087
1088
1089
1090
1091
1092
1093int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1094 const nodemask_t *to, int flags)
1095{
1096 int busy = 0;
1097 int err;
1098 nodemask_t tmp;
1099
1100 err = migrate_prep();
1101 if (err)
1102 return err;
1103
1104 down_read(&mm->mmap_sem);
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137 tmp = *from;
1138 while (!nodes_empty(tmp)) {
1139 int s,d;
1140 int source = NUMA_NO_NODE;
1141 int dest = 0;
1142
1143 for_each_node_mask(s, tmp) {
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1161 (node_isset(s, *to)))
1162 continue;
1163
1164 d = node_remap(s, *from, *to);
1165 if (s == d)
1166 continue;
1167
1168 source = s;
1169 dest = d;
1170
1171
1172 if (!node_isset(dest, tmp))
1173 break;
1174 }
1175 if (source == NUMA_NO_NODE)
1176 break;
1177
1178 node_clear(source, tmp);
1179 err = migrate_to_node(mm, source, dest, flags);
1180 if (err > 0)
1181 busy += err;
1182 if (err < 0)
1183 break;
1184 }
1185 up_read(&mm->mmap_sem);
1186 if (err < 0)
1187 return err;
1188 return busy;
1189
1190}
1191
1192
1193
1194
1195
1196
1197
1198
1199static struct page *new_page(struct page *page, unsigned long start)
1200{
1201 struct vm_area_struct *vma;
1202 unsigned long uninitialized_var(address);
1203
1204 vma = find_vma(current->mm, start);
1205 while (vma) {
1206 address = page_address_in_vma(page, vma);
1207 if (address != -EFAULT)
1208 break;
1209 vma = vma->vm_next;
1210 }
1211
1212 if (PageHuge(page)) {
1213 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1214 vma, address);
1215 } else if (PageTransHuge(page)) {
1216 struct page *thp;
1217
1218 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1219 HPAGE_PMD_ORDER);
1220 if (!thp)
1221 return NULL;
1222 prep_transhuge_page(thp);
1223 return thp;
1224 }
1225
1226
1227
1228 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1229 vma, address);
1230}
1231#else
1232
1233static int migrate_page_add(struct page *page, struct list_head *pagelist,
1234 unsigned long flags)
1235{
1236 return -EIO;
1237}
1238
1239int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1240 const nodemask_t *to, int flags)
1241{
1242 return -ENOSYS;
1243}
1244
1245static struct page *new_page(struct page *page, unsigned long start)
1246{
1247 return NULL;
1248}
1249#endif
1250
1251static long do_mbind(unsigned long start, unsigned long len,
1252 unsigned short mode, unsigned short mode_flags,
1253 nodemask_t *nmask, unsigned long flags)
1254{
1255 struct mm_struct *mm = current->mm;
1256 struct mempolicy *new;
1257 unsigned long end;
1258 int err;
1259 int ret;
1260 LIST_HEAD(pagelist);
1261
1262 if (flags & ~(unsigned long)MPOL_MF_VALID)
1263 return -EINVAL;
1264 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1265 return -EPERM;
1266
1267 if (start & ~PAGE_MASK)
1268 return -EINVAL;
1269
1270 if (mode == MPOL_DEFAULT)
1271 flags &= ~MPOL_MF_STRICT;
1272
1273 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1274 end = start + len;
1275
1276 if (end < start)
1277 return -EINVAL;
1278 if (end == start)
1279 return 0;
1280
1281 new = mpol_new(mode, mode_flags, nmask);
1282 if (IS_ERR(new))
1283 return PTR_ERR(new);
1284
1285 if (flags & MPOL_MF_LAZY)
1286 new->flags |= MPOL_F_MOF;
1287
1288
1289
1290
1291
1292 if (!new)
1293 flags |= MPOL_MF_DISCONTIG_OK;
1294
1295 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1296 start, start + len, mode, mode_flags,
1297 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1298
1299 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1300
1301 err = migrate_prep();
1302 if (err)
1303 goto mpol_out;
1304 }
1305 {
1306 NODEMASK_SCRATCH(scratch);
1307 if (scratch) {
1308 down_write(&mm->mmap_sem);
1309 task_lock(current);
1310 err = mpol_set_nodemask(new, nmask, scratch);
1311 task_unlock(current);
1312 if (err)
1313 up_write(&mm->mmap_sem);
1314 } else
1315 err = -ENOMEM;
1316 NODEMASK_SCRATCH_FREE(scratch);
1317 }
1318 if (err)
1319 goto mpol_out;
1320
1321 ret = queue_pages_range(mm, start, end, nmask,
1322 flags | MPOL_MF_INVERT, &pagelist);
1323
1324 if (ret < 0) {
1325 err = ret;
1326 goto up_out;
1327 }
1328
1329 err = mbind_range(mm, start, end, new);
1330
1331 if (!err) {
1332 int nr_failed = 0;
1333
1334 if (!list_empty(&pagelist)) {
1335 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1336 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1337 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1338 if (nr_failed)
1339 putback_movable_pages(&pagelist);
1340 }
1341
1342 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1343 err = -EIO;
1344 } else {
1345up_out:
1346 if (!list_empty(&pagelist))
1347 putback_movable_pages(&pagelist);
1348 }
1349
1350 up_write(&mm->mmap_sem);
1351mpol_out:
1352 mpol_put(new);
1353 return err;
1354}
1355
1356
1357
1358
1359
1360
1361static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1362 unsigned long maxnode)
1363{
1364 unsigned long k;
1365 unsigned long t;
1366 unsigned long nlongs;
1367 unsigned long endmask;
1368
1369 --maxnode;
1370 nodes_clear(*nodes);
1371 if (maxnode == 0 || !nmask)
1372 return 0;
1373 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1374 return -EINVAL;
1375
1376 nlongs = BITS_TO_LONGS(maxnode);
1377 if ((maxnode % BITS_PER_LONG) == 0)
1378 endmask = ~0UL;
1379 else
1380 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1392 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1393 if (get_user(t, nmask + k))
1394 return -EFAULT;
1395 if (k == nlongs - 1) {
1396 if (t & endmask)
1397 return -EINVAL;
1398 } else if (t)
1399 return -EINVAL;
1400 }
1401 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1402 endmask = ~0UL;
1403 }
1404
1405 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1406 unsigned long valid_mask = endmask;
1407
1408 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1409 if (get_user(t, nmask + nlongs - 1))
1410 return -EFAULT;
1411 if (t & valid_mask)
1412 return -EINVAL;
1413 }
1414
1415 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1416 return -EFAULT;
1417 nodes_addr(*nodes)[nlongs-1] &= endmask;
1418 return 0;
1419}
1420
1421
1422static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1423 nodemask_t *nodes)
1424{
1425 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1426 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1427
1428 if (copy > nbytes) {
1429 if (copy > PAGE_SIZE)
1430 return -EINVAL;
1431 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1432 return -EFAULT;
1433 copy = nbytes;
1434 }
1435 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1436}
1437
1438static long kernel_mbind(unsigned long start, unsigned long len,
1439 unsigned long mode, const unsigned long __user *nmask,
1440 unsigned long maxnode, unsigned int flags)
1441{
1442 nodemask_t nodes;
1443 int err;
1444 unsigned short mode_flags;
1445
1446 start = untagged_addr(start);
1447 mode_flags = mode & MPOL_MODE_FLAGS;
1448 mode &= ~MPOL_MODE_FLAGS;
1449 if (mode >= MPOL_MAX)
1450 return -EINVAL;
1451 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1452 (mode_flags & MPOL_F_RELATIVE_NODES))
1453 return -EINVAL;
1454 err = get_nodes(&nodes, nmask, maxnode);
1455 if (err)
1456 return err;
1457 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1458}
1459
1460SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1461 unsigned long, mode, const unsigned long __user *, nmask,
1462 unsigned long, maxnode, unsigned int, flags)
1463{
1464 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1465}
1466
1467
1468static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1469 unsigned long maxnode)
1470{
1471 int err;
1472 nodemask_t nodes;
1473 unsigned short flags;
1474
1475 flags = mode & MPOL_MODE_FLAGS;
1476 mode &= ~MPOL_MODE_FLAGS;
1477 if ((unsigned int)mode >= MPOL_MAX)
1478 return -EINVAL;
1479 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1480 return -EINVAL;
1481 err = get_nodes(&nodes, nmask, maxnode);
1482 if (err)
1483 return err;
1484 return do_set_mempolicy(mode, flags, &nodes);
1485}
1486
1487SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1488 unsigned long, maxnode)
1489{
1490 return kernel_set_mempolicy(mode, nmask, maxnode);
1491}
1492
1493static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1494 const unsigned long __user *old_nodes,
1495 const unsigned long __user *new_nodes)
1496{
1497 struct mm_struct *mm = NULL;
1498 struct task_struct *task;
1499 nodemask_t task_nodes;
1500 int err;
1501 nodemask_t *old;
1502 nodemask_t *new;
1503 NODEMASK_SCRATCH(scratch);
1504
1505 if (!scratch)
1506 return -ENOMEM;
1507
1508 old = &scratch->mask1;
1509 new = &scratch->mask2;
1510
1511 err = get_nodes(old, old_nodes, maxnode);
1512 if (err)
1513 goto out;
1514
1515 err = get_nodes(new, new_nodes, maxnode);
1516 if (err)
1517 goto out;
1518
1519
1520 rcu_read_lock();
1521 task = pid ? find_task_by_vpid(pid) : current;
1522 if (!task) {
1523 rcu_read_unlock();
1524 err = -ESRCH;
1525 goto out;
1526 }
1527 get_task_struct(task);
1528
1529 err = -EINVAL;
1530
1531
1532
1533
1534
1535 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1536 rcu_read_unlock();
1537 err = -EPERM;
1538 goto out_put;
1539 }
1540 rcu_read_unlock();
1541
1542 task_nodes = cpuset_mems_allowed(task);
1543
1544 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1545 err = -EPERM;
1546 goto out_put;
1547 }
1548
1549 task_nodes = cpuset_mems_allowed(current);
1550 nodes_and(*new, *new, task_nodes);
1551 if (nodes_empty(*new))
1552 goto out_put;
1553
1554 nodes_and(*new, *new, node_states[N_MEMORY]);
1555 if (nodes_empty(*new))
1556 goto out_put;
1557
1558 err = security_task_movememory(task);
1559 if (err)
1560 goto out_put;
1561
1562 mm = get_task_mm(task);
1563 put_task_struct(task);
1564
1565 if (!mm) {
1566 err = -EINVAL;
1567 goto out;
1568 }
1569
1570 err = do_migrate_pages(mm, old, new,
1571 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1572
1573 mmput(mm);
1574out:
1575 NODEMASK_SCRATCH_FREE(scratch);
1576
1577 return err;
1578
1579out_put:
1580 put_task_struct(task);
1581 goto out;
1582
1583}
1584
1585SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1586 const unsigned long __user *, old_nodes,
1587 const unsigned long __user *, new_nodes)
1588{
1589 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1590}
1591
1592
1593
1594static int kernel_get_mempolicy(int __user *policy,
1595 unsigned long __user *nmask,
1596 unsigned long maxnode,
1597 unsigned long addr,
1598 unsigned long flags)
1599{
1600 int err;
1601 int uninitialized_var(pval);
1602 nodemask_t nodes;
1603
1604 addr = untagged_addr(addr);
1605
1606 if (nmask != NULL && maxnode < nr_node_ids)
1607 return -EINVAL;
1608
1609 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1610
1611 if (err)
1612 return err;
1613
1614 if (policy && put_user(pval, policy))
1615 return -EFAULT;
1616
1617 if (nmask)
1618 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1619
1620 return err;
1621}
1622
1623SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1624 unsigned long __user *, nmask, unsigned long, maxnode,
1625 unsigned long, addr, unsigned long, flags)
1626{
1627 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1628}
1629
1630#ifdef CONFIG_COMPAT
1631
1632COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1633 compat_ulong_t __user *, nmask,
1634 compat_ulong_t, maxnode,
1635 compat_ulong_t, addr, compat_ulong_t, flags)
1636{
1637 long err;
1638 unsigned long __user *nm = NULL;
1639 unsigned long nr_bits, alloc_size;
1640 DECLARE_BITMAP(bm, MAX_NUMNODES);
1641
1642 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1643 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1644
1645 if (nmask)
1646 nm = compat_alloc_user_space(alloc_size);
1647
1648 err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1649
1650 if (!err && nmask) {
1651 unsigned long copy_size;
1652 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1653 err = copy_from_user(bm, nm, copy_size);
1654
1655 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1656 err |= compat_put_bitmap(nmask, bm, nr_bits);
1657 }
1658
1659 return err;
1660}
1661
1662COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1663 compat_ulong_t, maxnode)
1664{
1665 unsigned long __user *nm = NULL;
1666 unsigned long nr_bits, alloc_size;
1667 DECLARE_BITMAP(bm, MAX_NUMNODES);
1668
1669 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1670 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1671
1672 if (nmask) {
1673 if (compat_get_bitmap(bm, nmask, nr_bits))
1674 return -EFAULT;
1675 nm = compat_alloc_user_space(alloc_size);
1676 if (copy_to_user(nm, bm, alloc_size))
1677 return -EFAULT;
1678 }
1679
1680 return kernel_set_mempolicy(mode, nm, nr_bits+1);
1681}
1682
1683COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1684 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1685 compat_ulong_t, maxnode, compat_ulong_t, flags)
1686{
1687 unsigned long __user *nm = NULL;
1688 unsigned long nr_bits, alloc_size;
1689 nodemask_t bm;
1690
1691 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1692 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1693
1694 if (nmask) {
1695 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1696 return -EFAULT;
1697 nm = compat_alloc_user_space(alloc_size);
1698 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1699 return -EFAULT;
1700 }
1701
1702 return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1703}
1704
1705COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1706 compat_ulong_t, maxnode,
1707 const compat_ulong_t __user *, old_nodes,
1708 const compat_ulong_t __user *, new_nodes)
1709{
1710 unsigned long __user *old = NULL;
1711 unsigned long __user *new = NULL;
1712 nodemask_t tmp_mask;
1713 unsigned long nr_bits;
1714 unsigned long size;
1715
1716 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1717 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1718 if (old_nodes) {
1719 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1720 return -EFAULT;
1721 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1722 if (new_nodes)
1723 new = old + size / sizeof(unsigned long);
1724 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1725 return -EFAULT;
1726 }
1727 if (new_nodes) {
1728 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1729 return -EFAULT;
1730 if (new == NULL)
1731 new = compat_alloc_user_space(size);
1732 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1733 return -EFAULT;
1734 }
1735 return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1736}
1737
1738#endif
1739
1740struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1741 unsigned long addr)
1742{
1743 struct mempolicy *pol = NULL;
1744
1745 if (vma) {
1746 if (vma->vm_ops && vma->vm_ops->get_policy) {
1747 pol = vma->vm_ops->get_policy(vma, addr);
1748 } else if (vma->vm_policy) {
1749 pol = vma->vm_policy;
1750
1751
1752
1753
1754
1755
1756
1757 if (mpol_needs_cond_ref(pol))
1758 mpol_get(pol);
1759 }
1760 }
1761
1762 return pol;
1763}
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1778 unsigned long addr)
1779{
1780 struct mempolicy *pol = __get_vma_policy(vma, addr);
1781
1782 if (!pol)
1783 pol = get_task_policy(current);
1784
1785 return pol;
1786}
1787
1788bool vma_policy_mof(struct vm_area_struct *vma)
1789{
1790 struct mempolicy *pol;
1791
1792 if (vma->vm_ops && vma->vm_ops->get_policy) {
1793 bool ret = false;
1794
1795 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1796 if (pol && (pol->flags & MPOL_F_MOF))
1797 ret = true;
1798 mpol_cond_put(pol);
1799
1800 return ret;
1801 }
1802
1803 pol = vma->vm_policy;
1804 if (!pol)
1805 pol = get_task_policy(current);
1806
1807 return pol->flags & MPOL_F_MOF;
1808}
1809
1810static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1811{
1812 enum zone_type dynamic_policy_zone = policy_zone;
1813
1814 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1825 dynamic_policy_zone = ZONE_MOVABLE;
1826
1827 return zone >= dynamic_policy_zone;
1828}
1829
1830
1831
1832
1833
1834static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1835{
1836
1837 if (unlikely(policy->mode == MPOL_BIND) &&
1838 apply_policy_zone(policy, gfp_zone(gfp)) &&
1839 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1840 return &policy->v.nodes;
1841
1842 return NULL;
1843}
1844
1845
1846static int policy_node(gfp_t gfp, struct mempolicy *policy,
1847 int nd)
1848{
1849 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1850 nd = policy->v.preferred_node;
1851 else {
1852
1853
1854
1855
1856
1857 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1858 }
1859
1860 return nd;
1861}
1862
1863
1864static unsigned interleave_nodes(struct mempolicy *policy)
1865{
1866 unsigned next;
1867 struct task_struct *me = current;
1868
1869 next = next_node_in(me->il_prev, policy->v.nodes);
1870 if (next < MAX_NUMNODES)
1871 me->il_prev = next;
1872 return next;
1873}
1874
1875
1876
1877
1878
1879unsigned int mempolicy_slab_node(void)
1880{
1881 struct mempolicy *policy;
1882 int node = numa_mem_id();
1883
1884 if (in_interrupt())
1885 return node;
1886
1887 policy = current->mempolicy;
1888 if (!policy || policy->flags & MPOL_F_LOCAL)
1889 return node;
1890
1891 switch (policy->mode) {
1892 case MPOL_PREFERRED:
1893
1894
1895
1896 return policy->v.preferred_node;
1897
1898 case MPOL_INTERLEAVE:
1899 return interleave_nodes(policy);
1900
1901 case MPOL_BIND: {
1902 struct zoneref *z;
1903
1904
1905
1906
1907
1908 struct zonelist *zonelist;
1909 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1910 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1911 z = first_zones_zonelist(zonelist, highest_zoneidx,
1912 &policy->v.nodes);
1913 return z->zone ? zone_to_nid(z->zone) : node;
1914 }
1915
1916 default:
1917 BUG();
1918 }
1919}
1920
1921
1922
1923
1924
1925
1926static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1927{
1928 unsigned nnodes = nodes_weight(pol->v.nodes);
1929 unsigned target;
1930 int i;
1931 int nid;
1932
1933 if (!nnodes)
1934 return numa_node_id();
1935 target = (unsigned int)n % nnodes;
1936 nid = first_node(pol->v.nodes);
1937 for (i = 0; i < target; i++)
1938 nid = next_node(nid, pol->v.nodes);
1939 return nid;
1940}
1941
1942
1943static inline unsigned interleave_nid(struct mempolicy *pol,
1944 struct vm_area_struct *vma, unsigned long addr, int shift)
1945{
1946 if (vma) {
1947 unsigned long off;
1948
1949
1950
1951
1952
1953
1954
1955
1956 BUG_ON(shift < PAGE_SHIFT);
1957 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1958 off += (addr - vma->vm_start) >> shift;
1959 return offset_il_node(pol, off);
1960 } else
1961 return interleave_nodes(pol);
1962}
1963
1964#ifdef CONFIG_HUGETLBFS
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1981 struct mempolicy **mpol, nodemask_t **nodemask)
1982{
1983 int nid;
1984
1985 *mpol = get_vma_policy(vma, addr);
1986 *nodemask = NULL;
1987
1988 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1989 nid = interleave_nid(*mpol, vma, addr,
1990 huge_page_shift(hstate_vma(vma)));
1991 } else {
1992 nid = policy_node(gfp_flags, *mpol, numa_node_id());
1993 if ((*mpol)->mode == MPOL_BIND)
1994 *nodemask = &(*mpol)->v.nodes;
1995 }
1996 return nid;
1997}
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015bool init_nodemask_of_mempolicy(nodemask_t *mask)
2016{
2017 struct mempolicy *mempolicy;
2018 int nid;
2019
2020 if (!(mask && current->mempolicy))
2021 return false;
2022
2023 task_lock(current);
2024 mempolicy = current->mempolicy;
2025 switch (mempolicy->mode) {
2026 case MPOL_PREFERRED:
2027 if (mempolicy->flags & MPOL_F_LOCAL)
2028 nid = numa_node_id();
2029 else
2030 nid = mempolicy->v.preferred_node;
2031 init_nodemask_of_node(mask, nid);
2032 break;
2033
2034 case MPOL_BIND:
2035
2036 case MPOL_INTERLEAVE:
2037 *mask = mempolicy->v.nodes;
2038 break;
2039
2040 default:
2041 BUG();
2042 }
2043 task_unlock(current);
2044
2045 return true;
2046}
2047#endif
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2060 const nodemask_t *mask)
2061{
2062 struct mempolicy *mempolicy;
2063 bool ret = true;
2064
2065 if (!mask)
2066 return ret;
2067 task_lock(tsk);
2068 mempolicy = tsk->mempolicy;
2069 if (!mempolicy)
2070 goto out;
2071
2072 switch (mempolicy->mode) {
2073 case MPOL_PREFERRED:
2074
2075
2076
2077
2078
2079
2080 break;
2081 case MPOL_BIND:
2082 case MPOL_INTERLEAVE:
2083 ret = nodes_intersects(mempolicy->v.nodes, *mask);
2084 break;
2085 default:
2086 BUG();
2087 }
2088out:
2089 task_unlock(tsk);
2090 return ret;
2091}
2092
2093
2094
2095static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2096 unsigned nid)
2097{
2098 struct page *page;
2099
2100 page = __alloc_pages(gfp, order, nid);
2101
2102 if (!static_branch_likely(&vm_numa_stat_key))
2103 return page;
2104 if (page && page_to_nid(page) == nid) {
2105 preempt_disable();
2106 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2107 preempt_enable();
2108 }
2109 return page;
2110}
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135struct page *
2136alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2137 unsigned long addr, int node, bool hugepage)
2138{
2139 struct mempolicy *pol;
2140 struct page *page;
2141 int preferred_nid;
2142 nodemask_t *nmask;
2143
2144 pol = get_vma_policy(vma, addr);
2145
2146 if (pol->mode == MPOL_INTERLEAVE) {
2147 unsigned nid;
2148
2149 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2150 mpol_cond_put(pol);
2151 page = alloc_page_interleave(gfp, order, nid);
2152 goto out;
2153 }
2154
2155 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2156 int hpage_node = node;
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168 if (pol->mode == MPOL_PREFERRED &&
2169 !(pol->flags & MPOL_F_LOCAL))
2170 hpage_node = pol->v.preferred_node;
2171
2172 nmask = policy_nodemask(gfp, pol);
2173 if (!nmask || node_isset(hpage_node, *nmask)) {
2174 mpol_cond_put(pol);
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202 if (!(gfp & __GFP_DIRECT_RECLAIM))
2203 gfp |= __GFP_THISNODE;
2204 page = __alloc_pages_node(hpage_node, gfp, order);
2205 goto out;
2206 }
2207 }
2208
2209 nmask = policy_nodemask(gfp, pol);
2210 preferred_nid = policy_node(gfp, pol, node);
2211 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2212 mpol_cond_put(pol);
2213out:
2214 return page;
2215}
2216EXPORT_SYMBOL(alloc_pages_vma);
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2234{
2235 struct mempolicy *pol = &default_policy;
2236 struct page *page;
2237
2238 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2239 pol = get_task_policy(current);
2240
2241
2242
2243
2244
2245 if (pol->mode == MPOL_INTERLEAVE)
2246 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2247 else
2248 page = __alloc_pages_nodemask(gfp, order,
2249 policy_node(gfp, pol, numa_node_id()),
2250 policy_nodemask(gfp, pol));
2251
2252 return page;
2253}
2254EXPORT_SYMBOL(alloc_pages_current);
2255
2256int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2257{
2258 struct mempolicy *pol = mpol_dup(vma_policy(src));
2259
2260 if (IS_ERR(pol))
2261 return PTR_ERR(pol);
2262 dst->vm_policy = pol;
2263 return 0;
2264}
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278struct mempolicy *__mpol_dup(struct mempolicy *old)
2279{
2280 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2281
2282 if (!new)
2283 return ERR_PTR(-ENOMEM);
2284
2285
2286 if (old == current->mempolicy) {
2287 task_lock(current);
2288 *new = *old;
2289 task_unlock(current);
2290 } else
2291 *new = *old;
2292
2293 if (current_cpuset_is_being_rebound()) {
2294 nodemask_t mems = cpuset_mems_allowed(current);
2295 mpol_rebind_policy(new, &mems);
2296 }
2297 atomic_set(&new->refcnt, 1);
2298 return new;
2299}
2300
2301
2302bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2303{
2304 if (!a || !b)
2305 return false;
2306 if (a->mode != b->mode)
2307 return false;
2308 if (a->flags != b->flags)
2309 return false;
2310 if (mpol_store_user_nodemask(a))
2311 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2312 return false;
2313
2314 switch (a->mode) {
2315 case MPOL_BIND:
2316
2317 case MPOL_INTERLEAVE:
2318 return !!nodes_equal(a->v.nodes, b->v.nodes);
2319 case MPOL_PREFERRED:
2320
2321 if (a->flags & MPOL_F_LOCAL)
2322 return true;
2323 return a->v.preferred_node == b->v.preferred_node;
2324 default:
2325 BUG();
2326 return false;
2327 }
2328}
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343static struct sp_node *
2344sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2345{
2346 struct rb_node *n = sp->root.rb_node;
2347
2348 while (n) {
2349 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2350
2351 if (start >= p->end)
2352 n = n->rb_right;
2353 else if (end <= p->start)
2354 n = n->rb_left;
2355 else
2356 break;
2357 }
2358 if (!n)
2359 return NULL;
2360 for (;;) {
2361 struct sp_node *w = NULL;
2362 struct rb_node *prev = rb_prev(n);
2363 if (!prev)
2364 break;
2365 w = rb_entry(prev, struct sp_node, nd);
2366 if (w->end <= start)
2367 break;
2368 n = prev;
2369 }
2370 return rb_entry(n, struct sp_node, nd);
2371}
2372
2373
2374
2375
2376
2377static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2378{
2379 struct rb_node **p = &sp->root.rb_node;
2380 struct rb_node *parent = NULL;
2381 struct sp_node *nd;
2382
2383 while (*p) {
2384 parent = *p;
2385 nd = rb_entry(parent, struct sp_node, nd);
2386 if (new->start < nd->start)
2387 p = &(*p)->rb_left;
2388 else if (new->end > nd->end)
2389 p = &(*p)->rb_right;
2390 else
2391 BUG();
2392 }
2393 rb_link_node(&new->nd, parent, p);
2394 rb_insert_color(&new->nd, &sp->root);
2395 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2396 new->policy ? new->policy->mode : 0);
2397}
2398
2399
2400struct mempolicy *
2401mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2402{
2403 struct mempolicy *pol = NULL;
2404 struct sp_node *sn;
2405
2406 if (!sp->root.rb_node)
2407 return NULL;
2408 read_lock(&sp->lock);
2409 sn = sp_lookup(sp, idx, idx+1);
2410 if (sn) {
2411 mpol_get(sn->policy);
2412 pol = sn->policy;
2413 }
2414 read_unlock(&sp->lock);
2415 return pol;
2416}
2417
2418static void sp_free(struct sp_node *n)
2419{
2420 mpol_put(n->policy);
2421 kmem_cache_free(sn_cache, n);
2422}
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2442{
2443 struct mempolicy *pol;
2444 struct zoneref *z;
2445 int curnid = page_to_nid(page);
2446 unsigned long pgoff;
2447 int thiscpu = raw_smp_processor_id();
2448 int thisnid = cpu_to_node(thiscpu);
2449 int polnid = NUMA_NO_NODE;
2450 int ret = -1;
2451
2452 pol = get_vma_policy(vma, addr);
2453 if (!(pol->flags & MPOL_F_MOF))
2454 goto out;
2455
2456 switch (pol->mode) {
2457 case MPOL_INTERLEAVE:
2458 pgoff = vma->vm_pgoff;
2459 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2460 polnid = offset_il_node(pol, pgoff);
2461 break;
2462
2463 case MPOL_PREFERRED:
2464 if (pol->flags & MPOL_F_LOCAL)
2465 polnid = numa_node_id();
2466 else
2467 polnid = pol->v.preferred_node;
2468 break;
2469
2470 case MPOL_BIND:
2471
2472
2473
2474
2475
2476
2477
2478 if (node_isset(curnid, pol->v.nodes))
2479 goto out;
2480 z = first_zones_zonelist(
2481 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2482 gfp_zone(GFP_HIGHUSER),
2483 &pol->v.nodes);
2484 polnid = zone_to_nid(z->zone);
2485 break;
2486
2487 default:
2488 BUG();
2489 }
2490
2491
2492 if (pol->flags & MPOL_F_MORON) {
2493 polnid = thisnid;
2494
2495 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2496 goto out;
2497 }
2498
2499 if (curnid != polnid)
2500 ret = polnid;
2501out:
2502 mpol_cond_put(pol);
2503
2504 return ret;
2505}
2506
2507
2508
2509
2510
2511
2512
2513void mpol_put_task_policy(struct task_struct *task)
2514{
2515 struct mempolicy *pol;
2516
2517 task_lock(task);
2518 pol = task->mempolicy;
2519 task->mempolicy = NULL;
2520 task_unlock(task);
2521 mpol_put(pol);
2522}
2523
2524static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2525{
2526 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2527 rb_erase(&n->nd, &sp->root);
2528 sp_free(n);
2529}
2530
2531static void sp_node_init(struct sp_node *node, unsigned long start,
2532 unsigned long end, struct mempolicy *pol)
2533{
2534 node->start = start;
2535 node->end = end;
2536 node->policy = pol;
2537}
2538
2539static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2540 struct mempolicy *pol)
2541{
2542 struct sp_node *n;
2543 struct mempolicy *newpol;
2544
2545 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2546 if (!n)
2547 return NULL;
2548
2549 newpol = mpol_dup(pol);
2550 if (IS_ERR(newpol)) {
2551 kmem_cache_free(sn_cache, n);
2552 return NULL;
2553 }
2554 newpol->flags |= MPOL_F_SHARED;
2555 sp_node_init(n, start, end, newpol);
2556
2557 return n;
2558}
2559
2560
2561static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2562 unsigned long end, struct sp_node *new)
2563{
2564 struct sp_node *n;
2565 struct sp_node *n_new = NULL;
2566 struct mempolicy *mpol_new = NULL;
2567 int ret = 0;
2568
2569restart:
2570 write_lock(&sp->lock);
2571 n = sp_lookup(sp, start, end);
2572
2573 while (n && n->start < end) {
2574 struct rb_node *next = rb_next(&n->nd);
2575 if (n->start >= start) {
2576 if (n->end <= end)
2577 sp_delete(sp, n);
2578 else
2579 n->start = end;
2580 } else {
2581
2582 if (n->end > end) {
2583 if (!n_new)
2584 goto alloc_new;
2585
2586 *mpol_new = *n->policy;
2587 atomic_set(&mpol_new->refcnt, 1);
2588 sp_node_init(n_new, end, n->end, mpol_new);
2589 n->end = start;
2590 sp_insert(sp, n_new);
2591 n_new = NULL;
2592 mpol_new = NULL;
2593 break;
2594 } else
2595 n->end = start;
2596 }
2597 if (!next)
2598 break;
2599 n = rb_entry(next, struct sp_node, nd);
2600 }
2601 if (new)
2602 sp_insert(sp, new);
2603 write_unlock(&sp->lock);
2604 ret = 0;
2605
2606err_out:
2607 if (mpol_new)
2608 mpol_put(mpol_new);
2609 if (n_new)
2610 kmem_cache_free(sn_cache, n_new);
2611
2612 return ret;
2613
2614alloc_new:
2615 write_unlock(&sp->lock);
2616 ret = -ENOMEM;
2617 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2618 if (!n_new)
2619 goto err_out;
2620 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2621 if (!mpol_new)
2622 goto err_out;
2623 goto restart;
2624}
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2637{
2638 int ret;
2639
2640 sp->root = RB_ROOT;
2641 rwlock_init(&sp->lock);
2642
2643 if (mpol) {
2644 struct vm_area_struct pvma;
2645 struct mempolicy *new;
2646 NODEMASK_SCRATCH(scratch);
2647
2648 if (!scratch)
2649 goto put_mpol;
2650
2651 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2652 if (IS_ERR(new))
2653 goto free_scratch;
2654
2655 task_lock(current);
2656 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2657 task_unlock(current);
2658 if (ret)
2659 goto put_new;
2660
2661
2662 memset(&pvma, 0, sizeof(struct vm_area_struct));
2663 vma_init(&pvma, NULL);
2664 pvma.vm_end = TASK_SIZE;
2665 mpol_set_shared_policy(sp, &pvma, new);
2666
2667put_new:
2668 mpol_put(new);
2669free_scratch:
2670 NODEMASK_SCRATCH_FREE(scratch);
2671put_mpol:
2672 mpol_put(mpol);
2673 }
2674}
2675
2676int mpol_set_shared_policy(struct shared_policy *info,
2677 struct vm_area_struct *vma, struct mempolicy *npol)
2678{
2679 int err;
2680 struct sp_node *new = NULL;
2681 unsigned long sz = vma_pages(vma);
2682
2683 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2684 vma->vm_pgoff,
2685 sz, npol ? npol->mode : -1,
2686 npol ? npol->flags : -1,
2687 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2688
2689 if (npol) {
2690 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2691 if (!new)
2692 return -ENOMEM;
2693 }
2694 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2695 if (err && new)
2696 sp_free(new);
2697 return err;
2698}
2699
2700
2701void mpol_free_shared_policy(struct shared_policy *p)
2702{
2703 struct sp_node *n;
2704 struct rb_node *next;
2705
2706 if (!p->root.rb_node)
2707 return;
2708 write_lock(&p->lock);
2709 next = rb_first(&p->root);
2710 while (next) {
2711 n = rb_entry(next, struct sp_node, nd);
2712 next = rb_next(&n->nd);
2713 sp_delete(p, n);
2714 }
2715 write_unlock(&p->lock);
2716}
2717
2718#ifdef CONFIG_NUMA_BALANCING
2719static int __initdata numabalancing_override;
2720
2721static void __init check_numabalancing_enable(void)
2722{
2723 bool numabalancing_default = false;
2724
2725 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2726 numabalancing_default = true;
2727
2728
2729 if (numabalancing_override)
2730 set_numabalancing_state(numabalancing_override == 1);
2731
2732 if (num_online_nodes() > 1 && !numabalancing_override) {
2733 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2734 numabalancing_default ? "Enabling" : "Disabling");
2735 set_numabalancing_state(numabalancing_default);
2736 }
2737}
2738
2739static int __init setup_numabalancing(char *str)
2740{
2741 int ret = 0;
2742 if (!str)
2743 goto out;
2744
2745 if (!strcmp(str, "enable")) {
2746 numabalancing_override = 1;
2747 ret = 1;
2748 } else if (!strcmp(str, "disable")) {
2749 numabalancing_override = -1;
2750 ret = 1;
2751 }
2752out:
2753 if (!ret)
2754 pr_warn("Unable to parse numa_balancing=\n");
2755
2756 return ret;
2757}
2758__setup("numa_balancing=", setup_numabalancing);
2759#else
2760static inline void __init check_numabalancing_enable(void)
2761{
2762}
2763#endif
2764
2765
2766void __init numa_policy_init(void)
2767{
2768 nodemask_t interleave_nodes;
2769 unsigned long largest = 0;
2770 int nid, prefer = 0;
2771
2772 policy_cache = kmem_cache_create("numa_policy",
2773 sizeof(struct mempolicy),
2774 0, SLAB_PANIC, NULL);
2775
2776 sn_cache = kmem_cache_create("shared_policy_node",
2777 sizeof(struct sp_node),
2778 0, SLAB_PANIC, NULL);
2779
2780 for_each_node(nid) {
2781 preferred_node_policy[nid] = (struct mempolicy) {
2782 .refcnt = ATOMIC_INIT(1),
2783 .mode = MPOL_PREFERRED,
2784 .flags = MPOL_F_MOF | MPOL_F_MORON,
2785 .v = { .preferred_node = nid, },
2786 };
2787 }
2788
2789
2790
2791
2792
2793
2794 nodes_clear(interleave_nodes);
2795 for_each_node_state(nid, N_MEMORY) {
2796 unsigned long total_pages = node_present_pages(nid);
2797
2798
2799 if (largest < total_pages) {
2800 largest = total_pages;
2801 prefer = nid;
2802 }
2803
2804
2805 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2806 node_set(nid, interleave_nodes);
2807 }
2808
2809
2810 if (unlikely(nodes_empty(interleave_nodes)))
2811 node_set(prefer, interleave_nodes);
2812
2813 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2814 pr_err("%s: interleaving failed\n", __func__);
2815
2816 check_numabalancing_enable();
2817}
2818
2819
2820void numa_default_policy(void)
2821{
2822 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2823}
2824
2825
2826
2827
2828
2829
2830
2831
2832static const char * const policy_modes[] =
2833{
2834 [MPOL_DEFAULT] = "default",
2835 [MPOL_PREFERRED] = "prefer",
2836 [MPOL_BIND] = "bind",
2837 [MPOL_INTERLEAVE] = "interleave",
2838 [MPOL_LOCAL] = "local",
2839};
2840
2841
2842#ifdef CONFIG_TMPFS
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853int mpol_parse_str(char *str, struct mempolicy **mpol)
2854{
2855 struct mempolicy *new = NULL;
2856 unsigned short mode;
2857 unsigned short mode_flags;
2858 nodemask_t nodes;
2859 char *nodelist = strchr(str, ':');
2860 char *flags = strchr(str, '=');
2861 int err = 1;
2862
2863 if (flags)
2864 *flags++ = '\0';
2865
2866 if (nodelist) {
2867
2868 *nodelist++ = '\0';
2869 if (nodelist_parse(nodelist, nodes))
2870 goto out;
2871 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2872 goto out;
2873 } else
2874 nodes_clear(nodes);
2875
2876 for (mode = 0; mode < MPOL_MAX; mode++) {
2877 if (!strcmp(str, policy_modes[mode])) {
2878 break;
2879 }
2880 }
2881 if (mode >= MPOL_MAX)
2882 goto out;
2883
2884 switch (mode) {
2885 case MPOL_PREFERRED:
2886
2887
2888
2889
2890
2891 if (nodelist) {
2892 char *rest = nodelist;
2893 while (isdigit(*rest))
2894 rest++;
2895 if (*rest)
2896 goto out;
2897 if (nodes_empty(nodes))
2898 goto out;
2899 }
2900 break;
2901 case MPOL_INTERLEAVE:
2902
2903
2904
2905 if (!nodelist)
2906 nodes = node_states[N_MEMORY];
2907 break;
2908 case MPOL_LOCAL:
2909
2910
2911
2912 if (nodelist)
2913 goto out;
2914 mode = MPOL_PREFERRED;
2915 break;
2916 case MPOL_DEFAULT:
2917
2918
2919
2920 if (!nodelist)
2921 err = 0;
2922 goto out;
2923 case MPOL_BIND:
2924
2925
2926
2927 if (!nodelist)
2928 goto out;
2929 }
2930
2931 mode_flags = 0;
2932 if (flags) {
2933
2934
2935
2936
2937 if (!strcmp(flags, "static"))
2938 mode_flags |= MPOL_F_STATIC_NODES;
2939 else if (!strcmp(flags, "relative"))
2940 mode_flags |= MPOL_F_RELATIVE_NODES;
2941 else
2942 goto out;
2943 }
2944
2945 new = mpol_new(mode, mode_flags, &nodes);
2946 if (IS_ERR(new))
2947 goto out;
2948
2949
2950
2951
2952
2953 if (mode != MPOL_PREFERRED)
2954 new->v.nodes = nodes;
2955 else if (nodelist)
2956 new->v.preferred_node = first_node(nodes);
2957 else
2958 new->flags |= MPOL_F_LOCAL;
2959
2960
2961
2962
2963
2964 new->w.user_nodemask = nodes;
2965
2966 err = 0;
2967
2968out:
2969
2970 if (nodelist)
2971 *--nodelist = ':';
2972 if (flags)
2973 *--flags = '=';
2974 if (!err)
2975 *mpol = new;
2976 return err;
2977}
2978#endif
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2991{
2992 char *p = buffer;
2993 nodemask_t nodes = NODE_MASK_NONE;
2994 unsigned short mode = MPOL_DEFAULT;
2995 unsigned short flags = 0;
2996
2997 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2998 mode = pol->mode;
2999 flags = pol->flags;
3000 }
3001
3002 switch (mode) {
3003 case MPOL_DEFAULT:
3004 break;
3005 case MPOL_PREFERRED:
3006 if (flags & MPOL_F_LOCAL)
3007 mode = MPOL_LOCAL;
3008 else
3009 node_set(pol->v.preferred_node, nodes);
3010 break;
3011 case MPOL_BIND:
3012 case MPOL_INTERLEAVE:
3013 nodes = pol->v.nodes;
3014 break;
3015 default:
3016 WARN_ON_ONCE(1);
3017 snprintf(p, maxlen, "unknown");
3018 return;
3019 }
3020
3021 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3022
3023 if (flags & MPOL_MODE_FLAGS) {
3024 p += snprintf(p, buffer + maxlen - p, "=");
3025
3026
3027
3028
3029 if (flags & MPOL_F_STATIC_NODES)
3030 p += snprintf(p, buffer + maxlen - p, "static");
3031 else if (flags & MPOL_F_RELATIVE_NODES)
3032 p += snprintf(p, buffer + maxlen - p, "relative");
3033 }
3034
3035 if (!nodes_empty(nodes))
3036 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3037 nodemask_pr_args(&nodes));
3038}
3039