1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70#include <linux/mempolicy.h>
71#include <linux/pagewalk.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
76#include <linux/sched/mm.h>
77#include <linux/sched/numa_balancing.h>
78#include <linux/sched/task.h>
79#include <linux/nodemask.h>
80#include <linux/cpuset.h>
81#include <linux/slab.h>
82#include <linux/string.h>
83#include <linux/export.h>
84#include <linux/nsproxy.h>
85#include <linux/interrupt.h>
86#include <linux/init.h>
87#include <linux/compat.h>
88#include <linux/ptrace.h>
89#include <linux/swap.h>
90#include <linux/seq_file.h>
91#include <linux/proc_fs.h>
92#include <linux/migrate.h>
93#include <linux/ksm.h>
94#include <linux/rmap.h>
95#include <linux/security.h>
96#include <linux/syscalls.h>
97#include <linux/ctype.h>
98#include <linux/mm_inline.h>
99#include <linux/mmu_notifier.h>
100#include <linux/printk.h>
101#include <linux/swapops.h>
102
103#include <asm/tlbflush.h>
104#include <linux/uaccess.h>
105
106#include "internal.h"
107
108
109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)
110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)
111
112static struct kmem_cache *policy_cache;
113static struct kmem_cache *sn_cache;
114
115
116
117enum zone_type policy_zone = 0;
118
119
120
121
122static struct mempolicy default_policy = {
123 .refcnt = ATOMIC_INIT(1),
124 .mode = MPOL_PREFERRED,
125 .flags = MPOL_F_LOCAL,
126};
127
128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129
130
131
132
133
134
135
136int numa_map_to_online_node(int node)
137{
138 int min_node;
139
140 if (node == NUMA_NO_NODE)
141 node = 0;
142
143 min_node = node;
144 if (!node_online(node)) {
145 int min_dist = INT_MAX, dist, n;
146
147 for_each_online_node(n) {
148 dist = node_distance(node, n);
149 if (dist < min_dist) {
150 min_dist = dist;
151 min_node = n;
152 }
153 }
154 }
155
156 return min_node;
157}
158EXPORT_SYMBOL_GPL(numa_map_to_online_node);
159
160struct mempolicy *get_task_policy(struct task_struct *p)
161{
162 struct mempolicy *pol = p->mempolicy;
163 int node;
164
165 if (pol)
166 return pol;
167
168 node = numa_node_id();
169 if (node != NUMA_NO_NODE) {
170 pol = &preferred_node_policy[node];
171
172 if (pol->mode)
173 return pol;
174 }
175
176 return &default_policy;
177}
178
179static const struct mempolicy_operations {
180 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
181 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
182} mpol_ops[MPOL_MAX];
183
184static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
185{
186 return pol->flags & MPOL_MODE_FLAGS;
187}
188
189static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
190 const nodemask_t *rel)
191{
192 nodemask_t tmp;
193 nodes_fold(tmp, *orig, nodes_weight(*rel));
194 nodes_onto(*ret, tmp, *rel);
195}
196
197static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
198{
199 if (nodes_empty(*nodes))
200 return -EINVAL;
201 pol->v.nodes = *nodes;
202 return 0;
203}
204
205static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
206{
207 if (!nodes)
208 pol->flags |= MPOL_F_LOCAL;
209 else if (nodes_empty(*nodes))
210 return -EINVAL;
211 else
212 pol->v.preferred_node = first_node(*nodes);
213 return 0;
214}
215
216static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
217{
218 if (nodes_empty(*nodes))
219 return -EINVAL;
220 pol->v.nodes = *nodes;
221 return 0;
222}
223
224
225
226
227
228
229
230
231
232
233static int mpol_set_nodemask(struct mempolicy *pol,
234 const nodemask_t *nodes, struct nodemask_scratch *nsc)
235{
236 int ret;
237
238
239 if (pol == NULL)
240 return 0;
241
242 nodes_and(nsc->mask1,
243 cpuset_current_mems_allowed, node_states[N_MEMORY]);
244
245 VM_BUG_ON(!nodes);
246 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
247 nodes = NULL;
248 else {
249 if (pol->flags & MPOL_F_RELATIVE_NODES)
250 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
251 else
252 nodes_and(nsc->mask2, *nodes, nsc->mask1);
253
254 if (mpol_store_user_nodemask(pol))
255 pol->w.user_nodemask = *nodes;
256 else
257 pol->w.cpuset_mems_allowed =
258 cpuset_current_mems_allowed;
259 }
260
261 if (nodes)
262 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
263 else
264 ret = mpol_ops[pol->mode].create(pol, NULL);
265 return ret;
266}
267
268
269
270
271
272static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
273 nodemask_t *nodes)
274{
275 struct mempolicy *policy;
276
277 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
278 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
279
280 if (mode == MPOL_DEFAULT) {
281 if (nodes && !nodes_empty(*nodes))
282 return ERR_PTR(-EINVAL);
283 return NULL;
284 }
285 VM_BUG_ON(!nodes);
286
287
288
289
290
291
292 if (mode == MPOL_PREFERRED) {
293 if (nodes_empty(*nodes)) {
294 if (((flags & MPOL_F_STATIC_NODES) ||
295 (flags & MPOL_F_RELATIVE_NODES)))
296 return ERR_PTR(-EINVAL);
297 }
298 } else if (mode == MPOL_LOCAL) {
299 if (!nodes_empty(*nodes) ||
300 (flags & MPOL_F_STATIC_NODES) ||
301 (flags & MPOL_F_RELATIVE_NODES))
302 return ERR_PTR(-EINVAL);
303 mode = MPOL_PREFERRED;
304 } else if (nodes_empty(*nodes))
305 return ERR_PTR(-EINVAL);
306 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
307 if (!policy)
308 return ERR_PTR(-ENOMEM);
309 atomic_set(&policy->refcnt, 1);
310 policy->mode = mode;
311 policy->flags = flags;
312
313 return policy;
314}
315
316
317void __mpol_put(struct mempolicy *p)
318{
319 if (!atomic_dec_and_test(&p->refcnt))
320 return;
321 kmem_cache_free(policy_cache, p);
322}
323
324static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
325{
326}
327
328static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
329{
330 nodemask_t tmp;
331
332 if (pol->flags & MPOL_F_STATIC_NODES)
333 nodes_and(tmp, pol->w.user_nodemask, *nodes);
334 else if (pol->flags & MPOL_F_RELATIVE_NODES)
335 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
336 else {
337 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
338 *nodes);
339 pol->w.cpuset_mems_allowed = *nodes;
340 }
341
342 if (nodes_empty(tmp))
343 tmp = *nodes;
344
345 pol->v.nodes = tmp;
346}
347
348static void mpol_rebind_preferred(struct mempolicy *pol,
349 const nodemask_t *nodes)
350{
351 nodemask_t tmp;
352
353 if (pol->flags & MPOL_F_STATIC_NODES) {
354 int node = first_node(pol->w.user_nodemask);
355
356 if (node_isset(node, *nodes)) {
357 pol->v.preferred_node = node;
358 pol->flags &= ~MPOL_F_LOCAL;
359 } else
360 pol->flags |= MPOL_F_LOCAL;
361 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
362 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
363 pol->v.preferred_node = first_node(tmp);
364 } else if (!(pol->flags & MPOL_F_LOCAL)) {
365 pol->v.preferred_node = node_remap(pol->v.preferred_node,
366 pol->w.cpuset_mems_allowed,
367 *nodes);
368 pol->w.cpuset_mems_allowed = *nodes;
369 }
370}
371
372
373
374
375
376
377
378
379static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
380{
381 if (!pol)
382 return;
383 if (!mpol_store_user_nodemask(pol) &&
384 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
385 return;
386
387 mpol_ops[pol->mode].rebind(pol, newmask);
388}
389
390
391
392
393
394
395
396
397void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
398{
399 mpol_rebind_policy(tsk->mempolicy, new);
400}
401
402
403
404
405
406
407
408void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
409{
410 struct vm_area_struct *vma;
411
412 mmap_write_lock(mm);
413 for (vma = mm->mmap; vma; vma = vma->vm_next)
414 mpol_rebind_policy(vma->vm_policy, new);
415 mmap_write_unlock(mm);
416}
417
418static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
419 [MPOL_DEFAULT] = {
420 .rebind = mpol_rebind_default,
421 },
422 [MPOL_INTERLEAVE] = {
423 .create = mpol_new_interleave,
424 .rebind = mpol_rebind_nodemask,
425 },
426 [MPOL_PREFERRED] = {
427 .create = mpol_new_preferred,
428 .rebind = mpol_rebind_preferred,
429 },
430 [MPOL_BIND] = {
431 .create = mpol_new_bind,
432 .rebind = mpol_rebind_nodemask,
433 },
434};
435
436static int migrate_page_add(struct page *page, struct list_head *pagelist,
437 unsigned long flags);
438
439struct queue_pages {
440 struct list_head *pagelist;
441 unsigned long flags;
442 nodemask_t *nmask;
443 struct vm_area_struct *prev;
444};
445
446
447
448
449
450
451
452static inline bool queue_pages_required(struct page *page,
453 struct queue_pages *qp)
454{
455 int nid = page_to_nid(page);
456 unsigned long flags = qp->flags;
457
458 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
459}
460
461
462
463
464
465
466
467
468
469
470
471static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
472 unsigned long end, struct mm_walk *walk)
473{
474 int ret = 0;
475 struct page *page;
476 struct queue_pages *qp = walk->private;
477 unsigned long flags;
478
479 if (unlikely(is_pmd_migration_entry(*pmd))) {
480 ret = -EIO;
481 goto unlock;
482 }
483 page = pmd_page(*pmd);
484 if (is_huge_zero_page(page)) {
485 spin_unlock(ptl);
486 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
487 ret = 2;
488 goto out;
489 }
490 if (!queue_pages_required(page, qp))
491 goto unlock;
492
493 flags = qp->flags;
494
495 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
496 if (!vma_migratable(walk->vma) ||
497 migrate_page_add(page, qp->pagelist, flags)) {
498 ret = 1;
499 goto unlock;
500 }
501 } else
502 ret = -EIO;
503unlock:
504 spin_unlock(ptl);
505out:
506 return ret;
507}
508
509
510
511
512
513
514
515
516
517
518
519
520static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
521 unsigned long end, struct mm_walk *walk)
522{
523 struct vm_area_struct *vma = walk->vma;
524 struct page *page;
525 struct queue_pages *qp = walk->private;
526 unsigned long flags = qp->flags;
527 int ret;
528 bool has_unmovable = false;
529 pte_t *pte, *mapped_pte;
530 spinlock_t *ptl;
531
532 ptl = pmd_trans_huge_lock(pmd, vma);
533 if (ptl) {
534 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
535 if (ret != 2)
536 return ret;
537 }
538
539
540 if (pmd_trans_unstable(pmd))
541 return 0;
542
543 mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
544 for (; addr != end; pte++, addr += PAGE_SIZE) {
545 if (!pte_present(*pte))
546 continue;
547 page = vm_normal_page(vma, addr, *pte);
548 if (!page)
549 continue;
550
551
552
553
554 if (PageReserved(page))
555 continue;
556 if (!queue_pages_required(page, qp))
557 continue;
558 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
559
560 if (!vma_migratable(vma)) {
561 has_unmovable = true;
562 break;
563 }
564
565
566
567
568
569
570 if (migrate_page_add(page, qp->pagelist, flags))
571 has_unmovable = true;
572 } else
573 break;
574 }
575 pte_unmap_unlock(mapped_pte, ptl);
576 cond_resched();
577
578 if (has_unmovable)
579 return 1;
580
581 return addr != end ? -EIO : 0;
582}
583
584static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
585 unsigned long addr, unsigned long end,
586 struct mm_walk *walk)
587{
588#ifdef CONFIG_HUGETLB_PAGE
589 struct queue_pages *qp = walk->private;
590 unsigned long flags = qp->flags;
591 struct page *page;
592 spinlock_t *ptl;
593 pte_t entry;
594
595 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
596 entry = huge_ptep_get(pte);
597 if (!pte_present(entry))
598 goto unlock;
599 page = pte_page(entry);
600 if (!queue_pages_required(page, qp))
601 goto unlock;
602
603 if (flags & (MPOL_MF_MOVE_ALL) ||
604 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
605 isolate_huge_page(page, qp->pagelist);
606unlock:
607 spin_unlock(ptl);
608#else
609 BUG();
610#endif
611 return 0;
612}
613
614#ifdef CONFIG_NUMA_BALANCING
615
616
617
618
619
620
621
622
623
624unsigned long change_prot_numa(struct vm_area_struct *vma,
625 unsigned long addr, unsigned long end)
626{
627 int nr_updated;
628
629 nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
630 if (nr_updated)
631 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
632
633 return nr_updated;
634}
635#else
636static unsigned long change_prot_numa(struct vm_area_struct *vma,
637 unsigned long addr, unsigned long end)
638{
639 return 0;
640}
641#endif
642
643static int queue_pages_test_walk(unsigned long start, unsigned long end,
644 struct mm_walk *walk)
645{
646 struct vm_area_struct *vma = walk->vma;
647 struct queue_pages *qp = walk->private;
648 unsigned long endvma = vma->vm_end;
649 unsigned long flags = qp->flags;
650
651
652 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
653 if (!vma->vm_next && vma->vm_end < end)
654 return -EFAULT;
655 if (qp->prev && qp->prev->vm_end < vma->vm_start)
656 return -EFAULT;
657 }
658
659 qp->prev = vma;
660
661
662
663
664
665 if (!vma_migratable(vma) &&
666 !(flags & MPOL_MF_STRICT))
667 return 1;
668
669 if (endvma > end)
670 endvma = end;
671 if (vma->vm_start > start)
672 start = vma->vm_start;
673
674 if (flags & MPOL_MF_LAZY) {
675
676 if (!is_vm_hugetlb_page(vma) &&
677 (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
678 !(vma->vm_flags & VM_MIXEDMAP))
679 change_prot_numa(vma, start, endvma);
680 return 1;
681 }
682
683
684 if (flags & MPOL_MF_VALID)
685 return 0;
686 return 1;
687}
688
689static const struct mm_walk_ops queue_pages_walk_ops = {
690 .hugetlb_entry = queue_pages_hugetlb,
691 .pmd_entry = queue_pages_pte_range,
692 .test_walk = queue_pages_test_walk,
693};
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710static int
711queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
712 nodemask_t *nodes, unsigned long flags,
713 struct list_head *pagelist)
714{
715 struct queue_pages qp = {
716 .pagelist = pagelist,
717 .flags = flags,
718 .nmask = nodes,
719 .prev = NULL,
720 };
721
722 return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
723}
724
725
726
727
728
729static int vma_replace_policy(struct vm_area_struct *vma,
730 struct mempolicy *pol)
731{
732 int err;
733 struct mempolicy *old;
734 struct mempolicy *new;
735
736 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
737 vma->vm_start, vma->vm_end, vma->vm_pgoff,
738 vma->vm_ops, vma->vm_file,
739 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
740
741 new = mpol_dup(pol);
742 if (IS_ERR(new))
743 return PTR_ERR(new);
744
745 if (vma->vm_ops && vma->vm_ops->set_policy) {
746 err = vma->vm_ops->set_policy(vma, new);
747 if (err)
748 goto err_out;
749 }
750
751 old = vma->vm_policy;
752 vma->vm_policy = new;
753 mpol_put(old);
754
755 return 0;
756 err_out:
757 mpol_put(new);
758 return err;
759}
760
761
762static int mbind_range(struct mm_struct *mm, unsigned long start,
763 unsigned long end, struct mempolicy *new_pol)
764{
765 struct vm_area_struct *next;
766 struct vm_area_struct *prev;
767 struct vm_area_struct *vma;
768 int err = 0;
769 pgoff_t pgoff;
770 unsigned long vmstart;
771 unsigned long vmend;
772
773 vma = find_vma(mm, start);
774 if (!vma || vma->vm_start > start)
775 return -EFAULT;
776
777 prev = vma->vm_prev;
778 if (start > vma->vm_start)
779 prev = vma;
780
781 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
782 next = vma->vm_next;
783 vmstart = max(start, vma->vm_start);
784 vmend = min(end, vma->vm_end);
785
786 if (mpol_equal(vma_policy(vma), new_pol))
787 continue;
788
789 pgoff = vma->vm_pgoff +
790 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
791 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
792 vma->anon_vma, vma->vm_file, pgoff,
793 new_pol, vma->vm_userfaultfd_ctx);
794 if (prev) {
795 vma = prev;
796 next = vma->vm_next;
797 if (mpol_equal(vma_policy(vma), new_pol))
798 continue;
799
800 goto replace;
801 }
802 if (vma->vm_start != vmstart) {
803 err = split_vma(vma->vm_mm, vma, vmstart, 1);
804 if (err)
805 goto out;
806 }
807 if (vma->vm_end != vmend) {
808 err = split_vma(vma->vm_mm, vma, vmend, 0);
809 if (err)
810 goto out;
811 }
812 replace:
813 err = vma_replace_policy(vma, new_pol);
814 if (err)
815 goto out;
816 }
817
818 out:
819 return err;
820}
821
822
823static long do_set_mempolicy(unsigned short mode, unsigned short flags,
824 nodemask_t *nodes)
825{
826 struct mempolicy *new, *old;
827 NODEMASK_SCRATCH(scratch);
828 int ret;
829
830 if (!scratch)
831 return -ENOMEM;
832
833 new = mpol_new(mode, flags, nodes);
834 if (IS_ERR(new)) {
835 ret = PTR_ERR(new);
836 goto out;
837 }
838
839 task_lock(current);
840 ret = mpol_set_nodemask(new, nodes, scratch);
841 if (ret) {
842 task_unlock(current);
843 mpol_put(new);
844 goto out;
845 }
846 old = current->mempolicy;
847 current->mempolicy = new;
848 if (new && new->mode == MPOL_INTERLEAVE)
849 current->il_prev = MAX_NUMNODES-1;
850 task_unlock(current);
851 mpol_put(old);
852 ret = 0;
853out:
854 NODEMASK_SCRATCH_FREE(scratch);
855 return ret;
856}
857
858
859
860
861
862
863static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
864{
865 nodes_clear(*nodes);
866 if (p == &default_policy)
867 return;
868
869 switch (p->mode) {
870 case MPOL_BIND:
871
872 case MPOL_INTERLEAVE:
873 *nodes = p->v.nodes;
874 break;
875 case MPOL_PREFERRED:
876 if (!(p->flags & MPOL_F_LOCAL))
877 node_set(p->v.preferred_node, *nodes);
878
879 break;
880 default:
881 BUG();
882 }
883}
884
885static int lookup_node(struct mm_struct *mm, unsigned long addr)
886{
887 struct page *p = NULL;
888 int err;
889
890 int locked = 1;
891 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
892 if (err > 0) {
893 err = page_to_nid(p);
894 put_page(p);
895 }
896 if (locked)
897 mmap_read_unlock(mm);
898 return err;
899}
900
901
902static long do_get_mempolicy(int *policy, nodemask_t *nmask,
903 unsigned long addr, unsigned long flags)
904{
905 int err;
906 struct mm_struct *mm = current->mm;
907 struct vm_area_struct *vma = NULL;
908 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
909
910 if (flags &
911 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
912 return -EINVAL;
913
914 if (flags & MPOL_F_MEMS_ALLOWED) {
915 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
916 return -EINVAL;
917 *policy = 0;
918 task_lock(current);
919 *nmask = cpuset_current_mems_allowed;
920 task_unlock(current);
921 return 0;
922 }
923
924 if (flags & MPOL_F_ADDR) {
925
926
927
928
929
930 mmap_read_lock(mm);
931 vma = find_vma_intersection(mm, addr, addr+1);
932 if (!vma) {
933 mmap_read_unlock(mm);
934 return -EFAULT;
935 }
936 if (vma->vm_ops && vma->vm_ops->get_policy)
937 pol = vma->vm_ops->get_policy(vma, addr);
938 else
939 pol = vma->vm_policy;
940 } else if (addr)
941 return -EINVAL;
942
943 if (!pol)
944 pol = &default_policy;
945
946 if (flags & MPOL_F_NODE) {
947 if (flags & MPOL_F_ADDR) {
948
949
950
951
952
953
954 pol_refcount = pol;
955 vma = NULL;
956 mpol_get(pol);
957 err = lookup_node(mm, addr);
958 if (err < 0)
959 goto out;
960 *policy = err;
961 } else if (pol == current->mempolicy &&
962 pol->mode == MPOL_INTERLEAVE) {
963 *policy = next_node_in(current->il_prev, pol->v.nodes);
964 } else {
965 err = -EINVAL;
966 goto out;
967 }
968 } else {
969 *policy = pol == &default_policy ? MPOL_DEFAULT :
970 pol->mode;
971
972
973
974
975 *policy |= (pol->flags & MPOL_MODE_FLAGS);
976 }
977
978 err = 0;
979 if (nmask) {
980 if (mpol_store_user_nodemask(pol)) {
981 *nmask = pol->w.user_nodemask;
982 } else {
983 task_lock(current);
984 get_policy_nodemask(pol, nmask);
985 task_unlock(current);
986 }
987 }
988
989 out:
990 mpol_cond_put(pol);
991 if (vma)
992 mmap_read_unlock(mm);
993 if (pol_refcount)
994 mpol_put(pol_refcount);
995 return err;
996}
997
998#ifdef CONFIG_MIGRATION
999
1000
1001
1002static int migrate_page_add(struct page *page, struct list_head *pagelist,
1003 unsigned long flags)
1004{
1005 struct page *head = compound_head(page);
1006
1007
1008
1009 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
1010 if (!isolate_lru_page(head)) {
1011 list_add_tail(&head->lru, pagelist);
1012 mod_node_page_state(page_pgdat(head),
1013 NR_ISOLATED_ANON + page_is_file_lru(head),
1014 thp_nr_pages(head));
1015 } else if (flags & MPOL_MF_STRICT) {
1016
1017
1018
1019
1020
1021
1022
1023 return -EIO;
1024 }
1025 }
1026
1027 return 0;
1028}
1029
1030
1031
1032
1033
1034static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1035 int flags)
1036{
1037 nodemask_t nmask;
1038 LIST_HEAD(pagelist);
1039 int err = 0;
1040 struct migration_target_control mtc = {
1041 .nid = dest,
1042 .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
1043 };
1044
1045 nodes_clear(nmask);
1046 node_set(source, nmask);
1047
1048
1049
1050
1051
1052
1053 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1054 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1055 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1056
1057 if (!list_empty(&pagelist)) {
1058 err = migrate_pages(&pagelist, alloc_migration_target, NULL,
1059 (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
1060 if (err)
1061 putback_movable_pages(&pagelist);
1062 }
1063
1064 return err;
1065}
1066
1067
1068
1069
1070
1071
1072
1073int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1074 const nodemask_t *to, int flags)
1075{
1076 int busy = 0;
1077 int err;
1078 nodemask_t tmp;
1079
1080 err = migrate_prep();
1081 if (err)
1082 return err;
1083
1084 mmap_read_lock(mm);
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117 tmp = *from;
1118 while (!nodes_empty(tmp)) {
1119 int s,d;
1120 int source = NUMA_NO_NODE;
1121 int dest = 0;
1122
1123 for_each_node_mask(s, tmp) {
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1141 (node_isset(s, *to)))
1142 continue;
1143
1144 d = node_remap(s, *from, *to);
1145 if (s == d)
1146 continue;
1147
1148 source = s;
1149 dest = d;
1150
1151
1152 if (!node_isset(dest, tmp))
1153 break;
1154 }
1155 if (source == NUMA_NO_NODE)
1156 break;
1157
1158 node_clear(source, tmp);
1159 err = migrate_to_node(mm, source, dest, flags);
1160 if (err > 0)
1161 busy += err;
1162 if (err < 0)
1163 break;
1164 }
1165 mmap_read_unlock(mm);
1166 if (err < 0)
1167 return err;
1168 return busy;
1169
1170}
1171
1172
1173
1174
1175
1176
1177
1178
1179static struct page *new_page(struct page *page, unsigned long start)
1180{
1181 struct vm_area_struct *vma;
1182 unsigned long uninitialized_var(address);
1183
1184 vma = find_vma(current->mm, start);
1185 while (vma) {
1186 address = page_address_in_vma(page, vma);
1187 if (address != -EFAULT)
1188 break;
1189 vma = vma->vm_next;
1190 }
1191
1192 if (PageHuge(page)) {
1193 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1194 vma, address);
1195 } else if (PageTransHuge(page)) {
1196 struct page *thp;
1197
1198 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1199 HPAGE_PMD_ORDER);
1200 if (!thp)
1201 return NULL;
1202 prep_transhuge_page(thp);
1203 return thp;
1204 }
1205
1206
1207
1208 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1209 vma, address);
1210}
1211#else
1212
1213static int migrate_page_add(struct page *page, struct list_head *pagelist,
1214 unsigned long flags)
1215{
1216 return -EIO;
1217}
1218
1219int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1220 const nodemask_t *to, int flags)
1221{
1222 return -ENOSYS;
1223}
1224
1225static struct page *new_page(struct page *page, unsigned long start)
1226{
1227 return NULL;
1228}
1229#endif
1230
1231static long do_mbind(unsigned long start, unsigned long len,
1232 unsigned short mode, unsigned short mode_flags,
1233 nodemask_t *nmask, unsigned long flags)
1234{
1235 struct mm_struct *mm = current->mm;
1236 struct mempolicy *new;
1237 unsigned long end;
1238 int err;
1239 int ret;
1240 LIST_HEAD(pagelist);
1241
1242 if (flags & ~(unsigned long)MPOL_MF_VALID)
1243 return -EINVAL;
1244 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1245 return -EPERM;
1246
1247 if (start & ~PAGE_MASK)
1248 return -EINVAL;
1249
1250 if (mode == MPOL_DEFAULT)
1251 flags &= ~MPOL_MF_STRICT;
1252
1253 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1254 end = start + len;
1255
1256 if (end < start)
1257 return -EINVAL;
1258 if (end == start)
1259 return 0;
1260
1261 new = mpol_new(mode, mode_flags, nmask);
1262 if (IS_ERR(new))
1263 return PTR_ERR(new);
1264
1265 if (flags & MPOL_MF_LAZY)
1266 new->flags |= MPOL_F_MOF;
1267
1268
1269
1270
1271
1272 if (!new)
1273 flags |= MPOL_MF_DISCONTIG_OK;
1274
1275 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1276 start, start + len, mode, mode_flags,
1277 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1278
1279 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1280
1281 err = migrate_prep();
1282 if (err)
1283 goto mpol_out;
1284 }
1285 {
1286 NODEMASK_SCRATCH(scratch);
1287 if (scratch) {
1288 mmap_write_lock(mm);
1289 task_lock(current);
1290 err = mpol_set_nodemask(new, nmask, scratch);
1291 task_unlock(current);
1292 if (err)
1293 mmap_write_unlock(mm);
1294 } else
1295 err = -ENOMEM;
1296 NODEMASK_SCRATCH_FREE(scratch);
1297 }
1298 if (err)
1299 goto mpol_out;
1300
1301 ret = queue_pages_range(mm, start, end, nmask,
1302 flags | MPOL_MF_INVERT, &pagelist);
1303
1304 if (ret < 0) {
1305 err = ret;
1306 goto up_out;
1307 }
1308
1309 err = mbind_range(mm, start, end, new);
1310
1311 if (!err) {
1312 int nr_failed = 0;
1313
1314 if (!list_empty(&pagelist)) {
1315 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1316 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1317 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1318 if (nr_failed)
1319 putback_movable_pages(&pagelist);
1320 }
1321
1322 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1323 err = -EIO;
1324 } else {
1325up_out:
1326 if (!list_empty(&pagelist))
1327 putback_movable_pages(&pagelist);
1328 }
1329
1330 mmap_write_unlock(mm);
1331mpol_out:
1332 mpol_put(new);
1333 return err;
1334}
1335
1336
1337
1338
1339
1340
1341static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1342 unsigned long maxnode)
1343{
1344 unsigned long k;
1345 unsigned long t;
1346 unsigned long nlongs;
1347 unsigned long endmask;
1348
1349 --maxnode;
1350 nodes_clear(*nodes);
1351 if (maxnode == 0 || !nmask)
1352 return 0;
1353 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1354 return -EINVAL;
1355
1356 nlongs = BITS_TO_LONGS(maxnode);
1357 if ((maxnode % BITS_PER_LONG) == 0)
1358 endmask = ~0UL;
1359 else
1360 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1372 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1373 if (get_user(t, nmask + k))
1374 return -EFAULT;
1375 if (k == nlongs - 1) {
1376 if (t & endmask)
1377 return -EINVAL;
1378 } else if (t)
1379 return -EINVAL;
1380 }
1381 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1382 endmask = ~0UL;
1383 }
1384
1385 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1386 unsigned long valid_mask = endmask;
1387
1388 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1389 if (get_user(t, nmask + nlongs - 1))
1390 return -EFAULT;
1391 if (t & valid_mask)
1392 return -EINVAL;
1393 }
1394
1395 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1396 return -EFAULT;
1397 nodes_addr(*nodes)[nlongs-1] &= endmask;
1398 return 0;
1399}
1400
1401
1402static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1403 nodemask_t *nodes)
1404{
1405 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1406 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1407
1408 if (copy > nbytes) {
1409 if (copy > PAGE_SIZE)
1410 return -EINVAL;
1411 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1412 return -EFAULT;
1413 copy = nbytes;
1414 }
1415 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1416}
1417
1418static long kernel_mbind(unsigned long start, unsigned long len,
1419 unsigned long mode, const unsigned long __user *nmask,
1420 unsigned long maxnode, unsigned int flags)
1421{
1422 nodemask_t nodes;
1423 int err;
1424 unsigned short mode_flags;
1425
1426 start = untagged_addr(start);
1427 mode_flags = mode & MPOL_MODE_FLAGS;
1428 mode &= ~MPOL_MODE_FLAGS;
1429 if (mode >= MPOL_MAX)
1430 return -EINVAL;
1431 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1432 (mode_flags & MPOL_F_RELATIVE_NODES))
1433 return -EINVAL;
1434 err = get_nodes(&nodes, nmask, maxnode);
1435 if (err)
1436 return err;
1437 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1438}
1439
1440SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1441 unsigned long, mode, const unsigned long __user *, nmask,
1442 unsigned long, maxnode, unsigned int, flags)
1443{
1444 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1445}
1446
1447
1448static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1449 unsigned long maxnode)
1450{
1451 int err;
1452 nodemask_t nodes;
1453 unsigned short flags;
1454
1455 flags = mode & MPOL_MODE_FLAGS;
1456 mode &= ~MPOL_MODE_FLAGS;
1457 if ((unsigned int)mode >= MPOL_MAX)
1458 return -EINVAL;
1459 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1460 return -EINVAL;
1461 err = get_nodes(&nodes, nmask, maxnode);
1462 if (err)
1463 return err;
1464 return do_set_mempolicy(mode, flags, &nodes);
1465}
1466
1467SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1468 unsigned long, maxnode)
1469{
1470 return kernel_set_mempolicy(mode, nmask, maxnode);
1471}
1472
1473static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1474 const unsigned long __user *old_nodes,
1475 const unsigned long __user *new_nodes)
1476{
1477 struct mm_struct *mm = NULL;
1478 struct task_struct *task;
1479 nodemask_t task_nodes;
1480 int err;
1481 nodemask_t *old;
1482 nodemask_t *new;
1483 NODEMASK_SCRATCH(scratch);
1484
1485 if (!scratch)
1486 return -ENOMEM;
1487
1488 old = &scratch->mask1;
1489 new = &scratch->mask2;
1490
1491 err = get_nodes(old, old_nodes, maxnode);
1492 if (err)
1493 goto out;
1494
1495 err = get_nodes(new, new_nodes, maxnode);
1496 if (err)
1497 goto out;
1498
1499
1500 rcu_read_lock();
1501 task = pid ? find_task_by_vpid(pid) : current;
1502 if (!task) {
1503 rcu_read_unlock();
1504 err = -ESRCH;
1505 goto out;
1506 }
1507 get_task_struct(task);
1508
1509 err = -EINVAL;
1510
1511
1512
1513
1514
1515 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1516 rcu_read_unlock();
1517 err = -EPERM;
1518 goto out_put;
1519 }
1520 rcu_read_unlock();
1521
1522 task_nodes = cpuset_mems_allowed(task);
1523
1524 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1525 err = -EPERM;
1526 goto out_put;
1527 }
1528
1529 task_nodes = cpuset_mems_allowed(current);
1530 nodes_and(*new, *new, task_nodes);
1531 if (nodes_empty(*new))
1532 goto out_put;
1533
1534 nodes_and(*new, *new, node_states[N_MEMORY]);
1535 if (nodes_empty(*new))
1536 goto out_put;
1537
1538 err = security_task_movememory(task);
1539 if (err)
1540 goto out_put;
1541
1542 mm = get_task_mm(task);
1543 put_task_struct(task);
1544
1545 if (!mm) {
1546 err = -EINVAL;
1547 goto out;
1548 }
1549
1550 err = do_migrate_pages(mm, old, new,
1551 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1552
1553 mmput(mm);
1554out:
1555 NODEMASK_SCRATCH_FREE(scratch);
1556
1557 return err;
1558
1559out_put:
1560 put_task_struct(task);
1561 goto out;
1562
1563}
1564
1565SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1566 const unsigned long __user *, old_nodes,
1567 const unsigned long __user *, new_nodes)
1568{
1569 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1570}
1571
1572
1573
1574static int kernel_get_mempolicy(int __user *policy,
1575 unsigned long __user *nmask,
1576 unsigned long maxnode,
1577 unsigned long addr,
1578 unsigned long flags)
1579{
1580 int err;
1581 int uninitialized_var(pval);
1582 nodemask_t nodes;
1583
1584 addr = untagged_addr(addr);
1585
1586 if (nmask != NULL && maxnode < nr_node_ids)
1587 return -EINVAL;
1588
1589 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1590
1591 if (err)
1592 return err;
1593
1594 if (policy && put_user(pval, policy))
1595 return -EFAULT;
1596
1597 if (nmask)
1598 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1599
1600 return err;
1601}
1602
1603SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1604 unsigned long __user *, nmask, unsigned long, maxnode,
1605 unsigned long, addr, unsigned long, flags)
1606{
1607 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1608}
1609
1610#ifdef CONFIG_COMPAT
1611
1612COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1613 compat_ulong_t __user *, nmask,
1614 compat_ulong_t, maxnode,
1615 compat_ulong_t, addr, compat_ulong_t, flags)
1616{
1617 long err;
1618 unsigned long __user *nm = NULL;
1619 unsigned long nr_bits, alloc_size;
1620 DECLARE_BITMAP(bm, MAX_NUMNODES);
1621
1622 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1623 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1624
1625 if (nmask)
1626 nm = compat_alloc_user_space(alloc_size);
1627
1628 err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1629
1630 if (!err && nmask) {
1631 unsigned long copy_size;
1632 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1633 err = copy_from_user(bm, nm, copy_size);
1634
1635 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1636 err |= compat_put_bitmap(nmask, bm, nr_bits);
1637 }
1638
1639 return err;
1640}
1641
1642COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1643 compat_ulong_t, maxnode)
1644{
1645 unsigned long __user *nm = NULL;
1646 unsigned long nr_bits, alloc_size;
1647 DECLARE_BITMAP(bm, MAX_NUMNODES);
1648
1649 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1650 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1651
1652 if (nmask) {
1653 if (compat_get_bitmap(bm, nmask, nr_bits))
1654 return -EFAULT;
1655 nm = compat_alloc_user_space(alloc_size);
1656 if (copy_to_user(nm, bm, alloc_size))
1657 return -EFAULT;
1658 }
1659
1660 return kernel_set_mempolicy(mode, nm, nr_bits+1);
1661}
1662
1663COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1664 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1665 compat_ulong_t, maxnode, compat_ulong_t, flags)
1666{
1667 unsigned long __user *nm = NULL;
1668 unsigned long nr_bits, alloc_size;
1669 nodemask_t bm;
1670
1671 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1672 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1673
1674 if (nmask) {
1675 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1676 return -EFAULT;
1677 nm = compat_alloc_user_space(alloc_size);
1678 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1679 return -EFAULT;
1680 }
1681
1682 return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1683}
1684
1685COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1686 compat_ulong_t, maxnode,
1687 const compat_ulong_t __user *, old_nodes,
1688 const compat_ulong_t __user *, new_nodes)
1689{
1690 unsigned long __user *old = NULL;
1691 unsigned long __user *new = NULL;
1692 nodemask_t tmp_mask;
1693 unsigned long nr_bits;
1694 unsigned long size;
1695
1696 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1697 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1698 if (old_nodes) {
1699 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1700 return -EFAULT;
1701 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1702 if (new_nodes)
1703 new = old + size / sizeof(unsigned long);
1704 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1705 return -EFAULT;
1706 }
1707 if (new_nodes) {
1708 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1709 return -EFAULT;
1710 if (new == NULL)
1711 new = compat_alloc_user_space(size);
1712 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1713 return -EFAULT;
1714 }
1715 return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1716}
1717
1718#endif
1719
1720struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1721 unsigned long addr)
1722{
1723 struct mempolicy *pol = NULL;
1724
1725 if (vma) {
1726 if (vma->vm_ops && vma->vm_ops->get_policy) {
1727 pol = vma->vm_ops->get_policy(vma, addr);
1728 } else if (vma->vm_policy) {
1729 pol = vma->vm_policy;
1730
1731
1732
1733
1734
1735
1736
1737 if (mpol_needs_cond_ref(pol))
1738 mpol_get(pol);
1739 }
1740 }
1741
1742 return pol;
1743}
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1758 unsigned long addr)
1759{
1760 struct mempolicy *pol = __get_vma_policy(vma, addr);
1761
1762 if (!pol)
1763 pol = get_task_policy(current);
1764
1765 return pol;
1766}
1767
1768bool vma_policy_mof(struct vm_area_struct *vma)
1769{
1770 struct mempolicy *pol;
1771
1772 if (vma->vm_ops && vma->vm_ops->get_policy) {
1773 bool ret = false;
1774
1775 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1776 if (pol && (pol->flags & MPOL_F_MOF))
1777 ret = true;
1778 mpol_cond_put(pol);
1779
1780 return ret;
1781 }
1782
1783 pol = vma->vm_policy;
1784 if (!pol)
1785 pol = get_task_policy(current);
1786
1787 return pol->flags & MPOL_F_MOF;
1788}
1789
1790static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1791{
1792 enum zone_type dynamic_policy_zone = policy_zone;
1793
1794 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1805 dynamic_policy_zone = ZONE_MOVABLE;
1806
1807 return zone >= dynamic_policy_zone;
1808}
1809
1810
1811
1812
1813
1814static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1815{
1816
1817 if (unlikely(policy->mode == MPOL_BIND) &&
1818 apply_policy_zone(policy, gfp_zone(gfp)) &&
1819 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1820 return &policy->v.nodes;
1821
1822 return NULL;
1823}
1824
1825
1826static int policy_node(gfp_t gfp, struct mempolicy *policy,
1827 int nd)
1828{
1829 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1830 nd = policy->v.preferred_node;
1831 else {
1832
1833
1834
1835
1836
1837 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1838 }
1839
1840 return nd;
1841}
1842
1843
1844static unsigned interleave_nodes(struct mempolicy *policy)
1845{
1846 unsigned next;
1847 struct task_struct *me = current;
1848
1849 next = next_node_in(me->il_prev, policy->v.nodes);
1850 if (next < MAX_NUMNODES)
1851 me->il_prev = next;
1852 return next;
1853}
1854
1855
1856
1857
1858
1859unsigned int mempolicy_slab_node(void)
1860{
1861 struct mempolicy *policy;
1862 int node = numa_mem_id();
1863
1864 if (in_interrupt())
1865 return node;
1866
1867 policy = current->mempolicy;
1868 if (!policy || policy->flags & MPOL_F_LOCAL)
1869 return node;
1870
1871 switch (policy->mode) {
1872 case MPOL_PREFERRED:
1873
1874
1875
1876 return policy->v.preferred_node;
1877
1878 case MPOL_INTERLEAVE:
1879 return interleave_nodes(policy);
1880
1881 case MPOL_BIND: {
1882 struct zoneref *z;
1883
1884
1885
1886
1887
1888 struct zonelist *zonelist;
1889 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1890 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1891 z = first_zones_zonelist(zonelist, highest_zoneidx,
1892 &policy->v.nodes);
1893 return z->zone ? zone_to_nid(z->zone) : node;
1894 }
1895
1896 default:
1897 BUG();
1898 }
1899}
1900
1901
1902
1903
1904
1905
1906static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1907{
1908 unsigned nnodes = nodes_weight(pol->v.nodes);
1909 unsigned target;
1910 int i;
1911 int nid;
1912
1913 if (!nnodes)
1914 return numa_node_id();
1915 target = (unsigned int)n % nnodes;
1916 nid = first_node(pol->v.nodes);
1917 for (i = 0; i < target; i++)
1918 nid = next_node(nid, pol->v.nodes);
1919 return nid;
1920}
1921
1922
1923static inline unsigned interleave_nid(struct mempolicy *pol,
1924 struct vm_area_struct *vma, unsigned long addr, int shift)
1925{
1926 if (vma) {
1927 unsigned long off;
1928
1929
1930
1931
1932
1933
1934
1935
1936 BUG_ON(shift < PAGE_SHIFT);
1937 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1938 off += (addr - vma->vm_start) >> shift;
1939 return offset_il_node(pol, off);
1940 } else
1941 return interleave_nodes(pol);
1942}
1943
1944#ifdef CONFIG_HUGETLBFS
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1961 struct mempolicy **mpol, nodemask_t **nodemask)
1962{
1963 int nid;
1964
1965 *mpol = get_vma_policy(vma, addr);
1966 *nodemask = NULL;
1967
1968 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1969 nid = interleave_nid(*mpol, vma, addr,
1970 huge_page_shift(hstate_vma(vma)));
1971 } else {
1972 nid = policy_node(gfp_flags, *mpol, numa_node_id());
1973 if ((*mpol)->mode == MPOL_BIND)
1974 *nodemask = &(*mpol)->v.nodes;
1975 }
1976 return nid;
1977}
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995bool init_nodemask_of_mempolicy(nodemask_t *mask)
1996{
1997 struct mempolicy *mempolicy;
1998 int nid;
1999
2000 if (!(mask && current->mempolicy))
2001 return false;
2002
2003 task_lock(current);
2004 mempolicy = current->mempolicy;
2005 switch (mempolicy->mode) {
2006 case MPOL_PREFERRED:
2007 if (mempolicy->flags & MPOL_F_LOCAL)
2008 nid = numa_node_id();
2009 else
2010 nid = mempolicy->v.preferred_node;
2011 init_nodemask_of_node(mask, nid);
2012 break;
2013
2014 case MPOL_BIND:
2015
2016 case MPOL_INTERLEAVE:
2017 *mask = mempolicy->v.nodes;
2018 break;
2019
2020 default:
2021 BUG();
2022 }
2023 task_unlock(current);
2024
2025 return true;
2026}
2027#endif
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2040 const nodemask_t *mask)
2041{
2042 struct mempolicy *mempolicy;
2043 bool ret = true;
2044
2045 if (!mask)
2046 return ret;
2047 task_lock(tsk);
2048 mempolicy = tsk->mempolicy;
2049 if (!mempolicy)
2050 goto out;
2051
2052 switch (mempolicy->mode) {
2053 case MPOL_PREFERRED:
2054
2055
2056
2057
2058
2059
2060 break;
2061 case MPOL_BIND:
2062 case MPOL_INTERLEAVE:
2063 ret = nodes_intersects(mempolicy->v.nodes, *mask);
2064 break;
2065 default:
2066 BUG();
2067 }
2068out:
2069 task_unlock(tsk);
2070 return ret;
2071}
2072
2073
2074
2075static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2076 unsigned nid)
2077{
2078 struct page *page;
2079
2080 page = __alloc_pages(gfp, order, nid);
2081
2082 if (!static_branch_likely(&vm_numa_stat_key))
2083 return page;
2084 if (page && page_to_nid(page) == nid) {
2085 preempt_disable();
2086 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2087 preempt_enable();
2088 }
2089 return page;
2090}
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115struct page *
2116alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2117 unsigned long addr, int node, bool hugepage)
2118{
2119 struct mempolicy *pol;
2120 struct page *page;
2121 int preferred_nid;
2122 nodemask_t *nmask;
2123
2124 pol = get_vma_policy(vma, addr);
2125
2126 if (pol->mode == MPOL_INTERLEAVE) {
2127 unsigned nid;
2128
2129 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2130 mpol_cond_put(pol);
2131 page = alloc_page_interleave(gfp, order, nid);
2132 goto out;
2133 }
2134
2135 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2136 int hpage_node = node;
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148 if (pol->mode == MPOL_PREFERRED &&
2149 !(pol->flags & MPOL_F_LOCAL))
2150 hpage_node = pol->v.preferred_node;
2151
2152 nmask = policy_nodemask(gfp, pol);
2153 if (!nmask || node_isset(hpage_node, *nmask)) {
2154 mpol_cond_put(pol);
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182 if (!(gfp & __GFP_DIRECT_RECLAIM))
2183 gfp |= __GFP_THISNODE;
2184 page = __alloc_pages_node(hpage_node, gfp, order);
2185 goto out;
2186 }
2187 }
2188
2189 nmask = policy_nodemask(gfp, pol);
2190 preferred_nid = policy_node(gfp, pol, node);
2191 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2192 mpol_cond_put(pol);
2193out:
2194 return page;
2195}
2196EXPORT_SYMBOL(alloc_pages_vma);
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2214{
2215 struct mempolicy *pol = &default_policy;
2216 struct page *page;
2217
2218 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2219 pol = get_task_policy(current);
2220
2221
2222
2223
2224
2225 if (pol->mode == MPOL_INTERLEAVE)
2226 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2227 else
2228 page = __alloc_pages_nodemask(gfp, order,
2229 policy_node(gfp, pol, numa_node_id()),
2230 policy_nodemask(gfp, pol));
2231
2232 return page;
2233}
2234EXPORT_SYMBOL(alloc_pages_current);
2235
2236int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2237{
2238 struct mempolicy *pol = mpol_dup(vma_policy(src));
2239
2240 if (IS_ERR(pol))
2241 return PTR_ERR(pol);
2242 dst->vm_policy = pol;
2243 return 0;
2244}
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258struct mempolicy *__mpol_dup(struct mempolicy *old)
2259{
2260 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2261
2262 if (!new)
2263 return ERR_PTR(-ENOMEM);
2264
2265
2266 if (old == current->mempolicy) {
2267 task_lock(current);
2268 *new = *old;
2269 task_unlock(current);
2270 } else
2271 *new = *old;
2272
2273 if (current_cpuset_is_being_rebound()) {
2274 nodemask_t mems = cpuset_mems_allowed(current);
2275 mpol_rebind_policy(new, &mems);
2276 }
2277 atomic_set(&new->refcnt, 1);
2278 return new;
2279}
2280
2281
2282bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2283{
2284 if (!a || !b)
2285 return false;
2286 if (a->mode != b->mode)
2287 return false;
2288 if (a->flags != b->flags)
2289 return false;
2290 if (mpol_store_user_nodemask(a))
2291 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2292 return false;
2293
2294 switch (a->mode) {
2295 case MPOL_BIND:
2296
2297 case MPOL_INTERLEAVE:
2298 return !!nodes_equal(a->v.nodes, b->v.nodes);
2299 case MPOL_PREFERRED:
2300
2301 if (a->flags & MPOL_F_LOCAL)
2302 return true;
2303 return a->v.preferred_node == b->v.preferred_node;
2304 default:
2305 BUG();
2306 return false;
2307 }
2308}
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323static struct sp_node *
2324sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2325{
2326 struct rb_node *n = sp->root.rb_node;
2327
2328 while (n) {
2329 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2330
2331 if (start >= p->end)
2332 n = n->rb_right;
2333 else if (end <= p->start)
2334 n = n->rb_left;
2335 else
2336 break;
2337 }
2338 if (!n)
2339 return NULL;
2340 for (;;) {
2341 struct sp_node *w = NULL;
2342 struct rb_node *prev = rb_prev(n);
2343 if (!prev)
2344 break;
2345 w = rb_entry(prev, struct sp_node, nd);
2346 if (w->end <= start)
2347 break;
2348 n = prev;
2349 }
2350 return rb_entry(n, struct sp_node, nd);
2351}
2352
2353
2354
2355
2356
2357static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2358{
2359 struct rb_node **p = &sp->root.rb_node;
2360 struct rb_node *parent = NULL;
2361 struct sp_node *nd;
2362
2363 while (*p) {
2364 parent = *p;
2365 nd = rb_entry(parent, struct sp_node, nd);
2366 if (new->start < nd->start)
2367 p = &(*p)->rb_left;
2368 else if (new->end > nd->end)
2369 p = &(*p)->rb_right;
2370 else
2371 BUG();
2372 }
2373 rb_link_node(&new->nd, parent, p);
2374 rb_insert_color(&new->nd, &sp->root);
2375 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2376 new->policy ? new->policy->mode : 0);
2377}
2378
2379
2380struct mempolicy *
2381mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2382{
2383 struct mempolicy *pol = NULL;
2384 struct sp_node *sn;
2385
2386 if (!sp->root.rb_node)
2387 return NULL;
2388 read_lock(&sp->lock);
2389 sn = sp_lookup(sp, idx, idx+1);
2390 if (sn) {
2391 mpol_get(sn->policy);
2392 pol = sn->policy;
2393 }
2394 read_unlock(&sp->lock);
2395 return pol;
2396}
2397
2398static void sp_free(struct sp_node *n)
2399{
2400 mpol_put(n->policy);
2401 kmem_cache_free(sn_cache, n);
2402}
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2422{
2423 struct mempolicy *pol;
2424 struct zoneref *z;
2425 int curnid = page_to_nid(page);
2426 unsigned long pgoff;
2427 int thiscpu = raw_smp_processor_id();
2428 int thisnid = cpu_to_node(thiscpu);
2429 int polnid = NUMA_NO_NODE;
2430 int ret = -1;
2431
2432 pol = get_vma_policy(vma, addr);
2433 if (!(pol->flags & MPOL_F_MOF))
2434 goto out;
2435
2436 switch (pol->mode) {
2437 case MPOL_INTERLEAVE:
2438 pgoff = vma->vm_pgoff;
2439 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2440 polnid = offset_il_node(pol, pgoff);
2441 break;
2442
2443 case MPOL_PREFERRED:
2444 if (pol->flags & MPOL_F_LOCAL)
2445 polnid = numa_node_id();
2446 else
2447 polnid = pol->v.preferred_node;
2448 break;
2449
2450 case MPOL_BIND:
2451
2452
2453
2454
2455
2456
2457
2458 if (node_isset(curnid, pol->v.nodes))
2459 goto out;
2460 z = first_zones_zonelist(
2461 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2462 gfp_zone(GFP_HIGHUSER),
2463 &pol->v.nodes);
2464 polnid = zone_to_nid(z->zone);
2465 break;
2466
2467 default:
2468 BUG();
2469 }
2470
2471
2472 if (pol->flags & MPOL_F_MORON) {
2473 polnid = thisnid;
2474
2475 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2476 goto out;
2477 }
2478
2479 if (curnid != polnid)
2480 ret = polnid;
2481out:
2482 mpol_cond_put(pol);
2483
2484 return ret;
2485}
2486
2487
2488
2489
2490
2491
2492
2493void mpol_put_task_policy(struct task_struct *task)
2494{
2495 struct mempolicy *pol;
2496
2497 task_lock(task);
2498 pol = task->mempolicy;
2499 task->mempolicy = NULL;
2500 task_unlock(task);
2501 mpol_put(pol);
2502}
2503
2504static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2505{
2506 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2507 rb_erase(&n->nd, &sp->root);
2508 sp_free(n);
2509}
2510
2511static void sp_node_init(struct sp_node *node, unsigned long start,
2512 unsigned long end, struct mempolicy *pol)
2513{
2514 node->start = start;
2515 node->end = end;
2516 node->policy = pol;
2517}
2518
2519static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2520 struct mempolicy *pol)
2521{
2522 struct sp_node *n;
2523 struct mempolicy *newpol;
2524
2525 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2526 if (!n)
2527 return NULL;
2528
2529 newpol = mpol_dup(pol);
2530 if (IS_ERR(newpol)) {
2531 kmem_cache_free(sn_cache, n);
2532 return NULL;
2533 }
2534 newpol->flags |= MPOL_F_SHARED;
2535 sp_node_init(n, start, end, newpol);
2536
2537 return n;
2538}
2539
2540
2541static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2542 unsigned long end, struct sp_node *new)
2543{
2544 struct sp_node *n;
2545 struct sp_node *n_new = NULL;
2546 struct mempolicy *mpol_new = NULL;
2547 int ret = 0;
2548
2549restart:
2550 write_lock(&sp->lock);
2551 n = sp_lookup(sp, start, end);
2552
2553 while (n && n->start < end) {
2554 struct rb_node *next = rb_next(&n->nd);
2555 if (n->start >= start) {
2556 if (n->end <= end)
2557 sp_delete(sp, n);
2558 else
2559 n->start = end;
2560 } else {
2561
2562 if (n->end > end) {
2563 if (!n_new)
2564 goto alloc_new;
2565
2566 *mpol_new = *n->policy;
2567 atomic_set(&mpol_new->refcnt, 1);
2568 sp_node_init(n_new, end, n->end, mpol_new);
2569 n->end = start;
2570 sp_insert(sp, n_new);
2571 n_new = NULL;
2572 mpol_new = NULL;
2573 break;
2574 } else
2575 n->end = start;
2576 }
2577 if (!next)
2578 break;
2579 n = rb_entry(next, struct sp_node, nd);
2580 }
2581 if (new)
2582 sp_insert(sp, new);
2583 write_unlock(&sp->lock);
2584 ret = 0;
2585
2586err_out:
2587 if (mpol_new)
2588 mpol_put(mpol_new);
2589 if (n_new)
2590 kmem_cache_free(sn_cache, n_new);
2591
2592 return ret;
2593
2594alloc_new:
2595 write_unlock(&sp->lock);
2596 ret = -ENOMEM;
2597 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2598 if (!n_new)
2599 goto err_out;
2600 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2601 if (!mpol_new)
2602 goto err_out;
2603 goto restart;
2604}
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2617{
2618 int ret;
2619
2620 sp->root = RB_ROOT;
2621 rwlock_init(&sp->lock);
2622
2623 if (mpol) {
2624 struct vm_area_struct pvma;
2625 struct mempolicy *new;
2626 NODEMASK_SCRATCH(scratch);
2627
2628 if (!scratch)
2629 goto put_mpol;
2630
2631 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2632 if (IS_ERR(new))
2633 goto free_scratch;
2634
2635 task_lock(current);
2636 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2637 task_unlock(current);
2638 if (ret)
2639 goto put_new;
2640
2641
2642 memset(&pvma, 0, sizeof(struct vm_area_struct));
2643 vma_init(&pvma, NULL);
2644 pvma.vm_end = TASK_SIZE;
2645 mpol_set_shared_policy(sp, &pvma, new);
2646
2647put_new:
2648 mpol_put(new);
2649free_scratch:
2650 NODEMASK_SCRATCH_FREE(scratch);
2651put_mpol:
2652 mpol_put(mpol);
2653 }
2654}
2655
2656int mpol_set_shared_policy(struct shared_policy *info,
2657 struct vm_area_struct *vma, struct mempolicy *npol)
2658{
2659 int err;
2660 struct sp_node *new = NULL;
2661 unsigned long sz = vma_pages(vma);
2662
2663 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2664 vma->vm_pgoff,
2665 sz, npol ? npol->mode : -1,
2666 npol ? npol->flags : -1,
2667 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2668
2669 if (npol) {
2670 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2671 if (!new)
2672 return -ENOMEM;
2673 }
2674 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2675 if (err && new)
2676 sp_free(new);
2677 return err;
2678}
2679
2680
2681void mpol_free_shared_policy(struct shared_policy *p)
2682{
2683 struct sp_node *n;
2684 struct rb_node *next;
2685
2686 if (!p->root.rb_node)
2687 return;
2688 write_lock(&p->lock);
2689 next = rb_first(&p->root);
2690 while (next) {
2691 n = rb_entry(next, struct sp_node, nd);
2692 next = rb_next(&n->nd);
2693 sp_delete(p, n);
2694 }
2695 write_unlock(&p->lock);
2696}
2697
2698#ifdef CONFIG_NUMA_BALANCING
2699static int __initdata numabalancing_override;
2700
2701static void __init check_numabalancing_enable(void)
2702{
2703 bool numabalancing_default = false;
2704
2705 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2706 numabalancing_default = true;
2707
2708
2709 if (numabalancing_override)
2710 set_numabalancing_state(numabalancing_override == 1);
2711
2712 if (num_online_nodes() > 1 && !numabalancing_override) {
2713 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2714 numabalancing_default ? "Enabling" : "Disabling");
2715 set_numabalancing_state(numabalancing_default);
2716 }
2717}
2718
2719static int __init setup_numabalancing(char *str)
2720{
2721 int ret = 0;
2722 if (!str)
2723 goto out;
2724
2725 if (!strcmp(str, "enable")) {
2726 numabalancing_override = 1;
2727 ret = 1;
2728 } else if (!strcmp(str, "disable")) {
2729 numabalancing_override = -1;
2730 ret = 1;
2731 }
2732out:
2733 if (!ret)
2734 pr_warn("Unable to parse numa_balancing=\n");
2735
2736 return ret;
2737}
2738__setup("numa_balancing=", setup_numabalancing);
2739#else
2740static inline void __init check_numabalancing_enable(void)
2741{
2742}
2743#endif
2744
2745
2746void __init numa_policy_init(void)
2747{
2748 nodemask_t interleave_nodes;
2749 unsigned long largest = 0;
2750 int nid, prefer = 0;
2751
2752 policy_cache = kmem_cache_create("numa_policy",
2753 sizeof(struct mempolicy),
2754 0, SLAB_PANIC, NULL);
2755
2756 sn_cache = kmem_cache_create("shared_policy_node",
2757 sizeof(struct sp_node),
2758 0, SLAB_PANIC, NULL);
2759
2760 for_each_node(nid) {
2761 preferred_node_policy[nid] = (struct mempolicy) {
2762 .refcnt = ATOMIC_INIT(1),
2763 .mode = MPOL_PREFERRED,
2764 .flags = MPOL_F_MOF | MPOL_F_MORON,
2765 .v = { .preferred_node = nid, },
2766 };
2767 }
2768
2769
2770
2771
2772
2773
2774 nodes_clear(interleave_nodes);
2775 for_each_node_state(nid, N_MEMORY) {
2776 unsigned long total_pages = node_present_pages(nid);
2777
2778
2779 if (largest < total_pages) {
2780 largest = total_pages;
2781 prefer = nid;
2782 }
2783
2784
2785 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2786 node_set(nid, interleave_nodes);
2787 }
2788
2789
2790 if (unlikely(nodes_empty(interleave_nodes)))
2791 node_set(prefer, interleave_nodes);
2792
2793 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2794 pr_err("%s: interleaving failed\n", __func__);
2795
2796 check_numabalancing_enable();
2797}
2798
2799
2800void numa_default_policy(void)
2801{
2802 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2803}
2804
2805
2806
2807
2808
2809
2810
2811
2812static const char * const policy_modes[] =
2813{
2814 [MPOL_DEFAULT] = "default",
2815 [MPOL_PREFERRED] = "prefer",
2816 [MPOL_BIND] = "bind",
2817 [MPOL_INTERLEAVE] = "interleave",
2818 [MPOL_LOCAL] = "local",
2819};
2820
2821
2822#ifdef CONFIG_TMPFS
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833int mpol_parse_str(char *str, struct mempolicy **mpol)
2834{
2835 struct mempolicy *new = NULL;
2836 unsigned short mode;
2837 unsigned short mode_flags;
2838 nodemask_t nodes;
2839 char *nodelist = strchr(str, ':');
2840 char *flags = strchr(str, '=');
2841 int err = 1;
2842
2843 if (flags)
2844 *flags++ = '\0';
2845
2846 if (nodelist) {
2847
2848 *nodelist++ = '\0';
2849 if (nodelist_parse(nodelist, nodes))
2850 goto out;
2851 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2852 goto out;
2853 } else
2854 nodes_clear(nodes);
2855
2856 for (mode = 0; mode < MPOL_MAX; mode++) {
2857 if (!strcmp(str, policy_modes[mode])) {
2858 break;
2859 }
2860 }
2861 if (mode >= MPOL_MAX)
2862 goto out;
2863
2864 switch (mode) {
2865 case MPOL_PREFERRED:
2866
2867
2868
2869
2870
2871 if (nodelist) {
2872 char *rest = nodelist;
2873 while (isdigit(*rest))
2874 rest++;
2875 if (*rest)
2876 goto out;
2877 if (nodes_empty(nodes))
2878 goto out;
2879 }
2880 break;
2881 case MPOL_INTERLEAVE:
2882
2883
2884
2885 if (!nodelist)
2886 nodes = node_states[N_MEMORY];
2887 break;
2888 case MPOL_LOCAL:
2889
2890
2891
2892 if (nodelist)
2893 goto out;
2894 mode = MPOL_PREFERRED;
2895 break;
2896 case MPOL_DEFAULT:
2897
2898
2899
2900 if (!nodelist)
2901 err = 0;
2902 goto out;
2903 case MPOL_BIND:
2904
2905
2906
2907 if (!nodelist)
2908 goto out;
2909 }
2910
2911 mode_flags = 0;
2912 if (flags) {
2913
2914
2915
2916
2917 if (!strcmp(flags, "static"))
2918 mode_flags |= MPOL_F_STATIC_NODES;
2919 else if (!strcmp(flags, "relative"))
2920 mode_flags |= MPOL_F_RELATIVE_NODES;
2921 else
2922 goto out;
2923 }
2924
2925 new = mpol_new(mode, mode_flags, &nodes);
2926 if (IS_ERR(new))
2927 goto out;
2928
2929
2930
2931
2932
2933 if (mode != MPOL_PREFERRED)
2934 new->v.nodes = nodes;
2935 else if (nodelist)
2936 new->v.preferred_node = first_node(nodes);
2937 else
2938 new->flags |= MPOL_F_LOCAL;
2939
2940
2941
2942
2943
2944 new->w.user_nodemask = nodes;
2945
2946 err = 0;
2947
2948out:
2949
2950 if (nodelist)
2951 *--nodelist = ':';
2952 if (flags)
2953 *--flags = '=';
2954 if (!err)
2955 *mpol = new;
2956 return err;
2957}
2958#endif
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2971{
2972 char *p = buffer;
2973 nodemask_t nodes = NODE_MASK_NONE;
2974 unsigned short mode = MPOL_DEFAULT;
2975 unsigned short flags = 0;
2976
2977 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2978 mode = pol->mode;
2979 flags = pol->flags;
2980 }
2981
2982 switch (mode) {
2983 case MPOL_DEFAULT:
2984 break;
2985 case MPOL_PREFERRED:
2986 if (flags & MPOL_F_LOCAL)
2987 mode = MPOL_LOCAL;
2988 else
2989 node_set(pol->v.preferred_node, nodes);
2990 break;
2991 case MPOL_BIND:
2992 case MPOL_INTERLEAVE:
2993 nodes = pol->v.nodes;
2994 break;
2995 default:
2996 WARN_ON_ONCE(1);
2997 snprintf(p, maxlen, "unknown");
2998 return;
2999 }
3000
3001 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
3002
3003 if (flags & MPOL_MODE_FLAGS) {
3004 p += snprintf(p, buffer + maxlen - p, "=");
3005
3006
3007
3008
3009 if (flags & MPOL_F_STATIC_NODES)
3010 p += snprintf(p, buffer + maxlen - p, "static");
3011 else if (flags & MPOL_F_RELATIVE_NODES)
3012 p += snprintf(p, buffer + maxlen - p, "relative");
3013 }
3014
3015 if (!nodes_empty(nodes))
3016 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
3017 nodemask_pr_args(&nodes));
3018}
3019