1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70#include <linux/mempolicy.h>
71#include <linux/pagewalk.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
76#include <linux/sched/mm.h>
77#include <linux/sched/numa_balancing.h>
78#include <linux/sched/task.h>
79#include <linux/nodemask.h>
80#include <linux/cpuset.h>
81#include <linux/slab.h>
82#include <linux/string.h>
83#include <linux/export.h>
84#include <linux/nsproxy.h>
85#include <linux/interrupt.h>
86#include <linux/init.h>
87#include <linux/compat.h>
88#include <linux/ptrace.h>
89#include <linux/swap.h>
90#include <linux/seq_file.h>
91#include <linux/proc_fs.h>
92#include <linux/migrate.h>
93#include <linux/ksm.h>
94#include <linux/rmap.h>
95#include <linux/security.h>
96#include <linux/syscalls.h>
97#include <linux/ctype.h>
98#include <linux/mm_inline.h>
99#include <linux/mmu_notifier.h>
100#include <linux/printk.h>
101#include <linux/swapops.h>
102
103#include <asm/tlbflush.h>
104#include <linux/uaccess.h>
105
106#include "internal.h"
107
108
109#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)
110#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)
111
112static struct kmem_cache *policy_cache;
113static struct kmem_cache *sn_cache;
114
115
116
117enum zone_type policy_zone = 0;
118
119
120
121
122static struct mempolicy default_policy = {
123 .refcnt = ATOMIC_INIT(1),
124 .mode = MPOL_PREFERRED,
125 .flags = MPOL_F_LOCAL,
126};
127
128static struct mempolicy preferred_node_policy[MAX_NUMNODES];
129
130struct mempolicy *get_task_policy(struct task_struct *p)
131{
132 struct mempolicy *pol = p->mempolicy;
133 int node;
134
135 if (pol)
136 return pol;
137
138 node = numa_node_id();
139 if (node != NUMA_NO_NODE) {
140 pol = &preferred_node_policy[node];
141
142 if (pol->mode)
143 return pol;
144 }
145
146 return &default_policy;
147}
148
149static const struct mempolicy_operations {
150 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
151 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
152} mpol_ops[MPOL_MAX];
153
154static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
155{
156 return pol->flags & MPOL_MODE_FLAGS;
157}
158
159static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
160 const nodemask_t *rel)
161{
162 nodemask_t tmp;
163 nodes_fold(tmp, *orig, nodes_weight(*rel));
164 nodes_onto(*ret, tmp, *rel);
165}
166
167static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
168{
169 if (nodes_empty(*nodes))
170 return -EINVAL;
171 pol->v.nodes = *nodes;
172 return 0;
173}
174
175static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
176{
177 if (!nodes)
178 pol->flags |= MPOL_F_LOCAL;
179 else if (nodes_empty(*nodes))
180 return -EINVAL;
181 else
182 pol->v.preferred_node = first_node(*nodes);
183 return 0;
184}
185
186static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
187{
188 if (nodes_empty(*nodes))
189 return -EINVAL;
190 pol->v.nodes = *nodes;
191 return 0;
192}
193
194
195
196
197
198
199
200
201
202
203static int mpol_set_nodemask(struct mempolicy *pol,
204 const nodemask_t *nodes, struct nodemask_scratch *nsc)
205{
206 int ret;
207
208
209 if (pol == NULL)
210 return 0;
211
212 nodes_and(nsc->mask1,
213 cpuset_current_mems_allowed, node_states[N_MEMORY]);
214
215 VM_BUG_ON(!nodes);
216 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
217 nodes = NULL;
218 else {
219 if (pol->flags & MPOL_F_RELATIVE_NODES)
220 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
221 else
222 nodes_and(nsc->mask2, *nodes, nsc->mask1);
223
224 if (mpol_store_user_nodemask(pol))
225 pol->w.user_nodemask = *nodes;
226 else
227 pol->w.cpuset_mems_allowed =
228 cpuset_current_mems_allowed;
229 }
230
231 if (nodes)
232 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
233 else
234 ret = mpol_ops[pol->mode].create(pol, NULL);
235 return ret;
236}
237
238
239
240
241
242static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
243 nodemask_t *nodes)
244{
245 struct mempolicy *policy;
246
247 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
248 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
249
250 if (mode == MPOL_DEFAULT) {
251 if (nodes && !nodes_empty(*nodes))
252 return ERR_PTR(-EINVAL);
253 return NULL;
254 }
255 VM_BUG_ON(!nodes);
256
257
258
259
260
261
262 if (mode == MPOL_PREFERRED) {
263 if (nodes_empty(*nodes)) {
264 if (((flags & MPOL_F_STATIC_NODES) ||
265 (flags & MPOL_F_RELATIVE_NODES)))
266 return ERR_PTR(-EINVAL);
267 }
268 } else if (mode == MPOL_LOCAL) {
269 if (!nodes_empty(*nodes) ||
270 (flags & MPOL_F_STATIC_NODES) ||
271 (flags & MPOL_F_RELATIVE_NODES))
272 return ERR_PTR(-EINVAL);
273 mode = MPOL_PREFERRED;
274 } else if (nodes_empty(*nodes))
275 return ERR_PTR(-EINVAL);
276 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
277 if (!policy)
278 return ERR_PTR(-ENOMEM);
279 atomic_set(&policy->refcnt, 1);
280 policy->mode = mode;
281 policy->flags = flags;
282
283 return policy;
284}
285
286
287void __mpol_put(struct mempolicy *p)
288{
289 if (!atomic_dec_and_test(&p->refcnt))
290 return;
291 kmem_cache_free(policy_cache, p);
292}
293
294static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
295{
296}
297
298static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
299{
300 nodemask_t tmp;
301
302 if (pol->flags & MPOL_F_STATIC_NODES)
303 nodes_and(tmp, pol->w.user_nodemask, *nodes);
304 else if (pol->flags & MPOL_F_RELATIVE_NODES)
305 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
306 else {
307 nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
308 *nodes);
309 pol->w.cpuset_mems_allowed = *nodes;
310 }
311
312 if (nodes_empty(tmp))
313 tmp = *nodes;
314
315 pol->v.nodes = tmp;
316}
317
318static void mpol_rebind_preferred(struct mempolicy *pol,
319 const nodemask_t *nodes)
320{
321 nodemask_t tmp;
322
323 if (pol->flags & MPOL_F_STATIC_NODES) {
324 int node = first_node(pol->w.user_nodemask);
325
326 if (node_isset(node, *nodes)) {
327 pol->v.preferred_node = node;
328 pol->flags &= ~MPOL_F_LOCAL;
329 } else
330 pol->flags |= MPOL_F_LOCAL;
331 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
332 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
333 pol->v.preferred_node = first_node(tmp);
334 } else if (!(pol->flags & MPOL_F_LOCAL)) {
335 pol->v.preferred_node = node_remap(pol->v.preferred_node,
336 pol->w.cpuset_mems_allowed,
337 *nodes);
338 pol->w.cpuset_mems_allowed = *nodes;
339 }
340}
341
342
343
344
345
346
347
348
349static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
350{
351 if (!pol)
352 return;
353 if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
354 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
355 return;
356
357 mpol_ops[pol->mode].rebind(pol, newmask);
358}
359
360
361
362
363
364
365
366
367void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
368{
369 mpol_rebind_policy(tsk->mempolicy, new);
370}
371
372
373
374
375
376
377
378void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
379{
380 struct vm_area_struct *vma;
381
382 down_write(&mm->mmap_sem);
383 for (vma = mm->mmap; vma; vma = vma->vm_next)
384 mpol_rebind_policy(vma->vm_policy, new);
385 up_write(&mm->mmap_sem);
386}
387
388static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
389 [MPOL_DEFAULT] = {
390 .rebind = mpol_rebind_default,
391 },
392 [MPOL_INTERLEAVE] = {
393 .create = mpol_new_interleave,
394 .rebind = mpol_rebind_nodemask,
395 },
396 [MPOL_PREFERRED] = {
397 .create = mpol_new_preferred,
398 .rebind = mpol_rebind_preferred,
399 },
400 [MPOL_BIND] = {
401 .create = mpol_new_bind,
402 .rebind = mpol_rebind_nodemask,
403 },
404};
405
406static int migrate_page_add(struct page *page, struct list_head *pagelist,
407 unsigned long flags);
408
409struct queue_pages {
410 struct list_head *pagelist;
411 unsigned long flags;
412 nodemask_t *nmask;
413 unsigned long start;
414 unsigned long end;
415 struct vm_area_struct *first;
416};
417
418
419
420
421
422
423
424static inline bool queue_pages_required(struct page *page,
425 struct queue_pages *qp)
426{
427 int nid = page_to_nid(page);
428 unsigned long flags = qp->flags;
429
430 return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
431}
432
433
434
435
436
437
438
439
440
441
442
443static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
444 unsigned long end, struct mm_walk *walk)
445{
446 int ret = 0;
447 struct page *page;
448 struct queue_pages *qp = walk->private;
449 unsigned long flags;
450
451 if (unlikely(is_pmd_migration_entry(*pmd))) {
452 ret = -EIO;
453 goto unlock;
454 }
455 page = pmd_page(*pmd);
456 if (is_huge_zero_page(page)) {
457 spin_unlock(ptl);
458 __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
459 ret = 2;
460 goto out;
461 }
462 if (!queue_pages_required(page, qp))
463 goto unlock;
464
465 flags = qp->flags;
466
467 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
468 if (!vma_migratable(walk->vma) ||
469 migrate_page_add(page, qp->pagelist, flags)) {
470 ret = 1;
471 goto unlock;
472 }
473 } else
474 ret = -EIO;
475unlock:
476 spin_unlock(ptl);
477out:
478 return ret;
479}
480
481
482
483
484
485
486
487
488
489
490
491
492static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
493 unsigned long end, struct mm_walk *walk)
494{
495 struct vm_area_struct *vma = walk->vma;
496 struct page *page;
497 struct queue_pages *qp = walk->private;
498 unsigned long flags = qp->flags;
499 int ret;
500 bool has_unmovable = false;
501 pte_t *pte;
502 spinlock_t *ptl;
503
504 ptl = pmd_trans_huge_lock(pmd, vma);
505 if (ptl) {
506 ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
507 if (ret != 2)
508 return ret;
509 }
510
511
512 if (pmd_trans_unstable(pmd))
513 return 0;
514
515 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
516 for (; addr != end; pte++, addr += PAGE_SIZE) {
517 if (!pte_present(*pte))
518 continue;
519 page = vm_normal_page(vma, addr, *pte);
520 if (!page)
521 continue;
522
523
524
525
526 if (PageReserved(page))
527 continue;
528 if (!queue_pages_required(page, qp))
529 continue;
530 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
531
532 if (!vma_migratable(vma)) {
533 has_unmovable = true;
534 break;
535 }
536
537
538
539
540
541
542 if (migrate_page_add(page, qp->pagelist, flags))
543 has_unmovable = true;
544 } else
545 break;
546 }
547 pte_unmap_unlock(pte - 1, ptl);
548 cond_resched();
549
550 if (has_unmovable)
551 return 1;
552
553 return addr != end ? -EIO : 0;
554}
555
556static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
557 unsigned long addr, unsigned long end,
558 struct mm_walk *walk)
559{
560#ifdef CONFIG_HUGETLB_PAGE
561 struct queue_pages *qp = walk->private;
562 unsigned long flags = qp->flags;
563 struct page *page;
564 spinlock_t *ptl;
565 pte_t entry;
566
567 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
568 entry = huge_ptep_get(pte);
569 if (!pte_present(entry))
570 goto unlock;
571 page = pte_page(entry);
572 if (!queue_pages_required(page, qp))
573 goto unlock;
574
575 if (flags & (MPOL_MF_MOVE_ALL) ||
576 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
577 isolate_huge_page(page, qp->pagelist);
578unlock:
579 spin_unlock(ptl);
580#else
581 BUG();
582#endif
583 return 0;
584}
585
586#ifdef CONFIG_NUMA_BALANCING
587
588
589
590
591
592
593
594
595
596unsigned long change_prot_numa(struct vm_area_struct *vma,
597 unsigned long addr, unsigned long end)
598{
599 int nr_updated;
600
601 nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
602 if (nr_updated)
603 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
604
605 return nr_updated;
606}
607#else
608static unsigned long change_prot_numa(struct vm_area_struct *vma,
609 unsigned long addr, unsigned long end)
610{
611 return 0;
612}
613#endif
614
615static int queue_pages_test_walk(unsigned long start, unsigned long end,
616 struct mm_walk *walk)
617{
618 struct vm_area_struct *vma = walk->vma;
619 struct queue_pages *qp = walk->private;
620 unsigned long endvma = vma->vm_end;
621 unsigned long flags = qp->flags;
622
623
624 VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
625
626 if (!qp->first) {
627 qp->first = vma;
628 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
629 (qp->start < vma->vm_start))
630
631 return -EFAULT;
632 }
633 if (!(flags & MPOL_MF_DISCONTIG_OK) &&
634 ((vma->vm_end < qp->end) &&
635 (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
636
637 return -EFAULT;
638
639
640
641
642
643 if (!vma_migratable(vma) &&
644 !(flags & MPOL_MF_STRICT))
645 return 1;
646
647 if (endvma > end)
648 endvma = end;
649
650 if (flags & MPOL_MF_LAZY) {
651
652 if (!is_vm_hugetlb_page(vma) &&
653 (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
654 !(vma->vm_flags & VM_MIXEDMAP))
655 change_prot_numa(vma, start, endvma);
656 return 1;
657 }
658
659
660 if (flags & MPOL_MF_VALID)
661 return 0;
662 return 1;
663}
664
665static const struct mm_walk_ops queue_pages_walk_ops = {
666 .hugetlb_entry = queue_pages_hugetlb,
667 .pmd_entry = queue_pages_pte_range,
668 .test_walk = queue_pages_test_walk,
669};
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686static int
687queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
688 nodemask_t *nodes, unsigned long flags,
689 struct list_head *pagelist)
690{
691 int err;
692 struct queue_pages qp = {
693 .pagelist = pagelist,
694 .flags = flags,
695 .nmask = nodes,
696 .start = start,
697 .end = end,
698 .first = NULL,
699 };
700
701 err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
702
703 if (!qp.first)
704
705 err = -EFAULT;
706
707 return err;
708}
709
710
711
712
713
714static int vma_replace_policy(struct vm_area_struct *vma,
715 struct mempolicy *pol)
716{
717 int err;
718 struct mempolicy *old;
719 struct mempolicy *new;
720
721 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
722 vma->vm_start, vma->vm_end, vma->vm_pgoff,
723 vma->vm_ops, vma->vm_file,
724 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
725
726 new = mpol_dup(pol);
727 if (IS_ERR(new))
728 return PTR_ERR(new);
729
730 if (vma->vm_ops && vma->vm_ops->set_policy) {
731 err = vma->vm_ops->set_policy(vma, new);
732 if (err)
733 goto err_out;
734 }
735
736 old = vma->vm_policy;
737 vma->vm_policy = new;
738 mpol_put(old);
739
740 return 0;
741 err_out:
742 mpol_put(new);
743 return err;
744}
745
746
747static int mbind_range(struct mm_struct *mm, unsigned long start,
748 unsigned long end, struct mempolicy *new_pol)
749{
750 struct vm_area_struct *next;
751 struct vm_area_struct *prev;
752 struct vm_area_struct *vma;
753 int err = 0;
754 pgoff_t pgoff;
755 unsigned long vmstart;
756 unsigned long vmend;
757
758 vma = find_vma(mm, start);
759 VM_BUG_ON(!vma);
760
761 prev = vma->vm_prev;
762 if (start > vma->vm_start)
763 prev = vma;
764
765 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
766 next = vma->vm_next;
767 vmstart = max(start, vma->vm_start);
768 vmend = min(end, vma->vm_end);
769
770 if (mpol_equal(vma_policy(vma), new_pol))
771 continue;
772
773 pgoff = vma->vm_pgoff +
774 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
775 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
776 vma->anon_vma, vma->vm_file, pgoff,
777 new_pol, vma->vm_userfaultfd_ctx);
778 if (prev) {
779 vma = prev;
780 next = vma->vm_next;
781 if (mpol_equal(vma_policy(vma), new_pol))
782 continue;
783
784 goto replace;
785 }
786 if (vma->vm_start != vmstart) {
787 err = split_vma(vma->vm_mm, vma, vmstart, 1);
788 if (err)
789 goto out;
790 }
791 if (vma->vm_end != vmend) {
792 err = split_vma(vma->vm_mm, vma, vmend, 0);
793 if (err)
794 goto out;
795 }
796 replace:
797 err = vma_replace_policy(vma, new_pol);
798 if (err)
799 goto out;
800 }
801
802 out:
803 return err;
804}
805
806
807static long do_set_mempolicy(unsigned short mode, unsigned short flags,
808 nodemask_t *nodes)
809{
810 struct mempolicy *new, *old;
811 NODEMASK_SCRATCH(scratch);
812 int ret;
813
814 if (!scratch)
815 return -ENOMEM;
816
817 new = mpol_new(mode, flags, nodes);
818 if (IS_ERR(new)) {
819 ret = PTR_ERR(new);
820 goto out;
821 }
822
823 task_lock(current);
824 ret = mpol_set_nodemask(new, nodes, scratch);
825 if (ret) {
826 task_unlock(current);
827 mpol_put(new);
828 goto out;
829 }
830 old = current->mempolicy;
831 current->mempolicy = new;
832 if (new && new->mode == MPOL_INTERLEAVE)
833 current->il_prev = MAX_NUMNODES-1;
834 task_unlock(current);
835 mpol_put(old);
836 ret = 0;
837out:
838 NODEMASK_SCRATCH_FREE(scratch);
839 return ret;
840}
841
842
843
844
845
846
847static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
848{
849 nodes_clear(*nodes);
850 if (p == &default_policy)
851 return;
852
853 switch (p->mode) {
854 case MPOL_BIND:
855
856 case MPOL_INTERLEAVE:
857 *nodes = p->v.nodes;
858 break;
859 case MPOL_PREFERRED:
860 if (!(p->flags & MPOL_F_LOCAL))
861 node_set(p->v.preferred_node, *nodes);
862
863 break;
864 default:
865 BUG();
866 }
867}
868
869static int lookup_node(struct mm_struct *mm, unsigned long addr)
870{
871 struct page *p;
872 int err;
873
874 int locked = 1;
875 err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
876 if (err >= 0) {
877 err = page_to_nid(p);
878 put_page(p);
879 }
880 if (locked)
881 up_read(&mm->mmap_sem);
882 return err;
883}
884
885
886static long do_get_mempolicy(int *policy, nodemask_t *nmask,
887 unsigned long addr, unsigned long flags)
888{
889 int err;
890 struct mm_struct *mm = current->mm;
891 struct vm_area_struct *vma = NULL;
892 struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
893
894 if (flags &
895 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
896 return -EINVAL;
897
898 if (flags & MPOL_F_MEMS_ALLOWED) {
899 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
900 return -EINVAL;
901 *policy = 0;
902 task_lock(current);
903 *nmask = cpuset_current_mems_allowed;
904 task_unlock(current);
905 return 0;
906 }
907
908 if (flags & MPOL_F_ADDR) {
909
910
911
912
913
914 down_read(&mm->mmap_sem);
915 vma = find_vma_intersection(mm, addr, addr+1);
916 if (!vma) {
917 up_read(&mm->mmap_sem);
918 return -EFAULT;
919 }
920 if (vma->vm_ops && vma->vm_ops->get_policy)
921 pol = vma->vm_ops->get_policy(vma, addr);
922 else
923 pol = vma->vm_policy;
924 } else if (addr)
925 return -EINVAL;
926
927 if (!pol)
928 pol = &default_policy;
929
930 if (flags & MPOL_F_NODE) {
931 if (flags & MPOL_F_ADDR) {
932
933
934
935
936
937
938 pol_refcount = pol;
939 vma = NULL;
940 mpol_get(pol);
941 err = lookup_node(mm, addr);
942 if (err < 0)
943 goto out;
944 *policy = err;
945 } else if (pol == current->mempolicy &&
946 pol->mode == MPOL_INTERLEAVE) {
947 *policy = next_node_in(current->il_prev, pol->v.nodes);
948 } else {
949 err = -EINVAL;
950 goto out;
951 }
952 } else {
953 *policy = pol == &default_policy ? MPOL_DEFAULT :
954 pol->mode;
955
956
957
958
959 *policy |= (pol->flags & MPOL_MODE_FLAGS);
960 }
961
962 err = 0;
963 if (nmask) {
964 if (mpol_store_user_nodemask(pol)) {
965 *nmask = pol->w.user_nodemask;
966 } else {
967 task_lock(current);
968 get_policy_nodemask(pol, nmask);
969 task_unlock(current);
970 }
971 }
972
973 out:
974 mpol_cond_put(pol);
975 if (vma)
976 up_read(&mm->mmap_sem);
977 if (pol_refcount)
978 mpol_put(pol_refcount);
979 return err;
980}
981
982#ifdef CONFIG_MIGRATION
983
984
985
986static int migrate_page_add(struct page *page, struct list_head *pagelist,
987 unsigned long flags)
988{
989 struct page *head = compound_head(page);
990
991
992
993 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
994 if (!isolate_lru_page(head)) {
995 list_add_tail(&head->lru, pagelist);
996 mod_node_page_state(page_pgdat(head),
997 NR_ISOLATED_ANON + page_is_file_cache(head),
998 hpage_nr_pages(head));
999 } else if (flags & MPOL_MF_STRICT) {
1000
1001
1002
1003
1004
1005
1006
1007 return -EIO;
1008 }
1009 }
1010
1011 return 0;
1012}
1013
1014
1015struct page *alloc_new_node_page(struct page *page, unsigned long node)
1016{
1017 if (PageHuge(page))
1018 return alloc_huge_page_node(page_hstate(compound_head(page)),
1019 node);
1020 else if (PageTransHuge(page)) {
1021 struct page *thp;
1022
1023 thp = alloc_pages_node(node,
1024 (GFP_TRANSHUGE | __GFP_THISNODE),
1025 HPAGE_PMD_ORDER);
1026 if (!thp)
1027 return NULL;
1028 prep_transhuge_page(thp);
1029 return thp;
1030 } else
1031 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
1032 __GFP_THISNODE, 0);
1033}
1034
1035
1036
1037
1038
1039static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1040 int flags)
1041{
1042 nodemask_t nmask;
1043 LIST_HEAD(pagelist);
1044 int err = 0;
1045
1046 nodes_clear(nmask);
1047 node_set(source, nmask);
1048
1049
1050
1051
1052
1053
1054 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1055 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1056 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1057
1058 if (!list_empty(&pagelist)) {
1059 err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
1060 MIGRATE_SYNC, MR_SYSCALL);
1061 if (err)
1062 putback_movable_pages(&pagelist);
1063 }
1064
1065 return err;
1066}
1067
1068
1069
1070
1071
1072
1073
1074int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1075 const nodemask_t *to, int flags)
1076{
1077 int busy = 0;
1078 int err;
1079 nodemask_t tmp;
1080
1081 err = migrate_prep();
1082 if (err)
1083 return err;
1084
1085 down_read(&mm->mmap_sem);
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118 tmp = *from;
1119 while (!nodes_empty(tmp)) {
1120 int s,d;
1121 int source = NUMA_NO_NODE;
1122 int dest = 0;
1123
1124 for_each_node_mask(s, tmp) {
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1142 (node_isset(s, *to)))
1143 continue;
1144
1145 d = node_remap(s, *from, *to);
1146 if (s == d)
1147 continue;
1148
1149 source = s;
1150 dest = d;
1151
1152
1153 if (!node_isset(dest, tmp))
1154 break;
1155 }
1156 if (source == NUMA_NO_NODE)
1157 break;
1158
1159 node_clear(source, tmp);
1160 err = migrate_to_node(mm, source, dest, flags);
1161 if (err > 0)
1162 busy += err;
1163 if (err < 0)
1164 break;
1165 }
1166 up_read(&mm->mmap_sem);
1167 if (err < 0)
1168 return err;
1169 return busy;
1170
1171}
1172
1173
1174
1175
1176
1177
1178
1179
1180static struct page *new_page(struct page *page, unsigned long start)
1181{
1182 struct vm_area_struct *vma;
1183 unsigned long uninitialized_var(address);
1184
1185 vma = find_vma(current->mm, start);
1186 while (vma) {
1187 address = page_address_in_vma(page, vma);
1188 if (address != -EFAULT)
1189 break;
1190 vma = vma->vm_next;
1191 }
1192
1193 if (PageHuge(page)) {
1194 return alloc_huge_page_vma(page_hstate(compound_head(page)),
1195 vma, address);
1196 } else if (PageTransHuge(page)) {
1197 struct page *thp;
1198
1199 thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
1200 HPAGE_PMD_ORDER);
1201 if (!thp)
1202 return NULL;
1203 prep_transhuge_page(thp);
1204 return thp;
1205 }
1206
1207
1208
1209 return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
1210 vma, address);
1211}
1212#else
1213
1214static int migrate_page_add(struct page *page, struct list_head *pagelist,
1215 unsigned long flags)
1216{
1217 return -EIO;
1218}
1219
1220int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1221 const nodemask_t *to, int flags)
1222{
1223 return -ENOSYS;
1224}
1225
1226static struct page *new_page(struct page *page, unsigned long start)
1227{
1228 return NULL;
1229}
1230#endif
1231
1232static long do_mbind(unsigned long start, unsigned long len,
1233 unsigned short mode, unsigned short mode_flags,
1234 nodemask_t *nmask, unsigned long flags)
1235{
1236 struct mm_struct *mm = current->mm;
1237 struct mempolicy *new;
1238 unsigned long end;
1239 int err;
1240 int ret;
1241 LIST_HEAD(pagelist);
1242
1243 if (flags & ~(unsigned long)MPOL_MF_VALID)
1244 return -EINVAL;
1245 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1246 return -EPERM;
1247
1248 if (start & ~PAGE_MASK)
1249 return -EINVAL;
1250
1251 if (mode == MPOL_DEFAULT)
1252 flags &= ~MPOL_MF_STRICT;
1253
1254 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1255 end = start + len;
1256
1257 if (end < start)
1258 return -EINVAL;
1259 if (end == start)
1260 return 0;
1261
1262 new = mpol_new(mode, mode_flags, nmask);
1263 if (IS_ERR(new))
1264 return PTR_ERR(new);
1265
1266 if (flags & MPOL_MF_LAZY)
1267 new->flags |= MPOL_F_MOF;
1268
1269
1270
1271
1272
1273 if (!new)
1274 flags |= MPOL_MF_DISCONTIG_OK;
1275
1276 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1277 start, start + len, mode, mode_flags,
1278 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1279
1280 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1281
1282 err = migrate_prep();
1283 if (err)
1284 goto mpol_out;
1285 }
1286 {
1287 NODEMASK_SCRATCH(scratch);
1288 if (scratch) {
1289 down_write(&mm->mmap_sem);
1290 task_lock(current);
1291 err = mpol_set_nodemask(new, nmask, scratch);
1292 task_unlock(current);
1293 if (err)
1294 up_write(&mm->mmap_sem);
1295 } else
1296 err = -ENOMEM;
1297 NODEMASK_SCRATCH_FREE(scratch);
1298 }
1299 if (err)
1300 goto mpol_out;
1301
1302 ret = queue_pages_range(mm, start, end, nmask,
1303 flags | MPOL_MF_INVERT, &pagelist);
1304
1305 if (ret < 0) {
1306 err = ret;
1307 goto up_out;
1308 }
1309
1310 err = mbind_range(mm, start, end, new);
1311
1312 if (!err) {
1313 int nr_failed = 0;
1314
1315 if (!list_empty(&pagelist)) {
1316 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1317 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1318 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1319 if (nr_failed)
1320 putback_movable_pages(&pagelist);
1321 }
1322
1323 if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
1324 err = -EIO;
1325 } else {
1326up_out:
1327 if (!list_empty(&pagelist))
1328 putback_movable_pages(&pagelist);
1329 }
1330
1331 up_write(&mm->mmap_sem);
1332mpol_out:
1333 mpol_put(new);
1334 return err;
1335}
1336
1337
1338
1339
1340
1341
1342static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1343 unsigned long maxnode)
1344{
1345 unsigned long k;
1346 unsigned long t;
1347 unsigned long nlongs;
1348 unsigned long endmask;
1349
1350 --maxnode;
1351 nodes_clear(*nodes);
1352 if (maxnode == 0 || !nmask)
1353 return 0;
1354 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1355 return -EINVAL;
1356
1357 nlongs = BITS_TO_LONGS(maxnode);
1358 if ((maxnode % BITS_PER_LONG) == 0)
1359 endmask = ~0UL;
1360 else
1361 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1373 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1374 if (get_user(t, nmask + k))
1375 return -EFAULT;
1376 if (k == nlongs - 1) {
1377 if (t & endmask)
1378 return -EINVAL;
1379 } else if (t)
1380 return -EINVAL;
1381 }
1382 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1383 endmask = ~0UL;
1384 }
1385
1386 if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
1387 unsigned long valid_mask = endmask;
1388
1389 valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
1390 if (get_user(t, nmask + nlongs - 1))
1391 return -EFAULT;
1392 if (t & valid_mask)
1393 return -EINVAL;
1394 }
1395
1396 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1397 return -EFAULT;
1398 nodes_addr(*nodes)[nlongs-1] &= endmask;
1399 return 0;
1400}
1401
1402
1403static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1404 nodemask_t *nodes)
1405{
1406 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1407 unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
1408
1409 if (copy > nbytes) {
1410 if (copy > PAGE_SIZE)
1411 return -EINVAL;
1412 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1413 return -EFAULT;
1414 copy = nbytes;
1415 }
1416 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1417}
1418
1419static long kernel_mbind(unsigned long start, unsigned long len,
1420 unsigned long mode, const unsigned long __user *nmask,
1421 unsigned long maxnode, unsigned int flags)
1422{
1423 nodemask_t nodes;
1424 int err;
1425 unsigned short mode_flags;
1426
1427 start = untagged_addr(start);
1428 mode_flags = mode & MPOL_MODE_FLAGS;
1429 mode &= ~MPOL_MODE_FLAGS;
1430 if (mode >= MPOL_MAX)
1431 return -EINVAL;
1432 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1433 (mode_flags & MPOL_F_RELATIVE_NODES))
1434 return -EINVAL;
1435 err = get_nodes(&nodes, nmask, maxnode);
1436 if (err)
1437 return err;
1438 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1439}
1440
1441SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1442 unsigned long, mode, const unsigned long __user *, nmask,
1443 unsigned long, maxnode, unsigned int, flags)
1444{
1445 return kernel_mbind(start, len, mode, nmask, maxnode, flags);
1446}
1447
1448
1449static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
1450 unsigned long maxnode)
1451{
1452 int err;
1453 nodemask_t nodes;
1454 unsigned short flags;
1455
1456 flags = mode & MPOL_MODE_FLAGS;
1457 mode &= ~MPOL_MODE_FLAGS;
1458 if ((unsigned int)mode >= MPOL_MAX)
1459 return -EINVAL;
1460 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1461 return -EINVAL;
1462 err = get_nodes(&nodes, nmask, maxnode);
1463 if (err)
1464 return err;
1465 return do_set_mempolicy(mode, flags, &nodes);
1466}
1467
1468SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1469 unsigned long, maxnode)
1470{
1471 return kernel_set_mempolicy(mode, nmask, maxnode);
1472}
1473
1474static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1475 const unsigned long __user *old_nodes,
1476 const unsigned long __user *new_nodes)
1477{
1478 struct mm_struct *mm = NULL;
1479 struct task_struct *task;
1480 nodemask_t task_nodes;
1481 int err;
1482 nodemask_t *old;
1483 nodemask_t *new;
1484 NODEMASK_SCRATCH(scratch);
1485
1486 if (!scratch)
1487 return -ENOMEM;
1488
1489 old = &scratch->mask1;
1490 new = &scratch->mask2;
1491
1492 err = get_nodes(old, old_nodes, maxnode);
1493 if (err)
1494 goto out;
1495
1496 err = get_nodes(new, new_nodes, maxnode);
1497 if (err)
1498 goto out;
1499
1500
1501 rcu_read_lock();
1502 task = pid ? find_task_by_vpid(pid) : current;
1503 if (!task) {
1504 rcu_read_unlock();
1505 err = -ESRCH;
1506 goto out;
1507 }
1508 get_task_struct(task);
1509
1510 err = -EINVAL;
1511
1512
1513
1514
1515
1516 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
1517 rcu_read_unlock();
1518 err = -EPERM;
1519 goto out_put;
1520 }
1521 rcu_read_unlock();
1522
1523 task_nodes = cpuset_mems_allowed(task);
1524
1525 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1526 err = -EPERM;
1527 goto out_put;
1528 }
1529
1530 task_nodes = cpuset_mems_allowed(current);
1531 nodes_and(*new, *new, task_nodes);
1532 if (nodes_empty(*new))
1533 goto out_put;
1534
1535 err = security_task_movememory(task);
1536 if (err)
1537 goto out_put;
1538
1539 mm = get_task_mm(task);
1540 put_task_struct(task);
1541
1542 if (!mm) {
1543 err = -EINVAL;
1544 goto out;
1545 }
1546
1547 err = do_migrate_pages(mm, old, new,
1548 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1549
1550 mmput(mm);
1551out:
1552 NODEMASK_SCRATCH_FREE(scratch);
1553
1554 return err;
1555
1556out_put:
1557 put_task_struct(task);
1558 goto out;
1559
1560}
1561
1562SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1563 const unsigned long __user *, old_nodes,
1564 const unsigned long __user *, new_nodes)
1565{
1566 return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
1567}
1568
1569
1570
1571static int kernel_get_mempolicy(int __user *policy,
1572 unsigned long __user *nmask,
1573 unsigned long maxnode,
1574 unsigned long addr,
1575 unsigned long flags)
1576{
1577 int err;
1578 int uninitialized_var(pval);
1579 nodemask_t nodes;
1580
1581 addr = untagged_addr(addr);
1582
1583 if (nmask != NULL && maxnode < nr_node_ids)
1584 return -EINVAL;
1585
1586 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1587
1588 if (err)
1589 return err;
1590
1591 if (policy && put_user(pval, policy))
1592 return -EFAULT;
1593
1594 if (nmask)
1595 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1596
1597 return err;
1598}
1599
1600SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1601 unsigned long __user *, nmask, unsigned long, maxnode,
1602 unsigned long, addr, unsigned long, flags)
1603{
1604 return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
1605}
1606
1607#ifdef CONFIG_COMPAT
1608
1609COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1610 compat_ulong_t __user *, nmask,
1611 compat_ulong_t, maxnode,
1612 compat_ulong_t, addr, compat_ulong_t, flags)
1613{
1614 long err;
1615 unsigned long __user *nm = NULL;
1616 unsigned long nr_bits, alloc_size;
1617 DECLARE_BITMAP(bm, MAX_NUMNODES);
1618
1619 nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
1620 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1621
1622 if (nmask)
1623 nm = compat_alloc_user_space(alloc_size);
1624
1625 err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1626
1627 if (!err && nmask) {
1628 unsigned long copy_size;
1629 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1630 err = copy_from_user(bm, nm, copy_size);
1631
1632 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1633 err |= compat_put_bitmap(nmask, bm, nr_bits);
1634 }
1635
1636 return err;
1637}
1638
1639COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1640 compat_ulong_t, maxnode)
1641{
1642 unsigned long __user *nm = NULL;
1643 unsigned long nr_bits, alloc_size;
1644 DECLARE_BITMAP(bm, MAX_NUMNODES);
1645
1646 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1647 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1648
1649 if (nmask) {
1650 if (compat_get_bitmap(bm, nmask, nr_bits))
1651 return -EFAULT;
1652 nm = compat_alloc_user_space(alloc_size);
1653 if (copy_to_user(nm, bm, alloc_size))
1654 return -EFAULT;
1655 }
1656
1657 return kernel_set_mempolicy(mode, nm, nr_bits+1);
1658}
1659
1660COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1661 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1662 compat_ulong_t, maxnode, compat_ulong_t, flags)
1663{
1664 unsigned long __user *nm = NULL;
1665 unsigned long nr_bits, alloc_size;
1666 nodemask_t bm;
1667
1668 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1669 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1670
1671 if (nmask) {
1672 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1673 return -EFAULT;
1674 nm = compat_alloc_user_space(alloc_size);
1675 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1676 return -EFAULT;
1677 }
1678
1679 return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
1680}
1681
1682COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
1683 compat_ulong_t, maxnode,
1684 const compat_ulong_t __user *, old_nodes,
1685 const compat_ulong_t __user *, new_nodes)
1686{
1687 unsigned long __user *old = NULL;
1688 unsigned long __user *new = NULL;
1689 nodemask_t tmp_mask;
1690 unsigned long nr_bits;
1691 unsigned long size;
1692
1693 nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
1694 size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1695 if (old_nodes) {
1696 if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
1697 return -EFAULT;
1698 old = compat_alloc_user_space(new_nodes ? size * 2 : size);
1699 if (new_nodes)
1700 new = old + size / sizeof(unsigned long);
1701 if (copy_to_user(old, nodes_addr(tmp_mask), size))
1702 return -EFAULT;
1703 }
1704 if (new_nodes) {
1705 if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
1706 return -EFAULT;
1707 if (new == NULL)
1708 new = compat_alloc_user_space(size);
1709 if (copy_to_user(new, nodes_addr(tmp_mask), size))
1710 return -EFAULT;
1711 }
1712 return kernel_migrate_pages(pid, nr_bits + 1, old, new);
1713}
1714
1715#endif
1716
1717struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1718 unsigned long addr)
1719{
1720 struct mempolicy *pol = NULL;
1721
1722 if (vma) {
1723 if (vma->vm_ops && vma->vm_ops->get_policy) {
1724 pol = vma->vm_ops->get_policy(vma, addr);
1725 } else if (vma->vm_policy) {
1726 pol = vma->vm_policy;
1727
1728
1729
1730
1731
1732
1733
1734 if (mpol_needs_cond_ref(pol))
1735 mpol_get(pol);
1736 }
1737 }
1738
1739 return pol;
1740}
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1755 unsigned long addr)
1756{
1757 struct mempolicy *pol = __get_vma_policy(vma, addr);
1758
1759 if (!pol)
1760 pol = get_task_policy(current);
1761
1762 return pol;
1763}
1764
1765bool vma_policy_mof(struct vm_area_struct *vma)
1766{
1767 struct mempolicy *pol;
1768
1769 if (vma->vm_ops && vma->vm_ops->get_policy) {
1770 bool ret = false;
1771
1772 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1773 if (pol && (pol->flags & MPOL_F_MOF))
1774 ret = true;
1775 mpol_cond_put(pol);
1776
1777 return ret;
1778 }
1779
1780 pol = vma->vm_policy;
1781 if (!pol)
1782 pol = get_task_policy(current);
1783
1784 return pol->flags & MPOL_F_MOF;
1785}
1786
1787static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1788{
1789 enum zone_type dynamic_policy_zone = policy_zone;
1790
1791 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1802 dynamic_policy_zone = ZONE_MOVABLE;
1803
1804 return zone >= dynamic_policy_zone;
1805}
1806
1807
1808
1809
1810
1811static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1812{
1813
1814 if (unlikely(policy->mode == MPOL_BIND) &&
1815 apply_policy_zone(policy, gfp_zone(gfp)) &&
1816 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1817 return &policy->v.nodes;
1818
1819 return NULL;
1820}
1821
1822
1823static int policy_node(gfp_t gfp, struct mempolicy *policy,
1824 int nd)
1825{
1826 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1827 nd = policy->v.preferred_node;
1828 else {
1829
1830
1831
1832
1833
1834 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1835 }
1836
1837 return nd;
1838}
1839
1840
1841static unsigned interleave_nodes(struct mempolicy *policy)
1842{
1843 unsigned next;
1844 struct task_struct *me = current;
1845
1846 next = next_node_in(me->il_prev, policy->v.nodes);
1847 if (next < MAX_NUMNODES)
1848 me->il_prev = next;
1849 return next;
1850}
1851
1852
1853
1854
1855
1856unsigned int mempolicy_slab_node(void)
1857{
1858 struct mempolicy *policy;
1859 int node = numa_mem_id();
1860
1861 if (in_interrupt())
1862 return node;
1863
1864 policy = current->mempolicy;
1865 if (!policy || policy->flags & MPOL_F_LOCAL)
1866 return node;
1867
1868 switch (policy->mode) {
1869 case MPOL_PREFERRED:
1870
1871
1872
1873 return policy->v.preferred_node;
1874
1875 case MPOL_INTERLEAVE:
1876 return interleave_nodes(policy);
1877
1878 case MPOL_BIND: {
1879 struct zoneref *z;
1880
1881
1882
1883
1884
1885 struct zonelist *zonelist;
1886 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1887 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1888 z = first_zones_zonelist(zonelist, highest_zoneidx,
1889 &policy->v.nodes);
1890 return z->zone ? zone_to_nid(z->zone) : node;
1891 }
1892
1893 default:
1894 BUG();
1895 }
1896}
1897
1898
1899
1900
1901
1902
1903static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
1904{
1905 unsigned nnodes = nodes_weight(pol->v.nodes);
1906 unsigned target;
1907 int i;
1908 int nid;
1909
1910 if (!nnodes)
1911 return numa_node_id();
1912 target = (unsigned int)n % nnodes;
1913 nid = first_node(pol->v.nodes);
1914 for (i = 0; i < target; i++)
1915 nid = next_node(nid, pol->v.nodes);
1916 return nid;
1917}
1918
1919
1920static inline unsigned interleave_nid(struct mempolicy *pol,
1921 struct vm_area_struct *vma, unsigned long addr, int shift)
1922{
1923 if (vma) {
1924 unsigned long off;
1925
1926
1927
1928
1929
1930
1931
1932
1933 BUG_ON(shift < PAGE_SHIFT);
1934 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1935 off += (addr - vma->vm_start) >> shift;
1936 return offset_il_node(pol, off);
1937 } else
1938 return interleave_nodes(pol);
1939}
1940
1941#ifdef CONFIG_HUGETLBFS
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
1958 struct mempolicy **mpol, nodemask_t **nodemask)
1959{
1960 int nid;
1961
1962 *mpol = get_vma_policy(vma, addr);
1963 *nodemask = NULL;
1964
1965 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1966 nid = interleave_nid(*mpol, vma, addr,
1967 huge_page_shift(hstate_vma(vma)));
1968 } else {
1969 nid = policy_node(gfp_flags, *mpol, numa_node_id());
1970 if ((*mpol)->mode == MPOL_BIND)
1971 *nodemask = &(*mpol)->v.nodes;
1972 }
1973 return nid;
1974}
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992bool init_nodemask_of_mempolicy(nodemask_t *mask)
1993{
1994 struct mempolicy *mempolicy;
1995 int nid;
1996
1997 if (!(mask && current->mempolicy))
1998 return false;
1999
2000 task_lock(current);
2001 mempolicy = current->mempolicy;
2002 switch (mempolicy->mode) {
2003 case MPOL_PREFERRED:
2004 if (mempolicy->flags & MPOL_F_LOCAL)
2005 nid = numa_node_id();
2006 else
2007 nid = mempolicy->v.preferred_node;
2008 init_nodemask_of_node(mask, nid);
2009 break;
2010
2011 case MPOL_BIND:
2012
2013 case MPOL_INTERLEAVE:
2014 *mask = mempolicy->v.nodes;
2015 break;
2016
2017 default:
2018 BUG();
2019 }
2020 task_unlock(current);
2021
2022 return true;
2023}
2024#endif
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036bool mempolicy_nodemask_intersects(struct task_struct *tsk,
2037 const nodemask_t *mask)
2038{
2039 struct mempolicy *mempolicy;
2040 bool ret = true;
2041
2042 if (!mask)
2043 return ret;
2044 task_lock(tsk);
2045 mempolicy = tsk->mempolicy;
2046 if (!mempolicy)
2047 goto out;
2048
2049 switch (mempolicy->mode) {
2050 case MPOL_PREFERRED:
2051
2052
2053
2054
2055
2056
2057 break;
2058 case MPOL_BIND:
2059 case MPOL_INTERLEAVE:
2060 ret = nodes_intersects(mempolicy->v.nodes, *mask);
2061 break;
2062 default:
2063 BUG();
2064 }
2065out:
2066 task_unlock(tsk);
2067 return ret;
2068}
2069
2070
2071
2072static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
2073 unsigned nid)
2074{
2075 struct page *page;
2076
2077 page = __alloc_pages(gfp, order, nid);
2078
2079 if (!static_branch_likely(&vm_numa_stat_key))
2080 return page;
2081 if (page && page_to_nid(page) == nid) {
2082 preempt_disable();
2083 __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
2084 preempt_enable();
2085 }
2086 return page;
2087}
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112struct page *
2113alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2114 unsigned long addr, int node, bool hugepage)
2115{
2116 struct mempolicy *pol;
2117 struct page *page;
2118 int preferred_nid;
2119 nodemask_t *nmask;
2120
2121 pol = get_vma_policy(vma, addr);
2122
2123 if (pol->mode == MPOL_INTERLEAVE) {
2124 unsigned nid;
2125
2126 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2127 mpol_cond_put(pol);
2128 page = alloc_page_interleave(gfp, order, nid);
2129 goto out;
2130 }
2131
2132 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
2133 int hpage_node = node;
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145 if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
2146 hpage_node = pol->v.preferred_node;
2147
2148 nmask = policy_nodemask(gfp, pol);
2149 if (!nmask || node_isset(hpage_node, *nmask)) {
2150 mpol_cond_put(pol);
2151
2152
2153
2154
2155 page = __alloc_pages_node(hpage_node,
2156 gfp | __GFP_THISNODE | __GFP_NORETRY, order);
2157
2158
2159
2160
2161
2162
2163
2164 if (!page && (gfp & __GFP_DIRECT_RECLAIM))
2165 page = __alloc_pages_node(hpage_node,
2166 gfp, order);
2167
2168 goto out;
2169 }
2170 }
2171
2172 nmask = policy_nodemask(gfp, pol);
2173 preferred_nid = policy_node(gfp, pol, node);
2174 page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
2175 mpol_cond_put(pol);
2176out:
2177 return page;
2178}
2179EXPORT_SYMBOL(alloc_pages_vma);
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2197{
2198 struct mempolicy *pol = &default_policy;
2199 struct page *page;
2200
2201 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2202 pol = get_task_policy(current);
2203
2204
2205
2206
2207
2208 if (pol->mode == MPOL_INTERLEAVE)
2209 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2210 else
2211 page = __alloc_pages_nodemask(gfp, order,
2212 policy_node(gfp, pol, numa_node_id()),
2213 policy_nodemask(gfp, pol));
2214
2215 return page;
2216}
2217EXPORT_SYMBOL(alloc_pages_current);
2218
2219int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2220{
2221 struct mempolicy *pol = mpol_dup(vma_policy(src));
2222
2223 if (IS_ERR(pol))
2224 return PTR_ERR(pol);
2225 dst->vm_policy = pol;
2226 return 0;
2227}
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241struct mempolicy *__mpol_dup(struct mempolicy *old)
2242{
2243 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2244
2245 if (!new)
2246 return ERR_PTR(-ENOMEM);
2247
2248
2249 if (old == current->mempolicy) {
2250 task_lock(current);
2251 *new = *old;
2252 task_unlock(current);
2253 } else
2254 *new = *old;
2255
2256 if (current_cpuset_is_being_rebound()) {
2257 nodemask_t mems = cpuset_mems_allowed(current);
2258 mpol_rebind_policy(new, &mems);
2259 }
2260 atomic_set(&new->refcnt, 1);
2261 return new;
2262}
2263
2264
2265bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2266{
2267 if (!a || !b)
2268 return false;
2269 if (a->mode != b->mode)
2270 return false;
2271 if (a->flags != b->flags)
2272 return false;
2273 if (mpol_store_user_nodemask(a))
2274 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2275 return false;
2276
2277 switch (a->mode) {
2278 case MPOL_BIND:
2279
2280 case MPOL_INTERLEAVE:
2281 return !!nodes_equal(a->v.nodes, b->v.nodes);
2282 case MPOL_PREFERRED:
2283
2284 if (a->flags & MPOL_F_LOCAL)
2285 return true;
2286 return a->v.preferred_node == b->v.preferred_node;
2287 default:
2288 BUG();
2289 return false;
2290 }
2291}
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306static struct sp_node *
2307sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2308{
2309 struct rb_node *n = sp->root.rb_node;
2310
2311 while (n) {
2312 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2313
2314 if (start >= p->end)
2315 n = n->rb_right;
2316 else if (end <= p->start)
2317 n = n->rb_left;
2318 else
2319 break;
2320 }
2321 if (!n)
2322 return NULL;
2323 for (;;) {
2324 struct sp_node *w = NULL;
2325 struct rb_node *prev = rb_prev(n);
2326 if (!prev)
2327 break;
2328 w = rb_entry(prev, struct sp_node, nd);
2329 if (w->end <= start)
2330 break;
2331 n = prev;
2332 }
2333 return rb_entry(n, struct sp_node, nd);
2334}
2335
2336
2337
2338
2339
2340static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2341{
2342 struct rb_node **p = &sp->root.rb_node;
2343 struct rb_node *parent = NULL;
2344 struct sp_node *nd;
2345
2346 while (*p) {
2347 parent = *p;
2348 nd = rb_entry(parent, struct sp_node, nd);
2349 if (new->start < nd->start)
2350 p = &(*p)->rb_left;
2351 else if (new->end > nd->end)
2352 p = &(*p)->rb_right;
2353 else
2354 BUG();
2355 }
2356 rb_link_node(&new->nd, parent, p);
2357 rb_insert_color(&new->nd, &sp->root);
2358 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2359 new->policy ? new->policy->mode : 0);
2360}
2361
2362
2363struct mempolicy *
2364mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2365{
2366 struct mempolicy *pol = NULL;
2367 struct sp_node *sn;
2368
2369 if (!sp->root.rb_node)
2370 return NULL;
2371 read_lock(&sp->lock);
2372 sn = sp_lookup(sp, idx, idx+1);
2373 if (sn) {
2374 mpol_get(sn->policy);
2375 pol = sn->policy;
2376 }
2377 read_unlock(&sp->lock);
2378 return pol;
2379}
2380
2381static void sp_free(struct sp_node *n)
2382{
2383 mpol_put(n->policy);
2384 kmem_cache_free(sn_cache, n);
2385}
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2405{
2406 struct mempolicy *pol;
2407 struct zoneref *z;
2408 int curnid = page_to_nid(page);
2409 unsigned long pgoff;
2410 int thiscpu = raw_smp_processor_id();
2411 int thisnid = cpu_to_node(thiscpu);
2412 int polnid = NUMA_NO_NODE;
2413 int ret = -1;
2414
2415 pol = get_vma_policy(vma, addr);
2416 if (!(pol->flags & MPOL_F_MOF))
2417 goto out;
2418
2419 switch (pol->mode) {
2420 case MPOL_INTERLEAVE:
2421 pgoff = vma->vm_pgoff;
2422 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2423 polnid = offset_il_node(pol, pgoff);
2424 break;
2425
2426 case MPOL_PREFERRED:
2427 if (pol->flags & MPOL_F_LOCAL)
2428 polnid = numa_node_id();
2429 else
2430 polnid = pol->v.preferred_node;
2431 break;
2432
2433 case MPOL_BIND:
2434
2435
2436
2437
2438
2439
2440
2441 if (node_isset(curnid, pol->v.nodes))
2442 goto out;
2443 z = first_zones_zonelist(
2444 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2445 gfp_zone(GFP_HIGHUSER),
2446 &pol->v.nodes);
2447 polnid = zone_to_nid(z->zone);
2448 break;
2449
2450 default:
2451 BUG();
2452 }
2453
2454
2455 if (pol->flags & MPOL_F_MORON) {
2456 polnid = thisnid;
2457
2458 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2459 goto out;
2460 }
2461
2462 if (curnid != polnid)
2463 ret = polnid;
2464out:
2465 mpol_cond_put(pol);
2466
2467 return ret;
2468}
2469
2470
2471
2472
2473
2474
2475
2476void mpol_put_task_policy(struct task_struct *task)
2477{
2478 struct mempolicy *pol;
2479
2480 task_lock(task);
2481 pol = task->mempolicy;
2482 task->mempolicy = NULL;
2483 task_unlock(task);
2484 mpol_put(pol);
2485}
2486
2487static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2488{
2489 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2490 rb_erase(&n->nd, &sp->root);
2491 sp_free(n);
2492}
2493
2494static void sp_node_init(struct sp_node *node, unsigned long start,
2495 unsigned long end, struct mempolicy *pol)
2496{
2497 node->start = start;
2498 node->end = end;
2499 node->policy = pol;
2500}
2501
2502static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2503 struct mempolicy *pol)
2504{
2505 struct sp_node *n;
2506 struct mempolicy *newpol;
2507
2508 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2509 if (!n)
2510 return NULL;
2511
2512 newpol = mpol_dup(pol);
2513 if (IS_ERR(newpol)) {
2514 kmem_cache_free(sn_cache, n);
2515 return NULL;
2516 }
2517 newpol->flags |= MPOL_F_SHARED;
2518 sp_node_init(n, start, end, newpol);
2519
2520 return n;
2521}
2522
2523
2524static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2525 unsigned long end, struct sp_node *new)
2526{
2527 struct sp_node *n;
2528 struct sp_node *n_new = NULL;
2529 struct mempolicy *mpol_new = NULL;
2530 int ret = 0;
2531
2532restart:
2533 write_lock(&sp->lock);
2534 n = sp_lookup(sp, start, end);
2535
2536 while (n && n->start < end) {
2537 struct rb_node *next = rb_next(&n->nd);
2538 if (n->start >= start) {
2539 if (n->end <= end)
2540 sp_delete(sp, n);
2541 else
2542 n->start = end;
2543 } else {
2544
2545 if (n->end > end) {
2546 if (!n_new)
2547 goto alloc_new;
2548
2549 *mpol_new = *n->policy;
2550 atomic_set(&mpol_new->refcnt, 1);
2551 sp_node_init(n_new, end, n->end, mpol_new);
2552 n->end = start;
2553 sp_insert(sp, n_new);
2554 n_new = NULL;
2555 mpol_new = NULL;
2556 break;
2557 } else
2558 n->end = start;
2559 }
2560 if (!next)
2561 break;
2562 n = rb_entry(next, struct sp_node, nd);
2563 }
2564 if (new)
2565 sp_insert(sp, new);
2566 write_unlock(&sp->lock);
2567 ret = 0;
2568
2569err_out:
2570 if (mpol_new)
2571 mpol_put(mpol_new);
2572 if (n_new)
2573 kmem_cache_free(sn_cache, n_new);
2574
2575 return ret;
2576
2577alloc_new:
2578 write_unlock(&sp->lock);
2579 ret = -ENOMEM;
2580 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2581 if (!n_new)
2582 goto err_out;
2583 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2584 if (!mpol_new)
2585 goto err_out;
2586 goto restart;
2587}
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2600{
2601 int ret;
2602
2603 sp->root = RB_ROOT;
2604 rwlock_init(&sp->lock);
2605
2606 if (mpol) {
2607 struct vm_area_struct pvma;
2608 struct mempolicy *new;
2609 NODEMASK_SCRATCH(scratch);
2610
2611 if (!scratch)
2612 goto put_mpol;
2613
2614 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2615 if (IS_ERR(new))
2616 goto free_scratch;
2617
2618 task_lock(current);
2619 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2620 task_unlock(current);
2621 if (ret)
2622 goto put_new;
2623
2624
2625 vma_init(&pvma, NULL);
2626 pvma.vm_end = TASK_SIZE;
2627 mpol_set_shared_policy(sp, &pvma, new);
2628
2629put_new:
2630 mpol_put(new);
2631free_scratch:
2632 NODEMASK_SCRATCH_FREE(scratch);
2633put_mpol:
2634 mpol_put(mpol);
2635 }
2636}
2637
2638int mpol_set_shared_policy(struct shared_policy *info,
2639 struct vm_area_struct *vma, struct mempolicy *npol)
2640{
2641 int err;
2642 struct sp_node *new = NULL;
2643 unsigned long sz = vma_pages(vma);
2644
2645 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2646 vma->vm_pgoff,
2647 sz, npol ? npol->mode : -1,
2648 npol ? npol->flags : -1,
2649 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2650
2651 if (npol) {
2652 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2653 if (!new)
2654 return -ENOMEM;
2655 }
2656 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2657 if (err && new)
2658 sp_free(new);
2659 return err;
2660}
2661
2662
2663void mpol_free_shared_policy(struct shared_policy *p)
2664{
2665 struct sp_node *n;
2666 struct rb_node *next;
2667
2668 if (!p->root.rb_node)
2669 return;
2670 write_lock(&p->lock);
2671 next = rb_first(&p->root);
2672 while (next) {
2673 n = rb_entry(next, struct sp_node, nd);
2674 next = rb_next(&n->nd);
2675 sp_delete(p, n);
2676 }
2677 write_unlock(&p->lock);
2678}
2679
2680#ifdef CONFIG_NUMA_BALANCING
2681static int __initdata numabalancing_override;
2682
2683static void __init check_numabalancing_enable(void)
2684{
2685 bool numabalancing_default = false;
2686
2687 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2688 numabalancing_default = true;
2689
2690
2691 if (numabalancing_override)
2692 set_numabalancing_state(numabalancing_override == 1);
2693
2694 if (num_online_nodes() > 1 && !numabalancing_override) {
2695 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2696 numabalancing_default ? "Enabling" : "Disabling");
2697 set_numabalancing_state(numabalancing_default);
2698 }
2699}
2700
2701static int __init setup_numabalancing(char *str)
2702{
2703 int ret = 0;
2704 if (!str)
2705 goto out;
2706
2707 if (!strcmp(str, "enable")) {
2708 numabalancing_override = 1;
2709 ret = 1;
2710 } else if (!strcmp(str, "disable")) {
2711 numabalancing_override = -1;
2712 ret = 1;
2713 }
2714out:
2715 if (!ret)
2716 pr_warn("Unable to parse numa_balancing=\n");
2717
2718 return ret;
2719}
2720__setup("numa_balancing=", setup_numabalancing);
2721#else
2722static inline void __init check_numabalancing_enable(void)
2723{
2724}
2725#endif
2726
2727
2728void __init numa_policy_init(void)
2729{
2730 nodemask_t interleave_nodes;
2731 unsigned long largest = 0;
2732 int nid, prefer = 0;
2733
2734 policy_cache = kmem_cache_create("numa_policy",
2735 sizeof(struct mempolicy),
2736 0, SLAB_PANIC, NULL);
2737
2738 sn_cache = kmem_cache_create("shared_policy_node",
2739 sizeof(struct sp_node),
2740 0, SLAB_PANIC, NULL);
2741
2742 for_each_node(nid) {
2743 preferred_node_policy[nid] = (struct mempolicy) {
2744 .refcnt = ATOMIC_INIT(1),
2745 .mode = MPOL_PREFERRED,
2746 .flags = MPOL_F_MOF | MPOL_F_MORON,
2747 .v = { .preferred_node = nid, },
2748 };
2749 }
2750
2751
2752
2753
2754
2755
2756 nodes_clear(interleave_nodes);
2757 for_each_node_state(nid, N_MEMORY) {
2758 unsigned long total_pages = node_present_pages(nid);
2759
2760
2761 if (largest < total_pages) {
2762 largest = total_pages;
2763 prefer = nid;
2764 }
2765
2766
2767 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2768 node_set(nid, interleave_nodes);
2769 }
2770
2771
2772 if (unlikely(nodes_empty(interleave_nodes)))
2773 node_set(prefer, interleave_nodes);
2774
2775 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2776 pr_err("%s: interleaving failed\n", __func__);
2777
2778 check_numabalancing_enable();
2779}
2780
2781
2782void numa_default_policy(void)
2783{
2784 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2785}
2786
2787
2788
2789
2790
2791
2792
2793
2794static const char * const policy_modes[] =
2795{
2796 [MPOL_DEFAULT] = "default",
2797 [MPOL_PREFERRED] = "prefer",
2798 [MPOL_BIND] = "bind",
2799 [MPOL_INTERLEAVE] = "interleave",
2800 [MPOL_LOCAL] = "local",
2801};
2802
2803
2804#ifdef CONFIG_TMPFS
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815int mpol_parse_str(char *str, struct mempolicy **mpol)
2816{
2817 struct mempolicy *new = NULL;
2818 unsigned short mode_flags;
2819 nodemask_t nodes;
2820 char *nodelist = strchr(str, ':');
2821 char *flags = strchr(str, '=');
2822 int err = 1, mode;
2823
2824 if (nodelist) {
2825
2826 *nodelist++ = '\0';
2827 if (nodelist_parse(nodelist, nodes))
2828 goto out;
2829 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2830 goto out;
2831 } else
2832 nodes_clear(nodes);
2833
2834 if (flags)
2835 *flags++ = '\0';
2836
2837 mode = match_string(policy_modes, MPOL_MAX, str);
2838 if (mode < 0)
2839 goto out;
2840
2841 switch (mode) {
2842 case MPOL_PREFERRED:
2843
2844
2845
2846 if (nodelist) {
2847 char *rest = nodelist;
2848 while (isdigit(*rest))
2849 rest++;
2850 if (*rest)
2851 goto out;
2852 }
2853 break;
2854 case MPOL_INTERLEAVE:
2855
2856
2857
2858 if (!nodelist)
2859 nodes = node_states[N_MEMORY];
2860 break;
2861 case MPOL_LOCAL:
2862
2863
2864
2865 if (nodelist)
2866 goto out;
2867 mode = MPOL_PREFERRED;
2868 break;
2869 case MPOL_DEFAULT:
2870
2871
2872
2873 if (!nodelist)
2874 err = 0;
2875 goto out;
2876 case MPOL_BIND:
2877
2878
2879
2880 if (!nodelist)
2881 goto out;
2882 }
2883
2884 mode_flags = 0;
2885 if (flags) {
2886
2887
2888
2889
2890 if (!strcmp(flags, "static"))
2891 mode_flags |= MPOL_F_STATIC_NODES;
2892 else if (!strcmp(flags, "relative"))
2893 mode_flags |= MPOL_F_RELATIVE_NODES;
2894 else
2895 goto out;
2896 }
2897
2898 new = mpol_new(mode, mode_flags, &nodes);
2899 if (IS_ERR(new))
2900 goto out;
2901
2902
2903
2904
2905
2906 if (mode != MPOL_PREFERRED)
2907 new->v.nodes = nodes;
2908 else if (nodelist)
2909 new->v.preferred_node = first_node(nodes);
2910 else
2911 new->flags |= MPOL_F_LOCAL;
2912
2913
2914
2915
2916
2917 new->w.user_nodemask = nodes;
2918
2919 err = 0;
2920
2921out:
2922
2923 if (nodelist)
2924 *--nodelist = ':';
2925 if (flags)
2926 *--flags = '=';
2927 if (!err)
2928 *mpol = new;
2929 return err;
2930}
2931#endif
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2944{
2945 char *p = buffer;
2946 nodemask_t nodes = NODE_MASK_NONE;
2947 unsigned short mode = MPOL_DEFAULT;
2948 unsigned short flags = 0;
2949
2950 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2951 mode = pol->mode;
2952 flags = pol->flags;
2953 }
2954
2955 switch (mode) {
2956 case MPOL_DEFAULT:
2957 break;
2958 case MPOL_PREFERRED:
2959 if (flags & MPOL_F_LOCAL)
2960 mode = MPOL_LOCAL;
2961 else
2962 node_set(pol->v.preferred_node, nodes);
2963 break;
2964 case MPOL_BIND:
2965 case MPOL_INTERLEAVE:
2966 nodes = pol->v.nodes;
2967 break;
2968 default:
2969 WARN_ON_ONCE(1);
2970 snprintf(p, maxlen, "unknown");
2971 return;
2972 }
2973
2974 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2975
2976 if (flags & MPOL_MODE_FLAGS) {
2977 p += snprintf(p, buffer + maxlen - p, "=");
2978
2979
2980
2981
2982 if (flags & MPOL_F_STATIC_NODES)
2983 p += snprintf(p, buffer + maxlen - p, "static");
2984 else if (flags & MPOL_F_RELATIVE_NODES)
2985 p += snprintf(p, buffer + maxlen - p, "relative");
2986 }
2987
2988 if (!nodes_empty(nodes))
2989 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2990 nodemask_pr_args(&nodes));
2991}
2992