1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70#include <linux/mempolicy.h>
71#include <linux/mm.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/slab.h>
79#include <linux/string.h>
80#include <linux/export.h>
81#include <linux/nsproxy.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/swap.h>
86#include <linux/seq_file.h>
87#include <linux/proc_fs.h>
88#include <linux/migrate.h>
89#include <linux/ksm.h>
90#include <linux/rmap.h>
91#include <linux/security.h>
92#include <linux/syscalls.h>
93#include <linux/ctype.h>
94#include <linux/mm_inline.h>
95#include <linux/mmu_notifier.h>
96#include <linux/printk.h>
97
98#include <asm/tlbflush.h>
99#include <asm/uaccess.h>
100
101#include "internal.h"
102
103
104#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)
105#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)
106
107static struct kmem_cache *policy_cache;
108static struct kmem_cache *sn_cache;
109
110
111
112enum zone_type policy_zone = 0;
113
114
115
116
117static struct mempolicy default_policy = {
118 .refcnt = ATOMIC_INIT(1),
119 .mode = MPOL_PREFERRED,
120 .flags = MPOL_F_LOCAL,
121};
122
123static struct mempolicy preferred_node_policy[MAX_NUMNODES];
124
125struct mempolicy *get_task_policy(struct task_struct *p)
126{
127 struct mempolicy *pol = p->mempolicy;
128 int node;
129
130 if (pol)
131 return pol;
132
133 node = numa_node_id();
134 if (node != NUMA_NO_NODE) {
135 pol = &preferred_node_policy[node];
136
137 if (pol->mode)
138 return pol;
139 }
140
141 return &default_policy;
142}
143
144static const struct mempolicy_operations {
145 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161 enum mpol_rebind_step step);
162} mpol_ops[MPOL_MAX];
163
164static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
165{
166 return pol->flags & MPOL_MODE_FLAGS;
167}
168
169static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
170 const nodemask_t *rel)
171{
172 nodemask_t tmp;
173 nodes_fold(tmp, *orig, nodes_weight(*rel));
174 nodes_onto(*ret, tmp, *rel);
175}
176
177static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
178{
179 if (nodes_empty(*nodes))
180 return -EINVAL;
181 pol->v.nodes = *nodes;
182 return 0;
183}
184
185static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
186{
187 if (!nodes)
188 pol->flags |= MPOL_F_LOCAL;
189 else if (nodes_empty(*nodes))
190 return -EINVAL;
191 else
192 pol->v.preferred_node = first_node(*nodes);
193 return 0;
194}
195
196static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
197{
198 if (nodes_empty(*nodes))
199 return -EINVAL;
200 pol->v.nodes = *nodes;
201 return 0;
202}
203
204
205
206
207
208
209
210
211
212
213static int mpol_set_nodemask(struct mempolicy *pol,
214 const nodemask_t *nodes, struct nodemask_scratch *nsc)
215{
216 int ret;
217
218
219 if (pol == NULL)
220 return 0;
221
222 nodes_and(nsc->mask1,
223 cpuset_current_mems_allowed, node_states[N_MEMORY]);
224
225 VM_BUG_ON(!nodes);
226 if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
227 nodes = NULL;
228 else {
229 if (pol->flags & MPOL_F_RELATIVE_NODES)
230 mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
231 else
232 nodes_and(nsc->mask2, *nodes, nsc->mask1);
233
234 if (mpol_store_user_nodemask(pol))
235 pol->w.user_nodemask = *nodes;
236 else
237 pol->w.cpuset_mems_allowed =
238 cpuset_current_mems_allowed;
239 }
240
241 if (nodes)
242 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
243 else
244 ret = mpol_ops[pol->mode].create(pol, NULL);
245 return ret;
246}
247
248
249
250
251
252static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
253 nodemask_t *nodes)
254{
255 struct mempolicy *policy;
256
257 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
258 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
259
260 if (mode == MPOL_DEFAULT) {
261 if (nodes && !nodes_empty(*nodes))
262 return ERR_PTR(-EINVAL);
263 return NULL;
264 }
265 VM_BUG_ON(!nodes);
266
267
268
269
270
271
272 if (mode == MPOL_PREFERRED) {
273 if (nodes_empty(*nodes)) {
274 if (((flags & MPOL_F_STATIC_NODES) ||
275 (flags & MPOL_F_RELATIVE_NODES)))
276 return ERR_PTR(-EINVAL);
277 }
278 } else if (mode == MPOL_LOCAL) {
279 if (!nodes_empty(*nodes))
280 return ERR_PTR(-EINVAL);
281 mode = MPOL_PREFERRED;
282 } else if (nodes_empty(*nodes))
283 return ERR_PTR(-EINVAL);
284 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
285 if (!policy)
286 return ERR_PTR(-ENOMEM);
287 atomic_set(&policy->refcnt, 1);
288 policy->mode = mode;
289 policy->flags = flags;
290
291 return policy;
292}
293
294
295void __mpol_put(struct mempolicy *p)
296{
297 if (!atomic_dec_and_test(&p->refcnt))
298 return;
299 kmem_cache_free(policy_cache, p);
300}
301
302static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
303 enum mpol_rebind_step step)
304{
305}
306
307
308
309
310
311
312
313static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
314 enum mpol_rebind_step step)
315{
316 nodemask_t tmp;
317
318 if (pol->flags & MPOL_F_STATIC_NODES)
319 nodes_and(tmp, pol->w.user_nodemask, *nodes);
320 else if (pol->flags & MPOL_F_RELATIVE_NODES)
321 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
322 else {
323
324
325
326
327 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
328 nodes_remap(tmp, pol->v.nodes,
329 pol->w.cpuset_mems_allowed, *nodes);
330 pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
331 } else if (step == MPOL_REBIND_STEP2) {
332 tmp = pol->w.cpuset_mems_allowed;
333 pol->w.cpuset_mems_allowed = *nodes;
334 } else
335 BUG();
336 }
337
338 if (nodes_empty(tmp))
339 tmp = *nodes;
340
341 if (step == MPOL_REBIND_STEP1)
342 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
343 else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
344 pol->v.nodes = tmp;
345 else
346 BUG();
347
348 if (!node_isset(current->il_next, tmp)) {
349 current->il_next = next_node_in(current->il_next, tmp);
350 if (current->il_next >= MAX_NUMNODES)
351 current->il_next = numa_node_id();
352 }
353}
354
355static void mpol_rebind_preferred(struct mempolicy *pol,
356 const nodemask_t *nodes,
357 enum mpol_rebind_step step)
358{
359 nodemask_t tmp;
360
361 if (pol->flags & MPOL_F_STATIC_NODES) {
362 int node = first_node(pol->w.user_nodemask);
363
364 if (node_isset(node, *nodes)) {
365 pol->v.preferred_node = node;
366 pol->flags &= ~MPOL_F_LOCAL;
367 } else
368 pol->flags |= MPOL_F_LOCAL;
369 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
370 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
371 pol->v.preferred_node = first_node(tmp);
372 } else if (!(pol->flags & MPOL_F_LOCAL)) {
373 pol->v.preferred_node = node_remap(pol->v.preferred_node,
374 pol->w.cpuset_mems_allowed,
375 *nodes);
376 pol->w.cpuset_mems_allowed = *nodes;
377 }
378}
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
397 enum mpol_rebind_step step)
398{
399 if (!pol)
400 return;
401 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
402 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
403 return;
404
405 if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
406 return;
407
408 if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
409 BUG();
410
411 if (step == MPOL_REBIND_STEP1)
412 pol->flags |= MPOL_F_REBINDING;
413 else if (step == MPOL_REBIND_STEP2)
414 pol->flags &= ~MPOL_F_REBINDING;
415 else if (step >= MPOL_REBIND_NSTEP)
416 BUG();
417
418 mpol_ops[pol->mode].rebind(pol, newmask, step);
419}
420
421
422
423
424
425
426
427
428void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
429 enum mpol_rebind_step step)
430{
431 mpol_rebind_policy(tsk->mempolicy, new, step);
432}
433
434
435
436
437
438
439
440void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
441{
442 struct vm_area_struct *vma;
443
444 down_write(&mm->mmap_sem);
445 for (vma = mm->mmap; vma; vma = vma->vm_next)
446 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
447 up_write(&mm->mmap_sem);
448}
449
450static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
451 [MPOL_DEFAULT] = {
452 .rebind = mpol_rebind_default,
453 },
454 [MPOL_INTERLEAVE] = {
455 .create = mpol_new_interleave,
456 .rebind = mpol_rebind_nodemask,
457 },
458 [MPOL_PREFERRED] = {
459 .create = mpol_new_preferred,
460 .rebind = mpol_rebind_preferred,
461 },
462 [MPOL_BIND] = {
463 .create = mpol_new_bind,
464 .rebind = mpol_rebind_nodemask,
465 },
466};
467
468static void migrate_page_add(struct page *page, struct list_head *pagelist,
469 unsigned long flags);
470
471struct queue_pages {
472 struct list_head *pagelist;
473 unsigned long flags;
474 nodemask_t *nmask;
475 struct vm_area_struct *prev;
476};
477
478
479
480
481
482static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
483 unsigned long end, struct mm_walk *walk)
484{
485 struct vm_area_struct *vma = walk->vma;
486 struct page *page;
487 struct queue_pages *qp = walk->private;
488 unsigned long flags = qp->flags;
489 int nid, ret;
490 pte_t *pte;
491 spinlock_t *ptl;
492
493 if (pmd_trans_huge(*pmd)) {
494 ptl = pmd_lock(walk->mm, pmd);
495 if (pmd_trans_huge(*pmd)) {
496 page = pmd_page(*pmd);
497 if (is_huge_zero_page(page)) {
498 spin_unlock(ptl);
499 split_huge_pmd(vma, pmd, addr);
500 } else {
501 get_page(page);
502 spin_unlock(ptl);
503 lock_page(page);
504 ret = split_huge_page(page);
505 unlock_page(page);
506 put_page(page);
507 if (ret)
508 return 0;
509 }
510 } else {
511 spin_unlock(ptl);
512 }
513 }
514
515 if (pmd_trans_unstable(pmd))
516 return 0;
517retry:
518 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
519 for (; addr != end; pte++, addr += PAGE_SIZE) {
520 if (!pte_present(*pte))
521 continue;
522 page = vm_normal_page(vma, addr, *pte);
523 if (!page)
524 continue;
525
526
527
528
529 if (PageReserved(page))
530 continue;
531 nid = page_to_nid(page);
532 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
533 continue;
534 if (PageTransCompound(page)) {
535 get_page(page);
536 pte_unmap_unlock(pte, ptl);
537 lock_page(page);
538 ret = split_huge_page(page);
539 unlock_page(page);
540 put_page(page);
541
542 if (ret) {
543 pte = pte_offset_map_lock(walk->mm, pmd,
544 addr, &ptl);
545 continue;
546 }
547 goto retry;
548 }
549
550 migrate_page_add(page, qp->pagelist, flags);
551 }
552 pte_unmap_unlock(pte - 1, ptl);
553 cond_resched();
554 return 0;
555}
556
557static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
558 unsigned long addr, unsigned long end,
559 struct mm_walk *walk)
560{
561#ifdef CONFIG_HUGETLB_PAGE
562 struct queue_pages *qp = walk->private;
563 unsigned long flags = qp->flags;
564 int nid;
565 struct page *page;
566 spinlock_t *ptl;
567 pte_t entry;
568
569 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
570 entry = huge_ptep_get(pte);
571 if (!pte_present(entry))
572 goto unlock;
573 page = pte_page(entry);
574 nid = page_to_nid(page);
575 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
576 goto unlock;
577
578 if (flags & (MPOL_MF_MOVE_ALL) ||
579 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
580 isolate_huge_page(page, qp->pagelist);
581unlock:
582 spin_unlock(ptl);
583#else
584 BUG();
585#endif
586 return 0;
587}
588
589#ifdef CONFIG_NUMA_BALANCING
590
591
592
593
594
595
596
597
598
599unsigned long change_prot_numa(struct vm_area_struct *vma,
600 unsigned long addr, unsigned long end)
601{
602 int nr_updated;
603
604 nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
605 if (nr_updated)
606 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
607
608 return nr_updated;
609}
610#else
611static unsigned long change_prot_numa(struct vm_area_struct *vma,
612 unsigned long addr, unsigned long end)
613{
614 return 0;
615}
616#endif
617
618static int queue_pages_test_walk(unsigned long start, unsigned long end,
619 struct mm_walk *walk)
620{
621 struct vm_area_struct *vma = walk->vma;
622 struct queue_pages *qp = walk->private;
623 unsigned long endvma = vma->vm_end;
624 unsigned long flags = qp->flags;
625
626 if (!vma_migratable(vma))
627 return 1;
628
629 if (endvma > end)
630 endvma = end;
631 if (vma->vm_start > start)
632 start = vma->vm_start;
633
634 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
635 if (!vma->vm_next && vma->vm_end < end)
636 return -EFAULT;
637 if (qp->prev && qp->prev->vm_end < vma->vm_start)
638 return -EFAULT;
639 }
640
641 qp->prev = vma;
642
643 if (flags & MPOL_MF_LAZY) {
644
645 if (!is_vm_hugetlb_page(vma) &&
646 (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
647 !(vma->vm_flags & VM_MIXEDMAP))
648 change_prot_numa(vma, start, endvma);
649 return 1;
650 }
651
652
653 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
654 return 0;
655 return 1;
656}
657
658
659
660
661
662
663
664
665static int
666queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
667 nodemask_t *nodes, unsigned long flags,
668 struct list_head *pagelist)
669{
670 struct queue_pages qp = {
671 .pagelist = pagelist,
672 .flags = flags,
673 .nmask = nodes,
674 .prev = NULL,
675 };
676 struct mm_walk queue_pages_walk = {
677 .hugetlb_entry = queue_pages_hugetlb,
678 .pmd_entry = queue_pages_pte_range,
679 .test_walk = queue_pages_test_walk,
680 .mm = mm,
681 .private = &qp,
682 };
683
684 return walk_page_range(start, end, &queue_pages_walk);
685}
686
687
688
689
690
691static int vma_replace_policy(struct vm_area_struct *vma,
692 struct mempolicy *pol)
693{
694 int err;
695 struct mempolicy *old;
696 struct mempolicy *new;
697
698 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
699 vma->vm_start, vma->vm_end, vma->vm_pgoff,
700 vma->vm_ops, vma->vm_file,
701 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
702
703 new = mpol_dup(pol);
704 if (IS_ERR(new))
705 return PTR_ERR(new);
706
707 if (vma->vm_ops && vma->vm_ops->set_policy) {
708 err = vma->vm_ops->set_policy(vma, new);
709 if (err)
710 goto err_out;
711 }
712
713 old = vma->vm_policy;
714 vma->vm_policy = new;
715 mpol_put(old);
716
717 return 0;
718 err_out:
719 mpol_put(new);
720 return err;
721}
722
723
724static int mbind_range(struct mm_struct *mm, unsigned long start,
725 unsigned long end, struct mempolicy *new_pol)
726{
727 struct vm_area_struct *next;
728 struct vm_area_struct *prev;
729 struct vm_area_struct *vma;
730 int err = 0;
731 pgoff_t pgoff;
732 unsigned long vmstart;
733 unsigned long vmend;
734
735 vma = find_vma(mm, start);
736 if (!vma || vma->vm_start > start)
737 return -EFAULT;
738
739 prev = vma->vm_prev;
740 if (start > vma->vm_start)
741 prev = vma;
742
743 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
744 next = vma->vm_next;
745 vmstart = max(start, vma->vm_start);
746 vmend = min(end, vma->vm_end);
747
748 if (mpol_equal(vma_policy(vma), new_pol))
749 continue;
750
751 pgoff = vma->vm_pgoff +
752 ((vmstart - vma->vm_start) >> PAGE_SHIFT);
753 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
754 vma->anon_vma, vma->vm_file, pgoff,
755 new_pol, vma->vm_userfaultfd_ctx);
756 if (prev) {
757 vma = prev;
758 next = vma->vm_next;
759 if (mpol_equal(vma_policy(vma), new_pol))
760 continue;
761
762 goto replace;
763 }
764 if (vma->vm_start != vmstart) {
765 err = split_vma(vma->vm_mm, vma, vmstart, 1);
766 if (err)
767 goto out;
768 }
769 if (vma->vm_end != vmend) {
770 err = split_vma(vma->vm_mm, vma, vmend, 0);
771 if (err)
772 goto out;
773 }
774 replace:
775 err = vma_replace_policy(vma, new_pol);
776 if (err)
777 goto out;
778 }
779
780 out:
781 return err;
782}
783
784
785static long do_set_mempolicy(unsigned short mode, unsigned short flags,
786 nodemask_t *nodes)
787{
788 struct mempolicy *new, *old;
789 NODEMASK_SCRATCH(scratch);
790 int ret;
791
792 if (!scratch)
793 return -ENOMEM;
794
795 new = mpol_new(mode, flags, nodes);
796 if (IS_ERR(new)) {
797 ret = PTR_ERR(new);
798 goto out;
799 }
800
801 task_lock(current);
802 ret = mpol_set_nodemask(new, nodes, scratch);
803 if (ret) {
804 task_unlock(current);
805 mpol_put(new);
806 goto out;
807 }
808 old = current->mempolicy;
809 current->mempolicy = new;
810 if (new && new->mode == MPOL_INTERLEAVE &&
811 nodes_weight(new->v.nodes))
812 current->il_next = first_node(new->v.nodes);
813 task_unlock(current);
814 mpol_put(old);
815 ret = 0;
816out:
817 NODEMASK_SCRATCH_FREE(scratch);
818 return ret;
819}
820
821
822
823
824
825
826static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
827{
828 nodes_clear(*nodes);
829 if (p == &default_policy)
830 return;
831
832 switch (p->mode) {
833 case MPOL_BIND:
834
835 case MPOL_INTERLEAVE:
836 *nodes = p->v.nodes;
837 break;
838 case MPOL_PREFERRED:
839 if (!(p->flags & MPOL_F_LOCAL))
840 node_set(p->v.preferred_node, *nodes);
841
842 break;
843 default:
844 BUG();
845 }
846}
847
848static int lookup_node(unsigned long addr)
849{
850 struct page *p;
851 int err;
852
853 err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
854 if (err >= 0) {
855 err = page_to_nid(p);
856 put_page(p);
857 }
858 return err;
859}
860
861
862static long do_get_mempolicy(int *policy, nodemask_t *nmask,
863 unsigned long addr, unsigned long flags)
864{
865 int err;
866 struct mm_struct *mm = current->mm;
867 struct vm_area_struct *vma = NULL;
868 struct mempolicy *pol = current->mempolicy;
869
870 if (flags &
871 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
872 return -EINVAL;
873
874 if (flags & MPOL_F_MEMS_ALLOWED) {
875 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
876 return -EINVAL;
877 *policy = 0;
878 task_lock(current);
879 *nmask = cpuset_current_mems_allowed;
880 task_unlock(current);
881 return 0;
882 }
883
884 if (flags & MPOL_F_ADDR) {
885
886
887
888
889
890 down_read(&mm->mmap_sem);
891 vma = find_vma_intersection(mm, addr, addr+1);
892 if (!vma) {
893 up_read(&mm->mmap_sem);
894 return -EFAULT;
895 }
896 if (vma->vm_ops && vma->vm_ops->get_policy)
897 pol = vma->vm_ops->get_policy(vma, addr);
898 else
899 pol = vma->vm_policy;
900 } else if (addr)
901 return -EINVAL;
902
903 if (!pol)
904 pol = &default_policy;
905
906 if (flags & MPOL_F_NODE) {
907 if (flags & MPOL_F_ADDR) {
908 err = lookup_node(addr);
909 if (err < 0)
910 goto out;
911 *policy = err;
912 } else if (pol == current->mempolicy &&
913 pol->mode == MPOL_INTERLEAVE) {
914 *policy = current->il_next;
915 } else {
916 err = -EINVAL;
917 goto out;
918 }
919 } else {
920 *policy = pol == &default_policy ? MPOL_DEFAULT :
921 pol->mode;
922
923
924
925
926 *policy |= (pol->flags & MPOL_MODE_FLAGS);
927 }
928
929 if (vma) {
930 up_read(¤t->mm->mmap_sem);
931 vma = NULL;
932 }
933
934 err = 0;
935 if (nmask) {
936 if (mpol_store_user_nodemask(pol)) {
937 *nmask = pol->w.user_nodemask;
938 } else {
939 task_lock(current);
940 get_policy_nodemask(pol, nmask);
941 task_unlock(current);
942 }
943 }
944
945 out:
946 mpol_cond_put(pol);
947 if (vma)
948 up_read(¤t->mm->mmap_sem);
949 return err;
950}
951
952#ifdef CONFIG_MIGRATION
953
954
955
956static void migrate_page_add(struct page *page, struct list_head *pagelist,
957 unsigned long flags)
958{
959
960
961
962 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
963 if (!isolate_lru_page(page)) {
964 list_add_tail(&page->lru, pagelist);
965 inc_node_page_state(page, NR_ISOLATED_ANON +
966 page_is_file_cache(page));
967 }
968 }
969}
970
971static struct page *new_node_page(struct page *page, unsigned long node, int **x)
972{
973 if (PageHuge(page))
974 return alloc_huge_page_node(page_hstate(compound_head(page)),
975 node);
976 else
977 return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
978 __GFP_THISNODE, 0);
979}
980
981
982
983
984
985static int migrate_to_node(struct mm_struct *mm, int source, int dest,
986 int flags)
987{
988 nodemask_t nmask;
989 LIST_HEAD(pagelist);
990 int err = 0;
991
992 nodes_clear(nmask);
993 node_set(source, nmask);
994
995
996
997
998
999
1000 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1001 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1002 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1003
1004 if (!list_empty(&pagelist)) {
1005 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1006 MIGRATE_SYNC, MR_SYSCALL);
1007 if (err)
1008 putback_movable_pages(&pagelist);
1009 }
1010
1011 return err;
1012}
1013
1014
1015
1016
1017
1018
1019
1020int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1021 const nodemask_t *to, int flags)
1022{
1023 int busy = 0;
1024 int err;
1025 nodemask_t tmp;
1026
1027 err = migrate_prep();
1028 if (err)
1029 return err;
1030
1031 down_read(&mm->mmap_sem);
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064 tmp = *from;
1065 while (!nodes_empty(tmp)) {
1066 int s,d;
1067 int source = NUMA_NO_NODE;
1068 int dest = 0;
1069
1070 for_each_node_mask(s, tmp) {
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1088 (node_isset(s, *to)))
1089 continue;
1090
1091 d = node_remap(s, *from, *to);
1092 if (s == d)
1093 continue;
1094
1095 source = s;
1096 dest = d;
1097
1098
1099 if (!node_isset(dest, tmp))
1100 break;
1101 }
1102 if (source == NUMA_NO_NODE)
1103 break;
1104
1105 node_clear(source, tmp);
1106 err = migrate_to_node(mm, source, dest, flags);
1107 if (err > 0)
1108 busy += err;
1109 if (err < 0)
1110 break;
1111 }
1112 up_read(&mm->mmap_sem);
1113 if (err < 0)
1114 return err;
1115 return busy;
1116
1117}
1118
1119
1120
1121
1122
1123
1124
1125
1126static struct page *new_page(struct page *page, unsigned long start, int **x)
1127{
1128 struct vm_area_struct *vma;
1129 unsigned long uninitialized_var(address);
1130
1131 vma = find_vma(current->mm, start);
1132 while (vma) {
1133 address = page_address_in_vma(page, vma);
1134 if (address != -EFAULT)
1135 break;
1136 vma = vma->vm_next;
1137 }
1138
1139 if (PageHuge(page)) {
1140 BUG_ON(!vma);
1141 return alloc_huge_page_noerr(vma, address, 1);
1142 }
1143
1144
1145
1146 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1147}
1148#else
1149
1150static void migrate_page_add(struct page *page, struct list_head *pagelist,
1151 unsigned long flags)
1152{
1153}
1154
1155int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1156 const nodemask_t *to, int flags)
1157{
1158 return -ENOSYS;
1159}
1160
1161static struct page *new_page(struct page *page, unsigned long start, int **x)
1162{
1163 return NULL;
1164}
1165#endif
1166
1167static long do_mbind(unsigned long start, unsigned long len,
1168 unsigned short mode, unsigned short mode_flags,
1169 nodemask_t *nmask, unsigned long flags)
1170{
1171 struct mm_struct *mm = current->mm;
1172 struct mempolicy *new;
1173 unsigned long end;
1174 int err;
1175 LIST_HEAD(pagelist);
1176
1177 if (flags & ~(unsigned long)MPOL_MF_VALID)
1178 return -EINVAL;
1179 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1180 return -EPERM;
1181
1182 if (start & ~PAGE_MASK)
1183 return -EINVAL;
1184
1185 if (mode == MPOL_DEFAULT)
1186 flags &= ~MPOL_MF_STRICT;
1187
1188 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1189 end = start + len;
1190
1191 if (end < start)
1192 return -EINVAL;
1193 if (end == start)
1194 return 0;
1195
1196 new = mpol_new(mode, mode_flags, nmask);
1197 if (IS_ERR(new))
1198 return PTR_ERR(new);
1199
1200 if (flags & MPOL_MF_LAZY)
1201 new->flags |= MPOL_F_MOF;
1202
1203
1204
1205
1206
1207 if (!new)
1208 flags |= MPOL_MF_DISCONTIG_OK;
1209
1210 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1211 start, start + len, mode, mode_flags,
1212 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1213
1214 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1215
1216 err = migrate_prep();
1217 if (err)
1218 goto mpol_out;
1219 }
1220 {
1221 NODEMASK_SCRATCH(scratch);
1222 if (scratch) {
1223 down_write(&mm->mmap_sem);
1224 task_lock(current);
1225 err = mpol_set_nodemask(new, nmask, scratch);
1226 task_unlock(current);
1227 if (err)
1228 up_write(&mm->mmap_sem);
1229 } else
1230 err = -ENOMEM;
1231 NODEMASK_SCRATCH_FREE(scratch);
1232 }
1233 if (err)
1234 goto mpol_out;
1235
1236 err = queue_pages_range(mm, start, end, nmask,
1237 flags | MPOL_MF_INVERT, &pagelist);
1238 if (!err)
1239 err = mbind_range(mm, start, end, new);
1240
1241 if (!err) {
1242 int nr_failed = 0;
1243
1244 if (!list_empty(&pagelist)) {
1245 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1246 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1247 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1248 if (nr_failed)
1249 putback_movable_pages(&pagelist);
1250 }
1251
1252 if (nr_failed && (flags & MPOL_MF_STRICT))
1253 err = -EIO;
1254 } else
1255 putback_movable_pages(&pagelist);
1256
1257 up_write(&mm->mmap_sem);
1258 mpol_out:
1259 mpol_put(new);
1260 return err;
1261}
1262
1263
1264
1265
1266
1267
1268static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1269 unsigned long maxnode)
1270{
1271 unsigned long k;
1272 unsigned long nlongs;
1273 unsigned long endmask;
1274
1275 --maxnode;
1276 nodes_clear(*nodes);
1277 if (maxnode == 0 || !nmask)
1278 return 0;
1279 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1280 return -EINVAL;
1281
1282 nlongs = BITS_TO_LONGS(maxnode);
1283 if ((maxnode % BITS_PER_LONG) == 0)
1284 endmask = ~0UL;
1285 else
1286 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1287
1288
1289
1290 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1291 if (nlongs > PAGE_SIZE/sizeof(long))
1292 return -EINVAL;
1293 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1294 unsigned long t;
1295 if (get_user(t, nmask + k))
1296 return -EFAULT;
1297 if (k == nlongs - 1) {
1298 if (t & endmask)
1299 return -EINVAL;
1300 } else if (t)
1301 return -EINVAL;
1302 }
1303 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1304 endmask = ~0UL;
1305 }
1306
1307 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1308 return -EFAULT;
1309 nodes_addr(*nodes)[nlongs-1] &= endmask;
1310 return 0;
1311}
1312
1313
1314static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1315 nodemask_t *nodes)
1316{
1317 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1318 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1319
1320 if (copy > nbytes) {
1321 if (copy > PAGE_SIZE)
1322 return -EINVAL;
1323 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1324 return -EFAULT;
1325 copy = nbytes;
1326 }
1327 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1328}
1329
1330SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1331 unsigned long, mode, const unsigned long __user *, nmask,
1332 unsigned long, maxnode, unsigned, flags)
1333{
1334 nodemask_t nodes;
1335 int err;
1336 unsigned short mode_flags;
1337
1338 mode_flags = mode & MPOL_MODE_FLAGS;
1339 mode &= ~MPOL_MODE_FLAGS;
1340 if (mode >= MPOL_MAX)
1341 return -EINVAL;
1342 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1343 (mode_flags & MPOL_F_RELATIVE_NODES))
1344 return -EINVAL;
1345 err = get_nodes(&nodes, nmask, maxnode);
1346 if (err)
1347 return err;
1348 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1349}
1350
1351
1352SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1353 unsigned long, maxnode)
1354{
1355 int err;
1356 nodemask_t nodes;
1357 unsigned short flags;
1358
1359 flags = mode & MPOL_MODE_FLAGS;
1360 mode &= ~MPOL_MODE_FLAGS;
1361 if ((unsigned int)mode >= MPOL_MAX)
1362 return -EINVAL;
1363 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1364 return -EINVAL;
1365 err = get_nodes(&nodes, nmask, maxnode);
1366 if (err)
1367 return err;
1368 return do_set_mempolicy(mode, flags, &nodes);
1369}
1370
1371SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1372 const unsigned long __user *, old_nodes,
1373 const unsigned long __user *, new_nodes)
1374{
1375 const struct cred *cred = current_cred(), *tcred;
1376 struct mm_struct *mm = NULL;
1377 struct task_struct *task;
1378 nodemask_t task_nodes;
1379 int err;
1380 nodemask_t *old;
1381 nodemask_t *new;
1382 NODEMASK_SCRATCH(scratch);
1383
1384 if (!scratch)
1385 return -ENOMEM;
1386
1387 old = &scratch->mask1;
1388 new = &scratch->mask2;
1389
1390 err = get_nodes(old, old_nodes, maxnode);
1391 if (err)
1392 goto out;
1393
1394 err = get_nodes(new, new_nodes, maxnode);
1395 if (err)
1396 goto out;
1397
1398
1399 rcu_read_lock();
1400 task = pid ? find_task_by_vpid(pid) : current;
1401 if (!task) {
1402 rcu_read_unlock();
1403 err = -ESRCH;
1404 goto out;
1405 }
1406 get_task_struct(task);
1407
1408 err = -EINVAL;
1409
1410
1411
1412
1413
1414
1415
1416 tcred = __task_cred(task);
1417 if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1418 !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1419 !capable(CAP_SYS_NICE)) {
1420 rcu_read_unlock();
1421 err = -EPERM;
1422 goto out_put;
1423 }
1424 rcu_read_unlock();
1425
1426 task_nodes = cpuset_mems_allowed(task);
1427
1428 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1429 err = -EPERM;
1430 goto out_put;
1431 }
1432
1433 if (!nodes_subset(*new, node_states[N_MEMORY])) {
1434 err = -EINVAL;
1435 goto out_put;
1436 }
1437
1438 err = security_task_movememory(task);
1439 if (err)
1440 goto out_put;
1441
1442 mm = get_task_mm(task);
1443 put_task_struct(task);
1444
1445 if (!mm) {
1446 err = -EINVAL;
1447 goto out;
1448 }
1449
1450 err = do_migrate_pages(mm, old, new,
1451 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1452
1453 mmput(mm);
1454out:
1455 NODEMASK_SCRATCH_FREE(scratch);
1456
1457 return err;
1458
1459out_put:
1460 put_task_struct(task);
1461 goto out;
1462
1463}
1464
1465
1466
1467SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1468 unsigned long __user *, nmask, unsigned long, maxnode,
1469 unsigned long, addr, unsigned long, flags)
1470{
1471 int err;
1472 int uninitialized_var(pval);
1473 nodemask_t nodes;
1474
1475 if (nmask != NULL && maxnode < MAX_NUMNODES)
1476 return -EINVAL;
1477
1478 err = do_get_mempolicy(&pval, &nodes, addr, flags);
1479
1480 if (err)
1481 return err;
1482
1483 if (policy && put_user(pval, policy))
1484 return -EFAULT;
1485
1486 if (nmask)
1487 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1488
1489 return err;
1490}
1491
1492#ifdef CONFIG_COMPAT
1493
1494COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1495 compat_ulong_t __user *, nmask,
1496 compat_ulong_t, maxnode,
1497 compat_ulong_t, addr, compat_ulong_t, flags)
1498{
1499 long err;
1500 unsigned long __user *nm = NULL;
1501 unsigned long nr_bits, alloc_size;
1502 DECLARE_BITMAP(bm, MAX_NUMNODES);
1503
1504 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1505 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1506
1507 if (nmask)
1508 nm = compat_alloc_user_space(alloc_size);
1509
1510 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1511
1512 if (!err && nmask) {
1513 unsigned long copy_size;
1514 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1515 err = copy_from_user(bm, nm, copy_size);
1516
1517 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1518 err |= compat_put_bitmap(nmask, bm, nr_bits);
1519 }
1520
1521 return err;
1522}
1523
1524COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1525 compat_ulong_t, maxnode)
1526{
1527 long err = 0;
1528 unsigned long __user *nm = NULL;
1529 unsigned long nr_bits, alloc_size;
1530 DECLARE_BITMAP(bm, MAX_NUMNODES);
1531
1532 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1533 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1534
1535 if (nmask) {
1536 err = compat_get_bitmap(bm, nmask, nr_bits);
1537 nm = compat_alloc_user_space(alloc_size);
1538 err |= copy_to_user(nm, bm, alloc_size);
1539 }
1540
1541 if (err)
1542 return -EFAULT;
1543
1544 return sys_set_mempolicy(mode, nm, nr_bits+1);
1545}
1546
1547COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1548 compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1549 compat_ulong_t, maxnode, compat_ulong_t, flags)
1550{
1551 long err = 0;
1552 unsigned long __user *nm = NULL;
1553 unsigned long nr_bits, alloc_size;
1554 nodemask_t bm;
1555
1556 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1557 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1558
1559 if (nmask) {
1560 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1561 nm = compat_alloc_user_space(alloc_size);
1562 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1563 }
1564
1565 if (err)
1566 return -EFAULT;
1567
1568 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1569}
1570
1571#endif
1572
1573struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1574 unsigned long addr)
1575{
1576 struct mempolicy *pol = NULL;
1577
1578 if (vma) {
1579 if (vma->vm_ops && vma->vm_ops->get_policy) {
1580 pol = vma->vm_ops->get_policy(vma, addr);
1581 } else if (vma->vm_policy) {
1582 pol = vma->vm_policy;
1583
1584
1585
1586
1587
1588
1589
1590 if (mpol_needs_cond_ref(pol))
1591 mpol_get(pol);
1592 }
1593 }
1594
1595 return pol;
1596}
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1611 unsigned long addr)
1612{
1613 struct mempolicy *pol = __get_vma_policy(vma, addr);
1614
1615 if (!pol)
1616 pol = get_task_policy(current);
1617
1618 return pol;
1619}
1620
1621bool vma_policy_mof(struct vm_area_struct *vma)
1622{
1623 struct mempolicy *pol;
1624
1625 if (vma->vm_ops && vma->vm_ops->get_policy) {
1626 bool ret = false;
1627
1628 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1629 if (pol && (pol->flags & MPOL_F_MOF))
1630 ret = true;
1631 mpol_cond_put(pol);
1632
1633 return ret;
1634 }
1635
1636 pol = vma->vm_policy;
1637 if (!pol)
1638 pol = get_task_policy(current);
1639
1640 return pol->flags & MPOL_F_MOF;
1641}
1642
1643static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1644{
1645 enum zone_type dynamic_policy_zone = policy_zone;
1646
1647 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1658 dynamic_policy_zone = ZONE_MOVABLE;
1659
1660 return zone >= dynamic_policy_zone;
1661}
1662
1663
1664
1665
1666
1667static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1668{
1669
1670 if (unlikely(policy->mode == MPOL_BIND) &&
1671 apply_policy_zone(policy, gfp_zone(gfp)) &&
1672 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1673 return &policy->v.nodes;
1674
1675 return NULL;
1676}
1677
1678
1679static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1680 int nd)
1681{
1682 switch (policy->mode) {
1683 case MPOL_PREFERRED:
1684 if (!(policy->flags & MPOL_F_LOCAL))
1685 nd = policy->v.preferred_node;
1686 break;
1687 case MPOL_BIND:
1688
1689
1690
1691
1692
1693
1694 if (unlikely(gfp & __GFP_THISNODE) &&
1695 unlikely(!node_isset(nd, policy->v.nodes)))
1696 nd = first_node(policy->v.nodes);
1697 break;
1698 default:
1699 BUG();
1700 }
1701 return node_zonelist(nd, gfp);
1702}
1703
1704
1705static unsigned interleave_nodes(struct mempolicy *policy)
1706{
1707 unsigned nid, next;
1708 struct task_struct *me = current;
1709
1710 nid = me->il_next;
1711 next = next_node_in(nid, policy->v.nodes);
1712 if (next < MAX_NUMNODES)
1713 me->il_next = next;
1714 return nid;
1715}
1716
1717
1718
1719
1720
1721unsigned int mempolicy_slab_node(void)
1722{
1723 struct mempolicy *policy;
1724 int node = numa_mem_id();
1725
1726 if (in_interrupt())
1727 return node;
1728
1729 policy = current->mempolicy;
1730 if (!policy || policy->flags & MPOL_F_LOCAL)
1731 return node;
1732
1733 switch (policy->mode) {
1734 case MPOL_PREFERRED:
1735
1736
1737
1738 return policy->v.preferred_node;
1739
1740 case MPOL_INTERLEAVE:
1741 return interleave_nodes(policy);
1742
1743 case MPOL_BIND: {
1744 struct zoneref *z;
1745
1746
1747
1748
1749
1750 struct zonelist *zonelist;
1751 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1752 zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
1753 z = first_zones_zonelist(zonelist, highest_zoneidx,
1754 &policy->v.nodes);
1755 return z->zone ? z->zone->node : node;
1756 }
1757
1758 default:
1759 BUG();
1760 }
1761}
1762
1763
1764
1765
1766
1767
1768static unsigned offset_il_node(struct mempolicy *pol,
1769 struct vm_area_struct *vma, unsigned long n)
1770{
1771 unsigned nnodes = nodes_weight(pol->v.nodes);
1772 unsigned target;
1773 int i;
1774 int nid;
1775
1776 if (!nnodes)
1777 return numa_node_id();
1778 target = (unsigned int)n % nnodes;
1779 nid = first_node(pol->v.nodes);
1780 for (i = 0; i < target; i++)
1781 nid = next_node(nid, pol->v.nodes);
1782 return nid;
1783}
1784
1785
1786static inline unsigned interleave_nid(struct mempolicy *pol,
1787 struct vm_area_struct *vma, unsigned long addr, int shift)
1788{
1789 if (vma) {
1790 unsigned long off;
1791
1792
1793
1794
1795
1796
1797
1798
1799 BUG_ON(shift < PAGE_SHIFT);
1800 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1801 off += (addr - vma->vm_start) >> shift;
1802 return offset_il_node(pol, vma, off);
1803 } else
1804 return interleave_nodes(pol);
1805}
1806
1807#ifdef CONFIG_HUGETLBFS
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1824 gfp_t gfp_flags, struct mempolicy **mpol,
1825 nodemask_t **nodemask)
1826{
1827 struct zonelist *zl;
1828
1829 *mpol = get_vma_policy(vma, addr);
1830 *nodemask = NULL;
1831
1832 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1833 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1834 huge_page_shift(hstate_vma(vma))), gfp_flags);
1835 } else {
1836 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1837 if ((*mpol)->mode == MPOL_BIND)
1838 *nodemask = &(*mpol)->v.nodes;
1839 }
1840 return zl;
1841}
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859bool init_nodemask_of_mempolicy(nodemask_t *mask)
1860{
1861 struct mempolicy *mempolicy;
1862 int nid;
1863
1864 if (!(mask && current->mempolicy))
1865 return false;
1866
1867 task_lock(current);
1868 mempolicy = current->mempolicy;
1869 switch (mempolicy->mode) {
1870 case MPOL_PREFERRED:
1871 if (mempolicy->flags & MPOL_F_LOCAL)
1872 nid = numa_node_id();
1873 else
1874 nid = mempolicy->v.preferred_node;
1875 init_nodemask_of_node(mask, nid);
1876 break;
1877
1878 case MPOL_BIND:
1879
1880 case MPOL_INTERLEAVE:
1881 *mask = mempolicy->v.nodes;
1882 break;
1883
1884 default:
1885 BUG();
1886 }
1887 task_unlock(current);
1888
1889 return true;
1890}
1891#endif
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1904 const nodemask_t *mask)
1905{
1906 struct mempolicy *mempolicy;
1907 bool ret = true;
1908
1909 if (!mask)
1910 return ret;
1911 task_lock(tsk);
1912 mempolicy = tsk->mempolicy;
1913 if (!mempolicy)
1914 goto out;
1915
1916 switch (mempolicy->mode) {
1917 case MPOL_PREFERRED:
1918
1919
1920
1921
1922
1923
1924 break;
1925 case MPOL_BIND:
1926 case MPOL_INTERLEAVE:
1927 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1928 break;
1929 default:
1930 BUG();
1931 }
1932out:
1933 task_unlock(tsk);
1934 return ret;
1935}
1936
1937
1938
1939static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1940 unsigned nid)
1941{
1942 struct zonelist *zl;
1943 struct page *page;
1944
1945 zl = node_zonelist(nid, gfp);
1946 page = __alloc_pages(gfp, order, zl);
1947 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1948 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1949 return page;
1950}
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975struct page *
1976alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1977 unsigned long addr, int node, bool hugepage)
1978{
1979 struct mempolicy *pol;
1980 struct page *page;
1981 unsigned int cpuset_mems_cookie;
1982 struct zonelist *zl;
1983 nodemask_t *nmask;
1984
1985retry_cpuset:
1986 pol = get_vma_policy(vma, addr);
1987 cpuset_mems_cookie = read_mems_allowed_begin();
1988
1989 if (pol->mode == MPOL_INTERLEAVE) {
1990 unsigned nid;
1991
1992 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1993 mpol_cond_put(pol);
1994 page = alloc_page_interleave(gfp, order, nid);
1995 goto out;
1996 }
1997
1998 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1999 int hpage_node = node;
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011 if (pol->mode == MPOL_PREFERRED &&
2012 !(pol->flags & MPOL_F_LOCAL))
2013 hpage_node = pol->v.preferred_node;
2014
2015 nmask = policy_nodemask(gfp, pol);
2016 if (!nmask || node_isset(hpage_node, *nmask)) {
2017 mpol_cond_put(pol);
2018 page = __alloc_pages_node(hpage_node,
2019 gfp | __GFP_THISNODE, order);
2020 goto out;
2021 }
2022 }
2023
2024 nmask = policy_nodemask(gfp, pol);
2025 zl = policy_zonelist(gfp, pol, node);
2026 mpol_cond_put(pol);
2027 page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2028out:
2029 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2030 goto retry_cpuset;
2031 return page;
2032}
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2054{
2055 struct mempolicy *pol = &default_policy;
2056 struct page *page;
2057 unsigned int cpuset_mems_cookie;
2058
2059 if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2060 pol = get_task_policy(current);
2061
2062retry_cpuset:
2063 cpuset_mems_cookie = read_mems_allowed_begin();
2064
2065
2066
2067
2068
2069 if (pol->mode == MPOL_INTERLEAVE)
2070 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2071 else
2072 page = __alloc_pages_nodemask(gfp, order,
2073 policy_zonelist(gfp, pol, numa_node_id()),
2074 policy_nodemask(gfp, pol));
2075
2076 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2077 goto retry_cpuset;
2078
2079 return page;
2080}
2081EXPORT_SYMBOL(alloc_pages_current);
2082
2083int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2084{
2085 struct mempolicy *pol = mpol_dup(vma_policy(src));
2086
2087 if (IS_ERR(pol))
2088 return PTR_ERR(pol);
2089 dst->vm_policy = pol;
2090 return 0;
2091}
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105struct mempolicy *__mpol_dup(struct mempolicy *old)
2106{
2107 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2108
2109 if (!new)
2110 return ERR_PTR(-ENOMEM);
2111
2112
2113 if (old == current->mempolicy) {
2114 task_lock(current);
2115 *new = *old;
2116 task_unlock(current);
2117 } else
2118 *new = *old;
2119
2120 if (current_cpuset_is_being_rebound()) {
2121 nodemask_t mems = cpuset_mems_allowed(current);
2122 if (new->flags & MPOL_F_REBINDING)
2123 mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2124 else
2125 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2126 }
2127 atomic_set(&new->refcnt, 1);
2128 return new;
2129}
2130
2131
2132bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2133{
2134 if (!a || !b)
2135 return false;
2136 if (a->mode != b->mode)
2137 return false;
2138 if (a->flags != b->flags)
2139 return false;
2140 if (mpol_store_user_nodemask(a))
2141 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2142 return false;
2143
2144 switch (a->mode) {
2145 case MPOL_BIND:
2146
2147 case MPOL_INTERLEAVE:
2148 return !!nodes_equal(a->v.nodes, b->v.nodes);
2149 case MPOL_PREFERRED:
2150 return a->v.preferred_node == b->v.preferred_node;
2151 default:
2152 BUG();
2153 return false;
2154 }
2155}
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170static struct sp_node *
2171sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2172{
2173 struct rb_node *n = sp->root.rb_node;
2174
2175 while (n) {
2176 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2177
2178 if (start >= p->end)
2179 n = n->rb_right;
2180 else if (end <= p->start)
2181 n = n->rb_left;
2182 else
2183 break;
2184 }
2185 if (!n)
2186 return NULL;
2187 for (;;) {
2188 struct sp_node *w = NULL;
2189 struct rb_node *prev = rb_prev(n);
2190 if (!prev)
2191 break;
2192 w = rb_entry(prev, struct sp_node, nd);
2193 if (w->end <= start)
2194 break;
2195 n = prev;
2196 }
2197 return rb_entry(n, struct sp_node, nd);
2198}
2199
2200
2201
2202
2203
2204static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2205{
2206 struct rb_node **p = &sp->root.rb_node;
2207 struct rb_node *parent = NULL;
2208 struct sp_node *nd;
2209
2210 while (*p) {
2211 parent = *p;
2212 nd = rb_entry(parent, struct sp_node, nd);
2213 if (new->start < nd->start)
2214 p = &(*p)->rb_left;
2215 else if (new->end > nd->end)
2216 p = &(*p)->rb_right;
2217 else
2218 BUG();
2219 }
2220 rb_link_node(&new->nd, parent, p);
2221 rb_insert_color(&new->nd, &sp->root);
2222 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2223 new->policy ? new->policy->mode : 0);
2224}
2225
2226
2227struct mempolicy *
2228mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2229{
2230 struct mempolicy *pol = NULL;
2231 struct sp_node *sn;
2232
2233 if (!sp->root.rb_node)
2234 return NULL;
2235 read_lock(&sp->lock);
2236 sn = sp_lookup(sp, idx, idx+1);
2237 if (sn) {
2238 mpol_get(sn->policy);
2239 pol = sn->policy;
2240 }
2241 read_unlock(&sp->lock);
2242 return pol;
2243}
2244
2245static void sp_free(struct sp_node *n)
2246{
2247 mpol_put(n->policy);
2248 kmem_cache_free(sn_cache, n);
2249}
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2269{
2270 struct mempolicy *pol;
2271 struct zoneref *z;
2272 int curnid = page_to_nid(page);
2273 unsigned long pgoff;
2274 int thiscpu = raw_smp_processor_id();
2275 int thisnid = cpu_to_node(thiscpu);
2276 int polnid = -1;
2277 int ret = -1;
2278
2279 BUG_ON(!vma);
2280
2281 pol = get_vma_policy(vma, addr);
2282 if (!(pol->flags & MPOL_F_MOF))
2283 goto out;
2284
2285 switch (pol->mode) {
2286 case MPOL_INTERLEAVE:
2287 BUG_ON(addr >= vma->vm_end);
2288 BUG_ON(addr < vma->vm_start);
2289
2290 pgoff = vma->vm_pgoff;
2291 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2292 polnid = offset_il_node(pol, vma, pgoff);
2293 break;
2294
2295 case MPOL_PREFERRED:
2296 if (pol->flags & MPOL_F_LOCAL)
2297 polnid = numa_node_id();
2298 else
2299 polnid = pol->v.preferred_node;
2300 break;
2301
2302 case MPOL_BIND:
2303
2304
2305
2306
2307
2308
2309
2310 if (node_isset(curnid, pol->v.nodes))
2311 goto out;
2312 z = first_zones_zonelist(
2313 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2314 gfp_zone(GFP_HIGHUSER),
2315 &pol->v.nodes);
2316 polnid = z->zone->node;
2317 break;
2318
2319 default:
2320 BUG();
2321 }
2322
2323
2324 if (pol->flags & MPOL_F_MORON) {
2325 polnid = thisnid;
2326
2327 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2328 goto out;
2329 }
2330
2331 if (curnid != polnid)
2332 ret = polnid;
2333out:
2334 mpol_cond_put(pol);
2335
2336 return ret;
2337}
2338
2339
2340
2341
2342
2343
2344
2345void mpol_put_task_policy(struct task_struct *task)
2346{
2347 struct mempolicy *pol;
2348
2349 task_lock(task);
2350 pol = task->mempolicy;
2351 task->mempolicy = NULL;
2352 task_unlock(task);
2353 mpol_put(pol);
2354}
2355
2356static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2357{
2358 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2359 rb_erase(&n->nd, &sp->root);
2360 sp_free(n);
2361}
2362
2363static void sp_node_init(struct sp_node *node, unsigned long start,
2364 unsigned long end, struct mempolicy *pol)
2365{
2366 node->start = start;
2367 node->end = end;
2368 node->policy = pol;
2369}
2370
2371static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2372 struct mempolicy *pol)
2373{
2374 struct sp_node *n;
2375 struct mempolicy *newpol;
2376
2377 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2378 if (!n)
2379 return NULL;
2380
2381 newpol = mpol_dup(pol);
2382 if (IS_ERR(newpol)) {
2383 kmem_cache_free(sn_cache, n);
2384 return NULL;
2385 }
2386 newpol->flags |= MPOL_F_SHARED;
2387 sp_node_init(n, start, end, newpol);
2388
2389 return n;
2390}
2391
2392
2393static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2394 unsigned long end, struct sp_node *new)
2395{
2396 struct sp_node *n;
2397 struct sp_node *n_new = NULL;
2398 struct mempolicy *mpol_new = NULL;
2399 int ret = 0;
2400
2401restart:
2402 write_lock(&sp->lock);
2403 n = sp_lookup(sp, start, end);
2404
2405 while (n && n->start < end) {
2406 struct rb_node *next = rb_next(&n->nd);
2407 if (n->start >= start) {
2408 if (n->end <= end)
2409 sp_delete(sp, n);
2410 else
2411 n->start = end;
2412 } else {
2413
2414 if (n->end > end) {
2415 if (!n_new)
2416 goto alloc_new;
2417
2418 *mpol_new = *n->policy;
2419 atomic_set(&mpol_new->refcnt, 1);
2420 sp_node_init(n_new, end, n->end, mpol_new);
2421 n->end = start;
2422 sp_insert(sp, n_new);
2423 n_new = NULL;
2424 mpol_new = NULL;
2425 break;
2426 } else
2427 n->end = start;
2428 }
2429 if (!next)
2430 break;
2431 n = rb_entry(next, struct sp_node, nd);
2432 }
2433 if (new)
2434 sp_insert(sp, new);
2435 write_unlock(&sp->lock);
2436 ret = 0;
2437
2438err_out:
2439 if (mpol_new)
2440 mpol_put(mpol_new);
2441 if (n_new)
2442 kmem_cache_free(sn_cache, n_new);
2443
2444 return ret;
2445
2446alloc_new:
2447 write_unlock(&sp->lock);
2448 ret = -ENOMEM;
2449 n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2450 if (!n_new)
2451 goto err_out;
2452 mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2453 if (!mpol_new)
2454 goto err_out;
2455 goto restart;
2456}
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2469{
2470 int ret;
2471
2472 sp->root = RB_ROOT;
2473 rwlock_init(&sp->lock);
2474
2475 if (mpol) {
2476 struct vm_area_struct pvma;
2477 struct mempolicy *new;
2478 NODEMASK_SCRATCH(scratch);
2479
2480 if (!scratch)
2481 goto put_mpol;
2482
2483 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2484 if (IS_ERR(new))
2485 goto free_scratch;
2486
2487 task_lock(current);
2488 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2489 task_unlock(current);
2490 if (ret)
2491 goto put_new;
2492
2493
2494 memset(&pvma, 0, sizeof(struct vm_area_struct));
2495 pvma.vm_end = TASK_SIZE;
2496 mpol_set_shared_policy(sp, &pvma, new);
2497
2498put_new:
2499 mpol_put(new);
2500free_scratch:
2501 NODEMASK_SCRATCH_FREE(scratch);
2502put_mpol:
2503 mpol_put(mpol);
2504 }
2505}
2506
2507int mpol_set_shared_policy(struct shared_policy *info,
2508 struct vm_area_struct *vma, struct mempolicy *npol)
2509{
2510 int err;
2511 struct sp_node *new = NULL;
2512 unsigned long sz = vma_pages(vma);
2513
2514 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2515 vma->vm_pgoff,
2516 sz, npol ? npol->mode : -1,
2517 npol ? npol->flags : -1,
2518 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2519
2520 if (npol) {
2521 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2522 if (!new)
2523 return -ENOMEM;
2524 }
2525 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2526 if (err && new)
2527 sp_free(new);
2528 return err;
2529}
2530
2531
2532void mpol_free_shared_policy(struct shared_policy *p)
2533{
2534 struct sp_node *n;
2535 struct rb_node *next;
2536
2537 if (!p->root.rb_node)
2538 return;
2539 write_lock(&p->lock);
2540 next = rb_first(&p->root);
2541 while (next) {
2542 n = rb_entry(next, struct sp_node, nd);
2543 next = rb_next(&n->nd);
2544 sp_delete(p, n);
2545 }
2546 write_unlock(&p->lock);
2547}
2548
2549#ifdef CONFIG_NUMA_BALANCING
2550static int __initdata numabalancing_override;
2551
2552static void __init check_numabalancing_enable(void)
2553{
2554 bool numabalancing_default = false;
2555
2556 if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2557 numabalancing_default = true;
2558
2559
2560 if (numabalancing_override)
2561 set_numabalancing_state(numabalancing_override == 1);
2562
2563 if (num_online_nodes() > 1 && !numabalancing_override) {
2564 pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
2565 numabalancing_default ? "Enabling" : "Disabling");
2566 set_numabalancing_state(numabalancing_default);
2567 }
2568}
2569
2570static int __init setup_numabalancing(char *str)
2571{
2572 int ret = 0;
2573 if (!str)
2574 goto out;
2575
2576 if (!strcmp(str, "enable")) {
2577 numabalancing_override = 1;
2578 ret = 1;
2579 } else if (!strcmp(str, "disable")) {
2580 numabalancing_override = -1;
2581 ret = 1;
2582 }
2583out:
2584 if (!ret)
2585 pr_warn("Unable to parse numa_balancing=\n");
2586
2587 return ret;
2588}
2589__setup("numa_balancing=", setup_numabalancing);
2590#else
2591static inline void __init check_numabalancing_enable(void)
2592{
2593}
2594#endif
2595
2596
2597void __init numa_policy_init(void)
2598{
2599 nodemask_t interleave_nodes;
2600 unsigned long largest = 0;
2601 int nid, prefer = 0;
2602
2603 policy_cache = kmem_cache_create("numa_policy",
2604 sizeof(struct mempolicy),
2605 0, SLAB_PANIC, NULL);
2606
2607 sn_cache = kmem_cache_create("shared_policy_node",
2608 sizeof(struct sp_node),
2609 0, SLAB_PANIC, NULL);
2610
2611 for_each_node(nid) {
2612 preferred_node_policy[nid] = (struct mempolicy) {
2613 .refcnt = ATOMIC_INIT(1),
2614 .mode = MPOL_PREFERRED,
2615 .flags = MPOL_F_MOF | MPOL_F_MORON,
2616 .v = { .preferred_node = nid, },
2617 };
2618 }
2619
2620
2621
2622
2623
2624
2625 nodes_clear(interleave_nodes);
2626 for_each_node_state(nid, N_MEMORY) {
2627 unsigned long total_pages = node_present_pages(nid);
2628
2629
2630 if (largest < total_pages) {
2631 largest = total_pages;
2632 prefer = nid;
2633 }
2634
2635
2636 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2637 node_set(nid, interleave_nodes);
2638 }
2639
2640
2641 if (unlikely(nodes_empty(interleave_nodes)))
2642 node_set(prefer, interleave_nodes);
2643
2644 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2645 pr_err("%s: interleaving failed\n", __func__);
2646
2647 check_numabalancing_enable();
2648}
2649
2650
2651void numa_default_policy(void)
2652{
2653 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2654}
2655
2656
2657
2658
2659
2660
2661
2662
2663static const char * const policy_modes[] =
2664{
2665 [MPOL_DEFAULT] = "default",
2666 [MPOL_PREFERRED] = "prefer",
2667 [MPOL_BIND] = "bind",
2668 [MPOL_INTERLEAVE] = "interleave",
2669 [MPOL_LOCAL] = "local",
2670};
2671
2672
2673#ifdef CONFIG_TMPFS
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684int mpol_parse_str(char *str, struct mempolicy **mpol)
2685{
2686 struct mempolicy *new = NULL;
2687 unsigned short mode;
2688 unsigned short mode_flags;
2689 nodemask_t nodes;
2690 char *nodelist = strchr(str, ':');
2691 char *flags = strchr(str, '=');
2692 int err = 1;
2693
2694 if (nodelist) {
2695
2696 *nodelist++ = '\0';
2697 if (nodelist_parse(nodelist, nodes))
2698 goto out;
2699 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2700 goto out;
2701 } else
2702 nodes_clear(nodes);
2703
2704 if (flags)
2705 *flags++ = '\0';
2706
2707 for (mode = 0; mode < MPOL_MAX; mode++) {
2708 if (!strcmp(str, policy_modes[mode])) {
2709 break;
2710 }
2711 }
2712 if (mode >= MPOL_MAX)
2713 goto out;
2714
2715 switch (mode) {
2716 case MPOL_PREFERRED:
2717
2718
2719
2720 if (nodelist) {
2721 char *rest = nodelist;
2722 while (isdigit(*rest))
2723 rest++;
2724 if (*rest)
2725 goto out;
2726 }
2727 break;
2728 case MPOL_INTERLEAVE:
2729
2730
2731
2732 if (!nodelist)
2733 nodes = node_states[N_MEMORY];
2734 break;
2735 case MPOL_LOCAL:
2736
2737
2738
2739 if (nodelist)
2740 goto out;
2741 mode = MPOL_PREFERRED;
2742 break;
2743 case MPOL_DEFAULT:
2744
2745
2746
2747 if (!nodelist)
2748 err = 0;
2749 goto out;
2750 case MPOL_BIND:
2751
2752
2753
2754 if (!nodelist)
2755 goto out;
2756 }
2757
2758 mode_flags = 0;
2759 if (flags) {
2760
2761
2762
2763
2764 if (!strcmp(flags, "static"))
2765 mode_flags |= MPOL_F_STATIC_NODES;
2766 else if (!strcmp(flags, "relative"))
2767 mode_flags |= MPOL_F_RELATIVE_NODES;
2768 else
2769 goto out;
2770 }
2771
2772 new = mpol_new(mode, mode_flags, &nodes);
2773 if (IS_ERR(new))
2774 goto out;
2775
2776
2777
2778
2779
2780 if (mode != MPOL_PREFERRED)
2781 new->v.nodes = nodes;
2782 else if (nodelist)
2783 new->v.preferred_node = first_node(nodes);
2784 else
2785 new->flags |= MPOL_F_LOCAL;
2786
2787
2788
2789
2790
2791 new->w.user_nodemask = nodes;
2792
2793 err = 0;
2794
2795out:
2796
2797 if (nodelist)
2798 *--nodelist = ':';
2799 if (flags)
2800 *--flags = '=';
2801 if (!err)
2802 *mpol = new;
2803 return err;
2804}
2805#endif
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2818{
2819 char *p = buffer;
2820 nodemask_t nodes = NODE_MASK_NONE;
2821 unsigned short mode = MPOL_DEFAULT;
2822 unsigned short flags = 0;
2823
2824 if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2825 mode = pol->mode;
2826 flags = pol->flags;
2827 }
2828
2829 switch (mode) {
2830 case MPOL_DEFAULT:
2831 break;
2832 case MPOL_PREFERRED:
2833 if (flags & MPOL_F_LOCAL)
2834 mode = MPOL_LOCAL;
2835 else
2836 node_set(pol->v.preferred_node, nodes);
2837 break;
2838 case MPOL_BIND:
2839 case MPOL_INTERLEAVE:
2840 nodes = pol->v.nodes;
2841 break;
2842 default:
2843 WARN_ON_ONCE(1);
2844 snprintf(p, maxlen, "unknown");
2845 return;
2846 }
2847
2848 p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2849
2850 if (flags & MPOL_MODE_FLAGS) {
2851 p += snprintf(p, buffer + maxlen - p, "=");
2852
2853
2854
2855
2856 if (flags & MPOL_F_STATIC_NODES)
2857 p += snprintf(p, buffer + maxlen - p, "static");
2858 else if (flags & MPOL_F_RELATIVE_NODES)
2859 p += snprintf(p, buffer + maxlen - p, "relative");
2860 }
2861
2862 if (!nodes_empty(nodes))
2863 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2864 nodemask_pr_args(&nodes));
2865}
2866