1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/sched/mm.h>
23#include <linux/sched/coredump.h>
24#include <linux/rwsem.h>
25#include <linux/pagemap.h>
26#include <linux/rmap.h>
27#include <linux/spinlock.h>
28#include <linux/xxhash.h>
29#include <linux/delay.h>
30#include <linux/kthread.h>
31#include <linux/wait.h>
32#include <linux/slab.h>
33#include <linux/rbtree.h>
34#include <linux/memory.h>
35#include <linux/mmu_notifier.h>
36#include <linux/swap.h>
37#include <linux/ksm.h>
38#include <linux/hashtable.h>
39#include <linux/freezer.h>
40#include <linux/oom.h>
41#include <linux/numa.h>
42
43#include <asm/tlbflush.h>
44#include "internal.h"
45
46#ifdef CONFIG_NUMA
47#define NUMA(x) (x)
48#define DO_NUMA(x) do { (x); } while (0)
49#else
50#define NUMA(x) (0)
51#define DO_NUMA(x) do { } while (0)
52#endif
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121struct mm_slot {
122 struct hlist_node link;
123 struct list_head mm_list;
124 struct rmap_item *rmap_list;
125 struct mm_struct *mm;
126};
127
128
129
130
131
132
133
134
135
136
137struct ksm_scan {
138 struct mm_slot *mm_slot;
139 unsigned long address;
140 struct rmap_item **rmap_list;
141 unsigned long seqnr;
142};
143
144
145
146
147
148
149
150
151
152
153
154
155
156struct stable_node {
157 union {
158 struct rb_node node;
159 struct {
160 struct list_head *head;
161 struct {
162 struct hlist_node hlist_dup;
163 struct list_head list;
164 };
165 };
166 };
167 struct hlist_head hlist;
168 union {
169 unsigned long kpfn;
170 unsigned long chain_prune_time;
171 };
172
173
174
175
176
177#define STABLE_NODE_CHAIN -1024
178 int rmap_hlist_len;
179#ifdef CONFIG_NUMA
180 int nid;
181#endif
182};
183
184
185
186
187
188
189
190
191
192
193
194
195
196struct rmap_item {
197 struct rmap_item *rmap_list;
198 union {
199 struct anon_vma *anon_vma;
200#ifdef CONFIG_NUMA
201 int nid;
202#endif
203 };
204 struct mm_struct *mm;
205 unsigned long address;
206 unsigned int oldchecksum;
207 union {
208 struct rb_node node;
209 struct {
210 struct stable_node *head;
211 struct hlist_node hlist;
212 };
213 };
214};
215
216#define SEQNR_MASK 0x0ff
217#define UNSTABLE_FLAG 0x100
218#define STABLE_FLAG 0x200
219#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
220
221
222
223static struct rb_root one_stable_tree[1] = { RB_ROOT };
224static struct rb_root one_unstable_tree[1] = { RB_ROOT };
225static struct rb_root *root_stable_tree = one_stable_tree;
226static struct rb_root *root_unstable_tree = one_unstable_tree;
227
228
229static LIST_HEAD(migrate_nodes);
230#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
231
232#define MM_SLOTS_HASH_BITS 10
233static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
234
235static struct mm_slot ksm_mm_head = {
236 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
237};
238static struct ksm_scan ksm_scan = {
239 .mm_slot = &ksm_mm_head,
240};
241
242static struct kmem_cache *rmap_item_cache;
243static struct kmem_cache *stable_node_cache;
244static struct kmem_cache *mm_slot_cache;
245
246
247static unsigned long ksm_pages_shared;
248
249
250static unsigned long ksm_pages_sharing;
251
252
253static unsigned long ksm_pages_unshared;
254
255
256static unsigned long ksm_rmap_items;
257
258
259static unsigned long ksm_stable_node_chains;
260
261
262static unsigned long ksm_stable_node_dups;
263
264
265static int ksm_stable_node_chains_prune_millisecs = 2000;
266
267
268static int ksm_max_page_sharing = 256;
269
270
271static unsigned int ksm_thread_pages_to_scan = 100;
272
273
274static unsigned int ksm_thread_sleep_millisecs = 20;
275
276
277static unsigned int zero_checksum __read_mostly;
278
279
280static bool ksm_use_zero_pages __read_mostly;
281
282#ifdef CONFIG_NUMA
283
284static unsigned int ksm_merge_across_nodes = 1;
285static int ksm_nr_node_ids = 1;
286#else
287#define ksm_merge_across_nodes 1U
288#define ksm_nr_node_ids 1
289#endif
290
291#define KSM_RUN_STOP 0
292#define KSM_RUN_MERGE 1
293#define KSM_RUN_UNMERGE 2
294#define KSM_RUN_OFFLINE 4
295static unsigned long ksm_run = KSM_RUN_STOP;
296static void wait_while_offlining(void);
297
298static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
299static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
300static DEFINE_MUTEX(ksm_thread_mutex);
301static DEFINE_SPINLOCK(ksm_mmlist_lock);
302
303#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
304 sizeof(struct __struct), __alignof__(struct __struct),\
305 (__flags), NULL)
306
307static int __init ksm_slab_init(void)
308{
309 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
310 if (!rmap_item_cache)
311 goto out;
312
313 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
314 if (!stable_node_cache)
315 goto out_free1;
316
317 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
318 if (!mm_slot_cache)
319 goto out_free2;
320
321 return 0;
322
323out_free2:
324 kmem_cache_destroy(stable_node_cache);
325out_free1:
326 kmem_cache_destroy(rmap_item_cache);
327out:
328 return -ENOMEM;
329}
330
331static void __init ksm_slab_free(void)
332{
333 kmem_cache_destroy(mm_slot_cache);
334 kmem_cache_destroy(stable_node_cache);
335 kmem_cache_destroy(rmap_item_cache);
336 mm_slot_cache = NULL;
337}
338
339static __always_inline bool is_stable_node_chain(struct stable_node *chain)
340{
341 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
342}
343
344static __always_inline bool is_stable_node_dup(struct stable_node *dup)
345{
346 return dup->head == STABLE_NODE_DUP_HEAD;
347}
348
349static inline void stable_node_chain_add_dup(struct stable_node *dup,
350 struct stable_node *chain)
351{
352 VM_BUG_ON(is_stable_node_dup(dup));
353 dup->head = STABLE_NODE_DUP_HEAD;
354 VM_BUG_ON(!is_stable_node_chain(chain));
355 hlist_add_head(&dup->hlist_dup, &chain->hlist);
356 ksm_stable_node_dups++;
357}
358
359static inline void __stable_node_dup_del(struct stable_node *dup)
360{
361 VM_BUG_ON(!is_stable_node_dup(dup));
362 hlist_del(&dup->hlist_dup);
363 ksm_stable_node_dups--;
364}
365
366static inline void stable_node_dup_del(struct stable_node *dup)
367{
368 VM_BUG_ON(is_stable_node_chain(dup));
369 if (is_stable_node_dup(dup))
370 __stable_node_dup_del(dup);
371 else
372 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
373#ifdef CONFIG_DEBUG_VM
374 dup->head = NULL;
375#endif
376}
377
378static inline struct rmap_item *alloc_rmap_item(void)
379{
380 struct rmap_item *rmap_item;
381
382 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
383 __GFP_NORETRY | __GFP_NOWARN);
384 if (rmap_item)
385 ksm_rmap_items++;
386 return rmap_item;
387}
388
389static inline void free_rmap_item(struct rmap_item *rmap_item)
390{
391 ksm_rmap_items--;
392 rmap_item->mm = NULL;
393 kmem_cache_free(rmap_item_cache, rmap_item);
394}
395
396static inline struct stable_node *alloc_stable_node(void)
397{
398
399
400
401
402
403 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
404}
405
406static inline void free_stable_node(struct stable_node *stable_node)
407{
408 VM_BUG_ON(stable_node->rmap_hlist_len &&
409 !is_stable_node_chain(stable_node));
410 kmem_cache_free(stable_node_cache, stable_node);
411}
412
413static inline struct mm_slot *alloc_mm_slot(void)
414{
415 if (!mm_slot_cache)
416 return NULL;
417 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
418}
419
420static inline void free_mm_slot(struct mm_slot *mm_slot)
421{
422 kmem_cache_free(mm_slot_cache, mm_slot);
423}
424
425static struct mm_slot *get_mm_slot(struct mm_struct *mm)
426{
427 struct mm_slot *slot;
428
429 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
430 if (slot->mm == mm)
431 return slot;
432
433 return NULL;
434}
435
436static void insert_to_mm_slots_hash(struct mm_struct *mm,
437 struct mm_slot *mm_slot)
438{
439 mm_slot->mm = mm;
440 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
441}
442
443
444
445
446
447
448
449
450
451static inline bool ksm_test_exit(struct mm_struct *mm)
452{
453 return atomic_read(&mm->mm_users) == 0;
454}
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
472{
473 struct page *page;
474 vm_fault_t ret = 0;
475
476 do {
477 cond_resched();
478 page = follow_page(vma, addr,
479 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
480 if (IS_ERR_OR_NULL(page))
481 break;
482 if (PageKsm(page))
483 ret = handle_mm_fault(vma, addr,
484 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
485 else
486 ret = VM_FAULT_WRITE;
487 put_page(page);
488 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
518}
519
520static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
521 unsigned long addr)
522{
523 struct vm_area_struct *vma;
524 if (ksm_test_exit(mm))
525 return NULL;
526 vma = find_vma(mm, addr);
527 if (!vma || vma->vm_start > addr)
528 return NULL;
529 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
530 return NULL;
531 return vma;
532}
533
534static void break_cow(struct rmap_item *rmap_item)
535{
536 struct mm_struct *mm = rmap_item->mm;
537 unsigned long addr = rmap_item->address;
538 struct vm_area_struct *vma;
539
540
541
542
543
544 put_anon_vma(rmap_item->anon_vma);
545
546 down_read(&mm->mmap_sem);
547 vma = find_mergeable_vma(mm, addr);
548 if (vma)
549 break_ksm(vma, addr);
550 up_read(&mm->mmap_sem);
551}
552
553static struct page *get_mergeable_page(struct rmap_item *rmap_item)
554{
555 struct mm_struct *mm = rmap_item->mm;
556 unsigned long addr = rmap_item->address;
557 struct vm_area_struct *vma;
558 struct page *page;
559
560 down_read(&mm->mmap_sem);
561 vma = find_mergeable_vma(mm, addr);
562 if (!vma)
563 goto out;
564
565 page = follow_page(vma, addr, FOLL_GET);
566 if (IS_ERR_OR_NULL(page))
567 goto out;
568 if (PageAnon(page)) {
569 flush_anon_page(vma, page, addr);
570 flush_dcache_page(page);
571 } else {
572 put_page(page);
573out:
574 page = NULL;
575 }
576 up_read(&mm->mmap_sem);
577 return page;
578}
579
580
581
582
583
584
585
586static inline int get_kpfn_nid(unsigned long kpfn)
587{
588 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
589}
590
591static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
592 struct rb_root *root)
593{
594 struct stable_node *chain = alloc_stable_node();
595 VM_BUG_ON(is_stable_node_chain(dup));
596 if (likely(chain)) {
597 INIT_HLIST_HEAD(&chain->hlist);
598 chain->chain_prune_time = jiffies;
599 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
600#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
601 chain->nid = NUMA_NO_NODE;
602#endif
603 ksm_stable_node_chains++;
604
605
606
607
608
609
610 rb_replace_node(&dup->node, &chain->node, root);
611
612
613
614
615
616
617
618
619 stable_node_chain_add_dup(dup, chain);
620 }
621 return chain;
622}
623
624static inline void free_stable_node_chain(struct stable_node *chain,
625 struct rb_root *root)
626{
627 rb_erase(&chain->node, root);
628 free_stable_node(chain);
629 ksm_stable_node_chains--;
630}
631
632static void remove_node_from_stable_tree(struct stable_node *stable_node)
633{
634 struct rmap_item *rmap_item;
635
636
637 BUG_ON(stable_node->rmap_hlist_len < 0);
638
639 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
640 if (rmap_item->hlist.next)
641 ksm_pages_sharing--;
642 else
643 ksm_pages_shared--;
644 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
645 stable_node->rmap_hlist_len--;
646 put_anon_vma(rmap_item->anon_vma);
647 rmap_item->address &= PAGE_MASK;
648 cond_resched();
649 }
650
651
652
653
654
655
656
657
658#if defined(GCC_VERSION) && GCC_VERSION >= 40903
659 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
660 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
661#endif
662
663 if (stable_node->head == &migrate_nodes)
664 list_del(&stable_node->list);
665 else
666 stable_node_dup_del(stable_node);
667 free_stable_node(stable_node);
668}
669
670enum get_ksm_page_flags {
671 GET_KSM_PAGE_NOLOCK,
672 GET_KSM_PAGE_LOCK,
673 GET_KSM_PAGE_TRYLOCK
674};
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695static struct page *get_ksm_page(struct stable_node *stable_node,
696 enum get_ksm_page_flags flags)
697{
698 struct page *page;
699 void *expected_mapping;
700 unsigned long kpfn;
701
702 expected_mapping = (void *)((unsigned long)stable_node |
703 PAGE_MAPPING_KSM);
704again:
705 kpfn = READ_ONCE(stable_node->kpfn);
706 page = pfn_to_page(kpfn);
707 if (READ_ONCE(page->mapping) != expected_mapping)
708 goto stale;
709
710
711
712
713
714
715
716
717
718
719
720 while (!get_page_unless_zero(page)) {
721
722
723
724
725
726
727
728
729 if (!PageSwapCache(page))
730 goto stale;
731 cpu_relax();
732 }
733
734 if (READ_ONCE(page->mapping) != expected_mapping) {
735 put_page(page);
736 goto stale;
737 }
738
739 if (flags == GET_KSM_PAGE_TRYLOCK) {
740 if (!trylock_page(page)) {
741 put_page(page);
742 return ERR_PTR(-EBUSY);
743 }
744 } else if (flags == GET_KSM_PAGE_LOCK)
745 lock_page(page);
746
747 if (flags != GET_KSM_PAGE_NOLOCK) {
748 if (READ_ONCE(page->mapping) != expected_mapping) {
749 unlock_page(page);
750 put_page(page);
751 goto stale;
752 }
753 }
754 return page;
755
756stale:
757
758
759
760
761
762
763 smp_rmb();
764 if (READ_ONCE(stable_node->kpfn) != kpfn)
765 goto again;
766 remove_node_from_stable_tree(stable_node);
767 return NULL;
768}
769
770
771
772
773
774static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
775{
776 if (rmap_item->address & STABLE_FLAG) {
777 struct stable_node *stable_node;
778 struct page *page;
779
780 stable_node = rmap_item->head;
781 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
782 if (!page)
783 goto out;
784
785 hlist_del(&rmap_item->hlist);
786 unlock_page(page);
787 put_page(page);
788
789 if (!hlist_empty(&stable_node->hlist))
790 ksm_pages_sharing--;
791 else
792 ksm_pages_shared--;
793 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
794 stable_node->rmap_hlist_len--;
795
796 put_anon_vma(rmap_item->anon_vma);
797 rmap_item->address &= PAGE_MASK;
798
799 } else if (rmap_item->address & UNSTABLE_FLAG) {
800 unsigned char age;
801
802
803
804
805
806
807
808 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
809 BUG_ON(age > 1);
810 if (!age)
811 rb_erase(&rmap_item->node,
812 root_unstable_tree + NUMA(rmap_item->nid));
813 ksm_pages_unshared--;
814 rmap_item->address &= PAGE_MASK;
815 }
816out:
817 cond_resched();
818}
819
820static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
821 struct rmap_item **rmap_list)
822{
823 while (*rmap_list) {
824 struct rmap_item *rmap_item = *rmap_list;
825 *rmap_list = rmap_item->rmap_list;
826 remove_rmap_item_from_tree(rmap_item);
827 free_rmap_item(rmap_item);
828 }
829}
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844static int unmerge_ksm_pages(struct vm_area_struct *vma,
845 unsigned long start, unsigned long end)
846{
847 unsigned long addr;
848 int err = 0;
849
850 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
851 if (ksm_test_exit(vma->vm_mm))
852 break;
853 if (signal_pending(current))
854 err = -ERESTARTSYS;
855 else
856 err = break_ksm(vma, addr);
857 }
858 return err;
859}
860
861static inline struct stable_node *page_stable_node(struct page *page)
862{
863 return PageKsm(page) ? page_rmapping(page) : NULL;
864}
865
866static inline void set_page_stable_node(struct page *page,
867 struct stable_node *stable_node)
868{
869 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
870}
871
872#ifdef CONFIG_SYSFS
873
874
875
876static int remove_stable_node(struct stable_node *stable_node)
877{
878 struct page *page;
879 int err;
880
881 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
882 if (!page) {
883
884
885
886 return 0;
887 }
888
889 if (WARN_ON_ONCE(page_mapped(page))) {
890
891
892
893
894 err = -EBUSY;
895 } else {
896
897
898
899
900
901
902
903
904 set_page_stable_node(page, NULL);
905 remove_node_from_stable_tree(stable_node);
906 err = 0;
907 }
908
909 unlock_page(page);
910 put_page(page);
911 return err;
912}
913
914static int remove_stable_node_chain(struct stable_node *stable_node,
915 struct rb_root *root)
916{
917 struct stable_node *dup;
918 struct hlist_node *hlist_safe;
919
920 if (!is_stable_node_chain(stable_node)) {
921 VM_BUG_ON(is_stable_node_dup(stable_node));
922 if (remove_stable_node(stable_node))
923 return true;
924 else
925 return false;
926 }
927
928 hlist_for_each_entry_safe(dup, hlist_safe,
929 &stable_node->hlist, hlist_dup) {
930 VM_BUG_ON(!is_stable_node_dup(dup));
931 if (remove_stable_node(dup))
932 return true;
933 }
934 BUG_ON(!hlist_empty(&stable_node->hlist));
935 free_stable_node_chain(stable_node, root);
936 return false;
937}
938
939static int remove_all_stable_nodes(void)
940{
941 struct stable_node *stable_node, *next;
942 int nid;
943 int err = 0;
944
945 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
946 while (root_stable_tree[nid].rb_node) {
947 stable_node = rb_entry(root_stable_tree[nid].rb_node,
948 struct stable_node, node);
949 if (remove_stable_node_chain(stable_node,
950 root_stable_tree + nid)) {
951 err = -EBUSY;
952 break;
953 }
954 cond_resched();
955 }
956 }
957 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
958 if (remove_stable_node(stable_node))
959 err = -EBUSY;
960 cond_resched();
961 }
962 return err;
963}
964
965static int unmerge_and_remove_all_rmap_items(void)
966{
967 struct mm_slot *mm_slot;
968 struct mm_struct *mm;
969 struct vm_area_struct *vma;
970 int err = 0;
971
972 spin_lock(&ksm_mmlist_lock);
973 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
974 struct mm_slot, mm_list);
975 spin_unlock(&ksm_mmlist_lock);
976
977 for (mm_slot = ksm_scan.mm_slot;
978 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
979 mm = mm_slot->mm;
980 down_read(&mm->mmap_sem);
981 for (vma = mm->mmap; vma; vma = vma->vm_next) {
982 if (ksm_test_exit(mm))
983 break;
984 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
985 continue;
986 err = unmerge_ksm_pages(vma,
987 vma->vm_start, vma->vm_end);
988 if (err)
989 goto error;
990 }
991
992 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
993 up_read(&mm->mmap_sem);
994
995 spin_lock(&ksm_mmlist_lock);
996 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
997 struct mm_slot, mm_list);
998 if (ksm_test_exit(mm)) {
999 hash_del(&mm_slot->link);
1000 list_del(&mm_slot->mm_list);
1001 spin_unlock(&ksm_mmlist_lock);
1002
1003 free_mm_slot(mm_slot);
1004 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1005 mmdrop(mm);
1006 } else
1007 spin_unlock(&ksm_mmlist_lock);
1008 }
1009
1010
1011 remove_all_stable_nodes();
1012 ksm_scan.seqnr = 0;
1013 return 0;
1014
1015error:
1016 up_read(&mm->mmap_sem);
1017 spin_lock(&ksm_mmlist_lock);
1018 ksm_scan.mm_slot = &ksm_mm_head;
1019 spin_unlock(&ksm_mmlist_lock);
1020 return err;
1021}
1022#endif
1023
1024static u32 calc_checksum(struct page *page)
1025{
1026 u32 checksum;
1027 void *addr = kmap_atomic(page);
1028 checksum = xxhash(addr, PAGE_SIZE, 0);
1029 kunmap_atomic(addr);
1030 return checksum;
1031}
1032
1033static int memcmp_pages(struct page *page1, struct page *page2)
1034{
1035 char *addr1, *addr2;
1036 int ret;
1037
1038 addr1 = kmap_atomic(page1);
1039 addr2 = kmap_atomic(page2);
1040 ret = memcmp(addr1, addr2, PAGE_SIZE);
1041 kunmap_atomic(addr2);
1042 kunmap_atomic(addr1);
1043 return ret;
1044}
1045
1046static inline int pages_identical(struct page *page1, struct page *page2)
1047{
1048 return !memcmp_pages(page1, page2);
1049}
1050
1051static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1052 pte_t *orig_pte)
1053{
1054 struct mm_struct *mm = vma->vm_mm;
1055 struct page_vma_mapped_walk pvmw = {
1056 .page = page,
1057 .vma = vma,
1058 };
1059 int swapped;
1060 int err = -EFAULT;
1061 struct mmu_notifier_range range;
1062
1063 pvmw.address = page_address_in_vma(page, vma);
1064 if (pvmw.address == -EFAULT)
1065 goto out;
1066
1067 BUG_ON(PageTransCompound(page));
1068
1069 mmu_notifier_range_init(&range, mm, pvmw.address,
1070 pvmw.address + PAGE_SIZE);
1071 mmu_notifier_invalidate_range_start(&range);
1072
1073 if (!page_vma_mapped_walk(&pvmw))
1074 goto out_mn;
1075 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1076 goto out_unlock;
1077
1078 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1079 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1080 mm_tlb_flush_pending(mm)) {
1081 pte_t entry;
1082
1083 swapped = PageSwapCache(page);
1084 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1100
1101
1102
1103
1104 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1105 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1106 goto out_unlock;
1107 }
1108 if (pte_dirty(entry))
1109 set_page_dirty(page);
1110
1111 if (pte_protnone(entry))
1112 entry = pte_mkclean(pte_clear_savedwrite(entry));
1113 else
1114 entry = pte_mkclean(pte_wrprotect(entry));
1115 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1116 }
1117 *orig_pte = *pvmw.pte;
1118 err = 0;
1119
1120out_unlock:
1121 page_vma_mapped_walk_done(&pvmw);
1122out_mn:
1123 mmu_notifier_invalidate_range_end(&range);
1124out:
1125 return err;
1126}
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137static int replace_page(struct vm_area_struct *vma, struct page *page,
1138 struct page *kpage, pte_t orig_pte)
1139{
1140 struct mm_struct *mm = vma->vm_mm;
1141 pmd_t *pmd;
1142 pte_t *ptep;
1143 pte_t newpte;
1144 spinlock_t *ptl;
1145 unsigned long addr;
1146 int err = -EFAULT;
1147 struct mmu_notifier_range range;
1148
1149 addr = page_address_in_vma(page, vma);
1150 if (addr == -EFAULT)
1151 goto out;
1152
1153 pmd = mm_find_pmd(mm, addr);
1154 if (!pmd)
1155 goto out;
1156
1157 mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
1158 mmu_notifier_invalidate_range_start(&range);
1159
1160 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1161 if (!pte_same(*ptep, orig_pte)) {
1162 pte_unmap_unlock(ptep, ptl);
1163 goto out_mn;
1164 }
1165
1166
1167
1168
1169
1170 if (!is_zero_pfn(page_to_pfn(kpage))) {
1171 get_page(kpage);
1172 page_add_anon_rmap(kpage, vma, addr, false);
1173 newpte = mk_pte(kpage, vma->vm_page_prot);
1174 } else {
1175 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1176 vma->vm_page_prot));
1177
1178
1179
1180
1181
1182
1183 dec_mm_counter(mm, MM_ANONPAGES);
1184 }
1185
1186 flush_cache_page(vma, addr, pte_pfn(*ptep));
1187
1188
1189
1190
1191
1192
1193 ptep_clear_flush(vma, addr, ptep);
1194 set_pte_at_notify(mm, addr, ptep, newpte);
1195
1196 page_remove_rmap(page, false);
1197 if (!page_mapped(page))
1198 try_to_free_swap(page);
1199 put_page(page);
1200
1201 pte_unmap_unlock(ptep, ptl);
1202 err = 0;
1203out_mn:
1204 mmu_notifier_invalidate_range_end(&range);
1205out:
1206 return err;
1207}
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218static int try_to_merge_one_page(struct vm_area_struct *vma,
1219 struct page *page, struct page *kpage)
1220{
1221 pte_t orig_pte = __pte(0);
1222 int err = -EFAULT;
1223
1224 if (page == kpage)
1225 return 0;
1226
1227 if (!PageAnon(page))
1228 goto out;
1229
1230
1231
1232
1233
1234
1235
1236
1237 if (!trylock_page(page))
1238 goto out;
1239
1240 if (PageTransCompound(page)) {
1241 if (split_huge_page(page))
1242 goto out_unlock;
1243 }
1244
1245
1246
1247
1248
1249
1250
1251 if (write_protect_page(vma, page, &orig_pte) == 0) {
1252 if (!kpage) {
1253
1254
1255
1256
1257
1258 set_page_stable_node(page, NULL);
1259 mark_page_accessed(page);
1260
1261
1262
1263
1264 if (!PageDirty(page))
1265 SetPageDirty(page);
1266 err = 0;
1267 } else if (pages_identical(page, kpage))
1268 err = replace_page(vma, page, kpage, orig_pte);
1269 }
1270
1271 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1272 munlock_vma_page(page);
1273 if (!PageMlocked(kpage)) {
1274 unlock_page(page);
1275 lock_page(kpage);
1276 mlock_vma_page(kpage);
1277 page = kpage;
1278 }
1279 }
1280
1281out_unlock:
1282 unlock_page(page);
1283out:
1284 return err;
1285}
1286
1287
1288
1289
1290
1291
1292
1293static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1294 struct page *page, struct page *kpage)
1295{
1296 struct mm_struct *mm = rmap_item->mm;
1297 struct vm_area_struct *vma;
1298 int err = -EFAULT;
1299
1300 down_read(&mm->mmap_sem);
1301 vma = find_mergeable_vma(mm, rmap_item->address);
1302 if (!vma)
1303 goto out;
1304
1305 err = try_to_merge_one_page(vma, page, kpage);
1306 if (err)
1307 goto out;
1308
1309
1310 remove_rmap_item_from_tree(rmap_item);
1311
1312
1313 rmap_item->anon_vma = vma->anon_vma;
1314 get_anon_vma(vma->anon_vma);
1315out:
1316 up_read(&mm->mmap_sem);
1317 return err;
1318}
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1331 struct page *page,
1332 struct rmap_item *tree_rmap_item,
1333 struct page *tree_page)
1334{
1335 int err;
1336
1337 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1338 if (!err) {
1339 err = try_to_merge_with_ksm_page(tree_rmap_item,
1340 tree_page, page);
1341
1342
1343
1344
1345 if (err)
1346 break_cow(rmap_item);
1347 }
1348 return err ? NULL : page;
1349}
1350
1351static __always_inline
1352bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1353{
1354 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1355
1356
1357
1358
1359
1360
1361 return stable_node->rmap_hlist_len &&
1362 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1363}
1364
1365static __always_inline
1366bool is_page_sharing_candidate(struct stable_node *stable_node)
1367{
1368 return __is_page_sharing_candidate(stable_node, 0);
1369}
1370
1371static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1372 struct stable_node **_stable_node,
1373 struct rb_root *root,
1374 bool prune_stale_stable_nodes)
1375{
1376 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1377 struct hlist_node *hlist_safe;
1378 struct page *_tree_page, *tree_page = NULL;
1379 int nr = 0;
1380 int found_rmap_hlist_len;
1381
1382 if (!prune_stale_stable_nodes ||
1383 time_before(jiffies, stable_node->chain_prune_time +
1384 msecs_to_jiffies(
1385 ksm_stable_node_chains_prune_millisecs)))
1386 prune_stale_stable_nodes = false;
1387 else
1388 stable_node->chain_prune_time = jiffies;
1389
1390 hlist_for_each_entry_safe(dup, hlist_safe,
1391 &stable_node->hlist, hlist_dup) {
1392 cond_resched();
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1404 if (!_tree_page)
1405 continue;
1406 nr += 1;
1407 if (is_page_sharing_candidate(dup)) {
1408 if (!found ||
1409 dup->rmap_hlist_len > found_rmap_hlist_len) {
1410 if (found)
1411 put_page(tree_page);
1412 found = dup;
1413 found_rmap_hlist_len = found->rmap_hlist_len;
1414 tree_page = _tree_page;
1415
1416
1417 if (!prune_stale_stable_nodes)
1418 break;
1419 continue;
1420 }
1421 }
1422 put_page(_tree_page);
1423 }
1424
1425 if (found) {
1426
1427
1428
1429
1430
1431
1432 if (prune_stale_stable_nodes && nr == 1) {
1433
1434
1435
1436
1437
1438
1439 BUG_ON(stable_node->hlist.first->next);
1440
1441
1442
1443
1444
1445 rb_replace_node(&stable_node->node, &found->node,
1446 root);
1447 free_stable_node(stable_node);
1448 ksm_stable_node_chains--;
1449 ksm_stable_node_dups--;
1450
1451
1452
1453
1454
1455 *_stable_node = found;
1456
1457
1458
1459
1460
1461
1462 stable_node = NULL;
1463 } else if (stable_node->hlist.first != &found->hlist_dup &&
1464 __is_page_sharing_candidate(found, 1)) {
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480 hlist_del(&found->hlist_dup);
1481 hlist_add_head(&found->hlist_dup,
1482 &stable_node->hlist);
1483 }
1484 }
1485
1486 *_stable_node_dup = found;
1487 return tree_page;
1488}
1489
1490static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1491 struct rb_root *root)
1492{
1493 if (!is_stable_node_chain(stable_node))
1494 return stable_node;
1495 if (hlist_empty(&stable_node->hlist)) {
1496 free_stable_node_chain(stable_node, root);
1497 return NULL;
1498 }
1499 return hlist_entry(stable_node->hlist.first,
1500 typeof(*stable_node), hlist_dup);
1501}
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1518 struct stable_node **_stable_node,
1519 struct rb_root *root,
1520 bool prune_stale_stable_nodes)
1521{
1522 struct stable_node *stable_node = *_stable_node;
1523 if (!is_stable_node_chain(stable_node)) {
1524 if (is_page_sharing_candidate(stable_node)) {
1525 *_stable_node_dup = stable_node;
1526 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1527 }
1528
1529
1530
1531
1532 *_stable_node_dup = NULL;
1533 return NULL;
1534 }
1535 return stable_node_dup(_stable_node_dup, _stable_node, root,
1536 prune_stale_stable_nodes);
1537}
1538
1539static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1540 struct stable_node **s_n,
1541 struct rb_root *root)
1542{
1543 return __stable_node_chain(s_n_d, s_n, root, true);
1544}
1545
1546static __always_inline struct page *chain(struct stable_node **s_n_d,
1547 struct stable_node *s_n,
1548 struct rb_root *root)
1549{
1550 struct stable_node *old_stable_node = s_n;
1551 struct page *tree_page;
1552
1553 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1554
1555 VM_BUG_ON(s_n != old_stable_node);
1556 return tree_page;
1557}
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568static struct page *stable_tree_search(struct page *page)
1569{
1570 int nid;
1571 struct rb_root *root;
1572 struct rb_node **new;
1573 struct rb_node *parent;
1574 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1575 struct stable_node *page_node;
1576
1577 page_node = page_stable_node(page);
1578 if (page_node && page_node->head != &migrate_nodes) {
1579
1580 get_page(page);
1581 return page;
1582 }
1583
1584 nid = get_kpfn_nid(page_to_pfn(page));
1585 root = root_stable_tree + nid;
1586again:
1587 new = &root->rb_node;
1588 parent = NULL;
1589
1590 while (*new) {
1591 struct page *tree_page;
1592 int ret;
1593
1594 cond_resched();
1595 stable_node = rb_entry(*new, struct stable_node, node);
1596 stable_node_any = NULL;
1597 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610 if (!stable_node_dup) {
1611
1612
1613
1614
1615
1616 stable_node_any = stable_node_dup_any(stable_node,
1617 root);
1618 if (!stable_node_any) {
1619
1620 goto again;
1621 }
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631 tree_page = get_ksm_page(stable_node_any,
1632 GET_KSM_PAGE_NOLOCK);
1633 }
1634 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1635 if (!tree_page) {
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645 goto again;
1646 }
1647
1648 ret = memcmp_pages(page, tree_page);
1649 put_page(tree_page);
1650
1651 parent = *new;
1652 if (ret < 0)
1653 new = &parent->rb_left;
1654 else if (ret > 0)
1655 new = &parent->rb_right;
1656 else {
1657 if (page_node) {
1658 VM_BUG_ON(page_node->head != &migrate_nodes);
1659
1660
1661
1662
1663
1664
1665 if (page_mapcount(page) > 1)
1666 goto chain_append;
1667 }
1668
1669 if (!stable_node_dup) {
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682 return NULL;
1683 }
1684
1685
1686
1687
1688
1689
1690
1691
1692 tree_page = get_ksm_page(stable_node_dup,
1693 GET_KSM_PAGE_TRYLOCK);
1694
1695 if (PTR_ERR(tree_page) == -EBUSY)
1696 return ERR_PTR(-EBUSY);
1697
1698 if (unlikely(!tree_page))
1699
1700
1701
1702
1703 goto again;
1704 unlock_page(tree_page);
1705
1706 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1707 NUMA(stable_node_dup->nid)) {
1708 put_page(tree_page);
1709 goto replace;
1710 }
1711 return tree_page;
1712 }
1713 }
1714
1715 if (!page_node)
1716 return NULL;
1717
1718 list_del(&page_node->list);
1719 DO_NUMA(page_node->nid = nid);
1720 rb_link_node(&page_node->node, parent, new);
1721 rb_insert_color(&page_node->node, root);
1722out:
1723 if (is_page_sharing_candidate(page_node)) {
1724 get_page(page);
1725 return page;
1726 } else
1727 return NULL;
1728
1729replace:
1730
1731
1732
1733
1734
1735
1736
1737
1738 if (stable_node_dup == stable_node) {
1739 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1740 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1741
1742 if (page_node) {
1743 VM_BUG_ON(page_node->head != &migrate_nodes);
1744 list_del(&page_node->list);
1745 DO_NUMA(page_node->nid = nid);
1746 rb_replace_node(&stable_node_dup->node,
1747 &page_node->node,
1748 root);
1749 if (is_page_sharing_candidate(page_node))
1750 get_page(page);
1751 else
1752 page = NULL;
1753 } else {
1754 rb_erase(&stable_node_dup->node, root);
1755 page = NULL;
1756 }
1757 } else {
1758 VM_BUG_ON(!is_stable_node_chain(stable_node));
1759 __stable_node_dup_del(stable_node_dup);
1760 if (page_node) {
1761 VM_BUG_ON(page_node->head != &migrate_nodes);
1762 list_del(&page_node->list);
1763 DO_NUMA(page_node->nid = nid);
1764 stable_node_chain_add_dup(page_node, stable_node);
1765 if (is_page_sharing_candidate(page_node))
1766 get_page(page);
1767 else
1768 page = NULL;
1769 } else {
1770 page = NULL;
1771 }
1772 }
1773 stable_node_dup->head = &migrate_nodes;
1774 list_add(&stable_node_dup->list, stable_node_dup->head);
1775 return page;
1776
1777chain_append:
1778
1779 if (!stable_node_dup)
1780 stable_node_dup = stable_node_any;
1781
1782
1783
1784
1785
1786
1787
1788
1789 if (stable_node_dup == stable_node) {
1790 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1791 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1792
1793 stable_node = alloc_stable_node_chain(stable_node_dup,
1794 root);
1795 if (!stable_node)
1796 return NULL;
1797 }
1798
1799
1800
1801
1802
1803
1804 VM_BUG_ON(!is_stable_node_chain(stable_node));
1805 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1806 VM_BUG_ON(page_node->head != &migrate_nodes);
1807 list_del(&page_node->list);
1808 DO_NUMA(page_node->nid = nid);
1809 stable_node_chain_add_dup(page_node, stable_node);
1810 goto out;
1811}
1812
1813
1814
1815
1816
1817
1818
1819
1820static struct stable_node *stable_tree_insert(struct page *kpage)
1821{
1822 int nid;
1823 unsigned long kpfn;
1824 struct rb_root *root;
1825 struct rb_node **new;
1826 struct rb_node *parent;
1827 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1828 bool need_chain = false;
1829
1830 kpfn = page_to_pfn(kpage);
1831 nid = get_kpfn_nid(kpfn);
1832 root = root_stable_tree + nid;
1833again:
1834 parent = NULL;
1835 new = &root->rb_node;
1836
1837 while (*new) {
1838 struct page *tree_page;
1839 int ret;
1840
1841 cond_resched();
1842 stable_node = rb_entry(*new, struct stable_node, node);
1843 stable_node_any = NULL;
1844 tree_page = chain(&stable_node_dup, stable_node, root);
1845 if (!stable_node_dup) {
1846
1847
1848
1849
1850
1851 stable_node_any = stable_node_dup_any(stable_node,
1852 root);
1853 if (!stable_node_any) {
1854
1855 goto again;
1856 }
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866 tree_page = get_ksm_page(stable_node_any,
1867 GET_KSM_PAGE_NOLOCK);
1868 }
1869 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1870 if (!tree_page) {
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880 goto again;
1881 }
1882
1883 ret = memcmp_pages(kpage, tree_page);
1884 put_page(tree_page);
1885
1886 parent = *new;
1887 if (ret < 0)
1888 new = &parent->rb_left;
1889 else if (ret > 0)
1890 new = &parent->rb_right;
1891 else {
1892 need_chain = true;
1893 break;
1894 }
1895 }
1896
1897 stable_node_dup = alloc_stable_node();
1898 if (!stable_node_dup)
1899 return NULL;
1900
1901 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1902 stable_node_dup->kpfn = kpfn;
1903 set_page_stable_node(kpage, stable_node_dup);
1904 stable_node_dup->rmap_hlist_len = 0;
1905 DO_NUMA(stable_node_dup->nid = nid);
1906 if (!need_chain) {
1907 rb_link_node(&stable_node_dup->node, parent, new);
1908 rb_insert_color(&stable_node_dup->node, root);
1909 } else {
1910 if (!is_stable_node_chain(stable_node)) {
1911 struct stable_node *orig = stable_node;
1912
1913 stable_node = alloc_stable_node_chain(orig, root);
1914 if (!stable_node) {
1915 free_stable_node(stable_node_dup);
1916 return NULL;
1917 }
1918 }
1919 stable_node_chain_add_dup(stable_node_dup, stable_node);
1920 }
1921
1922 return stable_node_dup;
1923}
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939static
1940struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1941 struct page *page,
1942 struct page **tree_pagep)
1943{
1944 struct rb_node **new;
1945 struct rb_root *root;
1946 struct rb_node *parent = NULL;
1947 int nid;
1948
1949 nid = get_kpfn_nid(page_to_pfn(page));
1950 root = root_unstable_tree + nid;
1951 new = &root->rb_node;
1952
1953 while (*new) {
1954 struct rmap_item *tree_rmap_item;
1955 struct page *tree_page;
1956 int ret;
1957
1958 cond_resched();
1959 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1960 tree_page = get_mergeable_page(tree_rmap_item);
1961 if (!tree_page)
1962 return NULL;
1963
1964
1965
1966
1967 if (page == tree_page) {
1968 put_page(tree_page);
1969 return NULL;
1970 }
1971
1972 ret = memcmp_pages(page, tree_page);
1973
1974 parent = *new;
1975 if (ret < 0) {
1976 put_page(tree_page);
1977 new = &parent->rb_left;
1978 } else if (ret > 0) {
1979 put_page(tree_page);
1980 new = &parent->rb_right;
1981 } else if (!ksm_merge_across_nodes &&
1982 page_to_nid(tree_page) != nid) {
1983
1984
1985
1986
1987
1988 put_page(tree_page);
1989 return NULL;
1990 } else {
1991 *tree_pagep = tree_page;
1992 return tree_rmap_item;
1993 }
1994 }
1995
1996 rmap_item->address |= UNSTABLE_FLAG;
1997 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1998 DO_NUMA(rmap_item->nid = nid);
1999 rb_link_node(&rmap_item->node, parent, new);
2000 rb_insert_color(&rmap_item->node, root);
2001
2002 ksm_pages_unshared++;
2003 return NULL;
2004}
2005
2006
2007
2008
2009
2010
2011static void stable_tree_append(struct rmap_item *rmap_item,
2012 struct stable_node *stable_node,
2013 bool max_page_sharing_bypass)
2014{
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025 BUG_ON(stable_node->rmap_hlist_len < 0);
2026
2027 stable_node->rmap_hlist_len++;
2028 if (!max_page_sharing_bypass)
2029
2030 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2031 ksm_max_page_sharing);
2032
2033 rmap_item->head = stable_node;
2034 rmap_item->address |= STABLE_FLAG;
2035 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2036
2037 if (rmap_item->hlist.next)
2038 ksm_pages_sharing++;
2039 else
2040 ksm_pages_shared++;
2041}
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2053{
2054 struct mm_struct *mm = rmap_item->mm;
2055 struct rmap_item *tree_rmap_item;
2056 struct page *tree_page = NULL;
2057 struct stable_node *stable_node;
2058 struct page *kpage;
2059 unsigned int checksum;
2060 int err;
2061 bool max_page_sharing_bypass = false;
2062
2063 stable_node = page_stable_node(page);
2064 if (stable_node) {
2065 if (stable_node->head != &migrate_nodes &&
2066 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2067 NUMA(stable_node->nid)) {
2068 stable_node_dup_del(stable_node);
2069 stable_node->head = &migrate_nodes;
2070 list_add(&stable_node->list, stable_node->head);
2071 }
2072 if (stable_node->head != &migrate_nodes &&
2073 rmap_item->head == stable_node)
2074 return;
2075
2076
2077
2078
2079 if (!is_page_sharing_candidate(stable_node))
2080 max_page_sharing_bypass = true;
2081 }
2082
2083
2084 kpage = stable_tree_search(page);
2085 if (kpage == page && rmap_item->head == stable_node) {
2086 put_page(kpage);
2087 return;
2088 }
2089
2090 remove_rmap_item_from_tree(rmap_item);
2091
2092 if (kpage) {
2093 if (PTR_ERR(kpage) == -EBUSY)
2094 return;
2095
2096 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2097 if (!err) {
2098
2099
2100
2101
2102 lock_page(kpage);
2103 stable_tree_append(rmap_item, page_stable_node(kpage),
2104 max_page_sharing_bypass);
2105 unlock_page(kpage);
2106 }
2107 put_page(kpage);
2108 return;
2109 }
2110
2111
2112
2113
2114
2115
2116
2117 checksum = calc_checksum(page);
2118 if (rmap_item->oldchecksum != checksum) {
2119 rmap_item->oldchecksum = checksum;
2120 return;
2121 }
2122
2123
2124
2125
2126
2127 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2128 struct vm_area_struct *vma;
2129
2130 down_read(&mm->mmap_sem);
2131 vma = find_mergeable_vma(mm, rmap_item->address);
2132 err = try_to_merge_one_page(vma, page,
2133 ZERO_PAGE(rmap_item->address));
2134 up_read(&mm->mmap_sem);
2135
2136
2137
2138
2139 if (!err)
2140 return;
2141 }
2142 tree_rmap_item =
2143 unstable_tree_search_insert(rmap_item, page, &tree_page);
2144 if (tree_rmap_item) {
2145 bool split;
2146
2147 kpage = try_to_merge_two_pages(rmap_item, page,
2148 tree_rmap_item, tree_page);
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159 split = PageTransCompound(page)
2160 && compound_head(page) == compound_head(tree_page);
2161 put_page(tree_page);
2162 if (kpage) {
2163
2164
2165
2166
2167 lock_page(kpage);
2168 stable_node = stable_tree_insert(kpage);
2169 if (stable_node) {
2170 stable_tree_append(tree_rmap_item, stable_node,
2171 false);
2172 stable_tree_append(rmap_item, stable_node,
2173 false);
2174 }
2175 unlock_page(kpage);
2176
2177
2178
2179
2180
2181
2182
2183 if (!stable_node) {
2184 break_cow(tree_rmap_item);
2185 break_cow(rmap_item);
2186 }
2187 } else if (split) {
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197 if (!trylock_page(page))
2198 return;
2199 split_huge_page(page);
2200 unlock_page(page);
2201 }
2202 }
2203}
2204
2205static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2206 struct rmap_item **rmap_list,
2207 unsigned long addr)
2208{
2209 struct rmap_item *rmap_item;
2210
2211 while (*rmap_list) {
2212 rmap_item = *rmap_list;
2213 if ((rmap_item->address & PAGE_MASK) == addr)
2214 return rmap_item;
2215 if (rmap_item->address > addr)
2216 break;
2217 *rmap_list = rmap_item->rmap_list;
2218 remove_rmap_item_from_tree(rmap_item);
2219 free_rmap_item(rmap_item);
2220 }
2221
2222 rmap_item = alloc_rmap_item();
2223 if (rmap_item) {
2224
2225 rmap_item->mm = mm_slot->mm;
2226 rmap_item->address = addr;
2227 rmap_item->rmap_list = *rmap_list;
2228 *rmap_list = rmap_item;
2229 }
2230 return rmap_item;
2231}
2232
2233static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2234{
2235 struct mm_struct *mm;
2236 struct mm_slot *slot;
2237 struct vm_area_struct *vma;
2238 struct rmap_item *rmap_item;
2239 int nid;
2240
2241 if (list_empty(&ksm_mm_head.mm_list))
2242 return NULL;
2243
2244 slot = ksm_scan.mm_slot;
2245 if (slot == &ksm_mm_head) {
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256 lru_add_drain_all();
2257
2258
2259
2260
2261
2262
2263
2264 if (!ksm_merge_across_nodes) {
2265 struct stable_node *stable_node, *next;
2266 struct page *page;
2267
2268 list_for_each_entry_safe(stable_node, next,
2269 &migrate_nodes, list) {
2270 page = get_ksm_page(stable_node,
2271 GET_KSM_PAGE_NOLOCK);
2272 if (page)
2273 put_page(page);
2274 cond_resched();
2275 }
2276 }
2277
2278 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2279 root_unstable_tree[nid] = RB_ROOT;
2280
2281 spin_lock(&ksm_mmlist_lock);
2282 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2283 ksm_scan.mm_slot = slot;
2284 spin_unlock(&ksm_mmlist_lock);
2285
2286
2287
2288
2289 if (slot == &ksm_mm_head)
2290 return NULL;
2291next_mm:
2292 ksm_scan.address = 0;
2293 ksm_scan.rmap_list = &slot->rmap_list;
2294 }
2295
2296 mm = slot->mm;
2297 down_read(&mm->mmap_sem);
2298 if (ksm_test_exit(mm))
2299 vma = NULL;
2300 else
2301 vma = find_vma(mm, ksm_scan.address);
2302
2303 for (; vma; vma = vma->vm_next) {
2304 if (!(vma->vm_flags & VM_MERGEABLE))
2305 continue;
2306 if (ksm_scan.address < vma->vm_start)
2307 ksm_scan.address = vma->vm_start;
2308 if (!vma->anon_vma)
2309 ksm_scan.address = vma->vm_end;
2310
2311 while (ksm_scan.address < vma->vm_end) {
2312 if (ksm_test_exit(mm))
2313 break;
2314 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2315 if (IS_ERR_OR_NULL(*page)) {
2316 ksm_scan.address += PAGE_SIZE;
2317 cond_resched();
2318 continue;
2319 }
2320 if (PageAnon(*page)) {
2321 flush_anon_page(vma, *page, ksm_scan.address);
2322 flush_dcache_page(*page);
2323 rmap_item = get_next_rmap_item(slot,
2324 ksm_scan.rmap_list, ksm_scan.address);
2325 if (rmap_item) {
2326 ksm_scan.rmap_list =
2327 &rmap_item->rmap_list;
2328 ksm_scan.address += PAGE_SIZE;
2329 } else
2330 put_page(*page);
2331 up_read(&mm->mmap_sem);
2332 return rmap_item;
2333 }
2334 put_page(*page);
2335 ksm_scan.address += PAGE_SIZE;
2336 cond_resched();
2337 }
2338 }
2339
2340 if (ksm_test_exit(mm)) {
2341 ksm_scan.address = 0;
2342 ksm_scan.rmap_list = &slot->rmap_list;
2343 }
2344
2345
2346
2347
2348 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
2349
2350 spin_lock(&ksm_mmlist_lock);
2351 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2352 struct mm_slot, mm_list);
2353 if (ksm_scan.address == 0) {
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363 hash_del(&slot->link);
2364 list_del(&slot->mm_list);
2365 spin_unlock(&ksm_mmlist_lock);
2366
2367 free_mm_slot(slot);
2368 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2369 up_read(&mm->mmap_sem);
2370 mmdrop(mm);
2371 } else {
2372 up_read(&mm->mmap_sem);
2373
2374
2375
2376
2377
2378
2379
2380 spin_unlock(&ksm_mmlist_lock);
2381 }
2382
2383
2384 slot = ksm_scan.mm_slot;
2385 if (slot != &ksm_mm_head)
2386 goto next_mm;
2387
2388 ksm_scan.seqnr++;
2389 return NULL;
2390}
2391
2392
2393
2394
2395
2396static void ksm_do_scan(unsigned int scan_npages)
2397{
2398 struct rmap_item *rmap_item;
2399 struct page *uninitialized_var(page);
2400
2401 while (scan_npages-- && likely(!freezing(current))) {
2402 cond_resched();
2403 rmap_item = scan_get_next_rmap_item(&page);
2404 if (!rmap_item)
2405 return;
2406 cmp_and_merge_page(page, rmap_item);
2407 put_page(page);
2408 }
2409}
2410
2411static int ksmd_should_run(void)
2412{
2413 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2414}
2415
2416static int ksm_scan_thread(void *nothing)
2417{
2418 unsigned int sleep_ms;
2419
2420 set_freezable();
2421 set_user_nice(current, 5);
2422
2423 while (!kthread_should_stop()) {
2424 mutex_lock(&ksm_thread_mutex);
2425 wait_while_offlining();
2426 if (ksmd_should_run())
2427 ksm_do_scan(ksm_thread_pages_to_scan);
2428 mutex_unlock(&ksm_thread_mutex);
2429
2430 try_to_freeze();
2431
2432 if (ksmd_should_run()) {
2433 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2434 wait_event_interruptible_timeout(ksm_iter_wait,
2435 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2436 msecs_to_jiffies(sleep_ms));
2437 } else {
2438 wait_event_freezable(ksm_thread_wait,
2439 ksmd_should_run() || kthread_should_stop());
2440 }
2441 }
2442 return 0;
2443}
2444
2445int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2446 unsigned long end, int advice, unsigned long *vm_flags)
2447{
2448 struct mm_struct *mm = vma->vm_mm;
2449 int err;
2450
2451 switch (advice) {
2452 case MADV_MERGEABLE:
2453
2454
2455
2456 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2457 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2458 VM_HUGETLB | VM_MIXEDMAP))
2459 return 0;
2460
2461 if (vma_is_dax(vma))
2462 return 0;
2463
2464#ifdef VM_SAO
2465 if (*vm_flags & VM_SAO)
2466 return 0;
2467#endif
2468#ifdef VM_SPARC_ADI
2469 if (*vm_flags & VM_SPARC_ADI)
2470 return 0;
2471#endif
2472
2473 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2474 err = __ksm_enter(mm);
2475 if (err)
2476 return err;
2477 }
2478
2479 *vm_flags |= VM_MERGEABLE;
2480 break;
2481
2482 case MADV_UNMERGEABLE:
2483 if (!(*vm_flags & VM_MERGEABLE))
2484 return 0;
2485
2486 if (vma->anon_vma) {
2487 err = unmerge_ksm_pages(vma, start, end);
2488 if (err)
2489 return err;
2490 }
2491
2492 *vm_flags &= ~VM_MERGEABLE;
2493 break;
2494 }
2495
2496 return 0;
2497}
2498
2499int __ksm_enter(struct mm_struct *mm)
2500{
2501 struct mm_slot *mm_slot;
2502 int needs_wakeup;
2503
2504 mm_slot = alloc_mm_slot();
2505 if (!mm_slot)
2506 return -ENOMEM;
2507
2508
2509 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2510
2511 spin_lock(&ksm_mmlist_lock);
2512 insert_to_mm_slots_hash(mm, mm_slot);
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523 if (ksm_run & KSM_RUN_UNMERGE)
2524 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2525 else
2526 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2527 spin_unlock(&ksm_mmlist_lock);
2528
2529 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2530 mmgrab(mm);
2531
2532 if (needs_wakeup)
2533 wake_up_interruptible(&ksm_thread_wait);
2534
2535 return 0;
2536}
2537
2538void __ksm_exit(struct mm_struct *mm)
2539{
2540 struct mm_slot *mm_slot;
2541 int easy_to_free = 0;
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552 spin_lock(&ksm_mmlist_lock);
2553 mm_slot = get_mm_slot(mm);
2554 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2555 if (!mm_slot->rmap_list) {
2556 hash_del(&mm_slot->link);
2557 list_del(&mm_slot->mm_list);
2558 easy_to_free = 1;
2559 } else {
2560 list_move(&mm_slot->mm_list,
2561 &ksm_scan.mm_slot->mm_list);
2562 }
2563 }
2564 spin_unlock(&ksm_mmlist_lock);
2565
2566 if (easy_to_free) {
2567 free_mm_slot(mm_slot);
2568 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2569 mmdrop(mm);
2570 } else if (mm_slot) {
2571 down_write(&mm->mmap_sem);
2572 up_write(&mm->mmap_sem);
2573 }
2574}
2575
2576struct page *ksm_might_need_to_copy(struct page *page,
2577 struct vm_area_struct *vma, unsigned long address)
2578{
2579 struct anon_vma *anon_vma = page_anon_vma(page);
2580 struct page *new_page;
2581
2582 if (PageKsm(page)) {
2583 if (page_stable_node(page) &&
2584 !(ksm_run & KSM_RUN_UNMERGE))
2585 return page;
2586 } else if (!anon_vma) {
2587 return page;
2588 } else if (anon_vma->root == vma->anon_vma->root &&
2589 page->index == linear_page_index(vma, address)) {
2590 return page;
2591 }
2592 if (!PageUptodate(page))
2593 return page;
2594
2595 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2596 if (new_page) {
2597 copy_user_highpage(new_page, page, address, vma);
2598
2599 SetPageDirty(new_page);
2600 __SetPageUptodate(new_page);
2601 __SetPageLocked(new_page);
2602 }
2603
2604 return new_page;
2605}
2606
2607void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
2608{
2609 struct stable_node *stable_node;
2610 struct rmap_item *rmap_item;
2611 int search_new_forks = 0;
2612
2613 VM_BUG_ON_PAGE(!PageKsm(page), page);
2614
2615
2616
2617
2618
2619 VM_BUG_ON_PAGE(!PageLocked(page), page);
2620
2621 stable_node = page_stable_node(page);
2622 if (!stable_node)
2623 return;
2624again:
2625 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2626 struct anon_vma *anon_vma = rmap_item->anon_vma;
2627 struct anon_vma_chain *vmac;
2628 struct vm_area_struct *vma;
2629
2630 cond_resched();
2631 anon_vma_lock_read(anon_vma);
2632 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2633 0, ULONG_MAX) {
2634 unsigned long addr;
2635
2636 cond_resched();
2637 vma = vmac->vma;
2638
2639
2640 addr = rmap_item->address & ~KSM_FLAG_MASK;
2641
2642 if (addr < vma->vm_start || addr >= vma->vm_end)
2643 continue;
2644
2645
2646
2647
2648
2649
2650 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2651 continue;
2652
2653 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2654 continue;
2655
2656 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
2657 anon_vma_unlock_read(anon_vma);
2658 return;
2659 }
2660 if (rwc->done && rwc->done(page)) {
2661 anon_vma_unlock_read(anon_vma);
2662 return;
2663 }
2664 }
2665 anon_vma_unlock_read(anon_vma);
2666 }
2667 if (!search_new_forks++)
2668 goto again;
2669}
2670
2671bool reuse_ksm_page(struct page *page,
2672 struct vm_area_struct *vma,
2673 unsigned long address)
2674{
2675#ifdef CONFIG_DEBUG_VM
2676 if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
2677 WARN_ON(!page_mapped(page)) ||
2678 WARN_ON(!PageLocked(page))) {
2679 dump_page(page, "reuse_ksm_page");
2680 return false;
2681 }
2682#endif
2683
2684 if (PageSwapCache(page) || !page_stable_node(page))
2685 return false;
2686
2687 if (!page_ref_freeze(page, 1))
2688 return false;
2689
2690 page_move_anon_rmap(page, vma);
2691 page->index = linear_page_index(vma, address);
2692 page_ref_unfreeze(page, 1);
2693
2694 return true;
2695}
2696#ifdef CONFIG_MIGRATION
2697void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2698{
2699 struct stable_node *stable_node;
2700
2701 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2702 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2703 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2704
2705 stable_node = page_stable_node(newpage);
2706 if (stable_node) {
2707 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2708 stable_node->kpfn = page_to_pfn(newpage);
2709
2710
2711
2712
2713
2714
2715 smp_wmb();
2716 set_page_stable_node(oldpage, NULL);
2717 }
2718}
2719#endif
2720
2721#ifdef CONFIG_MEMORY_HOTREMOVE
2722static void wait_while_offlining(void)
2723{
2724 while (ksm_run & KSM_RUN_OFFLINE) {
2725 mutex_unlock(&ksm_thread_mutex);
2726 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2727 TASK_UNINTERRUPTIBLE);
2728 mutex_lock(&ksm_thread_mutex);
2729 }
2730}
2731
2732static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2733 unsigned long start_pfn,
2734 unsigned long end_pfn)
2735{
2736 if (stable_node->kpfn >= start_pfn &&
2737 stable_node->kpfn < end_pfn) {
2738
2739
2740
2741
2742 remove_node_from_stable_tree(stable_node);
2743 return true;
2744 }
2745 return false;
2746}
2747
2748static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2749 unsigned long start_pfn,
2750 unsigned long end_pfn,
2751 struct rb_root *root)
2752{
2753 struct stable_node *dup;
2754 struct hlist_node *hlist_safe;
2755
2756 if (!is_stable_node_chain(stable_node)) {
2757 VM_BUG_ON(is_stable_node_dup(stable_node));
2758 return stable_node_dup_remove_range(stable_node, start_pfn,
2759 end_pfn);
2760 }
2761
2762 hlist_for_each_entry_safe(dup, hlist_safe,
2763 &stable_node->hlist, hlist_dup) {
2764 VM_BUG_ON(!is_stable_node_dup(dup));
2765 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2766 }
2767 if (hlist_empty(&stable_node->hlist)) {
2768 free_stable_node_chain(stable_node, root);
2769 return true;
2770 } else
2771 return false;
2772}
2773
2774static void ksm_check_stable_tree(unsigned long start_pfn,
2775 unsigned long end_pfn)
2776{
2777 struct stable_node *stable_node, *next;
2778 struct rb_node *node;
2779 int nid;
2780
2781 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2782 node = rb_first(root_stable_tree + nid);
2783 while (node) {
2784 stable_node = rb_entry(node, struct stable_node, node);
2785 if (stable_node_chain_remove_range(stable_node,
2786 start_pfn, end_pfn,
2787 root_stable_tree +
2788 nid))
2789 node = rb_first(root_stable_tree + nid);
2790 else
2791 node = rb_next(node);
2792 cond_resched();
2793 }
2794 }
2795 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2796 if (stable_node->kpfn >= start_pfn &&
2797 stable_node->kpfn < end_pfn)
2798 remove_node_from_stable_tree(stable_node);
2799 cond_resched();
2800 }
2801}
2802
2803static int ksm_memory_callback(struct notifier_block *self,
2804 unsigned long action, void *arg)
2805{
2806 struct memory_notify *mn = arg;
2807
2808 switch (action) {
2809 case MEM_GOING_OFFLINE:
2810
2811
2812
2813
2814
2815
2816
2817 mutex_lock(&ksm_thread_mutex);
2818 ksm_run |= KSM_RUN_OFFLINE;
2819 mutex_unlock(&ksm_thread_mutex);
2820 break;
2821
2822 case MEM_OFFLINE:
2823
2824
2825
2826
2827
2828
2829
2830 ksm_check_stable_tree(mn->start_pfn,
2831 mn->start_pfn + mn->nr_pages);
2832
2833
2834 case MEM_CANCEL_OFFLINE:
2835 mutex_lock(&ksm_thread_mutex);
2836 ksm_run &= ~KSM_RUN_OFFLINE;
2837 mutex_unlock(&ksm_thread_mutex);
2838
2839 smp_mb();
2840 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2841 break;
2842 }
2843 return NOTIFY_OK;
2844}
2845#else
2846static void wait_while_offlining(void)
2847{
2848}
2849#endif
2850
2851#ifdef CONFIG_SYSFS
2852
2853
2854
2855
2856#define KSM_ATTR_RO(_name) \
2857 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2858#define KSM_ATTR(_name) \
2859 static struct kobj_attribute _name##_attr = \
2860 __ATTR(_name, 0644, _name##_show, _name##_store)
2861
2862static ssize_t sleep_millisecs_show(struct kobject *kobj,
2863 struct kobj_attribute *attr, char *buf)
2864{
2865 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2866}
2867
2868static ssize_t sleep_millisecs_store(struct kobject *kobj,
2869 struct kobj_attribute *attr,
2870 const char *buf, size_t count)
2871{
2872 unsigned long msecs;
2873 int err;
2874
2875 err = kstrtoul(buf, 10, &msecs);
2876 if (err || msecs > UINT_MAX)
2877 return -EINVAL;
2878
2879 ksm_thread_sleep_millisecs = msecs;
2880 wake_up_interruptible(&ksm_iter_wait);
2881
2882 return count;
2883}
2884KSM_ATTR(sleep_millisecs);
2885
2886static ssize_t pages_to_scan_show(struct kobject *kobj,
2887 struct kobj_attribute *attr, char *buf)
2888{
2889 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2890}
2891
2892static ssize_t pages_to_scan_store(struct kobject *kobj,
2893 struct kobj_attribute *attr,
2894 const char *buf, size_t count)
2895{
2896 int err;
2897 unsigned long nr_pages;
2898
2899 err = kstrtoul(buf, 10, &nr_pages);
2900 if (err || nr_pages > UINT_MAX)
2901 return -EINVAL;
2902
2903 ksm_thread_pages_to_scan = nr_pages;
2904
2905 return count;
2906}
2907KSM_ATTR(pages_to_scan);
2908
2909static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2910 char *buf)
2911{
2912 return sprintf(buf, "%lu\n", ksm_run);
2913}
2914
2915static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2916 const char *buf, size_t count)
2917{
2918 int err;
2919 unsigned long flags;
2920
2921 err = kstrtoul(buf, 10, &flags);
2922 if (err || flags > UINT_MAX)
2923 return -EINVAL;
2924 if (flags > KSM_RUN_UNMERGE)
2925 return -EINVAL;
2926
2927
2928
2929
2930
2931
2932
2933
2934 mutex_lock(&ksm_thread_mutex);
2935 wait_while_offlining();
2936 if (ksm_run != flags) {
2937 ksm_run = flags;
2938 if (flags & KSM_RUN_UNMERGE) {
2939 set_current_oom_origin();
2940 err = unmerge_and_remove_all_rmap_items();
2941 clear_current_oom_origin();
2942 if (err) {
2943 ksm_run = KSM_RUN_STOP;
2944 count = err;
2945 }
2946 }
2947 }
2948 mutex_unlock(&ksm_thread_mutex);
2949
2950 if (flags & KSM_RUN_MERGE)
2951 wake_up_interruptible(&ksm_thread_wait);
2952
2953 return count;
2954}
2955KSM_ATTR(run);
2956
2957#ifdef CONFIG_NUMA
2958static ssize_t merge_across_nodes_show(struct kobject *kobj,
2959 struct kobj_attribute *attr, char *buf)
2960{
2961 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2962}
2963
2964static ssize_t merge_across_nodes_store(struct kobject *kobj,
2965 struct kobj_attribute *attr,
2966 const char *buf, size_t count)
2967{
2968 int err;
2969 unsigned long knob;
2970
2971 err = kstrtoul(buf, 10, &knob);
2972 if (err)
2973 return err;
2974 if (knob > 1)
2975 return -EINVAL;
2976
2977 mutex_lock(&ksm_thread_mutex);
2978 wait_while_offlining();
2979 if (ksm_merge_across_nodes != knob) {
2980 if (ksm_pages_shared || remove_all_stable_nodes())
2981 err = -EBUSY;
2982 else if (root_stable_tree == one_stable_tree) {
2983 struct rb_root *buf;
2984
2985
2986
2987
2988
2989
2990
2991 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2992 GFP_KERNEL);
2993
2994 if (!buf)
2995 err = -ENOMEM;
2996 else {
2997 root_stable_tree = buf;
2998 root_unstable_tree = buf + nr_node_ids;
2999
3000 root_unstable_tree[0] = one_unstable_tree[0];
3001 }
3002 }
3003 if (!err) {
3004 ksm_merge_across_nodes = knob;
3005 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
3006 }
3007 }
3008 mutex_unlock(&ksm_thread_mutex);
3009
3010 return err ? err : count;
3011}
3012KSM_ATTR(merge_across_nodes);
3013#endif
3014
3015static ssize_t use_zero_pages_show(struct kobject *kobj,
3016 struct kobj_attribute *attr, char *buf)
3017{
3018 return sprintf(buf, "%u\n", ksm_use_zero_pages);
3019}
3020static ssize_t use_zero_pages_store(struct kobject *kobj,
3021 struct kobj_attribute *attr,
3022 const char *buf, size_t count)
3023{
3024 int err;
3025 bool value;
3026
3027 err = kstrtobool(buf, &value);
3028 if (err)
3029 return -EINVAL;
3030
3031 ksm_use_zero_pages = value;
3032
3033 return count;
3034}
3035KSM_ATTR(use_zero_pages);
3036
3037static ssize_t max_page_sharing_show(struct kobject *kobj,
3038 struct kobj_attribute *attr, char *buf)
3039{
3040 return sprintf(buf, "%u\n", ksm_max_page_sharing);
3041}
3042
3043static ssize_t max_page_sharing_store(struct kobject *kobj,
3044 struct kobj_attribute *attr,
3045 const char *buf, size_t count)
3046{
3047 int err;
3048 int knob;
3049
3050 err = kstrtoint(buf, 10, &knob);
3051 if (err)
3052 return err;
3053
3054
3055
3056
3057
3058 if (knob < 2)
3059 return -EINVAL;
3060
3061 if (READ_ONCE(ksm_max_page_sharing) == knob)
3062 return count;
3063
3064 mutex_lock(&ksm_thread_mutex);
3065 wait_while_offlining();
3066 if (ksm_max_page_sharing != knob) {
3067 if (ksm_pages_shared || remove_all_stable_nodes())
3068 err = -EBUSY;
3069 else
3070 ksm_max_page_sharing = knob;
3071 }
3072 mutex_unlock(&ksm_thread_mutex);
3073
3074 return err ? err : count;
3075}
3076KSM_ATTR(max_page_sharing);
3077
3078static ssize_t pages_shared_show(struct kobject *kobj,
3079 struct kobj_attribute *attr, char *buf)
3080{
3081 return sprintf(buf, "%lu\n", ksm_pages_shared);
3082}
3083KSM_ATTR_RO(pages_shared);
3084
3085static ssize_t pages_sharing_show(struct kobject *kobj,
3086 struct kobj_attribute *attr, char *buf)
3087{
3088 return sprintf(buf, "%lu\n", ksm_pages_sharing);
3089}
3090KSM_ATTR_RO(pages_sharing);
3091
3092static ssize_t pages_unshared_show(struct kobject *kobj,
3093 struct kobj_attribute *attr, char *buf)
3094{
3095 return sprintf(buf, "%lu\n", ksm_pages_unshared);
3096}
3097KSM_ATTR_RO(pages_unshared);
3098
3099static ssize_t pages_volatile_show(struct kobject *kobj,
3100 struct kobj_attribute *attr, char *buf)
3101{
3102 long ksm_pages_volatile;
3103
3104 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3105 - ksm_pages_sharing - ksm_pages_unshared;
3106
3107
3108
3109
3110 if (ksm_pages_volatile < 0)
3111 ksm_pages_volatile = 0;
3112 return sprintf(buf, "%ld\n", ksm_pages_volatile);
3113}
3114KSM_ATTR_RO(pages_volatile);
3115
3116static ssize_t stable_node_dups_show(struct kobject *kobj,
3117 struct kobj_attribute *attr, char *buf)
3118{
3119 return sprintf(buf, "%lu\n", ksm_stable_node_dups);
3120}
3121KSM_ATTR_RO(stable_node_dups);
3122
3123static ssize_t stable_node_chains_show(struct kobject *kobj,
3124 struct kobj_attribute *attr, char *buf)
3125{
3126 return sprintf(buf, "%lu\n", ksm_stable_node_chains);
3127}
3128KSM_ATTR_RO(stable_node_chains);
3129
3130static ssize_t
3131stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3132 struct kobj_attribute *attr,
3133 char *buf)
3134{
3135 return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3136}
3137
3138static ssize_t
3139stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3140 struct kobj_attribute *attr,
3141 const char *buf, size_t count)
3142{
3143 unsigned long msecs;
3144 int err;
3145
3146 err = kstrtoul(buf, 10, &msecs);
3147 if (err || msecs > UINT_MAX)
3148 return -EINVAL;
3149
3150 ksm_stable_node_chains_prune_millisecs = msecs;
3151
3152 return count;
3153}
3154KSM_ATTR(stable_node_chains_prune_millisecs);
3155
3156static ssize_t full_scans_show(struct kobject *kobj,
3157 struct kobj_attribute *attr, char *buf)
3158{
3159 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
3160}
3161KSM_ATTR_RO(full_scans);
3162
3163static struct attribute *ksm_attrs[] = {
3164 &sleep_millisecs_attr.attr,
3165 &pages_to_scan_attr.attr,
3166 &run_attr.attr,
3167 &pages_shared_attr.attr,
3168 &pages_sharing_attr.attr,
3169 &pages_unshared_attr.attr,
3170 &pages_volatile_attr.attr,
3171 &full_scans_attr.attr,
3172#ifdef CONFIG_NUMA
3173 &merge_across_nodes_attr.attr,
3174#endif
3175 &max_page_sharing_attr.attr,
3176 &stable_node_chains_attr.attr,
3177 &stable_node_dups_attr.attr,
3178 &stable_node_chains_prune_millisecs_attr.attr,
3179 &use_zero_pages_attr.attr,
3180 NULL,
3181};
3182
3183static const struct attribute_group ksm_attr_group = {
3184 .attrs = ksm_attrs,
3185 .name = "ksm",
3186};
3187#endif
3188
3189static int __init ksm_init(void)
3190{
3191 struct task_struct *ksm_thread;
3192 int err;
3193
3194
3195 zero_checksum = calc_checksum(ZERO_PAGE(0));
3196
3197 ksm_use_zero_pages = false;
3198
3199 err = ksm_slab_init();
3200 if (err)
3201 goto out;
3202
3203 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3204 if (IS_ERR(ksm_thread)) {
3205 pr_err("ksm: creating kthread failed\n");
3206 err = PTR_ERR(ksm_thread);
3207 goto out_free;
3208 }
3209
3210#ifdef CONFIG_SYSFS
3211 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3212 if (err) {
3213 pr_err("ksm: register sysfs failed\n");
3214 kthread_stop(ksm_thread);
3215 goto out_free;
3216 }
3217#else
3218 ksm_run = KSM_RUN_MERGE;
3219
3220#endif
3221
3222#ifdef CONFIG_MEMORY_HOTREMOVE
3223
3224 hotplug_memory_notifier(ksm_memory_callback, 100);
3225#endif
3226 return 0;
3227
3228out_free:
3229 ksm_slab_free();
3230out:
3231 return err;
3232}
3233subsys_initcall(ksm_init);
3234