1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <linux/errno.h>
17#include <linux/mm.h>
18#include <linux/fs.h>
19#include <linux/mman.h>
20#include <linux/sched.h>
21#include <linux/sched/mm.h>
22#include <linux/sched/coredump.h>
23#include <linux/rwsem.h>
24#include <linux/pagemap.h>
25#include <linux/rmap.h>
26#include <linux/spinlock.h>
27#include <linux/xxhash.h>
28#include <linux/delay.h>
29#include <linux/kthread.h>
30#include <linux/wait.h>
31#include <linux/slab.h>
32#include <linux/rbtree.h>
33#include <linux/memory.h>
34#include <linux/mmu_notifier.h>
35#include <linux/swap.h>
36#include <linux/ksm.h>
37#include <linux/hashtable.h>
38#include <linux/freezer.h>
39#include <linux/oom.h>
40#include <linux/numa.h>
41
42#include <asm/tlbflush.h>
43#include "internal.h"
44
45#ifdef CONFIG_NUMA
46#define NUMA(x) (x)
47#define DO_NUMA(x) do { (x); } while (0)
48#else
49#define NUMA(x) (0)
50#define DO_NUMA(x) do { } while (0)
51#endif
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120struct mm_slot {
121 struct hlist_node link;
122 struct list_head mm_list;
123 struct rmap_item *rmap_list;
124 struct mm_struct *mm;
125};
126
127
128
129
130
131
132
133
134
135
136struct ksm_scan {
137 struct mm_slot *mm_slot;
138 unsigned long address;
139 struct rmap_item **rmap_list;
140 unsigned long seqnr;
141};
142
143
144
145
146
147
148
149
150
151
152
153
154
155struct stable_node {
156 union {
157 struct rb_node node;
158 struct {
159 struct list_head *head;
160 struct {
161 struct hlist_node hlist_dup;
162 struct list_head list;
163 };
164 };
165 };
166 struct hlist_head hlist;
167 union {
168 unsigned long kpfn;
169 unsigned long chain_prune_time;
170 };
171
172
173
174
175
176#define STABLE_NODE_CHAIN -1024
177 int rmap_hlist_len;
178#ifdef CONFIG_NUMA
179 int nid;
180#endif
181};
182
183
184
185
186
187
188
189
190
191
192
193
194
195struct rmap_item {
196 struct rmap_item *rmap_list;
197 union {
198 struct anon_vma *anon_vma;
199#ifdef CONFIG_NUMA
200 int nid;
201#endif
202 };
203 struct mm_struct *mm;
204 unsigned long address;
205 unsigned int oldchecksum;
206 union {
207 struct rb_node node;
208 struct {
209 struct stable_node *head;
210 struct hlist_node hlist;
211 };
212 };
213};
214
215#define SEQNR_MASK 0x0ff
216#define UNSTABLE_FLAG 0x100
217#define STABLE_FLAG 0x200
218
219
220static struct rb_root one_stable_tree[1] = { RB_ROOT };
221static struct rb_root one_unstable_tree[1] = { RB_ROOT };
222static struct rb_root *root_stable_tree = one_stable_tree;
223static struct rb_root *root_unstable_tree = one_unstable_tree;
224
225
226static LIST_HEAD(migrate_nodes);
227#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
228
229#define MM_SLOTS_HASH_BITS 10
230static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
231
232static struct mm_slot ksm_mm_head = {
233 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
234};
235static struct ksm_scan ksm_scan = {
236 .mm_slot = &ksm_mm_head,
237};
238
239static struct kmem_cache *rmap_item_cache;
240static struct kmem_cache *stable_node_cache;
241static struct kmem_cache *mm_slot_cache;
242
243
244static unsigned long ksm_pages_shared;
245
246
247static unsigned long ksm_pages_sharing;
248
249
250static unsigned long ksm_pages_unshared;
251
252
253static unsigned long ksm_rmap_items;
254
255
256static unsigned long ksm_stable_node_chains;
257
258
259static unsigned long ksm_stable_node_dups;
260
261
262static int ksm_stable_node_chains_prune_millisecs = 2000;
263
264
265static int ksm_max_page_sharing = 256;
266
267
268static unsigned int ksm_thread_pages_to_scan = 100;
269
270
271static unsigned int ksm_thread_sleep_millisecs = 20;
272
273
274static unsigned int zero_checksum __read_mostly;
275
276
277static bool ksm_use_zero_pages __read_mostly;
278
279#ifdef CONFIG_NUMA
280
281static unsigned int ksm_merge_across_nodes = 1;
282static int ksm_nr_node_ids = 1;
283#else
284#define ksm_merge_across_nodes 1U
285#define ksm_nr_node_ids 1
286#endif
287
288#define KSM_RUN_STOP 0
289#define KSM_RUN_MERGE 1
290#define KSM_RUN_UNMERGE 2
291#define KSM_RUN_OFFLINE 4
292static unsigned long ksm_run = KSM_RUN_STOP;
293static void wait_while_offlining(void);
294
295static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
296static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
297static DEFINE_MUTEX(ksm_thread_mutex);
298static DEFINE_SPINLOCK(ksm_mmlist_lock);
299
300#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
301 sizeof(struct __struct), __alignof__(struct __struct),\
302 (__flags), NULL)
303
304static int __init ksm_slab_init(void)
305{
306 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
307 if (!rmap_item_cache)
308 goto out;
309
310 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
311 if (!stable_node_cache)
312 goto out_free1;
313
314 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
315 if (!mm_slot_cache)
316 goto out_free2;
317
318 return 0;
319
320out_free2:
321 kmem_cache_destroy(stable_node_cache);
322out_free1:
323 kmem_cache_destroy(rmap_item_cache);
324out:
325 return -ENOMEM;
326}
327
328static void __init ksm_slab_free(void)
329{
330 kmem_cache_destroy(mm_slot_cache);
331 kmem_cache_destroy(stable_node_cache);
332 kmem_cache_destroy(rmap_item_cache);
333 mm_slot_cache = NULL;
334}
335
336static __always_inline bool is_stable_node_chain(struct stable_node *chain)
337{
338 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
339}
340
341static __always_inline bool is_stable_node_dup(struct stable_node *dup)
342{
343 return dup->head == STABLE_NODE_DUP_HEAD;
344}
345
346static inline void stable_node_chain_add_dup(struct stable_node *dup,
347 struct stable_node *chain)
348{
349 VM_BUG_ON(is_stable_node_dup(dup));
350 dup->head = STABLE_NODE_DUP_HEAD;
351 VM_BUG_ON(!is_stable_node_chain(chain));
352 hlist_add_head(&dup->hlist_dup, &chain->hlist);
353 ksm_stable_node_dups++;
354}
355
356static inline void __stable_node_dup_del(struct stable_node *dup)
357{
358 VM_BUG_ON(!is_stable_node_dup(dup));
359 hlist_del(&dup->hlist_dup);
360 ksm_stable_node_dups--;
361}
362
363static inline void stable_node_dup_del(struct stable_node *dup)
364{
365 VM_BUG_ON(is_stable_node_chain(dup));
366 if (is_stable_node_dup(dup))
367 __stable_node_dup_del(dup);
368 else
369 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
370#ifdef CONFIG_DEBUG_VM
371 dup->head = NULL;
372#endif
373}
374
375static inline struct rmap_item *alloc_rmap_item(void)
376{
377 struct rmap_item *rmap_item;
378
379 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
380 __GFP_NORETRY | __GFP_NOWARN);
381 if (rmap_item)
382 ksm_rmap_items++;
383 return rmap_item;
384}
385
386static inline void free_rmap_item(struct rmap_item *rmap_item)
387{
388 ksm_rmap_items--;
389 rmap_item->mm = NULL;
390 kmem_cache_free(rmap_item_cache, rmap_item);
391}
392
393static inline struct stable_node *alloc_stable_node(void)
394{
395
396
397
398
399
400 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
401}
402
403static inline void free_stable_node(struct stable_node *stable_node)
404{
405 VM_BUG_ON(stable_node->rmap_hlist_len &&
406 !is_stable_node_chain(stable_node));
407 kmem_cache_free(stable_node_cache, stable_node);
408}
409
410static inline struct mm_slot *alloc_mm_slot(void)
411{
412 if (!mm_slot_cache)
413 return NULL;
414 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
415}
416
417static inline void free_mm_slot(struct mm_slot *mm_slot)
418{
419 kmem_cache_free(mm_slot_cache, mm_slot);
420}
421
422static struct mm_slot *get_mm_slot(struct mm_struct *mm)
423{
424 struct mm_slot *slot;
425
426 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
427 if (slot->mm == mm)
428 return slot;
429
430 return NULL;
431}
432
433static void insert_to_mm_slots_hash(struct mm_struct *mm,
434 struct mm_slot *mm_slot)
435{
436 mm_slot->mm = mm;
437 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
438}
439
440
441
442
443
444
445
446
447
448static inline bool ksm_test_exit(struct mm_struct *mm)
449{
450 return atomic_read(&mm->mm_users) == 0;
451}
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
469{
470 struct page *page;
471 vm_fault_t ret = 0;
472
473 do {
474 cond_resched();
475 page = follow_page(vma, addr,
476 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
477 if (IS_ERR_OR_NULL(page))
478 break;
479 if (PageKsm(page))
480 ret = handle_mm_fault(vma, addr,
481 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
482 NULL);
483 else
484 ret = VM_FAULT_WRITE;
485 put_page(page);
486 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
516}
517
518static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
519 unsigned long addr)
520{
521 struct vm_area_struct *vma;
522 if (ksm_test_exit(mm))
523 return NULL;
524 vma = vma_lookup(mm, addr);
525 if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
526 return NULL;
527 return vma;
528}
529
530static void break_cow(struct rmap_item *rmap_item)
531{
532 struct mm_struct *mm = rmap_item->mm;
533 unsigned long addr = rmap_item->address;
534 struct vm_area_struct *vma;
535
536
537
538
539
540 put_anon_vma(rmap_item->anon_vma);
541
542 mmap_read_lock(mm);
543 vma = find_mergeable_vma(mm, addr);
544 if (vma)
545 break_ksm(vma, addr);
546 mmap_read_unlock(mm);
547}
548
549static struct page *get_mergeable_page(struct rmap_item *rmap_item)
550{
551 struct mm_struct *mm = rmap_item->mm;
552 unsigned long addr = rmap_item->address;
553 struct vm_area_struct *vma;
554 struct page *page;
555
556 mmap_read_lock(mm);
557 vma = find_mergeable_vma(mm, addr);
558 if (!vma)
559 goto out;
560
561 page = follow_page(vma, addr, FOLL_GET);
562 if (IS_ERR_OR_NULL(page))
563 goto out;
564 if (PageAnon(page)) {
565 flush_anon_page(vma, page, addr);
566 flush_dcache_page(page);
567 } else {
568 put_page(page);
569out:
570 page = NULL;
571 }
572 mmap_read_unlock(mm);
573 return page;
574}
575
576
577
578
579
580
581
582static inline int get_kpfn_nid(unsigned long kpfn)
583{
584 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
585}
586
587static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
588 struct rb_root *root)
589{
590 struct stable_node *chain = alloc_stable_node();
591 VM_BUG_ON(is_stable_node_chain(dup));
592 if (likely(chain)) {
593 INIT_HLIST_HEAD(&chain->hlist);
594 chain->chain_prune_time = jiffies;
595 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
596#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
597 chain->nid = NUMA_NO_NODE;
598#endif
599 ksm_stable_node_chains++;
600
601
602
603
604
605
606 rb_replace_node(&dup->node, &chain->node, root);
607
608
609
610
611
612
613
614
615 stable_node_chain_add_dup(dup, chain);
616 }
617 return chain;
618}
619
620static inline void free_stable_node_chain(struct stable_node *chain,
621 struct rb_root *root)
622{
623 rb_erase(&chain->node, root);
624 free_stable_node(chain);
625 ksm_stable_node_chains--;
626}
627
628static void remove_node_from_stable_tree(struct stable_node *stable_node)
629{
630 struct rmap_item *rmap_item;
631
632
633 BUG_ON(stable_node->rmap_hlist_len < 0);
634
635 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
636 if (rmap_item->hlist.next)
637 ksm_pages_sharing--;
638 else
639 ksm_pages_shared--;
640 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
641 stable_node->rmap_hlist_len--;
642 put_anon_vma(rmap_item->anon_vma);
643 rmap_item->address &= PAGE_MASK;
644 cond_resched();
645 }
646
647
648
649
650
651
652
653
654#if defined(GCC_VERSION) && GCC_VERSION >= 40903
655 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
656 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
657#endif
658
659 if (stable_node->head == &migrate_nodes)
660 list_del(&stable_node->list);
661 else
662 stable_node_dup_del(stable_node);
663 free_stable_node(stable_node);
664}
665
666enum get_ksm_page_flags {
667 GET_KSM_PAGE_NOLOCK,
668 GET_KSM_PAGE_LOCK,
669 GET_KSM_PAGE_TRYLOCK
670};
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691static struct page *get_ksm_page(struct stable_node *stable_node,
692 enum get_ksm_page_flags flags)
693{
694 struct page *page;
695 void *expected_mapping;
696 unsigned long kpfn;
697
698 expected_mapping = (void *)((unsigned long)stable_node |
699 PAGE_MAPPING_KSM);
700again:
701 kpfn = READ_ONCE(stable_node->kpfn);
702 page = pfn_to_page(kpfn);
703 if (READ_ONCE(page->mapping) != expected_mapping)
704 goto stale;
705
706
707
708
709
710
711
712
713
714
715
716 while (!get_page_unless_zero(page)) {
717
718
719
720
721
722
723
724
725 if (!PageSwapCache(page))
726 goto stale;
727 cpu_relax();
728 }
729
730 if (READ_ONCE(page->mapping) != expected_mapping) {
731 put_page(page);
732 goto stale;
733 }
734
735 if (flags == GET_KSM_PAGE_TRYLOCK) {
736 if (!trylock_page(page)) {
737 put_page(page);
738 return ERR_PTR(-EBUSY);
739 }
740 } else if (flags == GET_KSM_PAGE_LOCK)
741 lock_page(page);
742
743 if (flags != GET_KSM_PAGE_NOLOCK) {
744 if (READ_ONCE(page->mapping) != expected_mapping) {
745 unlock_page(page);
746 put_page(page);
747 goto stale;
748 }
749 }
750 return page;
751
752stale:
753
754
755
756
757
758
759 smp_rmb();
760 if (READ_ONCE(stable_node->kpfn) != kpfn)
761 goto again;
762 remove_node_from_stable_tree(stable_node);
763 return NULL;
764}
765
766
767
768
769
770static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
771{
772 if (rmap_item->address & STABLE_FLAG) {
773 struct stable_node *stable_node;
774 struct page *page;
775
776 stable_node = rmap_item->head;
777 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
778 if (!page)
779 goto out;
780
781 hlist_del(&rmap_item->hlist);
782 unlock_page(page);
783 put_page(page);
784
785 if (!hlist_empty(&stable_node->hlist))
786 ksm_pages_sharing--;
787 else
788 ksm_pages_shared--;
789 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
790 stable_node->rmap_hlist_len--;
791
792 put_anon_vma(rmap_item->anon_vma);
793 rmap_item->head = NULL;
794 rmap_item->address &= PAGE_MASK;
795
796 } else if (rmap_item->address & UNSTABLE_FLAG) {
797 unsigned char age;
798
799
800
801
802
803
804
805 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
806 BUG_ON(age > 1);
807 if (!age)
808 rb_erase(&rmap_item->node,
809 root_unstable_tree + NUMA(rmap_item->nid));
810 ksm_pages_unshared--;
811 rmap_item->address &= PAGE_MASK;
812 }
813out:
814 cond_resched();
815}
816
817static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
818{
819 while (*rmap_list) {
820 struct rmap_item *rmap_item = *rmap_list;
821 *rmap_list = rmap_item->rmap_list;
822 remove_rmap_item_from_tree(rmap_item);
823 free_rmap_item(rmap_item);
824 }
825}
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840static int unmerge_ksm_pages(struct vm_area_struct *vma,
841 unsigned long start, unsigned long end)
842{
843 unsigned long addr;
844 int err = 0;
845
846 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
847 if (ksm_test_exit(vma->vm_mm))
848 break;
849 if (signal_pending(current))
850 err = -ERESTARTSYS;
851 else
852 err = break_ksm(vma, addr);
853 }
854 return err;
855}
856
857static inline struct stable_node *page_stable_node(struct page *page)
858{
859 return PageKsm(page) ? page_rmapping(page) : NULL;
860}
861
862static inline void set_page_stable_node(struct page *page,
863 struct stable_node *stable_node)
864{
865 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
866}
867
868#ifdef CONFIG_SYSFS
869
870
871
872static int remove_stable_node(struct stable_node *stable_node)
873{
874 struct page *page;
875 int err;
876
877 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
878 if (!page) {
879
880
881
882 return 0;
883 }
884
885
886
887
888
889
890 err = -EBUSY;
891 if (!page_mapped(page)) {
892
893
894
895
896
897
898
899
900 set_page_stable_node(page, NULL);
901 remove_node_from_stable_tree(stable_node);
902 err = 0;
903 }
904
905 unlock_page(page);
906 put_page(page);
907 return err;
908}
909
910static int remove_stable_node_chain(struct stable_node *stable_node,
911 struct rb_root *root)
912{
913 struct stable_node *dup;
914 struct hlist_node *hlist_safe;
915
916 if (!is_stable_node_chain(stable_node)) {
917 VM_BUG_ON(is_stable_node_dup(stable_node));
918 if (remove_stable_node(stable_node))
919 return true;
920 else
921 return false;
922 }
923
924 hlist_for_each_entry_safe(dup, hlist_safe,
925 &stable_node->hlist, hlist_dup) {
926 VM_BUG_ON(!is_stable_node_dup(dup));
927 if (remove_stable_node(dup))
928 return true;
929 }
930 BUG_ON(!hlist_empty(&stable_node->hlist));
931 free_stable_node_chain(stable_node, root);
932 return false;
933}
934
935static int remove_all_stable_nodes(void)
936{
937 struct stable_node *stable_node, *next;
938 int nid;
939 int err = 0;
940
941 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
942 while (root_stable_tree[nid].rb_node) {
943 stable_node = rb_entry(root_stable_tree[nid].rb_node,
944 struct stable_node, node);
945 if (remove_stable_node_chain(stable_node,
946 root_stable_tree + nid)) {
947 err = -EBUSY;
948 break;
949 }
950 cond_resched();
951 }
952 }
953 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
954 if (remove_stable_node(stable_node))
955 err = -EBUSY;
956 cond_resched();
957 }
958 return err;
959}
960
961static int unmerge_and_remove_all_rmap_items(void)
962{
963 struct mm_slot *mm_slot;
964 struct mm_struct *mm;
965 struct vm_area_struct *vma;
966 int err = 0;
967
968 spin_lock(&ksm_mmlist_lock);
969 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
970 struct mm_slot, mm_list);
971 spin_unlock(&ksm_mmlist_lock);
972
973 for (mm_slot = ksm_scan.mm_slot;
974 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
975 mm = mm_slot->mm;
976 mmap_read_lock(mm);
977 for (vma = mm->mmap; vma; vma = vma->vm_next) {
978 if (ksm_test_exit(mm))
979 break;
980 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
981 continue;
982 err = unmerge_ksm_pages(vma,
983 vma->vm_start, vma->vm_end);
984 if (err)
985 goto error;
986 }
987
988 remove_trailing_rmap_items(&mm_slot->rmap_list);
989 mmap_read_unlock(mm);
990
991 spin_lock(&ksm_mmlist_lock);
992 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
993 struct mm_slot, mm_list);
994 if (ksm_test_exit(mm)) {
995 hash_del(&mm_slot->link);
996 list_del(&mm_slot->mm_list);
997 spin_unlock(&ksm_mmlist_lock);
998
999 free_mm_slot(mm_slot);
1000 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1001 mmdrop(mm);
1002 } else
1003 spin_unlock(&ksm_mmlist_lock);
1004 }
1005
1006
1007 remove_all_stable_nodes();
1008 ksm_scan.seqnr = 0;
1009 return 0;
1010
1011error:
1012 mmap_read_unlock(mm);
1013 spin_lock(&ksm_mmlist_lock);
1014 ksm_scan.mm_slot = &ksm_mm_head;
1015 spin_unlock(&ksm_mmlist_lock);
1016 return err;
1017}
1018#endif
1019
1020static u32 calc_checksum(struct page *page)
1021{
1022 u32 checksum;
1023 void *addr = kmap_atomic(page);
1024 checksum = xxhash(addr, PAGE_SIZE, 0);
1025 kunmap_atomic(addr);
1026 return checksum;
1027}
1028
1029static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1030 pte_t *orig_pte)
1031{
1032 struct mm_struct *mm = vma->vm_mm;
1033 struct page_vma_mapped_walk pvmw = {
1034 .page = page,
1035 .vma = vma,
1036 };
1037 int swapped;
1038 int err = -EFAULT;
1039 struct mmu_notifier_range range;
1040
1041 pvmw.address = page_address_in_vma(page, vma);
1042 if (pvmw.address == -EFAULT)
1043 goto out;
1044
1045 BUG_ON(PageTransCompound(page));
1046
1047 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1048 pvmw.address,
1049 pvmw.address + PAGE_SIZE);
1050 mmu_notifier_invalidate_range_start(&range);
1051
1052 if (!page_vma_mapped_walk(&pvmw))
1053 goto out_mn;
1054 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1055 goto out_unlock;
1056
1057 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1058 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1059 mm_tlb_flush_pending(mm)) {
1060 pte_t entry;
1061
1062 swapped = PageSwapCache(page);
1063 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1079
1080
1081
1082
1083 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1084 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1085 goto out_unlock;
1086 }
1087 if (pte_dirty(entry))
1088 set_page_dirty(page);
1089
1090 if (pte_protnone(entry))
1091 entry = pte_mkclean(pte_clear_savedwrite(entry));
1092 else
1093 entry = pte_mkclean(pte_wrprotect(entry));
1094 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1095 }
1096 *orig_pte = *pvmw.pte;
1097 err = 0;
1098
1099out_unlock:
1100 page_vma_mapped_walk_done(&pvmw);
1101out_mn:
1102 mmu_notifier_invalidate_range_end(&range);
1103out:
1104 return err;
1105}
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116static int replace_page(struct vm_area_struct *vma, struct page *page,
1117 struct page *kpage, pte_t orig_pte)
1118{
1119 struct mm_struct *mm = vma->vm_mm;
1120 pmd_t *pmd;
1121 pte_t *ptep;
1122 pte_t newpte;
1123 spinlock_t *ptl;
1124 unsigned long addr;
1125 int err = -EFAULT;
1126 struct mmu_notifier_range range;
1127
1128 addr = page_address_in_vma(page, vma);
1129 if (addr == -EFAULT)
1130 goto out;
1131
1132 pmd = mm_find_pmd(mm, addr);
1133 if (!pmd)
1134 goto out;
1135
1136 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1137 addr + PAGE_SIZE);
1138 mmu_notifier_invalidate_range_start(&range);
1139
1140 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1141 if (!pte_same(*ptep, orig_pte)) {
1142 pte_unmap_unlock(ptep, ptl);
1143 goto out_mn;
1144 }
1145
1146
1147
1148
1149
1150 if (!is_zero_pfn(page_to_pfn(kpage))) {
1151 get_page(kpage);
1152 page_add_anon_rmap(kpage, vma, addr, false);
1153 newpte = mk_pte(kpage, vma->vm_page_prot);
1154 } else {
1155 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1156 vma->vm_page_prot));
1157
1158
1159
1160
1161
1162
1163 dec_mm_counter(mm, MM_ANONPAGES);
1164 }
1165
1166 flush_cache_page(vma, addr, pte_pfn(*ptep));
1167
1168
1169
1170
1171
1172
1173 ptep_clear_flush(vma, addr, ptep);
1174 set_pte_at_notify(mm, addr, ptep, newpte);
1175
1176 page_remove_rmap(page, false);
1177 if (!page_mapped(page))
1178 try_to_free_swap(page);
1179 put_page(page);
1180
1181 pte_unmap_unlock(ptep, ptl);
1182 err = 0;
1183out_mn:
1184 mmu_notifier_invalidate_range_end(&range);
1185out:
1186 return err;
1187}
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198static int try_to_merge_one_page(struct vm_area_struct *vma,
1199 struct page *page, struct page *kpage)
1200{
1201 pte_t orig_pte = __pte(0);
1202 int err = -EFAULT;
1203
1204 if (page == kpage)
1205 return 0;
1206
1207 if (!PageAnon(page))
1208 goto out;
1209
1210
1211
1212
1213
1214
1215
1216
1217 if (!trylock_page(page))
1218 goto out;
1219
1220 if (PageTransCompound(page)) {
1221 if (split_huge_page(page))
1222 goto out_unlock;
1223 }
1224
1225
1226
1227
1228
1229
1230
1231 if (write_protect_page(vma, page, &orig_pte) == 0) {
1232 if (!kpage) {
1233
1234
1235
1236
1237
1238 set_page_stable_node(page, NULL);
1239 mark_page_accessed(page);
1240
1241
1242
1243
1244 if (!PageDirty(page))
1245 SetPageDirty(page);
1246 err = 0;
1247 } else if (pages_identical(page, kpage))
1248 err = replace_page(vma, page, kpage, orig_pte);
1249 }
1250
1251 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1252 munlock_vma_page(page);
1253 if (!PageMlocked(kpage)) {
1254 unlock_page(page);
1255 lock_page(kpage);
1256 mlock_vma_page(kpage);
1257 page = kpage;
1258 }
1259 }
1260
1261out_unlock:
1262 unlock_page(page);
1263out:
1264 return err;
1265}
1266
1267
1268
1269
1270
1271
1272
1273static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1274 struct page *page, struct page *kpage)
1275{
1276 struct mm_struct *mm = rmap_item->mm;
1277 struct vm_area_struct *vma;
1278 int err = -EFAULT;
1279
1280 mmap_read_lock(mm);
1281 vma = find_mergeable_vma(mm, rmap_item->address);
1282 if (!vma)
1283 goto out;
1284
1285 err = try_to_merge_one_page(vma, page, kpage);
1286 if (err)
1287 goto out;
1288
1289
1290 remove_rmap_item_from_tree(rmap_item);
1291
1292
1293 rmap_item->anon_vma = vma->anon_vma;
1294 get_anon_vma(vma->anon_vma);
1295out:
1296 mmap_read_unlock(mm);
1297 return err;
1298}
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1311 struct page *page,
1312 struct rmap_item *tree_rmap_item,
1313 struct page *tree_page)
1314{
1315 int err;
1316
1317 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1318 if (!err) {
1319 err = try_to_merge_with_ksm_page(tree_rmap_item,
1320 tree_page, page);
1321
1322
1323
1324
1325 if (err)
1326 break_cow(rmap_item);
1327 }
1328 return err ? NULL : page;
1329}
1330
1331static __always_inline
1332bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1333{
1334 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1335
1336
1337
1338
1339
1340
1341 return stable_node->rmap_hlist_len &&
1342 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1343}
1344
1345static __always_inline
1346bool is_page_sharing_candidate(struct stable_node *stable_node)
1347{
1348 return __is_page_sharing_candidate(stable_node, 0);
1349}
1350
1351static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1352 struct stable_node **_stable_node,
1353 struct rb_root *root,
1354 bool prune_stale_stable_nodes)
1355{
1356 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1357 struct hlist_node *hlist_safe;
1358 struct page *_tree_page, *tree_page = NULL;
1359 int nr = 0;
1360 int found_rmap_hlist_len;
1361
1362 if (!prune_stale_stable_nodes ||
1363 time_before(jiffies, stable_node->chain_prune_time +
1364 msecs_to_jiffies(
1365 ksm_stable_node_chains_prune_millisecs)))
1366 prune_stale_stable_nodes = false;
1367 else
1368 stable_node->chain_prune_time = jiffies;
1369
1370 hlist_for_each_entry_safe(dup, hlist_safe,
1371 &stable_node->hlist, hlist_dup) {
1372 cond_resched();
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1384 if (!_tree_page)
1385 continue;
1386 nr += 1;
1387 if (is_page_sharing_candidate(dup)) {
1388 if (!found ||
1389 dup->rmap_hlist_len > found_rmap_hlist_len) {
1390 if (found)
1391 put_page(tree_page);
1392 found = dup;
1393 found_rmap_hlist_len = found->rmap_hlist_len;
1394 tree_page = _tree_page;
1395
1396
1397 if (!prune_stale_stable_nodes)
1398 break;
1399 continue;
1400 }
1401 }
1402 put_page(_tree_page);
1403 }
1404
1405 if (found) {
1406
1407
1408
1409
1410
1411
1412 if (prune_stale_stable_nodes && nr == 1) {
1413
1414
1415
1416
1417
1418
1419 BUG_ON(stable_node->hlist.first->next);
1420
1421
1422
1423
1424
1425 rb_replace_node(&stable_node->node, &found->node,
1426 root);
1427 free_stable_node(stable_node);
1428 ksm_stable_node_chains--;
1429 ksm_stable_node_dups--;
1430
1431
1432
1433
1434
1435 *_stable_node = found;
1436
1437
1438
1439
1440
1441
1442 stable_node = NULL;
1443 } else if (stable_node->hlist.first != &found->hlist_dup &&
1444 __is_page_sharing_candidate(found, 1)) {
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460 hlist_del(&found->hlist_dup);
1461 hlist_add_head(&found->hlist_dup,
1462 &stable_node->hlist);
1463 }
1464 }
1465
1466 *_stable_node_dup = found;
1467 return tree_page;
1468}
1469
1470static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1471 struct rb_root *root)
1472{
1473 if (!is_stable_node_chain(stable_node))
1474 return stable_node;
1475 if (hlist_empty(&stable_node->hlist)) {
1476 free_stable_node_chain(stable_node, root);
1477 return NULL;
1478 }
1479 return hlist_entry(stable_node->hlist.first,
1480 typeof(*stable_node), hlist_dup);
1481}
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1498 struct stable_node **_stable_node,
1499 struct rb_root *root,
1500 bool prune_stale_stable_nodes)
1501{
1502 struct stable_node *stable_node = *_stable_node;
1503 if (!is_stable_node_chain(stable_node)) {
1504 if (is_page_sharing_candidate(stable_node)) {
1505 *_stable_node_dup = stable_node;
1506 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1507 }
1508
1509
1510
1511
1512 *_stable_node_dup = NULL;
1513 return NULL;
1514 }
1515 return stable_node_dup(_stable_node_dup, _stable_node, root,
1516 prune_stale_stable_nodes);
1517}
1518
1519static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1520 struct stable_node **s_n,
1521 struct rb_root *root)
1522{
1523 return __stable_node_chain(s_n_d, s_n, root, true);
1524}
1525
1526static __always_inline struct page *chain(struct stable_node **s_n_d,
1527 struct stable_node *s_n,
1528 struct rb_root *root)
1529{
1530 struct stable_node *old_stable_node = s_n;
1531 struct page *tree_page;
1532
1533 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1534
1535 VM_BUG_ON(s_n != old_stable_node);
1536 return tree_page;
1537}
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548static struct page *stable_tree_search(struct page *page)
1549{
1550 int nid;
1551 struct rb_root *root;
1552 struct rb_node **new;
1553 struct rb_node *parent;
1554 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1555 struct stable_node *page_node;
1556
1557 page_node = page_stable_node(page);
1558 if (page_node && page_node->head != &migrate_nodes) {
1559
1560 get_page(page);
1561 return page;
1562 }
1563
1564 nid = get_kpfn_nid(page_to_pfn(page));
1565 root = root_stable_tree + nid;
1566again:
1567 new = &root->rb_node;
1568 parent = NULL;
1569
1570 while (*new) {
1571 struct page *tree_page;
1572 int ret;
1573
1574 cond_resched();
1575 stable_node = rb_entry(*new, struct stable_node, node);
1576 stable_node_any = NULL;
1577 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590 if (!stable_node_dup) {
1591
1592
1593
1594
1595
1596 stable_node_any = stable_node_dup_any(stable_node,
1597 root);
1598 if (!stable_node_any) {
1599
1600 goto again;
1601 }
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611 tree_page = get_ksm_page(stable_node_any,
1612 GET_KSM_PAGE_NOLOCK);
1613 }
1614 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1615 if (!tree_page) {
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625 goto again;
1626 }
1627
1628 ret = memcmp_pages(page, tree_page);
1629 put_page(tree_page);
1630
1631 parent = *new;
1632 if (ret < 0)
1633 new = &parent->rb_left;
1634 else if (ret > 0)
1635 new = &parent->rb_right;
1636 else {
1637 if (page_node) {
1638 VM_BUG_ON(page_node->head != &migrate_nodes);
1639
1640
1641
1642
1643
1644
1645 if (page_mapcount(page) > 1)
1646 goto chain_append;
1647 }
1648
1649 if (!stable_node_dup) {
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662 return NULL;
1663 }
1664
1665
1666
1667
1668
1669
1670
1671
1672 tree_page = get_ksm_page(stable_node_dup,
1673 GET_KSM_PAGE_TRYLOCK);
1674
1675 if (PTR_ERR(tree_page) == -EBUSY)
1676 return ERR_PTR(-EBUSY);
1677
1678 if (unlikely(!tree_page))
1679
1680
1681
1682
1683 goto again;
1684 unlock_page(tree_page);
1685
1686 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1687 NUMA(stable_node_dup->nid)) {
1688 put_page(tree_page);
1689 goto replace;
1690 }
1691 return tree_page;
1692 }
1693 }
1694
1695 if (!page_node)
1696 return NULL;
1697
1698 list_del(&page_node->list);
1699 DO_NUMA(page_node->nid = nid);
1700 rb_link_node(&page_node->node, parent, new);
1701 rb_insert_color(&page_node->node, root);
1702out:
1703 if (is_page_sharing_candidate(page_node)) {
1704 get_page(page);
1705 return page;
1706 } else
1707 return NULL;
1708
1709replace:
1710
1711
1712
1713
1714
1715
1716
1717
1718 if (stable_node_dup == stable_node) {
1719 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1720 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1721
1722 if (page_node) {
1723 VM_BUG_ON(page_node->head != &migrate_nodes);
1724 list_del(&page_node->list);
1725 DO_NUMA(page_node->nid = nid);
1726 rb_replace_node(&stable_node_dup->node,
1727 &page_node->node,
1728 root);
1729 if (is_page_sharing_candidate(page_node))
1730 get_page(page);
1731 else
1732 page = NULL;
1733 } else {
1734 rb_erase(&stable_node_dup->node, root);
1735 page = NULL;
1736 }
1737 } else {
1738 VM_BUG_ON(!is_stable_node_chain(stable_node));
1739 __stable_node_dup_del(stable_node_dup);
1740 if (page_node) {
1741 VM_BUG_ON(page_node->head != &migrate_nodes);
1742 list_del(&page_node->list);
1743 DO_NUMA(page_node->nid = nid);
1744 stable_node_chain_add_dup(page_node, stable_node);
1745 if (is_page_sharing_candidate(page_node))
1746 get_page(page);
1747 else
1748 page = NULL;
1749 } else {
1750 page = NULL;
1751 }
1752 }
1753 stable_node_dup->head = &migrate_nodes;
1754 list_add(&stable_node_dup->list, stable_node_dup->head);
1755 return page;
1756
1757chain_append:
1758
1759 if (!stable_node_dup)
1760 stable_node_dup = stable_node_any;
1761
1762
1763
1764
1765
1766
1767
1768
1769 if (stable_node_dup == stable_node) {
1770 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1771
1772 stable_node = alloc_stable_node_chain(stable_node_dup,
1773 root);
1774 if (!stable_node)
1775 return NULL;
1776 }
1777
1778
1779
1780
1781
1782
1783 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1784 VM_BUG_ON(page_node->head != &migrate_nodes);
1785 list_del(&page_node->list);
1786 DO_NUMA(page_node->nid = nid);
1787 stable_node_chain_add_dup(page_node, stable_node);
1788 goto out;
1789}
1790
1791
1792
1793
1794
1795
1796
1797
1798static struct stable_node *stable_tree_insert(struct page *kpage)
1799{
1800 int nid;
1801 unsigned long kpfn;
1802 struct rb_root *root;
1803 struct rb_node **new;
1804 struct rb_node *parent;
1805 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1806 bool need_chain = false;
1807
1808 kpfn = page_to_pfn(kpage);
1809 nid = get_kpfn_nid(kpfn);
1810 root = root_stable_tree + nid;
1811again:
1812 parent = NULL;
1813 new = &root->rb_node;
1814
1815 while (*new) {
1816 struct page *tree_page;
1817 int ret;
1818
1819 cond_resched();
1820 stable_node = rb_entry(*new, struct stable_node, node);
1821 stable_node_any = NULL;
1822 tree_page = chain(&stable_node_dup, stable_node, root);
1823 if (!stable_node_dup) {
1824
1825
1826
1827
1828
1829 stable_node_any = stable_node_dup_any(stable_node,
1830 root);
1831 if (!stable_node_any) {
1832
1833 goto again;
1834 }
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844 tree_page = get_ksm_page(stable_node_any,
1845 GET_KSM_PAGE_NOLOCK);
1846 }
1847 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1848 if (!tree_page) {
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858 goto again;
1859 }
1860
1861 ret = memcmp_pages(kpage, tree_page);
1862 put_page(tree_page);
1863
1864 parent = *new;
1865 if (ret < 0)
1866 new = &parent->rb_left;
1867 else if (ret > 0)
1868 new = &parent->rb_right;
1869 else {
1870 need_chain = true;
1871 break;
1872 }
1873 }
1874
1875 stable_node_dup = alloc_stable_node();
1876 if (!stable_node_dup)
1877 return NULL;
1878
1879 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1880 stable_node_dup->kpfn = kpfn;
1881 set_page_stable_node(kpage, stable_node_dup);
1882 stable_node_dup->rmap_hlist_len = 0;
1883 DO_NUMA(stable_node_dup->nid = nid);
1884 if (!need_chain) {
1885 rb_link_node(&stable_node_dup->node, parent, new);
1886 rb_insert_color(&stable_node_dup->node, root);
1887 } else {
1888 if (!is_stable_node_chain(stable_node)) {
1889 struct stable_node *orig = stable_node;
1890
1891 stable_node = alloc_stable_node_chain(orig, root);
1892 if (!stable_node) {
1893 free_stable_node(stable_node_dup);
1894 return NULL;
1895 }
1896 }
1897 stable_node_chain_add_dup(stable_node_dup, stable_node);
1898 }
1899
1900 return stable_node_dup;
1901}
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917static
1918struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1919 struct page *page,
1920 struct page **tree_pagep)
1921{
1922 struct rb_node **new;
1923 struct rb_root *root;
1924 struct rb_node *parent = NULL;
1925 int nid;
1926
1927 nid = get_kpfn_nid(page_to_pfn(page));
1928 root = root_unstable_tree + nid;
1929 new = &root->rb_node;
1930
1931 while (*new) {
1932 struct rmap_item *tree_rmap_item;
1933 struct page *tree_page;
1934 int ret;
1935
1936 cond_resched();
1937 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1938 tree_page = get_mergeable_page(tree_rmap_item);
1939 if (!tree_page)
1940 return NULL;
1941
1942
1943
1944
1945 if (page == tree_page) {
1946 put_page(tree_page);
1947 return NULL;
1948 }
1949
1950 ret = memcmp_pages(page, tree_page);
1951
1952 parent = *new;
1953 if (ret < 0) {
1954 put_page(tree_page);
1955 new = &parent->rb_left;
1956 } else if (ret > 0) {
1957 put_page(tree_page);
1958 new = &parent->rb_right;
1959 } else if (!ksm_merge_across_nodes &&
1960 page_to_nid(tree_page) != nid) {
1961
1962
1963
1964
1965
1966 put_page(tree_page);
1967 return NULL;
1968 } else {
1969 *tree_pagep = tree_page;
1970 return tree_rmap_item;
1971 }
1972 }
1973
1974 rmap_item->address |= UNSTABLE_FLAG;
1975 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1976 DO_NUMA(rmap_item->nid = nid);
1977 rb_link_node(&rmap_item->node, parent, new);
1978 rb_insert_color(&rmap_item->node, root);
1979
1980 ksm_pages_unshared++;
1981 return NULL;
1982}
1983
1984
1985
1986
1987
1988
1989static void stable_tree_append(struct rmap_item *rmap_item,
1990 struct stable_node *stable_node,
1991 bool max_page_sharing_bypass)
1992{
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003 BUG_ON(stable_node->rmap_hlist_len < 0);
2004
2005 stable_node->rmap_hlist_len++;
2006 if (!max_page_sharing_bypass)
2007
2008 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2009 ksm_max_page_sharing);
2010
2011 rmap_item->head = stable_node;
2012 rmap_item->address |= STABLE_FLAG;
2013 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2014
2015 if (rmap_item->hlist.next)
2016 ksm_pages_sharing++;
2017 else
2018 ksm_pages_shared++;
2019}
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2031{
2032 struct mm_struct *mm = rmap_item->mm;
2033 struct rmap_item *tree_rmap_item;
2034 struct page *tree_page = NULL;
2035 struct stable_node *stable_node;
2036 struct page *kpage;
2037 unsigned int checksum;
2038 int err;
2039 bool max_page_sharing_bypass = false;
2040
2041 stable_node = page_stable_node(page);
2042 if (stable_node) {
2043 if (stable_node->head != &migrate_nodes &&
2044 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2045 NUMA(stable_node->nid)) {
2046 stable_node_dup_del(stable_node);
2047 stable_node->head = &migrate_nodes;
2048 list_add(&stable_node->list, stable_node->head);
2049 }
2050 if (stable_node->head != &migrate_nodes &&
2051 rmap_item->head == stable_node)
2052 return;
2053
2054
2055
2056
2057 if (!is_page_sharing_candidate(stable_node))
2058 max_page_sharing_bypass = true;
2059 }
2060
2061
2062 kpage = stable_tree_search(page);
2063 if (kpage == page && rmap_item->head == stable_node) {
2064 put_page(kpage);
2065 return;
2066 }
2067
2068 remove_rmap_item_from_tree(rmap_item);
2069
2070 if (kpage) {
2071 if (PTR_ERR(kpage) == -EBUSY)
2072 return;
2073
2074 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2075 if (!err) {
2076
2077
2078
2079
2080 lock_page(kpage);
2081 stable_tree_append(rmap_item, page_stable_node(kpage),
2082 max_page_sharing_bypass);
2083 unlock_page(kpage);
2084 }
2085 put_page(kpage);
2086 return;
2087 }
2088
2089
2090
2091
2092
2093
2094
2095 checksum = calc_checksum(page);
2096 if (rmap_item->oldchecksum != checksum) {
2097 rmap_item->oldchecksum = checksum;
2098 return;
2099 }
2100
2101
2102
2103
2104
2105 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2106 struct vm_area_struct *vma;
2107
2108 mmap_read_lock(mm);
2109 vma = find_mergeable_vma(mm, rmap_item->address);
2110 if (vma) {
2111 err = try_to_merge_one_page(vma, page,
2112 ZERO_PAGE(rmap_item->address));
2113 } else {
2114
2115
2116
2117
2118 err = 0;
2119 }
2120 mmap_read_unlock(mm);
2121
2122
2123
2124
2125 if (!err)
2126 return;
2127 }
2128 tree_rmap_item =
2129 unstable_tree_search_insert(rmap_item, page, &tree_page);
2130 if (tree_rmap_item) {
2131 bool split;
2132
2133 kpage = try_to_merge_two_pages(rmap_item, page,
2134 tree_rmap_item, tree_page);
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145 split = PageTransCompound(page)
2146 && compound_head(page) == compound_head(tree_page);
2147 put_page(tree_page);
2148 if (kpage) {
2149
2150
2151
2152
2153 lock_page(kpage);
2154 stable_node = stable_tree_insert(kpage);
2155 if (stable_node) {
2156 stable_tree_append(tree_rmap_item, stable_node,
2157 false);
2158 stable_tree_append(rmap_item, stable_node,
2159 false);
2160 }
2161 unlock_page(kpage);
2162
2163
2164
2165
2166
2167
2168
2169 if (!stable_node) {
2170 break_cow(tree_rmap_item);
2171 break_cow(rmap_item);
2172 }
2173 } else if (split) {
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183 if (!trylock_page(page))
2184 return;
2185 split_huge_page(page);
2186 unlock_page(page);
2187 }
2188 }
2189}
2190
2191static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2192 struct rmap_item **rmap_list,
2193 unsigned long addr)
2194{
2195 struct rmap_item *rmap_item;
2196
2197 while (*rmap_list) {
2198 rmap_item = *rmap_list;
2199 if ((rmap_item->address & PAGE_MASK) == addr)
2200 return rmap_item;
2201 if (rmap_item->address > addr)
2202 break;
2203 *rmap_list = rmap_item->rmap_list;
2204 remove_rmap_item_from_tree(rmap_item);
2205 free_rmap_item(rmap_item);
2206 }
2207
2208 rmap_item = alloc_rmap_item();
2209 if (rmap_item) {
2210
2211 rmap_item->mm = mm_slot->mm;
2212 rmap_item->address = addr;
2213 rmap_item->rmap_list = *rmap_list;
2214 *rmap_list = rmap_item;
2215 }
2216 return rmap_item;
2217}
2218
2219static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2220{
2221 struct mm_struct *mm;
2222 struct mm_slot *slot;
2223 struct vm_area_struct *vma;
2224 struct rmap_item *rmap_item;
2225 int nid;
2226
2227 if (list_empty(&ksm_mm_head.mm_list))
2228 return NULL;
2229
2230 slot = ksm_scan.mm_slot;
2231 if (slot == &ksm_mm_head) {
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242 lru_add_drain_all();
2243
2244
2245
2246
2247
2248
2249
2250 if (!ksm_merge_across_nodes) {
2251 struct stable_node *stable_node, *next;
2252 struct page *page;
2253
2254 list_for_each_entry_safe(stable_node, next,
2255 &migrate_nodes, list) {
2256 page = get_ksm_page(stable_node,
2257 GET_KSM_PAGE_NOLOCK);
2258 if (page)
2259 put_page(page);
2260 cond_resched();
2261 }
2262 }
2263
2264 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2265 root_unstable_tree[nid] = RB_ROOT;
2266
2267 spin_lock(&ksm_mmlist_lock);
2268 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2269 ksm_scan.mm_slot = slot;
2270 spin_unlock(&ksm_mmlist_lock);
2271
2272
2273
2274
2275 if (slot == &ksm_mm_head)
2276 return NULL;
2277next_mm:
2278 ksm_scan.address = 0;
2279 ksm_scan.rmap_list = &slot->rmap_list;
2280 }
2281
2282 mm = slot->mm;
2283 mmap_read_lock(mm);
2284 if (ksm_test_exit(mm))
2285 vma = NULL;
2286 else
2287 vma = find_vma(mm, ksm_scan.address);
2288
2289 for (; vma; vma = vma->vm_next) {
2290 if (!(vma->vm_flags & VM_MERGEABLE))
2291 continue;
2292 if (ksm_scan.address < vma->vm_start)
2293 ksm_scan.address = vma->vm_start;
2294 if (!vma->anon_vma)
2295 ksm_scan.address = vma->vm_end;
2296
2297 while (ksm_scan.address < vma->vm_end) {
2298 if (ksm_test_exit(mm))
2299 break;
2300 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2301 if (IS_ERR_OR_NULL(*page)) {
2302 ksm_scan.address += PAGE_SIZE;
2303 cond_resched();
2304 continue;
2305 }
2306 if (PageAnon(*page)) {
2307 flush_anon_page(vma, *page, ksm_scan.address);
2308 flush_dcache_page(*page);
2309 rmap_item = get_next_rmap_item(slot,
2310 ksm_scan.rmap_list, ksm_scan.address);
2311 if (rmap_item) {
2312 ksm_scan.rmap_list =
2313 &rmap_item->rmap_list;
2314 ksm_scan.address += PAGE_SIZE;
2315 } else
2316 put_page(*page);
2317 mmap_read_unlock(mm);
2318 return rmap_item;
2319 }
2320 put_page(*page);
2321 ksm_scan.address += PAGE_SIZE;
2322 cond_resched();
2323 }
2324 }
2325
2326 if (ksm_test_exit(mm)) {
2327 ksm_scan.address = 0;
2328 ksm_scan.rmap_list = &slot->rmap_list;
2329 }
2330
2331
2332
2333
2334 remove_trailing_rmap_items(ksm_scan.rmap_list);
2335
2336 spin_lock(&ksm_mmlist_lock);
2337 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2338 struct mm_slot, mm_list);
2339 if (ksm_scan.address == 0) {
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349 hash_del(&slot->link);
2350 list_del(&slot->mm_list);
2351 spin_unlock(&ksm_mmlist_lock);
2352
2353 free_mm_slot(slot);
2354 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2355 mmap_read_unlock(mm);
2356 mmdrop(mm);
2357 } else {
2358 mmap_read_unlock(mm);
2359
2360
2361
2362
2363
2364
2365
2366 spin_unlock(&ksm_mmlist_lock);
2367 }
2368
2369
2370 slot = ksm_scan.mm_slot;
2371 if (slot != &ksm_mm_head)
2372 goto next_mm;
2373
2374 ksm_scan.seqnr++;
2375 return NULL;
2376}
2377
2378
2379
2380
2381
2382static void ksm_do_scan(unsigned int scan_npages)
2383{
2384 struct rmap_item *rmap_item;
2385 struct page *page;
2386
2387 while (scan_npages-- && likely(!freezing(current))) {
2388 cond_resched();
2389 rmap_item = scan_get_next_rmap_item(&page);
2390 if (!rmap_item)
2391 return;
2392 cmp_and_merge_page(page, rmap_item);
2393 put_page(page);
2394 }
2395}
2396
2397static int ksmd_should_run(void)
2398{
2399 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2400}
2401
2402static int ksm_scan_thread(void *nothing)
2403{
2404 unsigned int sleep_ms;
2405
2406 set_freezable();
2407 set_user_nice(current, 5);
2408
2409 while (!kthread_should_stop()) {
2410 mutex_lock(&ksm_thread_mutex);
2411 wait_while_offlining();
2412 if (ksmd_should_run())
2413 ksm_do_scan(ksm_thread_pages_to_scan);
2414 mutex_unlock(&ksm_thread_mutex);
2415
2416 try_to_freeze();
2417
2418 if (ksmd_should_run()) {
2419 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2420 wait_event_interruptible_timeout(ksm_iter_wait,
2421 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2422 msecs_to_jiffies(sleep_ms));
2423 } else {
2424 wait_event_freezable(ksm_thread_wait,
2425 ksmd_should_run() || kthread_should_stop());
2426 }
2427 }
2428 return 0;
2429}
2430
2431int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2432 unsigned long end, int advice, unsigned long *vm_flags)
2433{
2434 struct mm_struct *mm = vma->vm_mm;
2435 int err;
2436
2437 switch (advice) {
2438 case MADV_MERGEABLE:
2439
2440
2441
2442 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2443 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2444 VM_HUGETLB | VM_MIXEDMAP))
2445 return 0;
2446
2447 if (vma_is_dax(vma))
2448 return 0;
2449
2450#ifdef VM_SAO
2451 if (*vm_flags & VM_SAO)
2452 return 0;
2453#endif
2454#ifdef VM_SPARC_ADI
2455 if (*vm_flags & VM_SPARC_ADI)
2456 return 0;
2457#endif
2458
2459 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2460 err = __ksm_enter(mm);
2461 if (err)
2462 return err;
2463 }
2464
2465 *vm_flags |= VM_MERGEABLE;
2466 break;
2467
2468 case MADV_UNMERGEABLE:
2469 if (!(*vm_flags & VM_MERGEABLE))
2470 return 0;
2471
2472 if (vma->anon_vma) {
2473 err = unmerge_ksm_pages(vma, start, end);
2474 if (err)
2475 return err;
2476 }
2477
2478 *vm_flags &= ~VM_MERGEABLE;
2479 break;
2480 }
2481
2482 return 0;
2483}
2484EXPORT_SYMBOL_GPL(ksm_madvise);
2485
2486int __ksm_enter(struct mm_struct *mm)
2487{
2488 struct mm_slot *mm_slot;
2489 int needs_wakeup;
2490
2491 mm_slot = alloc_mm_slot();
2492 if (!mm_slot)
2493 return -ENOMEM;
2494
2495
2496 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2497
2498 spin_lock(&ksm_mmlist_lock);
2499 insert_to_mm_slots_hash(mm, mm_slot);
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510 if (ksm_run & KSM_RUN_UNMERGE)
2511 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2512 else
2513 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2514 spin_unlock(&ksm_mmlist_lock);
2515
2516 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2517 mmgrab(mm);
2518
2519 if (needs_wakeup)
2520 wake_up_interruptible(&ksm_thread_wait);
2521
2522 return 0;
2523}
2524
2525void __ksm_exit(struct mm_struct *mm)
2526{
2527 struct mm_slot *mm_slot;
2528 int easy_to_free = 0;
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539 spin_lock(&ksm_mmlist_lock);
2540 mm_slot = get_mm_slot(mm);
2541 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2542 if (!mm_slot->rmap_list) {
2543 hash_del(&mm_slot->link);
2544 list_del(&mm_slot->mm_list);
2545 easy_to_free = 1;
2546 } else {
2547 list_move(&mm_slot->mm_list,
2548 &ksm_scan.mm_slot->mm_list);
2549 }
2550 }
2551 spin_unlock(&ksm_mmlist_lock);
2552
2553 if (easy_to_free) {
2554 free_mm_slot(mm_slot);
2555 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2556 mmdrop(mm);
2557 } else if (mm_slot) {
2558 mmap_write_lock(mm);
2559 mmap_write_unlock(mm);
2560 }
2561}
2562
2563struct page *ksm_might_need_to_copy(struct page *page,
2564 struct vm_area_struct *vma, unsigned long address)
2565{
2566 struct anon_vma *anon_vma = page_anon_vma(page);
2567 struct page *new_page;
2568
2569 if (PageKsm(page)) {
2570 if (page_stable_node(page) &&
2571 !(ksm_run & KSM_RUN_UNMERGE))
2572 return page;
2573 } else if (!anon_vma) {
2574 return page;
2575 } else if (anon_vma->root == vma->anon_vma->root &&
2576 page->index == linear_page_index(vma, address)) {
2577 return page;
2578 }
2579 if (!PageUptodate(page))
2580 return page;
2581
2582 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2583 if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
2584 put_page(new_page);
2585 new_page = NULL;
2586 }
2587 if (new_page) {
2588 copy_user_highpage(new_page, page, address, vma);
2589
2590 SetPageDirty(new_page);
2591 __SetPageUptodate(new_page);
2592 __SetPageLocked(new_page);
2593 }
2594
2595 return new_page;
2596}
2597
2598void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
2599{
2600 struct stable_node *stable_node;
2601 struct rmap_item *rmap_item;
2602 int search_new_forks = 0;
2603
2604 VM_BUG_ON_PAGE(!PageKsm(page), page);
2605
2606
2607
2608
2609
2610 VM_BUG_ON_PAGE(!PageLocked(page), page);
2611
2612 stable_node = page_stable_node(page);
2613 if (!stable_node)
2614 return;
2615again:
2616 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2617 struct anon_vma *anon_vma = rmap_item->anon_vma;
2618 struct anon_vma_chain *vmac;
2619 struct vm_area_struct *vma;
2620
2621 cond_resched();
2622 anon_vma_lock_read(anon_vma);
2623 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2624 0, ULONG_MAX) {
2625 unsigned long addr;
2626
2627 cond_resched();
2628 vma = vmac->vma;
2629
2630
2631 addr = rmap_item->address & PAGE_MASK;
2632
2633 if (addr < vma->vm_start || addr >= vma->vm_end)
2634 continue;
2635
2636
2637
2638
2639
2640
2641 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2642 continue;
2643
2644 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2645 continue;
2646
2647 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
2648 anon_vma_unlock_read(anon_vma);
2649 return;
2650 }
2651 if (rwc->done && rwc->done(page)) {
2652 anon_vma_unlock_read(anon_vma);
2653 return;
2654 }
2655 }
2656 anon_vma_unlock_read(anon_vma);
2657 }
2658 if (!search_new_forks++)
2659 goto again;
2660}
2661
2662#ifdef CONFIG_MIGRATION
2663void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2664{
2665 struct stable_node *stable_node;
2666
2667 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2668 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2669 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2670
2671 stable_node = page_stable_node(newpage);
2672 if (stable_node) {
2673 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2674 stable_node->kpfn = page_to_pfn(newpage);
2675
2676
2677
2678
2679
2680
2681 smp_wmb();
2682 set_page_stable_node(oldpage, NULL);
2683 }
2684}
2685#endif
2686
2687#ifdef CONFIG_MEMORY_HOTREMOVE
2688static void wait_while_offlining(void)
2689{
2690 while (ksm_run & KSM_RUN_OFFLINE) {
2691 mutex_unlock(&ksm_thread_mutex);
2692 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2693 TASK_UNINTERRUPTIBLE);
2694 mutex_lock(&ksm_thread_mutex);
2695 }
2696}
2697
2698static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2699 unsigned long start_pfn,
2700 unsigned long end_pfn)
2701{
2702 if (stable_node->kpfn >= start_pfn &&
2703 stable_node->kpfn < end_pfn) {
2704
2705
2706
2707
2708 remove_node_from_stable_tree(stable_node);
2709 return true;
2710 }
2711 return false;
2712}
2713
2714static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2715 unsigned long start_pfn,
2716 unsigned long end_pfn,
2717 struct rb_root *root)
2718{
2719 struct stable_node *dup;
2720 struct hlist_node *hlist_safe;
2721
2722 if (!is_stable_node_chain(stable_node)) {
2723 VM_BUG_ON(is_stable_node_dup(stable_node));
2724 return stable_node_dup_remove_range(stable_node, start_pfn,
2725 end_pfn);
2726 }
2727
2728 hlist_for_each_entry_safe(dup, hlist_safe,
2729 &stable_node->hlist, hlist_dup) {
2730 VM_BUG_ON(!is_stable_node_dup(dup));
2731 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2732 }
2733 if (hlist_empty(&stable_node->hlist)) {
2734 free_stable_node_chain(stable_node, root);
2735 return true;
2736 } else
2737 return false;
2738}
2739
2740static void ksm_check_stable_tree(unsigned long start_pfn,
2741 unsigned long end_pfn)
2742{
2743 struct stable_node *stable_node, *next;
2744 struct rb_node *node;
2745 int nid;
2746
2747 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2748 node = rb_first(root_stable_tree + nid);
2749 while (node) {
2750 stable_node = rb_entry(node, struct stable_node, node);
2751 if (stable_node_chain_remove_range(stable_node,
2752 start_pfn, end_pfn,
2753 root_stable_tree +
2754 nid))
2755 node = rb_first(root_stable_tree + nid);
2756 else
2757 node = rb_next(node);
2758 cond_resched();
2759 }
2760 }
2761 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2762 if (stable_node->kpfn >= start_pfn &&
2763 stable_node->kpfn < end_pfn)
2764 remove_node_from_stable_tree(stable_node);
2765 cond_resched();
2766 }
2767}
2768
2769static int ksm_memory_callback(struct notifier_block *self,
2770 unsigned long action, void *arg)
2771{
2772 struct memory_notify *mn = arg;
2773
2774 switch (action) {
2775 case MEM_GOING_OFFLINE:
2776
2777
2778
2779
2780
2781
2782
2783 mutex_lock(&ksm_thread_mutex);
2784 ksm_run |= KSM_RUN_OFFLINE;
2785 mutex_unlock(&ksm_thread_mutex);
2786 break;
2787
2788 case MEM_OFFLINE:
2789
2790
2791
2792
2793
2794
2795
2796 ksm_check_stable_tree(mn->start_pfn,
2797 mn->start_pfn + mn->nr_pages);
2798 fallthrough;
2799 case MEM_CANCEL_OFFLINE:
2800 mutex_lock(&ksm_thread_mutex);
2801 ksm_run &= ~KSM_RUN_OFFLINE;
2802 mutex_unlock(&ksm_thread_mutex);
2803
2804 smp_mb();
2805 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2806 break;
2807 }
2808 return NOTIFY_OK;
2809}
2810#else
2811static void wait_while_offlining(void)
2812{
2813}
2814#endif
2815
2816#ifdef CONFIG_SYSFS
2817
2818
2819
2820
2821#define KSM_ATTR_RO(_name) \
2822 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2823#define KSM_ATTR(_name) \
2824 static struct kobj_attribute _name##_attr = \
2825 __ATTR(_name, 0644, _name##_show, _name##_store)
2826
2827static ssize_t sleep_millisecs_show(struct kobject *kobj,
2828 struct kobj_attribute *attr, char *buf)
2829{
2830 return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
2831}
2832
2833static ssize_t sleep_millisecs_store(struct kobject *kobj,
2834 struct kobj_attribute *attr,
2835 const char *buf, size_t count)
2836{
2837 unsigned int msecs;
2838 int err;
2839
2840 err = kstrtouint(buf, 10, &msecs);
2841 if (err)
2842 return -EINVAL;
2843
2844 ksm_thread_sleep_millisecs = msecs;
2845 wake_up_interruptible(&ksm_iter_wait);
2846
2847 return count;
2848}
2849KSM_ATTR(sleep_millisecs);
2850
2851static ssize_t pages_to_scan_show(struct kobject *kobj,
2852 struct kobj_attribute *attr, char *buf)
2853{
2854 return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
2855}
2856
2857static ssize_t pages_to_scan_store(struct kobject *kobj,
2858 struct kobj_attribute *attr,
2859 const char *buf, size_t count)
2860{
2861 unsigned int nr_pages;
2862 int err;
2863
2864 err = kstrtouint(buf, 10, &nr_pages);
2865 if (err)
2866 return -EINVAL;
2867
2868 ksm_thread_pages_to_scan = nr_pages;
2869
2870 return count;
2871}
2872KSM_ATTR(pages_to_scan);
2873
2874static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2875 char *buf)
2876{
2877 return sysfs_emit(buf, "%lu\n", ksm_run);
2878}
2879
2880static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2881 const char *buf, size_t count)
2882{
2883 unsigned int flags;
2884 int err;
2885
2886 err = kstrtouint(buf, 10, &flags);
2887 if (err)
2888 return -EINVAL;
2889 if (flags > KSM_RUN_UNMERGE)
2890 return -EINVAL;
2891
2892
2893
2894
2895
2896
2897
2898
2899 mutex_lock(&ksm_thread_mutex);
2900 wait_while_offlining();
2901 if (ksm_run != flags) {
2902 ksm_run = flags;
2903 if (flags & KSM_RUN_UNMERGE) {
2904 set_current_oom_origin();
2905 err = unmerge_and_remove_all_rmap_items();
2906 clear_current_oom_origin();
2907 if (err) {
2908 ksm_run = KSM_RUN_STOP;
2909 count = err;
2910 }
2911 }
2912 }
2913 mutex_unlock(&ksm_thread_mutex);
2914
2915 if (flags & KSM_RUN_MERGE)
2916 wake_up_interruptible(&ksm_thread_wait);
2917
2918 return count;
2919}
2920KSM_ATTR(run);
2921
2922#ifdef CONFIG_NUMA
2923static ssize_t merge_across_nodes_show(struct kobject *kobj,
2924 struct kobj_attribute *attr, char *buf)
2925{
2926 return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
2927}
2928
2929static ssize_t merge_across_nodes_store(struct kobject *kobj,
2930 struct kobj_attribute *attr,
2931 const char *buf, size_t count)
2932{
2933 int err;
2934 unsigned long knob;
2935
2936 err = kstrtoul(buf, 10, &knob);
2937 if (err)
2938 return err;
2939 if (knob > 1)
2940 return -EINVAL;
2941
2942 mutex_lock(&ksm_thread_mutex);
2943 wait_while_offlining();
2944 if (ksm_merge_across_nodes != knob) {
2945 if (ksm_pages_shared || remove_all_stable_nodes())
2946 err = -EBUSY;
2947 else if (root_stable_tree == one_stable_tree) {
2948 struct rb_root *buf;
2949
2950
2951
2952
2953
2954
2955
2956 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2957 GFP_KERNEL);
2958
2959 if (!buf)
2960 err = -ENOMEM;
2961 else {
2962 root_stable_tree = buf;
2963 root_unstable_tree = buf + nr_node_ids;
2964
2965 root_unstable_tree[0] = one_unstable_tree[0];
2966 }
2967 }
2968 if (!err) {
2969 ksm_merge_across_nodes = knob;
2970 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2971 }
2972 }
2973 mutex_unlock(&ksm_thread_mutex);
2974
2975 return err ? err : count;
2976}
2977KSM_ATTR(merge_across_nodes);
2978#endif
2979
2980static ssize_t use_zero_pages_show(struct kobject *kobj,
2981 struct kobj_attribute *attr, char *buf)
2982{
2983 return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
2984}
2985static ssize_t use_zero_pages_store(struct kobject *kobj,
2986 struct kobj_attribute *attr,
2987 const char *buf, size_t count)
2988{
2989 int err;
2990 bool value;
2991
2992 err = kstrtobool(buf, &value);
2993 if (err)
2994 return -EINVAL;
2995
2996 ksm_use_zero_pages = value;
2997
2998 return count;
2999}
3000KSM_ATTR(use_zero_pages);
3001
3002static ssize_t max_page_sharing_show(struct kobject *kobj,
3003 struct kobj_attribute *attr, char *buf)
3004{
3005 return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
3006}
3007
3008static ssize_t max_page_sharing_store(struct kobject *kobj,
3009 struct kobj_attribute *attr,
3010 const char *buf, size_t count)
3011{
3012 int err;
3013 int knob;
3014
3015 err = kstrtoint(buf, 10, &knob);
3016 if (err)
3017 return err;
3018
3019
3020
3021
3022
3023 if (knob < 2)
3024 return -EINVAL;
3025
3026 if (READ_ONCE(ksm_max_page_sharing) == knob)
3027 return count;
3028
3029 mutex_lock(&ksm_thread_mutex);
3030 wait_while_offlining();
3031 if (ksm_max_page_sharing != knob) {
3032 if (ksm_pages_shared || remove_all_stable_nodes())
3033 err = -EBUSY;
3034 else
3035 ksm_max_page_sharing = knob;
3036 }
3037 mutex_unlock(&ksm_thread_mutex);
3038
3039 return err ? err : count;
3040}
3041KSM_ATTR(max_page_sharing);
3042
3043static ssize_t pages_shared_show(struct kobject *kobj,
3044 struct kobj_attribute *attr, char *buf)
3045{
3046 return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
3047}
3048KSM_ATTR_RO(pages_shared);
3049
3050static ssize_t pages_sharing_show(struct kobject *kobj,
3051 struct kobj_attribute *attr, char *buf)
3052{
3053 return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
3054}
3055KSM_ATTR_RO(pages_sharing);
3056
3057static ssize_t pages_unshared_show(struct kobject *kobj,
3058 struct kobj_attribute *attr, char *buf)
3059{
3060 return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
3061}
3062KSM_ATTR_RO(pages_unshared);
3063
3064static ssize_t pages_volatile_show(struct kobject *kobj,
3065 struct kobj_attribute *attr, char *buf)
3066{
3067 long ksm_pages_volatile;
3068
3069 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3070 - ksm_pages_sharing - ksm_pages_unshared;
3071
3072
3073
3074
3075 if (ksm_pages_volatile < 0)
3076 ksm_pages_volatile = 0;
3077 return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
3078}
3079KSM_ATTR_RO(pages_volatile);
3080
3081static ssize_t stable_node_dups_show(struct kobject *kobj,
3082 struct kobj_attribute *attr, char *buf)
3083{
3084 return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
3085}
3086KSM_ATTR_RO(stable_node_dups);
3087
3088static ssize_t stable_node_chains_show(struct kobject *kobj,
3089 struct kobj_attribute *attr, char *buf)
3090{
3091 return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
3092}
3093KSM_ATTR_RO(stable_node_chains);
3094
3095static ssize_t
3096stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3097 struct kobj_attribute *attr,
3098 char *buf)
3099{
3100 return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3101}
3102
3103static ssize_t
3104stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3105 struct kobj_attribute *attr,
3106 const char *buf, size_t count)
3107{
3108 unsigned long msecs;
3109 int err;
3110
3111 err = kstrtoul(buf, 10, &msecs);
3112 if (err || msecs > UINT_MAX)
3113 return -EINVAL;
3114
3115 ksm_stable_node_chains_prune_millisecs = msecs;
3116
3117 return count;
3118}
3119KSM_ATTR(stable_node_chains_prune_millisecs);
3120
3121static ssize_t full_scans_show(struct kobject *kobj,
3122 struct kobj_attribute *attr, char *buf)
3123{
3124 return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
3125}
3126KSM_ATTR_RO(full_scans);
3127
3128static struct attribute *ksm_attrs[] = {
3129 &sleep_millisecs_attr.attr,
3130 &pages_to_scan_attr.attr,
3131 &run_attr.attr,
3132 &pages_shared_attr.attr,
3133 &pages_sharing_attr.attr,
3134 &pages_unshared_attr.attr,
3135 &pages_volatile_attr.attr,
3136 &full_scans_attr.attr,
3137#ifdef CONFIG_NUMA
3138 &merge_across_nodes_attr.attr,
3139#endif
3140 &max_page_sharing_attr.attr,
3141 &stable_node_chains_attr.attr,
3142 &stable_node_dups_attr.attr,
3143 &stable_node_chains_prune_millisecs_attr.attr,
3144 &use_zero_pages_attr.attr,
3145 NULL,
3146};
3147
3148static const struct attribute_group ksm_attr_group = {
3149 .attrs = ksm_attrs,
3150 .name = "ksm",
3151};
3152#endif
3153
3154static int __init ksm_init(void)
3155{
3156 struct task_struct *ksm_thread;
3157 int err;
3158
3159
3160 zero_checksum = calc_checksum(ZERO_PAGE(0));
3161
3162 ksm_use_zero_pages = false;
3163
3164 err = ksm_slab_init();
3165 if (err)
3166 goto out;
3167
3168 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3169 if (IS_ERR(ksm_thread)) {
3170 pr_err("ksm: creating kthread failed\n");
3171 err = PTR_ERR(ksm_thread);
3172 goto out_free;
3173 }
3174
3175#ifdef CONFIG_SYSFS
3176 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3177 if (err) {
3178 pr_err("ksm: register sysfs failed\n");
3179 kthread_stop(ksm_thread);
3180 goto out_free;
3181 }
3182#else
3183 ksm_run = KSM_RUN_MERGE;
3184
3185#endif
3186
3187#ifdef CONFIG_MEMORY_HOTREMOVE
3188
3189 hotplug_memory_notifier(ksm_memory_callback, 100);
3190#endif
3191 return 0;
3192
3193out_free:
3194 ksm_slab_free();
3195out:
3196 return err;
3197}
3198subsys_initcall(ksm_init);
3199