1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/sched/mm.h>
23#include <linux/sched/coredump.h>
24#include <linux/rwsem.h>
25#include <linux/pagemap.h>
26#include <linux/rmap.h>
27#include <linux/spinlock.h>
28#include <linux/jhash.h>
29#include <linux/delay.h>
30#include <linux/kthread.h>
31#include <linux/wait.h>
32#include <linux/slab.h>
33#include <linux/rbtree.h>
34#include <linux/memory.h>
35#include <linux/mmu_notifier.h>
36#include <linux/swap.h>
37#include <linux/ksm.h>
38#include <linux/hashtable.h>
39#include <linux/freezer.h>
40#include <linux/oom.h>
41#include <linux/numa.h>
42
43#include <asm/tlbflush.h>
44#include "internal.h"
45
46#ifdef CONFIG_NUMA
47#define NUMA(x) (x)
48#define DO_NUMA(x) do { (x); } while (0)
49#else
50#define NUMA(x) (0)
51#define DO_NUMA(x) do { } while (0)
52#endif
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121struct mm_slot {
122 struct hlist_node link;
123 struct list_head mm_list;
124 struct rmap_item *rmap_list;
125 struct mm_struct *mm;
126};
127
128
129
130
131
132
133
134
135
136
137struct ksm_scan {
138 struct mm_slot *mm_slot;
139 unsigned long address;
140 struct rmap_item **rmap_list;
141 unsigned long seqnr;
142};
143
144
145
146
147
148
149
150
151
152
153
154
155
156struct stable_node {
157 union {
158 struct rb_node node;
159 struct {
160 struct list_head *head;
161 struct {
162 struct hlist_node hlist_dup;
163 struct list_head list;
164 };
165 };
166 };
167 struct hlist_head hlist;
168 union {
169 unsigned long kpfn;
170 unsigned long chain_prune_time;
171 };
172
173
174
175
176
177#define STABLE_NODE_CHAIN -1024
178 int rmap_hlist_len;
179#ifdef CONFIG_NUMA
180 int nid;
181#endif
182};
183
184
185
186
187
188
189
190
191
192
193
194
195
196struct rmap_item {
197 struct rmap_item *rmap_list;
198 union {
199 struct anon_vma *anon_vma;
200#ifdef CONFIG_NUMA
201 int nid;
202#endif
203 };
204 struct mm_struct *mm;
205 unsigned long address;
206 unsigned int oldchecksum;
207 union {
208 struct rb_node node;
209 struct {
210 struct stable_node *head;
211 struct hlist_node hlist;
212 };
213 };
214};
215
216#define SEQNR_MASK 0x0ff
217#define UNSTABLE_FLAG 0x100
218#define STABLE_FLAG 0x200
219#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
220
221
222
223static struct rb_root one_stable_tree[1] = { RB_ROOT };
224static struct rb_root one_unstable_tree[1] = { RB_ROOT };
225static struct rb_root *root_stable_tree = one_stable_tree;
226static struct rb_root *root_unstable_tree = one_unstable_tree;
227
228
229static LIST_HEAD(migrate_nodes);
230#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
231
232#define MM_SLOTS_HASH_BITS 10
233static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
234
235static struct mm_slot ksm_mm_head = {
236 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
237};
238static struct ksm_scan ksm_scan = {
239 .mm_slot = &ksm_mm_head,
240};
241
242static struct kmem_cache *rmap_item_cache;
243static struct kmem_cache *stable_node_cache;
244static struct kmem_cache *mm_slot_cache;
245
246
247static unsigned long ksm_pages_shared;
248
249
250static unsigned long ksm_pages_sharing;
251
252
253static unsigned long ksm_pages_unshared;
254
255
256static unsigned long ksm_rmap_items;
257
258
259static unsigned long ksm_stable_node_chains;
260
261
262static unsigned long ksm_stable_node_dups;
263
264
265static int ksm_stable_node_chains_prune_millisecs = 2000;
266
267
268static int ksm_max_page_sharing = 256;
269
270
271static unsigned int ksm_thread_pages_to_scan = 100;
272
273
274static unsigned int ksm_thread_sleep_millisecs = 20;
275
276
277static unsigned int zero_checksum __read_mostly;
278
279
280static bool ksm_use_zero_pages __read_mostly;
281
282#ifdef CONFIG_NUMA
283
284static unsigned int ksm_merge_across_nodes = 1;
285static int ksm_nr_node_ids = 1;
286#else
287#define ksm_merge_across_nodes 1U
288#define ksm_nr_node_ids 1
289#endif
290
291#define KSM_RUN_STOP 0
292#define KSM_RUN_MERGE 1
293#define KSM_RUN_UNMERGE 2
294#define KSM_RUN_OFFLINE 4
295static unsigned long ksm_run = KSM_RUN_STOP;
296static void wait_while_offlining(void);
297
298static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
299static DEFINE_MUTEX(ksm_thread_mutex);
300static DEFINE_SPINLOCK(ksm_mmlist_lock);
301
302#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
303 sizeof(struct __struct), __alignof__(struct __struct),\
304 (__flags), NULL)
305
306static int __init ksm_slab_init(void)
307{
308 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
309 if (!rmap_item_cache)
310 goto out;
311
312 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
313 if (!stable_node_cache)
314 goto out_free1;
315
316 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
317 if (!mm_slot_cache)
318 goto out_free2;
319
320 return 0;
321
322out_free2:
323 kmem_cache_destroy(stable_node_cache);
324out_free1:
325 kmem_cache_destroy(rmap_item_cache);
326out:
327 return -ENOMEM;
328}
329
330static void __init ksm_slab_free(void)
331{
332 kmem_cache_destroy(mm_slot_cache);
333 kmem_cache_destroy(stable_node_cache);
334 kmem_cache_destroy(rmap_item_cache);
335 mm_slot_cache = NULL;
336}
337
338static __always_inline bool is_stable_node_chain(struct stable_node *chain)
339{
340 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
341}
342
343static __always_inline bool is_stable_node_dup(struct stable_node *dup)
344{
345 return dup->head == STABLE_NODE_DUP_HEAD;
346}
347
348static inline void stable_node_chain_add_dup(struct stable_node *dup,
349 struct stable_node *chain)
350{
351 VM_BUG_ON(is_stable_node_dup(dup));
352 dup->head = STABLE_NODE_DUP_HEAD;
353 VM_BUG_ON(!is_stable_node_chain(chain));
354 hlist_add_head(&dup->hlist_dup, &chain->hlist);
355 ksm_stable_node_dups++;
356}
357
358static inline void __stable_node_dup_del(struct stable_node *dup)
359{
360 VM_BUG_ON(!is_stable_node_dup(dup));
361 hlist_del(&dup->hlist_dup);
362 ksm_stable_node_dups--;
363}
364
365static inline void stable_node_dup_del(struct stable_node *dup)
366{
367 VM_BUG_ON(is_stable_node_chain(dup));
368 if (is_stable_node_dup(dup))
369 __stable_node_dup_del(dup);
370 else
371 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
372#ifdef CONFIG_DEBUG_VM
373 dup->head = NULL;
374#endif
375}
376
377static inline struct rmap_item *alloc_rmap_item(void)
378{
379 struct rmap_item *rmap_item;
380
381 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
382 __GFP_NORETRY | __GFP_NOWARN);
383 if (rmap_item)
384 ksm_rmap_items++;
385 return rmap_item;
386}
387
388static inline void free_rmap_item(struct rmap_item *rmap_item)
389{
390 ksm_rmap_items--;
391 rmap_item->mm = NULL;
392 kmem_cache_free(rmap_item_cache, rmap_item);
393}
394
395static inline struct stable_node *alloc_stable_node(void)
396{
397
398
399
400
401
402 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
403}
404
405static inline void free_stable_node(struct stable_node *stable_node)
406{
407 VM_BUG_ON(stable_node->rmap_hlist_len &&
408 !is_stable_node_chain(stable_node));
409 kmem_cache_free(stable_node_cache, stable_node);
410}
411
412static inline struct mm_slot *alloc_mm_slot(void)
413{
414 if (!mm_slot_cache)
415 return NULL;
416 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
417}
418
419static inline void free_mm_slot(struct mm_slot *mm_slot)
420{
421 kmem_cache_free(mm_slot_cache, mm_slot);
422}
423
424static struct mm_slot *get_mm_slot(struct mm_struct *mm)
425{
426 struct mm_slot *slot;
427
428 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
429 if (slot->mm == mm)
430 return slot;
431
432 return NULL;
433}
434
435static void insert_to_mm_slots_hash(struct mm_struct *mm,
436 struct mm_slot *mm_slot)
437{
438 mm_slot->mm = mm;
439 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
440}
441
442
443
444
445
446
447
448
449
450static inline bool ksm_test_exit(struct mm_struct *mm)
451{
452 return atomic_read(&mm->mm_users) == 0;
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
471{
472 struct page *page;
473 vm_fault_t ret = 0;
474
475 do {
476 cond_resched();
477 page = follow_page(vma, addr,
478 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
479 if (IS_ERR_OR_NULL(page))
480 break;
481 if (PageKsm(page))
482 ret = handle_mm_fault(vma, addr,
483 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
484 else
485 ret = VM_FAULT_WRITE;
486 put_page(page);
487 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
517}
518
519static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
520 unsigned long addr)
521{
522 struct vm_area_struct *vma;
523 if (ksm_test_exit(mm))
524 return NULL;
525 vma = vma_lookup(mm, addr);
526 if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
527 return NULL;
528 return vma;
529}
530
531static void break_cow(struct rmap_item *rmap_item)
532{
533 struct mm_struct *mm = rmap_item->mm;
534 unsigned long addr = rmap_item->address;
535 struct vm_area_struct *vma;
536
537
538
539
540
541 put_anon_vma(rmap_item->anon_vma);
542
543 mmap_read_lock(mm);
544 vma = find_mergeable_vma(mm, addr);
545 if (vma)
546 break_ksm(vma, addr);
547 mmap_read_unlock(mm);
548}
549
550static struct page *get_mergeable_page(struct rmap_item *rmap_item)
551{
552 struct mm_struct *mm = rmap_item->mm;
553 unsigned long addr = rmap_item->address;
554 struct vm_area_struct *vma;
555 struct page *page;
556
557 mmap_read_lock(mm);
558 vma = find_mergeable_vma(mm, addr);
559 if (!vma)
560 goto out;
561
562 page = follow_page(vma, addr, FOLL_GET);
563 if (IS_ERR_OR_NULL(page))
564 goto out;
565 if (PageAnon(page)) {
566 flush_anon_page(vma, page, addr);
567 flush_dcache_page(page);
568 } else {
569 put_page(page);
570out:
571 page = NULL;
572 }
573 mmap_read_unlock(mm);
574 return page;
575}
576
577
578
579
580
581
582
583static inline int get_kpfn_nid(unsigned long kpfn)
584{
585 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
586}
587
588static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
589 struct rb_root *root)
590{
591 struct stable_node *chain = alloc_stable_node();
592 VM_BUG_ON(is_stable_node_chain(dup));
593 if (likely(chain)) {
594 INIT_HLIST_HEAD(&chain->hlist);
595 chain->chain_prune_time = jiffies;
596 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
597#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
598 chain->nid = NUMA_NO_NODE;
599#endif
600 ksm_stable_node_chains++;
601
602
603
604
605
606
607 rb_replace_node(&dup->node, &chain->node, root);
608
609
610
611
612
613
614
615
616 stable_node_chain_add_dup(dup, chain);
617 }
618 return chain;
619}
620
621static inline void free_stable_node_chain(struct stable_node *chain,
622 struct rb_root *root)
623{
624 rb_erase(&chain->node, root);
625 free_stable_node(chain);
626 ksm_stable_node_chains--;
627}
628
629static void remove_node_from_stable_tree(struct stable_node *stable_node)
630{
631 struct rmap_item *rmap_item;
632
633
634 BUG_ON(stable_node->rmap_hlist_len < 0);
635
636 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
637 if (rmap_item->hlist.next)
638 ksm_pages_sharing--;
639 else
640 ksm_pages_shared--;
641 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
642 stable_node->rmap_hlist_len--;
643 put_anon_vma(rmap_item->anon_vma);
644 rmap_item->address &= PAGE_MASK;
645 cond_resched();
646 }
647
648
649
650
651
652
653
654
655#if GCC_VERSION >= 40903
656 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
657 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
658#endif
659
660 if (stable_node->head == &migrate_nodes)
661 list_del(&stable_node->list);
662 else
663 stable_node_dup_del(stable_node);
664 free_stable_node(stable_node);
665}
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
687{
688 struct page *page;
689 void *expected_mapping;
690 unsigned long kpfn;
691
692 expected_mapping = (void *)((unsigned long)stable_node |
693 PAGE_MAPPING_KSM);
694again:
695 kpfn = READ_ONCE(stable_node->kpfn);
696 page = pfn_to_page(kpfn);
697 if (READ_ONCE(page->mapping) != expected_mapping)
698 goto stale;
699
700
701
702
703
704
705
706
707
708
709
710 while (!get_page_unless_zero(page)) {
711
712
713
714
715
716
717
718
719 if (!PageSwapCache(page))
720 goto stale;
721 cpu_relax();
722 }
723
724 if (READ_ONCE(page->mapping) != expected_mapping) {
725 put_page(page);
726 goto stale;
727 }
728
729 if (lock_it) {
730 lock_page(page);
731 if (READ_ONCE(page->mapping) != expected_mapping) {
732 unlock_page(page);
733 put_page(page);
734 goto stale;
735 }
736 }
737 return page;
738
739stale:
740
741
742
743
744
745
746 smp_rmb();
747 if (READ_ONCE(stable_node->kpfn) != kpfn)
748 goto again;
749 remove_node_from_stable_tree(stable_node);
750 return NULL;
751}
752
753
754
755
756
757static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
758{
759 if (rmap_item->address & STABLE_FLAG) {
760 struct stable_node *stable_node;
761 struct page *page;
762
763 stable_node = rmap_item->head;
764 page = get_ksm_page(stable_node, true);
765 if (!page)
766 goto out;
767
768 hlist_del(&rmap_item->hlist);
769 unlock_page(page);
770 put_page(page);
771
772 if (!hlist_empty(&stable_node->hlist))
773 ksm_pages_sharing--;
774 else
775 ksm_pages_shared--;
776 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
777 stable_node->rmap_hlist_len--;
778
779 put_anon_vma(rmap_item->anon_vma);
780 rmap_item->head = NULL;
781 rmap_item->address &= PAGE_MASK;
782
783 } else if (rmap_item->address & UNSTABLE_FLAG) {
784 unsigned char age;
785
786
787
788
789
790
791
792 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
793 BUG_ON(age > 1);
794 if (!age)
795 rb_erase(&rmap_item->node,
796 root_unstable_tree + NUMA(rmap_item->nid));
797 ksm_pages_unshared--;
798 rmap_item->address &= PAGE_MASK;
799 }
800out:
801 cond_resched();
802}
803
804static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
805 struct rmap_item **rmap_list)
806{
807 while (*rmap_list) {
808 struct rmap_item *rmap_item = *rmap_list;
809 *rmap_list = rmap_item->rmap_list;
810 remove_rmap_item_from_tree(rmap_item);
811 free_rmap_item(rmap_item);
812 }
813}
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828static int unmerge_ksm_pages(struct vm_area_struct *vma,
829 unsigned long start, unsigned long end)
830{
831 unsigned long addr;
832 int err = 0;
833
834 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
835 if (ksm_test_exit(vma->vm_mm))
836 break;
837 if (signal_pending(current))
838 err = -ERESTARTSYS;
839 else
840 err = break_ksm(vma, addr);
841 }
842 return err;
843}
844
845static inline struct stable_node *page_stable_node(struct page *page)
846{
847 return PageKsm(page) ? page_rmapping(page) : NULL;
848}
849
850static inline void set_page_stable_node(struct page *page,
851 struct stable_node *stable_node)
852{
853 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
854}
855
856#ifdef CONFIG_SYSFS
857
858
859
860static int remove_stable_node(struct stable_node *stable_node)
861{
862 struct page *page;
863 int err;
864
865 page = get_ksm_page(stable_node, true);
866 if (!page) {
867
868
869
870 return 0;
871 }
872
873
874
875
876
877
878 err = -EBUSY;
879 if (!page_mapped(page)) {
880
881
882
883
884
885
886
887
888 set_page_stable_node(page, NULL);
889 remove_node_from_stable_tree(stable_node);
890 err = 0;
891 }
892
893 unlock_page(page);
894 put_page(page);
895 return err;
896}
897
898static int remove_stable_node_chain(struct stable_node *stable_node,
899 struct rb_root *root)
900{
901 struct stable_node *dup;
902 struct hlist_node *hlist_safe;
903
904 if (!is_stable_node_chain(stable_node)) {
905 VM_BUG_ON(is_stable_node_dup(stable_node));
906 if (remove_stable_node(stable_node))
907 return true;
908 else
909 return false;
910 }
911
912 hlist_for_each_entry_safe(dup, hlist_safe,
913 &stable_node->hlist, hlist_dup) {
914 VM_BUG_ON(!is_stable_node_dup(dup));
915 if (remove_stable_node(dup))
916 return true;
917 }
918 BUG_ON(!hlist_empty(&stable_node->hlist));
919 free_stable_node_chain(stable_node, root);
920 return false;
921}
922
923static int remove_all_stable_nodes(void)
924{
925 struct stable_node *stable_node, *next;
926 int nid;
927 int err = 0;
928
929 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
930 while (root_stable_tree[nid].rb_node) {
931 stable_node = rb_entry(root_stable_tree[nid].rb_node,
932 struct stable_node, node);
933 if (remove_stable_node_chain(stable_node,
934 root_stable_tree + nid)) {
935 err = -EBUSY;
936 break;
937 }
938 cond_resched();
939 }
940 }
941 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
942 if (remove_stable_node(stable_node))
943 err = -EBUSY;
944 cond_resched();
945 }
946 return err;
947}
948
949static int unmerge_and_remove_all_rmap_items(void)
950{
951 struct mm_slot *mm_slot;
952 struct mm_struct *mm;
953 struct vm_area_struct *vma;
954 int err = 0;
955
956 spin_lock(&ksm_mmlist_lock);
957 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
958 struct mm_slot, mm_list);
959 spin_unlock(&ksm_mmlist_lock);
960
961 for (mm_slot = ksm_scan.mm_slot;
962 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
963 mm = mm_slot->mm;
964 mmap_read_lock(mm);
965 for (vma = mm->mmap; vma; vma = vma->vm_next) {
966 if (ksm_test_exit(mm))
967 break;
968 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
969 continue;
970 err = unmerge_ksm_pages(vma,
971 vma->vm_start, vma->vm_end);
972 if (err)
973 goto error;
974 }
975
976 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
977 mmap_read_unlock(mm);
978
979 spin_lock(&ksm_mmlist_lock);
980 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
981 struct mm_slot, mm_list);
982 if (ksm_test_exit(mm)) {
983 hash_del(&mm_slot->link);
984 list_del(&mm_slot->mm_list);
985 spin_unlock(&ksm_mmlist_lock);
986
987 free_mm_slot(mm_slot);
988 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
989 mmdrop(mm);
990 } else
991 spin_unlock(&ksm_mmlist_lock);
992 }
993
994
995 remove_all_stable_nodes();
996 ksm_scan.seqnr = 0;
997 return 0;
998
999error:
1000 mmap_read_unlock(mm);
1001 spin_lock(&ksm_mmlist_lock);
1002 ksm_scan.mm_slot = &ksm_mm_head;
1003 spin_unlock(&ksm_mmlist_lock);
1004 return err;
1005}
1006#endif
1007
1008static u32 calc_checksum(struct page *page)
1009{
1010 u32 checksum;
1011 void *addr = kmap_atomic(page);
1012 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
1013 kunmap_atomic(addr);
1014 return checksum;
1015}
1016
1017static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1018 pte_t *orig_pte)
1019{
1020 struct mm_struct *mm = vma->vm_mm;
1021 struct page_vma_mapped_walk pvmw = {
1022 .page = page,
1023 .vma = vma,
1024 };
1025 int swapped;
1026 int err = -EFAULT;
1027 struct mmu_notifier_range range;
1028
1029 pvmw.address = page_address_in_vma(page, vma);
1030 if (pvmw.address == -EFAULT)
1031 goto out;
1032
1033 BUG_ON(PageTransCompound(page));
1034
1035 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1036 pvmw.address,
1037 pvmw.address + PAGE_SIZE);
1038 mmu_notifier_invalidate_range_start(&range);
1039
1040 if (!page_vma_mapped_walk(&pvmw))
1041 goto out_mn;
1042 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1043 goto out_unlock;
1044
1045 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1046 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1047 mm_tlb_flush_pending(mm)) {
1048 pte_t entry;
1049
1050 swapped = PageSwapCache(page);
1051 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1067
1068
1069
1070
1071 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1072 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1073 goto out_unlock;
1074 }
1075 if (pte_dirty(entry))
1076 set_page_dirty(page);
1077
1078 if (pte_protnone(entry))
1079 entry = pte_mkclean(pte_clear_savedwrite(entry));
1080 else
1081 entry = pte_mkclean(pte_wrprotect(entry));
1082 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1083 }
1084 *orig_pte = *pvmw.pte;
1085 err = 0;
1086
1087out_unlock:
1088 page_vma_mapped_walk_done(&pvmw);
1089out_mn:
1090 mmu_notifier_invalidate_range_end(&range);
1091out:
1092 return err;
1093}
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104static int replace_page(struct vm_area_struct *vma, struct page *page,
1105 struct page *kpage, pte_t orig_pte)
1106{
1107 struct mm_struct *mm = vma->vm_mm;
1108 pmd_t *pmd;
1109 pte_t *ptep;
1110 pte_t newpte;
1111 spinlock_t *ptl;
1112 unsigned long addr;
1113 int err = -EFAULT;
1114 struct mmu_notifier_range range;
1115
1116 addr = page_address_in_vma(page, vma);
1117 if (addr == -EFAULT)
1118 goto out;
1119
1120 pmd = mm_find_pmd(mm, addr);
1121 if (!pmd)
1122 goto out;
1123
1124 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1125 addr + PAGE_SIZE);
1126 mmu_notifier_invalidate_range_start(&range);
1127
1128 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1129 if (!pte_same(*ptep, orig_pte)) {
1130 pte_unmap_unlock(ptep, ptl);
1131 goto out_mn;
1132 }
1133
1134
1135
1136
1137
1138 if (!is_zero_pfn(page_to_pfn(kpage))) {
1139 get_page(kpage);
1140 page_add_anon_rmap(kpage, vma, addr, false);
1141 newpte = mk_pte(kpage, vma->vm_page_prot);
1142 } else {
1143 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1144 vma->vm_page_prot));
1145
1146
1147
1148
1149
1150
1151 dec_mm_counter(mm, MM_ANONPAGES);
1152 }
1153
1154 flush_cache_page(vma, addr, pte_pfn(*ptep));
1155
1156
1157
1158
1159
1160
1161 ptep_clear_flush(vma, addr, ptep);
1162 set_pte_at_notify(mm, addr, ptep, newpte);
1163
1164 page_remove_rmap(page, false);
1165 if (!page_mapped(page))
1166 try_to_free_swap(page);
1167 put_page(page);
1168
1169 pte_unmap_unlock(ptep, ptl);
1170 err = 0;
1171out_mn:
1172 mmu_notifier_invalidate_range_end(&range);
1173out:
1174 return err;
1175}
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186static int try_to_merge_one_page(struct vm_area_struct *vma,
1187 struct page *page, struct page *kpage)
1188{
1189 pte_t orig_pte = __pte(0);
1190 int err = -EFAULT;
1191
1192 if (page == kpage)
1193 return 0;
1194
1195 if (!PageAnon(page))
1196 goto out;
1197
1198
1199
1200
1201
1202
1203
1204
1205 if (!trylock_page(page))
1206 goto out;
1207
1208 if (PageTransCompound(page)) {
1209 if (split_huge_page(page))
1210 goto out_unlock;
1211 }
1212
1213
1214
1215
1216
1217
1218
1219 if (write_protect_page(vma, page, &orig_pte) == 0) {
1220 if (!kpage) {
1221
1222
1223
1224
1225
1226 set_page_stable_node(page, NULL);
1227 mark_page_accessed(page);
1228
1229
1230
1231
1232 if (!PageDirty(page))
1233 SetPageDirty(page);
1234 err = 0;
1235 } else if (pages_identical(page, kpage))
1236 err = replace_page(vma, page, kpage, orig_pte);
1237 }
1238
1239 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1240 munlock_vma_page(page);
1241 if (!PageMlocked(kpage)) {
1242 unlock_page(page);
1243 lock_page(kpage);
1244 mlock_vma_page(kpage);
1245 page = kpage;
1246 }
1247 }
1248
1249out_unlock:
1250 unlock_page(page);
1251out:
1252 return err;
1253}
1254
1255
1256
1257
1258
1259
1260
1261static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1262 struct page *page, struct page *kpage)
1263{
1264 struct mm_struct *mm = rmap_item->mm;
1265 struct vm_area_struct *vma;
1266 int err = -EFAULT;
1267
1268 mmap_read_lock(mm);
1269 vma = find_mergeable_vma(mm, rmap_item->address);
1270 if (!vma)
1271 goto out;
1272
1273 err = try_to_merge_one_page(vma, page, kpage);
1274 if (err)
1275 goto out;
1276
1277
1278 remove_rmap_item_from_tree(rmap_item);
1279
1280
1281 rmap_item->anon_vma = vma->anon_vma;
1282 get_anon_vma(vma->anon_vma);
1283out:
1284 mmap_read_unlock(mm);
1285 return err;
1286}
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1299 struct page *page,
1300 struct rmap_item *tree_rmap_item,
1301 struct page *tree_page)
1302{
1303 int err;
1304
1305 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1306 if (!err) {
1307 err = try_to_merge_with_ksm_page(tree_rmap_item,
1308 tree_page, page);
1309
1310
1311
1312
1313 if (err)
1314 break_cow(rmap_item);
1315 }
1316 return err ? NULL : page;
1317}
1318
1319static __always_inline
1320bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1321{
1322 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1323
1324
1325
1326
1327
1328
1329 return stable_node->rmap_hlist_len &&
1330 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1331}
1332
1333static __always_inline
1334bool is_page_sharing_candidate(struct stable_node *stable_node)
1335{
1336 return __is_page_sharing_candidate(stable_node, 0);
1337}
1338
1339static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1340 struct stable_node **_stable_node,
1341 struct rb_root *root,
1342 bool prune_stale_stable_nodes)
1343{
1344 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1345 struct hlist_node *hlist_safe;
1346 struct page *_tree_page, *tree_page = NULL;
1347 int nr = 0;
1348 int found_rmap_hlist_len;
1349
1350 if (!prune_stale_stable_nodes ||
1351 time_before(jiffies, stable_node->chain_prune_time +
1352 msecs_to_jiffies(
1353 ksm_stable_node_chains_prune_millisecs)))
1354 prune_stale_stable_nodes = false;
1355 else
1356 stable_node->chain_prune_time = jiffies;
1357
1358 hlist_for_each_entry_safe(dup, hlist_safe,
1359 &stable_node->hlist, hlist_dup) {
1360 cond_resched();
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371 _tree_page = get_ksm_page(dup, false);
1372 if (!_tree_page)
1373 continue;
1374 nr += 1;
1375 if (is_page_sharing_candidate(dup)) {
1376 if (!found ||
1377 dup->rmap_hlist_len > found_rmap_hlist_len) {
1378 if (found)
1379 put_page(tree_page);
1380 found = dup;
1381 found_rmap_hlist_len = found->rmap_hlist_len;
1382 tree_page = _tree_page;
1383
1384
1385 if (!prune_stale_stable_nodes)
1386 break;
1387 continue;
1388 }
1389 }
1390 put_page(_tree_page);
1391 }
1392
1393 if (found) {
1394
1395
1396
1397
1398
1399
1400 if (prune_stale_stable_nodes && nr == 1) {
1401
1402
1403
1404
1405
1406
1407 BUG_ON(stable_node->hlist.first->next);
1408
1409
1410
1411
1412
1413 rb_replace_node(&stable_node->node, &found->node,
1414 root);
1415 free_stable_node(stable_node);
1416 ksm_stable_node_chains--;
1417 ksm_stable_node_dups--;
1418
1419
1420
1421
1422
1423 *_stable_node = found;
1424
1425
1426
1427
1428
1429
1430 stable_node = NULL;
1431 } else if (stable_node->hlist.first != &found->hlist_dup &&
1432 __is_page_sharing_candidate(found, 1)) {
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448 hlist_del(&found->hlist_dup);
1449 hlist_add_head(&found->hlist_dup,
1450 &stable_node->hlist);
1451 }
1452 }
1453
1454 *_stable_node_dup = found;
1455 return tree_page;
1456}
1457
1458static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1459 struct rb_root *root)
1460{
1461 if (!is_stable_node_chain(stable_node))
1462 return stable_node;
1463 if (hlist_empty(&stable_node->hlist)) {
1464 free_stable_node_chain(stable_node, root);
1465 return NULL;
1466 }
1467 return hlist_entry(stable_node->hlist.first,
1468 typeof(*stable_node), hlist_dup);
1469}
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1486 struct stable_node **_stable_node,
1487 struct rb_root *root,
1488 bool prune_stale_stable_nodes)
1489{
1490 struct stable_node *stable_node = *_stable_node;
1491 if (!is_stable_node_chain(stable_node)) {
1492 if (is_page_sharing_candidate(stable_node)) {
1493 *_stable_node_dup = stable_node;
1494 return get_ksm_page(stable_node, false);
1495 }
1496
1497
1498
1499
1500 *_stable_node_dup = NULL;
1501 return NULL;
1502 }
1503 return stable_node_dup(_stable_node_dup, _stable_node, root,
1504 prune_stale_stable_nodes);
1505}
1506
1507static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1508 struct stable_node **s_n,
1509 struct rb_root *root)
1510{
1511 return __stable_node_chain(s_n_d, s_n, root, true);
1512}
1513
1514static __always_inline struct page *chain(struct stable_node **s_n_d,
1515 struct stable_node *s_n,
1516 struct rb_root *root)
1517{
1518 struct stable_node *old_stable_node = s_n;
1519 struct page *tree_page;
1520
1521 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1522
1523 VM_BUG_ON(s_n != old_stable_node);
1524 return tree_page;
1525}
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536static struct page *stable_tree_search(struct page *page)
1537{
1538 int nid;
1539 struct rb_root *root;
1540 struct rb_node **new;
1541 struct rb_node *parent;
1542 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1543 struct stable_node *page_node;
1544
1545 page_node = page_stable_node(page);
1546 if (page_node && page_node->head != &migrate_nodes) {
1547
1548 get_page(page);
1549 return page;
1550 }
1551
1552 nid = get_kpfn_nid(page_to_pfn(page));
1553 root = root_stable_tree + nid;
1554again:
1555 new = &root->rb_node;
1556 parent = NULL;
1557
1558 while (*new) {
1559 struct page *tree_page;
1560 int ret;
1561
1562 cond_resched();
1563 stable_node = rb_entry(*new, struct stable_node, node);
1564 stable_node_any = NULL;
1565 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578 if (!stable_node_dup) {
1579
1580
1581
1582
1583
1584 stable_node_any = stable_node_dup_any(stable_node,
1585 root);
1586 if (!stable_node_any) {
1587
1588 goto again;
1589 }
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599 tree_page = get_ksm_page(stable_node_any, false);
1600 }
1601 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1602 if (!tree_page) {
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612 goto again;
1613 }
1614
1615 ret = memcmp_pages(page, tree_page);
1616 put_page(tree_page);
1617
1618 parent = *new;
1619 if (ret < 0)
1620 new = &parent->rb_left;
1621 else if (ret > 0)
1622 new = &parent->rb_right;
1623 else {
1624 if (page_node) {
1625 VM_BUG_ON(page_node->head != &migrate_nodes);
1626
1627
1628
1629
1630
1631
1632 if (page_mapcount(page) > 1)
1633 goto chain_append;
1634 }
1635
1636 if (!stable_node_dup) {
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649 return NULL;
1650 }
1651
1652
1653
1654
1655
1656
1657
1658
1659 tree_page = get_ksm_page(stable_node_dup, true);
1660 if (unlikely(!tree_page))
1661
1662
1663
1664
1665 goto again;
1666 unlock_page(tree_page);
1667
1668 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1669 NUMA(stable_node_dup->nid)) {
1670 put_page(tree_page);
1671 goto replace;
1672 }
1673 return tree_page;
1674 }
1675 }
1676
1677 if (!page_node)
1678 return NULL;
1679
1680 list_del(&page_node->list);
1681 DO_NUMA(page_node->nid = nid);
1682 rb_link_node(&page_node->node, parent, new);
1683 rb_insert_color(&page_node->node, root);
1684out:
1685 if (is_page_sharing_candidate(page_node)) {
1686 get_page(page);
1687 return page;
1688 } else
1689 return NULL;
1690
1691replace:
1692
1693
1694
1695
1696
1697
1698
1699
1700 if (stable_node_dup == stable_node) {
1701 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1702 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1703
1704 if (page_node) {
1705 VM_BUG_ON(page_node->head != &migrate_nodes);
1706 list_del(&page_node->list);
1707 DO_NUMA(page_node->nid = nid);
1708 rb_replace_node(&stable_node_dup->node,
1709 &page_node->node,
1710 root);
1711 if (is_page_sharing_candidate(page_node))
1712 get_page(page);
1713 else
1714 page = NULL;
1715 } else {
1716 rb_erase(&stable_node_dup->node, root);
1717 page = NULL;
1718 }
1719 } else {
1720 VM_BUG_ON(!is_stable_node_chain(stable_node));
1721 __stable_node_dup_del(stable_node_dup);
1722 if (page_node) {
1723 VM_BUG_ON(page_node->head != &migrate_nodes);
1724 list_del(&page_node->list);
1725 DO_NUMA(page_node->nid = nid);
1726 stable_node_chain_add_dup(page_node, stable_node);
1727 if (is_page_sharing_candidate(page_node))
1728 get_page(page);
1729 else
1730 page = NULL;
1731 } else {
1732 page = NULL;
1733 }
1734 }
1735 stable_node_dup->head = &migrate_nodes;
1736 list_add(&stable_node_dup->list, stable_node_dup->head);
1737 return page;
1738
1739chain_append:
1740
1741 if (!stable_node_dup)
1742 stable_node_dup = stable_node_any;
1743
1744
1745
1746
1747
1748
1749
1750
1751 if (stable_node_dup == stable_node) {
1752 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1753 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1754
1755 stable_node = alloc_stable_node_chain(stable_node_dup,
1756 root);
1757 if (!stable_node)
1758 return NULL;
1759 }
1760
1761
1762
1763
1764
1765
1766 VM_BUG_ON(!is_stable_node_chain(stable_node));
1767 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1768 VM_BUG_ON(page_node->head != &migrate_nodes);
1769 list_del(&page_node->list);
1770 DO_NUMA(page_node->nid = nid);
1771 stable_node_chain_add_dup(page_node, stable_node);
1772 goto out;
1773}
1774
1775
1776
1777
1778
1779
1780
1781
1782static struct stable_node *stable_tree_insert(struct page *kpage)
1783{
1784 int nid;
1785 unsigned long kpfn;
1786 struct rb_root *root;
1787 struct rb_node **new;
1788 struct rb_node *parent;
1789 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1790 bool need_chain = false;
1791
1792 kpfn = page_to_pfn(kpage);
1793 nid = get_kpfn_nid(kpfn);
1794 root = root_stable_tree + nid;
1795again:
1796 parent = NULL;
1797 new = &root->rb_node;
1798
1799 while (*new) {
1800 struct page *tree_page;
1801 int ret;
1802
1803 cond_resched();
1804 stable_node = rb_entry(*new, struct stable_node, node);
1805 stable_node_any = NULL;
1806 tree_page = chain(&stable_node_dup, stable_node, root);
1807 if (!stable_node_dup) {
1808
1809
1810
1811
1812
1813 stable_node_any = stable_node_dup_any(stable_node,
1814 root);
1815 if (!stable_node_any) {
1816
1817 goto again;
1818 }
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828 tree_page = get_ksm_page(stable_node_any, false);
1829 }
1830 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1831 if (!tree_page) {
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841 goto again;
1842 }
1843
1844 ret = memcmp_pages(kpage, tree_page);
1845 put_page(tree_page);
1846
1847 parent = *new;
1848 if (ret < 0)
1849 new = &parent->rb_left;
1850 else if (ret > 0)
1851 new = &parent->rb_right;
1852 else {
1853 need_chain = true;
1854 break;
1855 }
1856 }
1857
1858 stable_node_dup = alloc_stable_node();
1859 if (!stable_node_dup)
1860 return NULL;
1861
1862 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1863 stable_node_dup->kpfn = kpfn;
1864 set_page_stable_node(kpage, stable_node_dup);
1865 stable_node_dup->rmap_hlist_len = 0;
1866 DO_NUMA(stable_node_dup->nid = nid);
1867 if (!need_chain) {
1868 rb_link_node(&stable_node_dup->node, parent, new);
1869 rb_insert_color(&stable_node_dup->node, root);
1870 } else {
1871 if (!is_stable_node_chain(stable_node)) {
1872 struct stable_node *orig = stable_node;
1873
1874 stable_node = alloc_stable_node_chain(orig, root);
1875 if (!stable_node) {
1876 free_stable_node(stable_node_dup);
1877 return NULL;
1878 }
1879 }
1880 stable_node_chain_add_dup(stable_node_dup, stable_node);
1881 }
1882
1883 return stable_node_dup;
1884}
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900static
1901struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1902 struct page *page,
1903 struct page **tree_pagep)
1904{
1905 struct rb_node **new;
1906 struct rb_root *root;
1907 struct rb_node *parent = NULL;
1908 int nid;
1909
1910 nid = get_kpfn_nid(page_to_pfn(page));
1911 root = root_unstable_tree + nid;
1912 new = &root->rb_node;
1913
1914 while (*new) {
1915 struct rmap_item *tree_rmap_item;
1916 struct page *tree_page;
1917 int ret;
1918
1919 cond_resched();
1920 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1921 tree_page = get_mergeable_page(tree_rmap_item);
1922 if (!tree_page)
1923 return NULL;
1924
1925
1926
1927
1928 if (page == tree_page) {
1929 put_page(tree_page);
1930 return NULL;
1931 }
1932
1933 ret = memcmp_pages(page, tree_page);
1934
1935 parent = *new;
1936 if (ret < 0) {
1937 put_page(tree_page);
1938 new = &parent->rb_left;
1939 } else if (ret > 0) {
1940 put_page(tree_page);
1941 new = &parent->rb_right;
1942 } else if (!ksm_merge_across_nodes &&
1943 page_to_nid(tree_page) != nid) {
1944
1945
1946
1947
1948
1949 put_page(tree_page);
1950 return NULL;
1951 } else {
1952 *tree_pagep = tree_page;
1953 return tree_rmap_item;
1954 }
1955 }
1956
1957 rmap_item->address |= UNSTABLE_FLAG;
1958 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1959 DO_NUMA(rmap_item->nid = nid);
1960 rb_link_node(&rmap_item->node, parent, new);
1961 rb_insert_color(&rmap_item->node, root);
1962
1963 ksm_pages_unshared++;
1964 return NULL;
1965}
1966
1967
1968
1969
1970
1971
1972static void stable_tree_append(struct rmap_item *rmap_item,
1973 struct stable_node *stable_node,
1974 bool max_page_sharing_bypass)
1975{
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986 BUG_ON(stable_node->rmap_hlist_len < 0);
1987
1988 stable_node->rmap_hlist_len++;
1989 if (!max_page_sharing_bypass)
1990
1991 WARN_ON_ONCE(stable_node->rmap_hlist_len >
1992 ksm_max_page_sharing);
1993
1994 rmap_item->head = stable_node;
1995 rmap_item->address |= STABLE_FLAG;
1996 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
1997
1998 if (rmap_item->hlist.next)
1999 ksm_pages_sharing++;
2000 else
2001 ksm_pages_shared++;
2002}
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2014{
2015 struct mm_struct *mm = rmap_item->mm;
2016 struct rmap_item *tree_rmap_item;
2017 struct page *tree_page = NULL;
2018 struct stable_node *stable_node;
2019 struct page *kpage;
2020 unsigned int checksum;
2021 int err;
2022 bool max_page_sharing_bypass = false;
2023
2024 stable_node = page_stable_node(page);
2025 if (stable_node) {
2026 if (stable_node->head != &migrate_nodes &&
2027 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2028 NUMA(stable_node->nid)) {
2029 stable_node_dup_del(stable_node);
2030 stable_node->head = &migrate_nodes;
2031 list_add(&stable_node->list, stable_node->head);
2032 }
2033 if (stable_node->head != &migrate_nodes &&
2034 rmap_item->head == stable_node)
2035 return;
2036
2037
2038
2039
2040 if (!is_page_sharing_candidate(stable_node))
2041 max_page_sharing_bypass = true;
2042 }
2043
2044
2045 kpage = stable_tree_search(page);
2046 if (kpage == page && rmap_item->head == stable_node) {
2047 put_page(kpage);
2048 return;
2049 }
2050
2051 remove_rmap_item_from_tree(rmap_item);
2052
2053 if (kpage) {
2054 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2055 if (!err) {
2056
2057
2058
2059
2060 lock_page(kpage);
2061 stable_tree_append(rmap_item, page_stable_node(kpage),
2062 max_page_sharing_bypass);
2063 unlock_page(kpage);
2064 }
2065 put_page(kpage);
2066 return;
2067 }
2068
2069
2070
2071
2072
2073
2074
2075 checksum = calc_checksum(page);
2076 if (rmap_item->oldchecksum != checksum) {
2077 rmap_item->oldchecksum = checksum;
2078 return;
2079 }
2080
2081
2082
2083
2084
2085 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2086 struct vm_area_struct *vma;
2087
2088 mmap_read_lock(mm);
2089 vma = find_mergeable_vma(mm, rmap_item->address);
2090 if (vma) {
2091 err = try_to_merge_one_page(vma, page,
2092 ZERO_PAGE(rmap_item->address));
2093 } else {
2094
2095
2096
2097
2098 err = 0;
2099 }
2100 mmap_read_unlock(mm);
2101
2102
2103
2104
2105 if (!err)
2106 return;
2107 }
2108 tree_rmap_item =
2109 unstable_tree_search_insert(rmap_item, page, &tree_page);
2110 if (tree_rmap_item) {
2111 bool split;
2112
2113 kpage = try_to_merge_two_pages(rmap_item, page,
2114 tree_rmap_item, tree_page);
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125 split = PageTransCompound(page)
2126 && compound_head(page) == compound_head(tree_page);
2127 put_page(tree_page);
2128 if (kpage) {
2129
2130
2131
2132
2133 lock_page(kpage);
2134 stable_node = stable_tree_insert(kpage);
2135 if (stable_node) {
2136 stable_tree_append(tree_rmap_item, stable_node,
2137 false);
2138 stable_tree_append(rmap_item, stable_node,
2139 false);
2140 }
2141 unlock_page(kpage);
2142
2143
2144
2145
2146
2147
2148
2149 if (!stable_node) {
2150 break_cow(tree_rmap_item);
2151 break_cow(rmap_item);
2152 }
2153 } else if (split) {
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163 if (!trylock_page(page))
2164 return;
2165 split_huge_page(page);
2166 unlock_page(page);
2167 }
2168 }
2169}
2170
2171static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2172 struct rmap_item **rmap_list,
2173 unsigned long addr)
2174{
2175 struct rmap_item *rmap_item;
2176
2177 while (*rmap_list) {
2178 rmap_item = *rmap_list;
2179 if ((rmap_item->address & PAGE_MASK) == addr)
2180 return rmap_item;
2181 if (rmap_item->address > addr)
2182 break;
2183 *rmap_list = rmap_item->rmap_list;
2184 remove_rmap_item_from_tree(rmap_item);
2185 free_rmap_item(rmap_item);
2186 }
2187
2188 rmap_item = alloc_rmap_item();
2189 if (rmap_item) {
2190
2191 rmap_item->mm = mm_slot->mm;
2192 rmap_item->address = addr;
2193 rmap_item->rmap_list = *rmap_list;
2194 *rmap_list = rmap_item;
2195 }
2196 return rmap_item;
2197}
2198
2199static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2200{
2201 struct mm_struct *mm;
2202 struct mm_slot *slot;
2203 struct vm_area_struct *vma;
2204 struct rmap_item *rmap_item;
2205 int nid;
2206
2207 if (list_empty(&ksm_mm_head.mm_list))
2208 return NULL;
2209
2210 slot = ksm_scan.mm_slot;
2211 if (slot == &ksm_mm_head) {
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222 lru_add_drain_all();
2223
2224
2225
2226
2227
2228
2229
2230 if (!ksm_merge_across_nodes) {
2231 struct stable_node *stable_node, *next;
2232 struct page *page;
2233
2234 list_for_each_entry_safe(stable_node, next,
2235 &migrate_nodes, list) {
2236 page = get_ksm_page(stable_node, false);
2237 if (page)
2238 put_page(page);
2239 cond_resched();
2240 }
2241 }
2242
2243 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2244 root_unstable_tree[nid] = RB_ROOT;
2245
2246 spin_lock(&ksm_mmlist_lock);
2247 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2248 ksm_scan.mm_slot = slot;
2249 spin_unlock(&ksm_mmlist_lock);
2250
2251
2252
2253
2254 if (slot == &ksm_mm_head)
2255 return NULL;
2256next_mm:
2257 ksm_scan.address = 0;
2258 ksm_scan.rmap_list = &slot->rmap_list;
2259 }
2260
2261 mm = slot->mm;
2262 mmap_read_lock(mm);
2263 if (ksm_test_exit(mm))
2264 vma = NULL;
2265 else
2266 vma = find_vma(mm, ksm_scan.address);
2267
2268 for (; vma; vma = vma->vm_next) {
2269 if (!(vma->vm_flags & VM_MERGEABLE))
2270 continue;
2271 if (ksm_scan.address < vma->vm_start)
2272 ksm_scan.address = vma->vm_start;
2273 if (!vma->anon_vma)
2274 ksm_scan.address = vma->vm_end;
2275
2276 while (ksm_scan.address < vma->vm_end) {
2277 if (ksm_test_exit(mm))
2278 break;
2279 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2280 if (IS_ERR_OR_NULL(*page)) {
2281 ksm_scan.address += PAGE_SIZE;
2282 cond_resched();
2283 continue;
2284 }
2285 if (PageAnon(*page)) {
2286 flush_anon_page(vma, *page, ksm_scan.address);
2287 flush_dcache_page(*page);
2288 rmap_item = get_next_rmap_item(slot,
2289 ksm_scan.rmap_list, ksm_scan.address);
2290 if (rmap_item) {
2291 ksm_scan.rmap_list =
2292 &rmap_item->rmap_list;
2293 ksm_scan.address += PAGE_SIZE;
2294 } else
2295 put_page(*page);
2296 mmap_read_unlock(mm);
2297 return rmap_item;
2298 }
2299 put_page(*page);
2300 ksm_scan.address += PAGE_SIZE;
2301 cond_resched();
2302 }
2303 }
2304
2305 if (ksm_test_exit(mm)) {
2306 ksm_scan.address = 0;
2307 ksm_scan.rmap_list = &slot->rmap_list;
2308 }
2309
2310
2311
2312
2313 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
2314
2315 spin_lock(&ksm_mmlist_lock);
2316 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2317 struct mm_slot, mm_list);
2318 if (ksm_scan.address == 0) {
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328 hash_del(&slot->link);
2329 list_del(&slot->mm_list);
2330 spin_unlock(&ksm_mmlist_lock);
2331
2332 free_mm_slot(slot);
2333 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2334 mmap_read_unlock(mm);
2335 mmdrop(mm);
2336 } else {
2337 mmap_read_unlock(mm);
2338
2339
2340
2341
2342
2343
2344
2345 spin_unlock(&ksm_mmlist_lock);
2346 }
2347
2348
2349 slot = ksm_scan.mm_slot;
2350 if (slot != &ksm_mm_head)
2351 goto next_mm;
2352
2353 ksm_scan.seqnr++;
2354 return NULL;
2355}
2356
2357
2358
2359
2360
2361static void ksm_do_scan(unsigned int scan_npages)
2362{
2363 struct rmap_item *rmap_item;
2364 struct page *uninitialized_var(page);
2365
2366 while (scan_npages-- && likely(!freezing(current))) {
2367 cond_resched();
2368 rmap_item = scan_get_next_rmap_item(&page);
2369 if (!rmap_item)
2370 return;
2371 cmp_and_merge_page(page, rmap_item);
2372 put_page(page);
2373 }
2374}
2375
2376static int ksmd_should_run(void)
2377{
2378 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2379}
2380
2381static int ksm_scan_thread(void *nothing)
2382{
2383 set_freezable();
2384 set_user_nice(current, 5);
2385
2386 while (!kthread_should_stop()) {
2387 mutex_lock(&ksm_thread_mutex);
2388 wait_while_offlining();
2389 if (ksmd_should_run())
2390 ksm_do_scan(ksm_thread_pages_to_scan);
2391 mutex_unlock(&ksm_thread_mutex);
2392
2393 try_to_freeze();
2394
2395 if (ksmd_should_run()) {
2396 schedule_timeout_interruptible(
2397 msecs_to_jiffies(ksm_thread_sleep_millisecs));
2398 } else {
2399 wait_event_freezable(ksm_thread_wait,
2400 ksmd_should_run() || kthread_should_stop());
2401 }
2402 }
2403 return 0;
2404}
2405
2406int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2407 unsigned long end, int advice, unsigned long *vm_flags)
2408{
2409 struct mm_struct *mm = vma->vm_mm;
2410 int err;
2411
2412 switch (advice) {
2413 case MADV_MERGEABLE:
2414
2415
2416
2417 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2418 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2419 VM_HUGETLB | VM_MIXEDMAP))
2420 return 0;
2421
2422 if (vma_is_dax(vma))
2423 return 0;
2424
2425#ifdef VM_SAO
2426 if (*vm_flags & VM_SAO)
2427 return 0;
2428#endif
2429#ifdef VM_SPARC_ADI
2430 if (*vm_flags & VM_SPARC_ADI)
2431 return 0;
2432#endif
2433
2434 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2435 err = __ksm_enter(mm);
2436 if (err)
2437 return err;
2438 }
2439
2440 *vm_flags |= VM_MERGEABLE;
2441 break;
2442
2443 case MADV_UNMERGEABLE:
2444 if (!(*vm_flags & VM_MERGEABLE))
2445 return 0;
2446
2447 if (vma->anon_vma) {
2448 err = unmerge_ksm_pages(vma, start, end);
2449 if (err)
2450 return err;
2451 }
2452
2453 *vm_flags &= ~VM_MERGEABLE;
2454 break;
2455 }
2456
2457 return 0;
2458}
2459EXPORT_SYMBOL_GPL(ksm_madvise);
2460
2461int __ksm_enter(struct mm_struct *mm)
2462{
2463 struct mm_slot *mm_slot;
2464 int needs_wakeup;
2465
2466 mm_slot = alloc_mm_slot();
2467 if (!mm_slot)
2468 return -ENOMEM;
2469
2470
2471 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2472
2473 spin_lock(&ksm_mmlist_lock);
2474 insert_to_mm_slots_hash(mm, mm_slot);
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485 if (ksm_run & KSM_RUN_UNMERGE)
2486 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2487 else
2488 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2489 spin_unlock(&ksm_mmlist_lock);
2490
2491 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2492 mmgrab(mm);
2493
2494 if (needs_wakeup)
2495 wake_up_interruptible(&ksm_thread_wait);
2496
2497 return 0;
2498}
2499
2500void __ksm_exit(struct mm_struct *mm)
2501{
2502 struct mm_slot *mm_slot;
2503 int easy_to_free = 0;
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514 spin_lock(&ksm_mmlist_lock);
2515 mm_slot = get_mm_slot(mm);
2516 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2517 if (!mm_slot->rmap_list) {
2518 hash_del(&mm_slot->link);
2519 list_del(&mm_slot->mm_list);
2520 easy_to_free = 1;
2521 } else {
2522 list_move(&mm_slot->mm_list,
2523 &ksm_scan.mm_slot->mm_list);
2524 }
2525 }
2526 spin_unlock(&ksm_mmlist_lock);
2527
2528 if (easy_to_free) {
2529 free_mm_slot(mm_slot);
2530 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2531 mmdrop(mm);
2532 } else if (mm_slot) {
2533 mmap_write_lock(mm);
2534 mmap_write_unlock(mm);
2535 }
2536}
2537
2538struct page *ksm_might_need_to_copy(struct page *page,
2539 struct vm_area_struct *vma, unsigned long address)
2540{
2541 struct anon_vma *anon_vma = page_anon_vma(page);
2542 struct page *new_page;
2543
2544 if (PageKsm(page)) {
2545 if (page_stable_node(page) &&
2546 !(ksm_run & KSM_RUN_UNMERGE))
2547 return page;
2548 } else if (!anon_vma) {
2549 return page;
2550 } else if (anon_vma->root == vma->anon_vma->root &&
2551 page->index == linear_page_index(vma, address)) {
2552 return page;
2553 }
2554 if (!PageUptodate(page))
2555 return page;
2556
2557 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2558 if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
2559 put_page(new_page);
2560 new_page = NULL;
2561 }
2562 if (new_page) {
2563 copy_user_highpage(new_page, page, address, vma);
2564
2565 SetPageDirty(new_page);
2566 __SetPageUptodate(new_page);
2567 __SetPageLocked(new_page);
2568 }
2569
2570 return new_page;
2571}
2572
2573void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
2574{
2575 struct stable_node *stable_node;
2576 struct rmap_item *rmap_item;
2577 int search_new_forks = 0;
2578
2579 VM_BUG_ON_PAGE(!PageKsm(page), page);
2580
2581
2582
2583
2584
2585 VM_BUG_ON_PAGE(!PageLocked(page), page);
2586
2587 stable_node = page_stable_node(page);
2588 if (!stable_node)
2589 return;
2590again:
2591 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2592 struct anon_vma *anon_vma = rmap_item->anon_vma;
2593 struct anon_vma_chain *vmac;
2594 struct vm_area_struct *vma;
2595
2596 cond_resched();
2597 anon_vma_lock_read(anon_vma);
2598 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2599 0, ULONG_MAX) {
2600 unsigned long addr;
2601
2602 cond_resched();
2603 vma = vmac->vma;
2604
2605
2606 addr = rmap_item->address & ~KSM_FLAG_MASK;
2607
2608 if (addr < vma->vm_start || addr >= vma->vm_end)
2609 continue;
2610
2611
2612
2613
2614
2615
2616 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2617 continue;
2618
2619 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2620 continue;
2621
2622 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
2623 anon_vma_unlock_read(anon_vma);
2624 return;
2625 }
2626 if (rwc->done && rwc->done(page)) {
2627 anon_vma_unlock_read(anon_vma);
2628 return;
2629 }
2630 }
2631 anon_vma_unlock_read(anon_vma);
2632 }
2633 if (!search_new_forks++)
2634 goto again;
2635}
2636
2637bool reuse_ksm_page(struct page *page,
2638 struct vm_area_struct *vma,
2639 unsigned long address)
2640{
2641#ifdef CONFIG_DEBUG_VM
2642 if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
2643 WARN_ON(!page_mapped(page)) ||
2644 WARN_ON(!PageLocked(page))) {
2645 dump_page(page, "reuse_ksm_page");
2646 return false;
2647 }
2648#endif
2649
2650 if (PageSwapCache(page) || !page_stable_node(page))
2651 return false;
2652
2653 if (!page_ref_freeze(page, 1))
2654 return false;
2655
2656 page_move_anon_rmap(page, vma);
2657 page->index = linear_page_index(vma, address);
2658 page_ref_unfreeze(page, 1);
2659
2660 return true;
2661}
2662#ifdef CONFIG_MIGRATION
2663void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2664{
2665 struct stable_node *stable_node;
2666
2667 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2668 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2669 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2670
2671 stable_node = page_stable_node(newpage);
2672 if (stable_node) {
2673 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2674 stable_node->kpfn = page_to_pfn(newpage);
2675
2676
2677
2678
2679
2680
2681 smp_wmb();
2682 set_page_stable_node(oldpage, NULL);
2683 }
2684}
2685#endif
2686
2687#ifdef CONFIG_MEMORY_HOTREMOVE
2688static void wait_while_offlining(void)
2689{
2690 while (ksm_run & KSM_RUN_OFFLINE) {
2691 mutex_unlock(&ksm_thread_mutex);
2692 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2693 TASK_UNINTERRUPTIBLE);
2694 mutex_lock(&ksm_thread_mutex);
2695 }
2696}
2697
2698static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2699 unsigned long start_pfn,
2700 unsigned long end_pfn)
2701{
2702 if (stable_node->kpfn >= start_pfn &&
2703 stable_node->kpfn < end_pfn) {
2704
2705
2706
2707
2708 remove_node_from_stable_tree(stable_node);
2709 return true;
2710 }
2711 return false;
2712}
2713
2714static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2715 unsigned long start_pfn,
2716 unsigned long end_pfn,
2717 struct rb_root *root)
2718{
2719 struct stable_node *dup;
2720 struct hlist_node *hlist_safe;
2721
2722 if (!is_stable_node_chain(stable_node)) {
2723 VM_BUG_ON(is_stable_node_dup(stable_node));
2724 return stable_node_dup_remove_range(stable_node, start_pfn,
2725 end_pfn);
2726 }
2727
2728 hlist_for_each_entry_safe(dup, hlist_safe,
2729 &stable_node->hlist, hlist_dup) {
2730 VM_BUG_ON(!is_stable_node_dup(dup));
2731 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2732 }
2733 if (hlist_empty(&stable_node->hlist)) {
2734 free_stable_node_chain(stable_node, root);
2735 return true;
2736 } else
2737 return false;
2738}
2739
2740static void ksm_check_stable_tree(unsigned long start_pfn,
2741 unsigned long end_pfn)
2742{
2743 struct stable_node *stable_node, *next;
2744 struct rb_node *node;
2745 int nid;
2746
2747 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2748 node = rb_first(root_stable_tree + nid);
2749 while (node) {
2750 stable_node = rb_entry(node, struct stable_node, node);
2751 if (stable_node_chain_remove_range(stable_node,
2752 start_pfn, end_pfn,
2753 root_stable_tree +
2754 nid))
2755 node = rb_first(root_stable_tree + nid);
2756 else
2757 node = rb_next(node);
2758 cond_resched();
2759 }
2760 }
2761 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2762 if (stable_node->kpfn >= start_pfn &&
2763 stable_node->kpfn < end_pfn)
2764 remove_node_from_stable_tree(stable_node);
2765 cond_resched();
2766 }
2767}
2768
2769static int ksm_memory_callback(struct notifier_block *self,
2770 unsigned long action, void *arg)
2771{
2772 struct memory_notify *mn = arg;
2773
2774 switch (action) {
2775 case MEM_GOING_OFFLINE:
2776
2777
2778
2779
2780
2781
2782
2783 mutex_lock(&ksm_thread_mutex);
2784 ksm_run |= KSM_RUN_OFFLINE;
2785 mutex_unlock(&ksm_thread_mutex);
2786 break;
2787
2788 case MEM_OFFLINE:
2789
2790
2791
2792
2793
2794
2795
2796 ksm_check_stable_tree(mn->start_pfn,
2797 mn->start_pfn + mn->nr_pages);
2798
2799
2800 case MEM_CANCEL_OFFLINE:
2801 mutex_lock(&ksm_thread_mutex);
2802 ksm_run &= ~KSM_RUN_OFFLINE;
2803 mutex_unlock(&ksm_thread_mutex);
2804
2805 smp_mb();
2806 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2807 break;
2808 }
2809 return NOTIFY_OK;
2810}
2811#else
2812static void wait_while_offlining(void)
2813{
2814}
2815#endif
2816
2817#ifdef CONFIG_SYSFS
2818
2819
2820
2821
2822#define KSM_ATTR_RO(_name) \
2823 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2824#define KSM_ATTR(_name) \
2825 static struct kobj_attribute _name##_attr = \
2826 __ATTR(_name, 0644, _name##_show, _name##_store)
2827
2828static ssize_t sleep_millisecs_show(struct kobject *kobj,
2829 struct kobj_attribute *attr, char *buf)
2830{
2831 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2832}
2833
2834static ssize_t sleep_millisecs_store(struct kobject *kobj,
2835 struct kobj_attribute *attr,
2836 const char *buf, size_t count)
2837{
2838 unsigned long msecs;
2839 int err;
2840
2841 err = kstrtoul(buf, 10, &msecs);
2842 if (err || msecs > UINT_MAX)
2843 return -EINVAL;
2844
2845 ksm_thread_sleep_millisecs = msecs;
2846
2847 return count;
2848}
2849KSM_ATTR(sleep_millisecs);
2850
2851static ssize_t pages_to_scan_show(struct kobject *kobj,
2852 struct kobj_attribute *attr, char *buf)
2853{
2854 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2855}
2856
2857static ssize_t pages_to_scan_store(struct kobject *kobj,
2858 struct kobj_attribute *attr,
2859 const char *buf, size_t count)
2860{
2861 int err;
2862 unsigned long nr_pages;
2863
2864 err = kstrtoul(buf, 10, &nr_pages);
2865 if (err || nr_pages > UINT_MAX)
2866 return -EINVAL;
2867
2868 ksm_thread_pages_to_scan = nr_pages;
2869
2870 return count;
2871}
2872KSM_ATTR(pages_to_scan);
2873
2874static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2875 char *buf)
2876{
2877 return sprintf(buf, "%lu\n", ksm_run);
2878}
2879
2880static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2881 const char *buf, size_t count)
2882{
2883 int err;
2884 unsigned long flags;
2885
2886 err = kstrtoul(buf, 10, &flags);
2887 if (err || flags > UINT_MAX)
2888 return -EINVAL;
2889 if (flags > KSM_RUN_UNMERGE)
2890 return -EINVAL;
2891
2892
2893
2894
2895
2896
2897
2898
2899 mutex_lock(&ksm_thread_mutex);
2900 wait_while_offlining();
2901 if (ksm_run != flags) {
2902 ksm_run = flags;
2903 if (flags & KSM_RUN_UNMERGE) {
2904 set_current_oom_origin();
2905 err = unmerge_and_remove_all_rmap_items();
2906 clear_current_oom_origin();
2907 if (err) {
2908 ksm_run = KSM_RUN_STOP;
2909 count = err;
2910 }
2911 }
2912 }
2913 mutex_unlock(&ksm_thread_mutex);
2914
2915 if (flags & KSM_RUN_MERGE)
2916 wake_up_interruptible(&ksm_thread_wait);
2917
2918 return count;
2919}
2920KSM_ATTR(run);
2921
2922#ifdef CONFIG_NUMA
2923static ssize_t merge_across_nodes_show(struct kobject *kobj,
2924 struct kobj_attribute *attr, char *buf)
2925{
2926 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2927}
2928
2929static ssize_t merge_across_nodes_store(struct kobject *kobj,
2930 struct kobj_attribute *attr,
2931 const char *buf, size_t count)
2932{
2933 int err;
2934 unsigned long knob;
2935
2936 err = kstrtoul(buf, 10, &knob);
2937 if (err)
2938 return err;
2939 if (knob > 1)
2940 return -EINVAL;
2941
2942 mutex_lock(&ksm_thread_mutex);
2943 wait_while_offlining();
2944 if (ksm_merge_across_nodes != knob) {
2945 if (ksm_pages_shared || remove_all_stable_nodes())
2946 err = -EBUSY;
2947 else if (root_stable_tree == one_stable_tree) {
2948 struct rb_root *buf;
2949
2950
2951
2952
2953
2954
2955
2956 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2957 GFP_KERNEL);
2958
2959 if (!buf)
2960 err = -ENOMEM;
2961 else {
2962 root_stable_tree = buf;
2963 root_unstable_tree = buf + nr_node_ids;
2964
2965 root_unstable_tree[0] = one_unstable_tree[0];
2966 }
2967 }
2968 if (!err) {
2969 ksm_merge_across_nodes = knob;
2970 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2971 }
2972 }
2973 mutex_unlock(&ksm_thread_mutex);
2974
2975 return err ? err : count;
2976}
2977KSM_ATTR(merge_across_nodes);
2978#endif
2979
2980static ssize_t use_zero_pages_show(struct kobject *kobj,
2981 struct kobj_attribute *attr, char *buf)
2982{
2983 return sprintf(buf, "%u\n", ksm_use_zero_pages);
2984}
2985static ssize_t use_zero_pages_store(struct kobject *kobj,
2986 struct kobj_attribute *attr,
2987 const char *buf, size_t count)
2988{
2989 int err;
2990 bool value;
2991
2992 err = kstrtobool(buf, &value);
2993 if (err)
2994 return -EINVAL;
2995
2996 ksm_use_zero_pages = value;
2997
2998 return count;
2999}
3000KSM_ATTR(use_zero_pages);
3001
3002static ssize_t max_page_sharing_show(struct kobject *kobj,
3003 struct kobj_attribute *attr, char *buf)
3004{
3005 return sprintf(buf, "%u\n", ksm_max_page_sharing);
3006}
3007
3008static ssize_t max_page_sharing_store(struct kobject *kobj,
3009 struct kobj_attribute *attr,
3010 const char *buf, size_t count)
3011{
3012 int err;
3013 int knob;
3014
3015 err = kstrtoint(buf, 10, &knob);
3016 if (err)
3017 return err;
3018
3019
3020
3021
3022
3023 if (knob < 2)
3024 return -EINVAL;
3025
3026 if (READ_ONCE(ksm_max_page_sharing) == knob)
3027 return count;
3028
3029 mutex_lock(&ksm_thread_mutex);
3030 wait_while_offlining();
3031 if (ksm_max_page_sharing != knob) {
3032 if (ksm_pages_shared || remove_all_stable_nodes())
3033 err = -EBUSY;
3034 else
3035 ksm_max_page_sharing = knob;
3036 }
3037 mutex_unlock(&ksm_thread_mutex);
3038
3039 return err ? err : count;
3040}
3041KSM_ATTR(max_page_sharing);
3042
3043static ssize_t pages_shared_show(struct kobject *kobj,
3044 struct kobj_attribute *attr, char *buf)
3045{
3046 return sprintf(buf, "%lu\n", ksm_pages_shared);
3047}
3048KSM_ATTR_RO(pages_shared);
3049
3050static ssize_t pages_sharing_show(struct kobject *kobj,
3051 struct kobj_attribute *attr, char *buf)
3052{
3053 return sprintf(buf, "%lu\n", ksm_pages_sharing);
3054}
3055KSM_ATTR_RO(pages_sharing);
3056
3057static ssize_t pages_unshared_show(struct kobject *kobj,
3058 struct kobj_attribute *attr, char *buf)
3059{
3060 return sprintf(buf, "%lu\n", ksm_pages_unshared);
3061}
3062KSM_ATTR_RO(pages_unshared);
3063
3064static ssize_t pages_volatile_show(struct kobject *kobj,
3065 struct kobj_attribute *attr, char *buf)
3066{
3067 long ksm_pages_volatile;
3068
3069 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3070 - ksm_pages_sharing - ksm_pages_unshared;
3071
3072
3073
3074
3075 if (ksm_pages_volatile < 0)
3076 ksm_pages_volatile = 0;
3077 return sprintf(buf, "%ld\n", ksm_pages_volatile);
3078}
3079KSM_ATTR_RO(pages_volatile);
3080
3081static ssize_t stable_node_dups_show(struct kobject *kobj,
3082 struct kobj_attribute *attr, char *buf)
3083{
3084 return sprintf(buf, "%lu\n", ksm_stable_node_dups);
3085}
3086KSM_ATTR_RO(stable_node_dups);
3087
3088static ssize_t stable_node_chains_show(struct kobject *kobj,
3089 struct kobj_attribute *attr, char *buf)
3090{
3091 return sprintf(buf, "%lu\n", ksm_stable_node_chains);
3092}
3093KSM_ATTR_RO(stable_node_chains);
3094
3095static ssize_t
3096stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3097 struct kobj_attribute *attr,
3098 char *buf)
3099{
3100 return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3101}
3102
3103static ssize_t
3104stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3105 struct kobj_attribute *attr,
3106 const char *buf, size_t count)
3107{
3108 unsigned long msecs;
3109 int err;
3110
3111 err = kstrtoul(buf, 10, &msecs);
3112 if (err || msecs > UINT_MAX)
3113 return -EINVAL;
3114
3115 ksm_stable_node_chains_prune_millisecs = msecs;
3116
3117 return count;
3118}
3119KSM_ATTR(stable_node_chains_prune_millisecs);
3120
3121static ssize_t full_scans_show(struct kobject *kobj,
3122 struct kobj_attribute *attr, char *buf)
3123{
3124 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
3125}
3126KSM_ATTR_RO(full_scans);
3127
3128static struct attribute *ksm_attrs[] = {
3129 &sleep_millisecs_attr.attr,
3130 &pages_to_scan_attr.attr,
3131 &run_attr.attr,
3132 &pages_shared_attr.attr,
3133 &pages_sharing_attr.attr,
3134 &pages_unshared_attr.attr,
3135 &pages_volatile_attr.attr,
3136 &full_scans_attr.attr,
3137#ifdef CONFIG_NUMA
3138 &merge_across_nodes_attr.attr,
3139#endif
3140 &max_page_sharing_attr.attr,
3141 &stable_node_chains_attr.attr,
3142 &stable_node_dups_attr.attr,
3143 &stable_node_chains_prune_millisecs_attr.attr,
3144 &use_zero_pages_attr.attr,
3145 NULL,
3146};
3147
3148static const struct attribute_group ksm_attr_group = {
3149 .attrs = ksm_attrs,
3150 .name = "ksm",
3151};
3152#endif
3153
3154static int __init ksm_init(void)
3155{
3156 struct task_struct *ksm_thread;
3157 int err;
3158
3159
3160 zero_checksum = calc_checksum(ZERO_PAGE(0));
3161
3162 ksm_use_zero_pages = false;
3163
3164 err = ksm_slab_init();
3165 if (err)
3166 goto out;
3167
3168 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3169 if (IS_ERR(ksm_thread)) {
3170 pr_err("ksm: creating kthread failed\n");
3171 err = PTR_ERR(ksm_thread);
3172 goto out_free;
3173 }
3174
3175#ifdef CONFIG_SYSFS
3176 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3177 if (err) {
3178 pr_err("ksm: register sysfs failed\n");
3179 kthread_stop(ksm_thread);
3180 goto out_free;
3181 }
3182#else
3183 ksm_run = KSM_RUN_MERGE;
3184
3185#endif
3186
3187#ifdef CONFIG_MEMORY_HOTREMOVE
3188
3189 hotplug_memory_notifier(ksm_memory_callback, 100);
3190#endif
3191 return 0;
3192
3193out_free:
3194 ksm_slab_free();
3195out:
3196 return err;
3197}
3198subsys_initcall(ksm_init);
3199