1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <linux/errno.h>
17#include <linux/mm.h>
18#include <linux/fs.h>
19#include <linux/mman.h>
20#include <linux/sched.h>
21#include <linux/sched/mm.h>
22#include <linux/sched/coredump.h>
23#include <linux/rwsem.h>
24#include <linux/pagemap.h>
25#include <linux/rmap.h>
26#include <linux/spinlock.h>
27#include <linux/xxhash.h>
28#include <linux/delay.h>
29#include <linux/kthread.h>
30#include <linux/wait.h>
31#include <linux/slab.h>
32#include <linux/rbtree.h>
33#include <linux/memory.h>
34#include <linux/mmu_notifier.h>
35#include <linux/swap.h>
36#include <linux/ksm.h>
37#include <linux/hashtable.h>
38#include <linux/freezer.h>
39#include <linux/oom.h>
40#include <linux/numa.h>
41
42#include <asm/tlbflush.h>
43#include "internal.h"
44
45#ifdef CONFIG_NUMA
46#define NUMA(x) (x)
47#define DO_NUMA(x) do { (x); } while (0)
48#else
49#define NUMA(x) (0)
50#define DO_NUMA(x) do { } while (0)
51#endif
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120struct mm_slot {
121 struct hlist_node link;
122 struct list_head mm_list;
123 struct rmap_item *rmap_list;
124 struct mm_struct *mm;
125};
126
127
128
129
130
131
132
133
134
135
136struct ksm_scan {
137 struct mm_slot *mm_slot;
138 unsigned long address;
139 struct rmap_item **rmap_list;
140 unsigned long seqnr;
141};
142
143
144
145
146
147
148
149
150
151
152
153
154
155struct stable_node {
156 union {
157 struct rb_node node;
158 struct {
159 struct list_head *head;
160 struct {
161 struct hlist_node hlist_dup;
162 struct list_head list;
163 };
164 };
165 };
166 struct hlist_head hlist;
167 union {
168 unsigned long kpfn;
169 unsigned long chain_prune_time;
170 };
171
172
173
174
175
176#define STABLE_NODE_CHAIN -1024
177 int rmap_hlist_len;
178#ifdef CONFIG_NUMA
179 int nid;
180#endif
181};
182
183
184
185
186
187
188
189
190
191
192
193
194
195struct rmap_item {
196 struct rmap_item *rmap_list;
197 union {
198 struct anon_vma *anon_vma;
199#ifdef CONFIG_NUMA
200 int nid;
201#endif
202 };
203 struct mm_struct *mm;
204 unsigned long address;
205 unsigned int oldchecksum;
206 union {
207 struct rb_node node;
208 struct {
209 struct stable_node *head;
210 struct hlist_node hlist;
211 };
212 };
213};
214
215#define SEQNR_MASK 0x0ff
216#define UNSTABLE_FLAG 0x100
217#define STABLE_FLAG 0x200
218#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
219
220
221
222static struct rb_root one_stable_tree[1] = { RB_ROOT };
223static struct rb_root one_unstable_tree[1] = { RB_ROOT };
224static struct rb_root *root_stable_tree = one_stable_tree;
225static struct rb_root *root_unstable_tree = one_unstable_tree;
226
227
228static LIST_HEAD(migrate_nodes);
229#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
230
231#define MM_SLOTS_HASH_BITS 10
232static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
233
234static struct mm_slot ksm_mm_head = {
235 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
236};
237static struct ksm_scan ksm_scan = {
238 .mm_slot = &ksm_mm_head,
239};
240
241static struct kmem_cache *rmap_item_cache;
242static struct kmem_cache *stable_node_cache;
243static struct kmem_cache *mm_slot_cache;
244
245
246static unsigned long ksm_pages_shared;
247
248
249static unsigned long ksm_pages_sharing;
250
251
252static unsigned long ksm_pages_unshared;
253
254
255static unsigned long ksm_rmap_items;
256
257
258static unsigned long ksm_stable_node_chains;
259
260
261static unsigned long ksm_stable_node_dups;
262
263
264static int ksm_stable_node_chains_prune_millisecs = 2000;
265
266
267static int ksm_max_page_sharing = 256;
268
269
270static unsigned int ksm_thread_pages_to_scan = 100;
271
272
273static unsigned int ksm_thread_sleep_millisecs = 20;
274
275
276static unsigned int zero_checksum __read_mostly;
277
278
279static bool ksm_use_zero_pages __read_mostly;
280
281#ifdef CONFIG_NUMA
282
283static unsigned int ksm_merge_across_nodes = 1;
284static int ksm_nr_node_ids = 1;
285#else
286#define ksm_merge_across_nodes 1U
287#define ksm_nr_node_ids 1
288#endif
289
290#define KSM_RUN_STOP 0
291#define KSM_RUN_MERGE 1
292#define KSM_RUN_UNMERGE 2
293#define KSM_RUN_OFFLINE 4
294static unsigned long ksm_run = KSM_RUN_STOP;
295static void wait_while_offlining(void);
296
297static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
298static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
299static DEFINE_MUTEX(ksm_thread_mutex);
300static DEFINE_SPINLOCK(ksm_mmlist_lock);
301
302#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
303 sizeof(struct __struct), __alignof__(struct __struct),\
304 (__flags), NULL)
305
306static int __init ksm_slab_init(void)
307{
308 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
309 if (!rmap_item_cache)
310 goto out;
311
312 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
313 if (!stable_node_cache)
314 goto out_free1;
315
316 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
317 if (!mm_slot_cache)
318 goto out_free2;
319
320 return 0;
321
322out_free2:
323 kmem_cache_destroy(stable_node_cache);
324out_free1:
325 kmem_cache_destroy(rmap_item_cache);
326out:
327 return -ENOMEM;
328}
329
330static void __init ksm_slab_free(void)
331{
332 kmem_cache_destroy(mm_slot_cache);
333 kmem_cache_destroy(stable_node_cache);
334 kmem_cache_destroy(rmap_item_cache);
335 mm_slot_cache = NULL;
336}
337
338static __always_inline bool is_stable_node_chain(struct stable_node *chain)
339{
340 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
341}
342
343static __always_inline bool is_stable_node_dup(struct stable_node *dup)
344{
345 return dup->head == STABLE_NODE_DUP_HEAD;
346}
347
348static inline void stable_node_chain_add_dup(struct stable_node *dup,
349 struct stable_node *chain)
350{
351 VM_BUG_ON(is_stable_node_dup(dup));
352 dup->head = STABLE_NODE_DUP_HEAD;
353 VM_BUG_ON(!is_stable_node_chain(chain));
354 hlist_add_head(&dup->hlist_dup, &chain->hlist);
355 ksm_stable_node_dups++;
356}
357
358static inline void __stable_node_dup_del(struct stable_node *dup)
359{
360 VM_BUG_ON(!is_stable_node_dup(dup));
361 hlist_del(&dup->hlist_dup);
362 ksm_stable_node_dups--;
363}
364
365static inline void stable_node_dup_del(struct stable_node *dup)
366{
367 VM_BUG_ON(is_stable_node_chain(dup));
368 if (is_stable_node_dup(dup))
369 __stable_node_dup_del(dup);
370 else
371 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
372#ifdef CONFIG_DEBUG_VM
373 dup->head = NULL;
374#endif
375}
376
377static inline struct rmap_item *alloc_rmap_item(void)
378{
379 struct rmap_item *rmap_item;
380
381 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
382 __GFP_NORETRY | __GFP_NOWARN);
383 if (rmap_item)
384 ksm_rmap_items++;
385 return rmap_item;
386}
387
388static inline void free_rmap_item(struct rmap_item *rmap_item)
389{
390 ksm_rmap_items--;
391 rmap_item->mm = NULL;
392 kmem_cache_free(rmap_item_cache, rmap_item);
393}
394
395static inline struct stable_node *alloc_stable_node(void)
396{
397
398
399
400
401
402 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
403}
404
405static inline void free_stable_node(struct stable_node *stable_node)
406{
407 VM_BUG_ON(stable_node->rmap_hlist_len &&
408 !is_stable_node_chain(stable_node));
409 kmem_cache_free(stable_node_cache, stable_node);
410}
411
412static inline struct mm_slot *alloc_mm_slot(void)
413{
414 if (!mm_slot_cache)
415 return NULL;
416 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
417}
418
419static inline void free_mm_slot(struct mm_slot *mm_slot)
420{
421 kmem_cache_free(mm_slot_cache, mm_slot);
422}
423
424static struct mm_slot *get_mm_slot(struct mm_struct *mm)
425{
426 struct mm_slot *slot;
427
428 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
429 if (slot->mm == mm)
430 return slot;
431
432 return NULL;
433}
434
435static void insert_to_mm_slots_hash(struct mm_struct *mm,
436 struct mm_slot *mm_slot)
437{
438 mm_slot->mm = mm;
439 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
440}
441
442
443
444
445
446
447
448
449
450static inline bool ksm_test_exit(struct mm_struct *mm)
451{
452 return atomic_read(&mm->mm_users) == 0;
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
471{
472 struct page *page;
473 vm_fault_t ret = 0;
474
475 do {
476 cond_resched();
477 page = follow_page(vma, addr,
478 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
479 if (IS_ERR_OR_NULL(page))
480 break;
481 if (PageKsm(page))
482 ret = handle_mm_fault(vma, addr,
483 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
484 else
485 ret = VM_FAULT_WRITE;
486 put_page(page);
487 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
517}
518
519static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
520 unsigned long addr)
521{
522 struct vm_area_struct *vma;
523 if (ksm_test_exit(mm))
524 return NULL;
525 vma = find_vma(mm, addr);
526 if (!vma || vma->vm_start > addr)
527 return NULL;
528 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
529 return NULL;
530 return vma;
531}
532
533static void break_cow(struct rmap_item *rmap_item)
534{
535 struct mm_struct *mm = rmap_item->mm;
536 unsigned long addr = rmap_item->address;
537 struct vm_area_struct *vma;
538
539
540
541
542
543 put_anon_vma(rmap_item->anon_vma);
544
545 down_read(&mm->mmap_sem);
546 vma = find_mergeable_vma(mm, addr);
547 if (vma)
548 break_ksm(vma, addr);
549 up_read(&mm->mmap_sem);
550}
551
552static struct page *get_mergeable_page(struct rmap_item *rmap_item)
553{
554 struct mm_struct *mm = rmap_item->mm;
555 unsigned long addr = rmap_item->address;
556 struct vm_area_struct *vma;
557 struct page *page;
558
559 down_read(&mm->mmap_sem);
560 vma = find_mergeable_vma(mm, addr);
561 if (!vma)
562 goto out;
563
564 page = follow_page(vma, addr, FOLL_GET);
565 if (IS_ERR_OR_NULL(page))
566 goto out;
567 if (PageAnon(page)) {
568 flush_anon_page(vma, page, addr);
569 flush_dcache_page(page);
570 } else {
571 put_page(page);
572out:
573 page = NULL;
574 }
575 up_read(&mm->mmap_sem);
576 return page;
577}
578
579
580
581
582
583
584
585static inline int get_kpfn_nid(unsigned long kpfn)
586{
587 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
588}
589
590static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
591 struct rb_root *root)
592{
593 struct stable_node *chain = alloc_stable_node();
594 VM_BUG_ON(is_stable_node_chain(dup));
595 if (likely(chain)) {
596 INIT_HLIST_HEAD(&chain->hlist);
597 chain->chain_prune_time = jiffies;
598 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
599#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
600 chain->nid = NUMA_NO_NODE;
601#endif
602 ksm_stable_node_chains++;
603
604
605
606
607
608
609 rb_replace_node(&dup->node, &chain->node, root);
610
611
612
613
614
615
616
617
618 stable_node_chain_add_dup(dup, chain);
619 }
620 return chain;
621}
622
623static inline void free_stable_node_chain(struct stable_node *chain,
624 struct rb_root *root)
625{
626 rb_erase(&chain->node, root);
627 free_stable_node(chain);
628 ksm_stable_node_chains--;
629}
630
631static void remove_node_from_stable_tree(struct stable_node *stable_node)
632{
633 struct rmap_item *rmap_item;
634
635
636 BUG_ON(stable_node->rmap_hlist_len < 0);
637
638 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
639 if (rmap_item->hlist.next)
640 ksm_pages_sharing--;
641 else
642 ksm_pages_shared--;
643 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
644 stable_node->rmap_hlist_len--;
645 put_anon_vma(rmap_item->anon_vma);
646 rmap_item->address &= PAGE_MASK;
647 cond_resched();
648 }
649
650
651
652
653
654
655
656
657#if defined(GCC_VERSION) && GCC_VERSION >= 40903
658 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
659 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
660#endif
661
662 if (stable_node->head == &migrate_nodes)
663 list_del(&stable_node->list);
664 else
665 stable_node_dup_del(stable_node);
666 free_stable_node(stable_node);
667}
668
669enum get_ksm_page_flags {
670 GET_KSM_PAGE_NOLOCK,
671 GET_KSM_PAGE_LOCK,
672 GET_KSM_PAGE_TRYLOCK
673};
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694static struct page *get_ksm_page(struct stable_node *stable_node,
695 enum get_ksm_page_flags flags)
696{
697 struct page *page;
698 void *expected_mapping;
699 unsigned long kpfn;
700
701 expected_mapping = (void *)((unsigned long)stable_node |
702 PAGE_MAPPING_KSM);
703again:
704 kpfn = READ_ONCE(stable_node->kpfn);
705 page = pfn_to_page(kpfn);
706 if (READ_ONCE(page->mapping) != expected_mapping)
707 goto stale;
708
709
710
711
712
713
714
715
716
717
718
719 while (!get_page_unless_zero(page)) {
720
721
722
723
724
725
726
727
728 if (!PageSwapCache(page))
729 goto stale;
730 cpu_relax();
731 }
732
733 if (READ_ONCE(page->mapping) != expected_mapping) {
734 put_page(page);
735 goto stale;
736 }
737
738 if (flags == GET_KSM_PAGE_TRYLOCK) {
739 if (!trylock_page(page)) {
740 put_page(page);
741 return ERR_PTR(-EBUSY);
742 }
743 } else if (flags == GET_KSM_PAGE_LOCK)
744 lock_page(page);
745
746 if (flags != GET_KSM_PAGE_NOLOCK) {
747 if (READ_ONCE(page->mapping) != expected_mapping) {
748 unlock_page(page);
749 put_page(page);
750 goto stale;
751 }
752 }
753 return page;
754
755stale:
756
757
758
759
760
761
762 smp_rmb();
763 if (READ_ONCE(stable_node->kpfn) != kpfn)
764 goto again;
765 remove_node_from_stable_tree(stable_node);
766 return NULL;
767}
768
769
770
771
772
773static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
774{
775 if (rmap_item->address & STABLE_FLAG) {
776 struct stable_node *stable_node;
777 struct page *page;
778
779 stable_node = rmap_item->head;
780 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
781 if (!page)
782 goto out;
783
784 hlist_del(&rmap_item->hlist);
785 unlock_page(page);
786 put_page(page);
787
788 if (!hlist_empty(&stable_node->hlist))
789 ksm_pages_sharing--;
790 else
791 ksm_pages_shared--;
792 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
793 stable_node->rmap_hlist_len--;
794
795 put_anon_vma(rmap_item->anon_vma);
796 rmap_item->address &= PAGE_MASK;
797
798 } else if (rmap_item->address & UNSTABLE_FLAG) {
799 unsigned char age;
800
801
802
803
804
805
806
807 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
808 BUG_ON(age > 1);
809 if (!age)
810 rb_erase(&rmap_item->node,
811 root_unstable_tree + NUMA(rmap_item->nid));
812 ksm_pages_unshared--;
813 rmap_item->address &= PAGE_MASK;
814 }
815out:
816 cond_resched();
817}
818
819static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
820 struct rmap_item **rmap_list)
821{
822 while (*rmap_list) {
823 struct rmap_item *rmap_item = *rmap_list;
824 *rmap_list = rmap_item->rmap_list;
825 remove_rmap_item_from_tree(rmap_item);
826 free_rmap_item(rmap_item);
827 }
828}
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843static int unmerge_ksm_pages(struct vm_area_struct *vma,
844 unsigned long start, unsigned long end)
845{
846 unsigned long addr;
847 int err = 0;
848
849 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
850 if (ksm_test_exit(vma->vm_mm))
851 break;
852 if (signal_pending(current))
853 err = -ERESTARTSYS;
854 else
855 err = break_ksm(vma, addr);
856 }
857 return err;
858}
859
860static inline struct stable_node *page_stable_node(struct page *page)
861{
862 return PageKsm(page) ? page_rmapping(page) : NULL;
863}
864
865static inline void set_page_stable_node(struct page *page,
866 struct stable_node *stable_node)
867{
868 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
869}
870
871#ifdef CONFIG_SYSFS
872
873
874
875static int remove_stable_node(struct stable_node *stable_node)
876{
877 struct page *page;
878 int err;
879
880 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
881 if (!page) {
882
883
884
885 return 0;
886 }
887
888
889
890
891
892
893 err = -EBUSY;
894 if (!page_mapped(page)) {
895
896
897
898
899
900
901
902
903 set_page_stable_node(page, NULL);
904 remove_node_from_stable_tree(stable_node);
905 err = 0;
906 }
907
908 unlock_page(page);
909 put_page(page);
910 return err;
911}
912
913static int remove_stable_node_chain(struct stable_node *stable_node,
914 struct rb_root *root)
915{
916 struct stable_node *dup;
917 struct hlist_node *hlist_safe;
918
919 if (!is_stable_node_chain(stable_node)) {
920 VM_BUG_ON(is_stable_node_dup(stable_node));
921 if (remove_stable_node(stable_node))
922 return true;
923 else
924 return false;
925 }
926
927 hlist_for_each_entry_safe(dup, hlist_safe,
928 &stable_node->hlist, hlist_dup) {
929 VM_BUG_ON(!is_stable_node_dup(dup));
930 if (remove_stable_node(dup))
931 return true;
932 }
933 BUG_ON(!hlist_empty(&stable_node->hlist));
934 free_stable_node_chain(stable_node, root);
935 return false;
936}
937
938static int remove_all_stable_nodes(void)
939{
940 struct stable_node *stable_node, *next;
941 int nid;
942 int err = 0;
943
944 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
945 while (root_stable_tree[nid].rb_node) {
946 stable_node = rb_entry(root_stable_tree[nid].rb_node,
947 struct stable_node, node);
948 if (remove_stable_node_chain(stable_node,
949 root_stable_tree + nid)) {
950 err = -EBUSY;
951 break;
952 }
953 cond_resched();
954 }
955 }
956 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
957 if (remove_stable_node(stable_node))
958 err = -EBUSY;
959 cond_resched();
960 }
961 return err;
962}
963
964static int unmerge_and_remove_all_rmap_items(void)
965{
966 struct mm_slot *mm_slot;
967 struct mm_struct *mm;
968 struct vm_area_struct *vma;
969 int err = 0;
970
971 spin_lock(&ksm_mmlist_lock);
972 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
973 struct mm_slot, mm_list);
974 spin_unlock(&ksm_mmlist_lock);
975
976 for (mm_slot = ksm_scan.mm_slot;
977 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
978 mm = mm_slot->mm;
979 down_read(&mm->mmap_sem);
980 for (vma = mm->mmap; vma; vma = vma->vm_next) {
981 if (ksm_test_exit(mm))
982 break;
983 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
984 continue;
985 err = unmerge_ksm_pages(vma,
986 vma->vm_start, vma->vm_end);
987 if (err)
988 goto error;
989 }
990
991 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
992 up_read(&mm->mmap_sem);
993
994 spin_lock(&ksm_mmlist_lock);
995 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
996 struct mm_slot, mm_list);
997 if (ksm_test_exit(mm)) {
998 hash_del(&mm_slot->link);
999 list_del(&mm_slot->mm_list);
1000 spin_unlock(&ksm_mmlist_lock);
1001
1002 free_mm_slot(mm_slot);
1003 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1004 mmdrop(mm);
1005 } else
1006 spin_unlock(&ksm_mmlist_lock);
1007 }
1008
1009
1010 remove_all_stable_nodes();
1011 ksm_scan.seqnr = 0;
1012 return 0;
1013
1014error:
1015 up_read(&mm->mmap_sem);
1016 spin_lock(&ksm_mmlist_lock);
1017 ksm_scan.mm_slot = &ksm_mm_head;
1018 spin_unlock(&ksm_mmlist_lock);
1019 return err;
1020}
1021#endif
1022
1023static u32 calc_checksum(struct page *page)
1024{
1025 u32 checksum;
1026 void *addr = kmap_atomic(page);
1027 checksum = xxhash(addr, PAGE_SIZE, 0);
1028 kunmap_atomic(addr);
1029 return checksum;
1030}
1031
1032static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1033 pte_t *orig_pte)
1034{
1035 struct mm_struct *mm = vma->vm_mm;
1036 struct page_vma_mapped_walk pvmw = {
1037 .page = page,
1038 .vma = vma,
1039 };
1040 int swapped;
1041 int err = -EFAULT;
1042 struct mmu_notifier_range range;
1043
1044 pvmw.address = page_address_in_vma(page, vma);
1045 if (pvmw.address == -EFAULT)
1046 goto out;
1047
1048 BUG_ON(PageTransCompound(page));
1049
1050 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1051 pvmw.address,
1052 pvmw.address + PAGE_SIZE);
1053 mmu_notifier_invalidate_range_start(&range);
1054
1055 if (!page_vma_mapped_walk(&pvmw))
1056 goto out_mn;
1057 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1058 goto out_unlock;
1059
1060 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1061 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1062 mm_tlb_flush_pending(mm)) {
1063 pte_t entry;
1064
1065 swapped = PageSwapCache(page);
1066 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1082
1083
1084
1085
1086 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1087 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1088 goto out_unlock;
1089 }
1090 if (pte_dirty(entry))
1091 set_page_dirty(page);
1092
1093 if (pte_protnone(entry))
1094 entry = pte_mkclean(pte_clear_savedwrite(entry));
1095 else
1096 entry = pte_mkclean(pte_wrprotect(entry));
1097 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1098 }
1099 *orig_pte = *pvmw.pte;
1100 err = 0;
1101
1102out_unlock:
1103 page_vma_mapped_walk_done(&pvmw);
1104out_mn:
1105 mmu_notifier_invalidate_range_end(&range);
1106out:
1107 return err;
1108}
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119static int replace_page(struct vm_area_struct *vma, struct page *page,
1120 struct page *kpage, pte_t orig_pte)
1121{
1122 struct mm_struct *mm = vma->vm_mm;
1123 pmd_t *pmd;
1124 pte_t *ptep;
1125 pte_t newpte;
1126 spinlock_t *ptl;
1127 unsigned long addr;
1128 int err = -EFAULT;
1129 struct mmu_notifier_range range;
1130
1131 addr = page_address_in_vma(page, vma);
1132 if (addr == -EFAULT)
1133 goto out;
1134
1135 pmd = mm_find_pmd(mm, addr);
1136 if (!pmd)
1137 goto out;
1138
1139 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1140 addr + PAGE_SIZE);
1141 mmu_notifier_invalidate_range_start(&range);
1142
1143 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1144 if (!pte_same(*ptep, orig_pte)) {
1145 pte_unmap_unlock(ptep, ptl);
1146 goto out_mn;
1147 }
1148
1149
1150
1151
1152
1153 if (!is_zero_pfn(page_to_pfn(kpage))) {
1154 get_page(kpage);
1155 page_add_anon_rmap(kpage, vma, addr, false);
1156 newpte = mk_pte(kpage, vma->vm_page_prot);
1157 } else {
1158 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1159 vma->vm_page_prot));
1160
1161
1162
1163
1164
1165
1166 dec_mm_counter(mm, MM_ANONPAGES);
1167 }
1168
1169 flush_cache_page(vma, addr, pte_pfn(*ptep));
1170
1171
1172
1173
1174
1175
1176 ptep_clear_flush(vma, addr, ptep);
1177 set_pte_at_notify(mm, addr, ptep, newpte);
1178
1179 page_remove_rmap(page, false);
1180 if (!page_mapped(page))
1181 try_to_free_swap(page);
1182 put_page(page);
1183
1184 pte_unmap_unlock(ptep, ptl);
1185 err = 0;
1186out_mn:
1187 mmu_notifier_invalidate_range_end(&range);
1188out:
1189 return err;
1190}
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201static int try_to_merge_one_page(struct vm_area_struct *vma,
1202 struct page *page, struct page *kpage)
1203{
1204 pte_t orig_pte = __pte(0);
1205 int err = -EFAULT;
1206
1207 if (page == kpage)
1208 return 0;
1209
1210 if (!PageAnon(page))
1211 goto out;
1212
1213
1214
1215
1216
1217
1218
1219
1220 if (!trylock_page(page))
1221 goto out;
1222
1223 if (PageTransCompound(page)) {
1224 if (split_huge_page(page))
1225 goto out_unlock;
1226 }
1227
1228
1229
1230
1231
1232
1233
1234 if (write_protect_page(vma, page, &orig_pte) == 0) {
1235 if (!kpage) {
1236
1237
1238
1239
1240
1241 set_page_stable_node(page, NULL);
1242 mark_page_accessed(page);
1243
1244
1245
1246
1247 if (!PageDirty(page))
1248 SetPageDirty(page);
1249 err = 0;
1250 } else if (pages_identical(page, kpage))
1251 err = replace_page(vma, page, kpage, orig_pte);
1252 }
1253
1254 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1255 munlock_vma_page(page);
1256 if (!PageMlocked(kpage)) {
1257 unlock_page(page);
1258 lock_page(kpage);
1259 mlock_vma_page(kpage);
1260 page = kpage;
1261 }
1262 }
1263
1264out_unlock:
1265 unlock_page(page);
1266out:
1267 return err;
1268}
1269
1270
1271
1272
1273
1274
1275
1276static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1277 struct page *page, struct page *kpage)
1278{
1279 struct mm_struct *mm = rmap_item->mm;
1280 struct vm_area_struct *vma;
1281 int err = -EFAULT;
1282
1283 down_read(&mm->mmap_sem);
1284 vma = find_mergeable_vma(mm, rmap_item->address);
1285 if (!vma)
1286 goto out;
1287
1288 err = try_to_merge_one_page(vma, page, kpage);
1289 if (err)
1290 goto out;
1291
1292
1293 remove_rmap_item_from_tree(rmap_item);
1294
1295
1296 rmap_item->anon_vma = vma->anon_vma;
1297 get_anon_vma(vma->anon_vma);
1298out:
1299 up_read(&mm->mmap_sem);
1300 return err;
1301}
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1314 struct page *page,
1315 struct rmap_item *tree_rmap_item,
1316 struct page *tree_page)
1317{
1318 int err;
1319
1320 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1321 if (!err) {
1322 err = try_to_merge_with_ksm_page(tree_rmap_item,
1323 tree_page, page);
1324
1325
1326
1327
1328 if (err)
1329 break_cow(rmap_item);
1330 }
1331 return err ? NULL : page;
1332}
1333
1334static __always_inline
1335bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1336{
1337 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1338
1339
1340
1341
1342
1343
1344 return stable_node->rmap_hlist_len &&
1345 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1346}
1347
1348static __always_inline
1349bool is_page_sharing_candidate(struct stable_node *stable_node)
1350{
1351 return __is_page_sharing_candidate(stable_node, 0);
1352}
1353
1354static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1355 struct stable_node **_stable_node,
1356 struct rb_root *root,
1357 bool prune_stale_stable_nodes)
1358{
1359 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1360 struct hlist_node *hlist_safe;
1361 struct page *_tree_page, *tree_page = NULL;
1362 int nr = 0;
1363 int found_rmap_hlist_len;
1364
1365 if (!prune_stale_stable_nodes ||
1366 time_before(jiffies, stable_node->chain_prune_time +
1367 msecs_to_jiffies(
1368 ksm_stable_node_chains_prune_millisecs)))
1369 prune_stale_stable_nodes = false;
1370 else
1371 stable_node->chain_prune_time = jiffies;
1372
1373 hlist_for_each_entry_safe(dup, hlist_safe,
1374 &stable_node->hlist, hlist_dup) {
1375 cond_resched();
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1387 if (!_tree_page)
1388 continue;
1389 nr += 1;
1390 if (is_page_sharing_candidate(dup)) {
1391 if (!found ||
1392 dup->rmap_hlist_len > found_rmap_hlist_len) {
1393 if (found)
1394 put_page(tree_page);
1395 found = dup;
1396 found_rmap_hlist_len = found->rmap_hlist_len;
1397 tree_page = _tree_page;
1398
1399
1400 if (!prune_stale_stable_nodes)
1401 break;
1402 continue;
1403 }
1404 }
1405 put_page(_tree_page);
1406 }
1407
1408 if (found) {
1409
1410
1411
1412
1413
1414
1415 if (prune_stale_stable_nodes && nr == 1) {
1416
1417
1418
1419
1420
1421
1422 BUG_ON(stable_node->hlist.first->next);
1423
1424
1425
1426
1427
1428 rb_replace_node(&stable_node->node, &found->node,
1429 root);
1430 free_stable_node(stable_node);
1431 ksm_stable_node_chains--;
1432 ksm_stable_node_dups--;
1433
1434
1435
1436
1437
1438 *_stable_node = found;
1439
1440
1441
1442
1443
1444
1445 stable_node = NULL;
1446 } else if (stable_node->hlist.first != &found->hlist_dup &&
1447 __is_page_sharing_candidate(found, 1)) {
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463 hlist_del(&found->hlist_dup);
1464 hlist_add_head(&found->hlist_dup,
1465 &stable_node->hlist);
1466 }
1467 }
1468
1469 *_stable_node_dup = found;
1470 return tree_page;
1471}
1472
1473static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1474 struct rb_root *root)
1475{
1476 if (!is_stable_node_chain(stable_node))
1477 return stable_node;
1478 if (hlist_empty(&stable_node->hlist)) {
1479 free_stable_node_chain(stable_node, root);
1480 return NULL;
1481 }
1482 return hlist_entry(stable_node->hlist.first,
1483 typeof(*stable_node), hlist_dup);
1484}
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1501 struct stable_node **_stable_node,
1502 struct rb_root *root,
1503 bool prune_stale_stable_nodes)
1504{
1505 struct stable_node *stable_node = *_stable_node;
1506 if (!is_stable_node_chain(stable_node)) {
1507 if (is_page_sharing_candidate(stable_node)) {
1508 *_stable_node_dup = stable_node;
1509 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1510 }
1511
1512
1513
1514
1515 *_stable_node_dup = NULL;
1516 return NULL;
1517 }
1518 return stable_node_dup(_stable_node_dup, _stable_node, root,
1519 prune_stale_stable_nodes);
1520}
1521
1522static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1523 struct stable_node **s_n,
1524 struct rb_root *root)
1525{
1526 return __stable_node_chain(s_n_d, s_n, root, true);
1527}
1528
1529static __always_inline struct page *chain(struct stable_node **s_n_d,
1530 struct stable_node *s_n,
1531 struct rb_root *root)
1532{
1533 struct stable_node *old_stable_node = s_n;
1534 struct page *tree_page;
1535
1536 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1537
1538 VM_BUG_ON(s_n != old_stable_node);
1539 return tree_page;
1540}
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551static struct page *stable_tree_search(struct page *page)
1552{
1553 int nid;
1554 struct rb_root *root;
1555 struct rb_node **new;
1556 struct rb_node *parent;
1557 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1558 struct stable_node *page_node;
1559
1560 page_node = page_stable_node(page);
1561 if (page_node && page_node->head != &migrate_nodes) {
1562
1563 get_page(page);
1564 return page;
1565 }
1566
1567 nid = get_kpfn_nid(page_to_pfn(page));
1568 root = root_stable_tree + nid;
1569again:
1570 new = &root->rb_node;
1571 parent = NULL;
1572
1573 while (*new) {
1574 struct page *tree_page;
1575 int ret;
1576
1577 cond_resched();
1578 stable_node = rb_entry(*new, struct stable_node, node);
1579 stable_node_any = NULL;
1580 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593 if (!stable_node_dup) {
1594
1595
1596
1597
1598
1599 stable_node_any = stable_node_dup_any(stable_node,
1600 root);
1601 if (!stable_node_any) {
1602
1603 goto again;
1604 }
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614 tree_page = get_ksm_page(stable_node_any,
1615 GET_KSM_PAGE_NOLOCK);
1616 }
1617 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1618 if (!tree_page) {
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628 goto again;
1629 }
1630
1631 ret = memcmp_pages(page, tree_page);
1632 put_page(tree_page);
1633
1634 parent = *new;
1635 if (ret < 0)
1636 new = &parent->rb_left;
1637 else if (ret > 0)
1638 new = &parent->rb_right;
1639 else {
1640 if (page_node) {
1641 VM_BUG_ON(page_node->head != &migrate_nodes);
1642
1643
1644
1645
1646
1647
1648 if (page_mapcount(page) > 1)
1649 goto chain_append;
1650 }
1651
1652 if (!stable_node_dup) {
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665 return NULL;
1666 }
1667
1668
1669
1670
1671
1672
1673
1674
1675 tree_page = get_ksm_page(stable_node_dup,
1676 GET_KSM_PAGE_TRYLOCK);
1677
1678 if (PTR_ERR(tree_page) == -EBUSY)
1679 return ERR_PTR(-EBUSY);
1680
1681 if (unlikely(!tree_page))
1682
1683
1684
1685
1686 goto again;
1687 unlock_page(tree_page);
1688
1689 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1690 NUMA(stable_node_dup->nid)) {
1691 put_page(tree_page);
1692 goto replace;
1693 }
1694 return tree_page;
1695 }
1696 }
1697
1698 if (!page_node)
1699 return NULL;
1700
1701 list_del(&page_node->list);
1702 DO_NUMA(page_node->nid = nid);
1703 rb_link_node(&page_node->node, parent, new);
1704 rb_insert_color(&page_node->node, root);
1705out:
1706 if (is_page_sharing_candidate(page_node)) {
1707 get_page(page);
1708 return page;
1709 } else
1710 return NULL;
1711
1712replace:
1713
1714
1715
1716
1717
1718
1719
1720
1721 if (stable_node_dup == stable_node) {
1722 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1723 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1724
1725 if (page_node) {
1726 VM_BUG_ON(page_node->head != &migrate_nodes);
1727 list_del(&page_node->list);
1728 DO_NUMA(page_node->nid = nid);
1729 rb_replace_node(&stable_node_dup->node,
1730 &page_node->node,
1731 root);
1732 if (is_page_sharing_candidate(page_node))
1733 get_page(page);
1734 else
1735 page = NULL;
1736 } else {
1737 rb_erase(&stable_node_dup->node, root);
1738 page = NULL;
1739 }
1740 } else {
1741 VM_BUG_ON(!is_stable_node_chain(stable_node));
1742 __stable_node_dup_del(stable_node_dup);
1743 if (page_node) {
1744 VM_BUG_ON(page_node->head != &migrate_nodes);
1745 list_del(&page_node->list);
1746 DO_NUMA(page_node->nid = nid);
1747 stable_node_chain_add_dup(page_node, stable_node);
1748 if (is_page_sharing_candidate(page_node))
1749 get_page(page);
1750 else
1751 page = NULL;
1752 } else {
1753 page = NULL;
1754 }
1755 }
1756 stable_node_dup->head = &migrate_nodes;
1757 list_add(&stable_node_dup->list, stable_node_dup->head);
1758 return page;
1759
1760chain_append:
1761
1762 if (!stable_node_dup)
1763 stable_node_dup = stable_node_any;
1764
1765
1766
1767
1768
1769
1770
1771
1772 if (stable_node_dup == stable_node) {
1773 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1774 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1775
1776 stable_node = alloc_stable_node_chain(stable_node_dup,
1777 root);
1778 if (!stable_node)
1779 return NULL;
1780 }
1781
1782
1783
1784
1785
1786
1787 VM_BUG_ON(!is_stable_node_chain(stable_node));
1788 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1789 VM_BUG_ON(page_node->head != &migrate_nodes);
1790 list_del(&page_node->list);
1791 DO_NUMA(page_node->nid = nid);
1792 stable_node_chain_add_dup(page_node, stable_node);
1793 goto out;
1794}
1795
1796
1797
1798
1799
1800
1801
1802
1803static struct stable_node *stable_tree_insert(struct page *kpage)
1804{
1805 int nid;
1806 unsigned long kpfn;
1807 struct rb_root *root;
1808 struct rb_node **new;
1809 struct rb_node *parent;
1810 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1811 bool need_chain = false;
1812
1813 kpfn = page_to_pfn(kpage);
1814 nid = get_kpfn_nid(kpfn);
1815 root = root_stable_tree + nid;
1816again:
1817 parent = NULL;
1818 new = &root->rb_node;
1819
1820 while (*new) {
1821 struct page *tree_page;
1822 int ret;
1823
1824 cond_resched();
1825 stable_node = rb_entry(*new, struct stable_node, node);
1826 stable_node_any = NULL;
1827 tree_page = chain(&stable_node_dup, stable_node, root);
1828 if (!stable_node_dup) {
1829
1830
1831
1832
1833
1834 stable_node_any = stable_node_dup_any(stable_node,
1835 root);
1836 if (!stable_node_any) {
1837
1838 goto again;
1839 }
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849 tree_page = get_ksm_page(stable_node_any,
1850 GET_KSM_PAGE_NOLOCK);
1851 }
1852 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1853 if (!tree_page) {
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863 goto again;
1864 }
1865
1866 ret = memcmp_pages(kpage, tree_page);
1867 put_page(tree_page);
1868
1869 parent = *new;
1870 if (ret < 0)
1871 new = &parent->rb_left;
1872 else if (ret > 0)
1873 new = &parent->rb_right;
1874 else {
1875 need_chain = true;
1876 break;
1877 }
1878 }
1879
1880 stable_node_dup = alloc_stable_node();
1881 if (!stable_node_dup)
1882 return NULL;
1883
1884 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1885 stable_node_dup->kpfn = kpfn;
1886 set_page_stable_node(kpage, stable_node_dup);
1887 stable_node_dup->rmap_hlist_len = 0;
1888 DO_NUMA(stable_node_dup->nid = nid);
1889 if (!need_chain) {
1890 rb_link_node(&stable_node_dup->node, parent, new);
1891 rb_insert_color(&stable_node_dup->node, root);
1892 } else {
1893 if (!is_stable_node_chain(stable_node)) {
1894 struct stable_node *orig = stable_node;
1895
1896 stable_node = alloc_stable_node_chain(orig, root);
1897 if (!stable_node) {
1898 free_stable_node(stable_node_dup);
1899 return NULL;
1900 }
1901 }
1902 stable_node_chain_add_dup(stable_node_dup, stable_node);
1903 }
1904
1905 return stable_node_dup;
1906}
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922static
1923struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1924 struct page *page,
1925 struct page **tree_pagep)
1926{
1927 struct rb_node **new;
1928 struct rb_root *root;
1929 struct rb_node *parent = NULL;
1930 int nid;
1931
1932 nid = get_kpfn_nid(page_to_pfn(page));
1933 root = root_unstable_tree + nid;
1934 new = &root->rb_node;
1935
1936 while (*new) {
1937 struct rmap_item *tree_rmap_item;
1938 struct page *tree_page;
1939 int ret;
1940
1941 cond_resched();
1942 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1943 tree_page = get_mergeable_page(tree_rmap_item);
1944 if (!tree_page)
1945 return NULL;
1946
1947
1948
1949
1950 if (page == tree_page) {
1951 put_page(tree_page);
1952 return NULL;
1953 }
1954
1955 ret = memcmp_pages(page, tree_page);
1956
1957 parent = *new;
1958 if (ret < 0) {
1959 put_page(tree_page);
1960 new = &parent->rb_left;
1961 } else if (ret > 0) {
1962 put_page(tree_page);
1963 new = &parent->rb_right;
1964 } else if (!ksm_merge_across_nodes &&
1965 page_to_nid(tree_page) != nid) {
1966
1967
1968
1969
1970
1971 put_page(tree_page);
1972 return NULL;
1973 } else {
1974 *tree_pagep = tree_page;
1975 return tree_rmap_item;
1976 }
1977 }
1978
1979 rmap_item->address |= UNSTABLE_FLAG;
1980 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1981 DO_NUMA(rmap_item->nid = nid);
1982 rb_link_node(&rmap_item->node, parent, new);
1983 rb_insert_color(&rmap_item->node, root);
1984
1985 ksm_pages_unshared++;
1986 return NULL;
1987}
1988
1989
1990
1991
1992
1993
1994static void stable_tree_append(struct rmap_item *rmap_item,
1995 struct stable_node *stable_node,
1996 bool max_page_sharing_bypass)
1997{
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008 BUG_ON(stable_node->rmap_hlist_len < 0);
2009
2010 stable_node->rmap_hlist_len++;
2011 if (!max_page_sharing_bypass)
2012
2013 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2014 ksm_max_page_sharing);
2015
2016 rmap_item->head = stable_node;
2017 rmap_item->address |= STABLE_FLAG;
2018 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2019
2020 if (rmap_item->hlist.next)
2021 ksm_pages_sharing++;
2022 else
2023 ksm_pages_shared++;
2024}
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2036{
2037 struct mm_struct *mm = rmap_item->mm;
2038 struct rmap_item *tree_rmap_item;
2039 struct page *tree_page = NULL;
2040 struct stable_node *stable_node;
2041 struct page *kpage;
2042 unsigned int checksum;
2043 int err;
2044 bool max_page_sharing_bypass = false;
2045
2046 stable_node = page_stable_node(page);
2047 if (stable_node) {
2048 if (stable_node->head != &migrate_nodes &&
2049 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2050 NUMA(stable_node->nid)) {
2051 stable_node_dup_del(stable_node);
2052 stable_node->head = &migrate_nodes;
2053 list_add(&stable_node->list, stable_node->head);
2054 }
2055 if (stable_node->head != &migrate_nodes &&
2056 rmap_item->head == stable_node)
2057 return;
2058
2059
2060
2061
2062 if (!is_page_sharing_candidate(stable_node))
2063 max_page_sharing_bypass = true;
2064 }
2065
2066
2067 kpage = stable_tree_search(page);
2068 if (kpage == page && rmap_item->head == stable_node) {
2069 put_page(kpage);
2070 return;
2071 }
2072
2073 remove_rmap_item_from_tree(rmap_item);
2074
2075 if (kpage) {
2076 if (PTR_ERR(kpage) == -EBUSY)
2077 return;
2078
2079 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2080 if (!err) {
2081
2082
2083
2084
2085 lock_page(kpage);
2086 stable_tree_append(rmap_item, page_stable_node(kpage),
2087 max_page_sharing_bypass);
2088 unlock_page(kpage);
2089 }
2090 put_page(kpage);
2091 return;
2092 }
2093
2094
2095
2096
2097
2098
2099
2100 checksum = calc_checksum(page);
2101 if (rmap_item->oldchecksum != checksum) {
2102 rmap_item->oldchecksum = checksum;
2103 return;
2104 }
2105
2106
2107
2108
2109
2110 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2111 struct vm_area_struct *vma;
2112
2113 down_read(&mm->mmap_sem);
2114 vma = find_mergeable_vma(mm, rmap_item->address);
2115 err = try_to_merge_one_page(vma, page,
2116 ZERO_PAGE(rmap_item->address));
2117 up_read(&mm->mmap_sem);
2118
2119
2120
2121
2122 if (!err)
2123 return;
2124 }
2125 tree_rmap_item =
2126 unstable_tree_search_insert(rmap_item, page, &tree_page);
2127 if (tree_rmap_item) {
2128 bool split;
2129
2130 kpage = try_to_merge_two_pages(rmap_item, page,
2131 tree_rmap_item, tree_page);
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142 split = PageTransCompound(page)
2143 && compound_head(page) == compound_head(tree_page);
2144 put_page(tree_page);
2145 if (kpage) {
2146
2147
2148
2149
2150 lock_page(kpage);
2151 stable_node = stable_tree_insert(kpage);
2152 if (stable_node) {
2153 stable_tree_append(tree_rmap_item, stable_node,
2154 false);
2155 stable_tree_append(rmap_item, stable_node,
2156 false);
2157 }
2158 unlock_page(kpage);
2159
2160
2161
2162
2163
2164
2165
2166 if (!stable_node) {
2167 break_cow(tree_rmap_item);
2168 break_cow(rmap_item);
2169 }
2170 } else if (split) {
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180 if (!trylock_page(page))
2181 return;
2182 split_huge_page(page);
2183 unlock_page(page);
2184 }
2185 }
2186}
2187
2188static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2189 struct rmap_item **rmap_list,
2190 unsigned long addr)
2191{
2192 struct rmap_item *rmap_item;
2193
2194 while (*rmap_list) {
2195 rmap_item = *rmap_list;
2196 if ((rmap_item->address & PAGE_MASK) == addr)
2197 return rmap_item;
2198 if (rmap_item->address > addr)
2199 break;
2200 *rmap_list = rmap_item->rmap_list;
2201 remove_rmap_item_from_tree(rmap_item);
2202 free_rmap_item(rmap_item);
2203 }
2204
2205 rmap_item = alloc_rmap_item();
2206 if (rmap_item) {
2207
2208 rmap_item->mm = mm_slot->mm;
2209 rmap_item->address = addr;
2210 rmap_item->rmap_list = *rmap_list;
2211 *rmap_list = rmap_item;
2212 }
2213 return rmap_item;
2214}
2215
2216static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2217{
2218 struct mm_struct *mm;
2219 struct mm_slot *slot;
2220 struct vm_area_struct *vma;
2221 struct rmap_item *rmap_item;
2222 int nid;
2223
2224 if (list_empty(&ksm_mm_head.mm_list))
2225 return NULL;
2226
2227 slot = ksm_scan.mm_slot;
2228 if (slot == &ksm_mm_head) {
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239 lru_add_drain_all();
2240
2241
2242
2243
2244
2245
2246
2247 if (!ksm_merge_across_nodes) {
2248 struct stable_node *stable_node, *next;
2249 struct page *page;
2250
2251 list_for_each_entry_safe(stable_node, next,
2252 &migrate_nodes, list) {
2253 page = get_ksm_page(stable_node,
2254 GET_KSM_PAGE_NOLOCK);
2255 if (page)
2256 put_page(page);
2257 cond_resched();
2258 }
2259 }
2260
2261 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2262 root_unstable_tree[nid] = RB_ROOT;
2263
2264 spin_lock(&ksm_mmlist_lock);
2265 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2266 ksm_scan.mm_slot = slot;
2267 spin_unlock(&ksm_mmlist_lock);
2268
2269
2270
2271
2272 if (slot == &ksm_mm_head)
2273 return NULL;
2274next_mm:
2275 ksm_scan.address = 0;
2276 ksm_scan.rmap_list = &slot->rmap_list;
2277 }
2278
2279 mm = slot->mm;
2280 down_read(&mm->mmap_sem);
2281 if (ksm_test_exit(mm))
2282 vma = NULL;
2283 else
2284 vma = find_vma(mm, ksm_scan.address);
2285
2286 for (; vma; vma = vma->vm_next) {
2287 if (!(vma->vm_flags & VM_MERGEABLE))
2288 continue;
2289 if (ksm_scan.address < vma->vm_start)
2290 ksm_scan.address = vma->vm_start;
2291 if (!vma->anon_vma)
2292 ksm_scan.address = vma->vm_end;
2293
2294 while (ksm_scan.address < vma->vm_end) {
2295 if (ksm_test_exit(mm))
2296 break;
2297 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2298 if (IS_ERR_OR_NULL(*page)) {
2299 ksm_scan.address += PAGE_SIZE;
2300 cond_resched();
2301 continue;
2302 }
2303 if (PageAnon(*page)) {
2304 flush_anon_page(vma, *page, ksm_scan.address);
2305 flush_dcache_page(*page);
2306 rmap_item = get_next_rmap_item(slot,
2307 ksm_scan.rmap_list, ksm_scan.address);
2308 if (rmap_item) {
2309 ksm_scan.rmap_list =
2310 &rmap_item->rmap_list;
2311 ksm_scan.address += PAGE_SIZE;
2312 } else
2313 put_page(*page);
2314 up_read(&mm->mmap_sem);
2315 return rmap_item;
2316 }
2317 put_page(*page);
2318 ksm_scan.address += PAGE_SIZE;
2319 cond_resched();
2320 }
2321 }
2322
2323 if (ksm_test_exit(mm)) {
2324 ksm_scan.address = 0;
2325 ksm_scan.rmap_list = &slot->rmap_list;
2326 }
2327
2328
2329
2330
2331 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
2332
2333 spin_lock(&ksm_mmlist_lock);
2334 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2335 struct mm_slot, mm_list);
2336 if (ksm_scan.address == 0) {
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346 hash_del(&slot->link);
2347 list_del(&slot->mm_list);
2348 spin_unlock(&ksm_mmlist_lock);
2349
2350 free_mm_slot(slot);
2351 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2352 up_read(&mm->mmap_sem);
2353 mmdrop(mm);
2354 } else {
2355 up_read(&mm->mmap_sem);
2356
2357
2358
2359
2360
2361
2362
2363 spin_unlock(&ksm_mmlist_lock);
2364 }
2365
2366
2367 slot = ksm_scan.mm_slot;
2368 if (slot != &ksm_mm_head)
2369 goto next_mm;
2370
2371 ksm_scan.seqnr++;
2372 return NULL;
2373}
2374
2375
2376
2377
2378
2379static void ksm_do_scan(unsigned int scan_npages)
2380{
2381 struct rmap_item *rmap_item;
2382 struct page *uninitialized_var(page);
2383
2384 while (scan_npages-- && likely(!freezing(current))) {
2385 cond_resched();
2386 rmap_item = scan_get_next_rmap_item(&page);
2387 if (!rmap_item)
2388 return;
2389 cmp_and_merge_page(page, rmap_item);
2390 put_page(page);
2391 }
2392}
2393
2394static int ksmd_should_run(void)
2395{
2396 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2397}
2398
2399static int ksm_scan_thread(void *nothing)
2400{
2401 unsigned int sleep_ms;
2402
2403 set_freezable();
2404 set_user_nice(current, 5);
2405
2406 while (!kthread_should_stop()) {
2407 mutex_lock(&ksm_thread_mutex);
2408 wait_while_offlining();
2409 if (ksmd_should_run())
2410 ksm_do_scan(ksm_thread_pages_to_scan);
2411 mutex_unlock(&ksm_thread_mutex);
2412
2413 try_to_freeze();
2414
2415 if (ksmd_should_run()) {
2416 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2417 wait_event_interruptible_timeout(ksm_iter_wait,
2418 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2419 msecs_to_jiffies(sleep_ms));
2420 } else {
2421 wait_event_freezable(ksm_thread_wait,
2422 ksmd_should_run() || kthread_should_stop());
2423 }
2424 }
2425 return 0;
2426}
2427
2428int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2429 unsigned long end, int advice, unsigned long *vm_flags)
2430{
2431 struct mm_struct *mm = vma->vm_mm;
2432 int err;
2433
2434 switch (advice) {
2435 case MADV_MERGEABLE:
2436
2437
2438
2439 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2440 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2441 VM_HUGETLB | VM_MIXEDMAP))
2442 return 0;
2443
2444 if (vma_is_dax(vma))
2445 return 0;
2446
2447#ifdef VM_SAO
2448 if (*vm_flags & VM_SAO)
2449 return 0;
2450#endif
2451#ifdef VM_SPARC_ADI
2452 if (*vm_flags & VM_SPARC_ADI)
2453 return 0;
2454#endif
2455
2456 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2457 err = __ksm_enter(mm);
2458 if (err)
2459 return err;
2460 }
2461
2462 *vm_flags |= VM_MERGEABLE;
2463 break;
2464
2465 case MADV_UNMERGEABLE:
2466 if (!(*vm_flags & VM_MERGEABLE))
2467 return 0;
2468
2469 if (vma->anon_vma) {
2470 err = unmerge_ksm_pages(vma, start, end);
2471 if (err)
2472 return err;
2473 }
2474
2475 *vm_flags &= ~VM_MERGEABLE;
2476 break;
2477 }
2478
2479 return 0;
2480}
2481EXPORT_SYMBOL_GPL(ksm_madvise);
2482
2483int __ksm_enter(struct mm_struct *mm)
2484{
2485 struct mm_slot *mm_slot;
2486 int needs_wakeup;
2487
2488 mm_slot = alloc_mm_slot();
2489 if (!mm_slot)
2490 return -ENOMEM;
2491
2492
2493 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2494
2495 spin_lock(&ksm_mmlist_lock);
2496 insert_to_mm_slots_hash(mm, mm_slot);
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507 if (ksm_run & KSM_RUN_UNMERGE)
2508 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2509 else
2510 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2511 spin_unlock(&ksm_mmlist_lock);
2512
2513 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2514 mmgrab(mm);
2515
2516 if (needs_wakeup)
2517 wake_up_interruptible(&ksm_thread_wait);
2518
2519 return 0;
2520}
2521
2522void __ksm_exit(struct mm_struct *mm)
2523{
2524 struct mm_slot *mm_slot;
2525 int easy_to_free = 0;
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536 spin_lock(&ksm_mmlist_lock);
2537 mm_slot = get_mm_slot(mm);
2538 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2539 if (!mm_slot->rmap_list) {
2540 hash_del(&mm_slot->link);
2541 list_del(&mm_slot->mm_list);
2542 easy_to_free = 1;
2543 } else {
2544 list_move(&mm_slot->mm_list,
2545 &ksm_scan.mm_slot->mm_list);
2546 }
2547 }
2548 spin_unlock(&ksm_mmlist_lock);
2549
2550 if (easy_to_free) {
2551 free_mm_slot(mm_slot);
2552 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2553 mmdrop(mm);
2554 } else if (mm_slot) {
2555 down_write(&mm->mmap_sem);
2556 up_write(&mm->mmap_sem);
2557 }
2558}
2559
2560struct page *ksm_might_need_to_copy(struct page *page,
2561 struct vm_area_struct *vma, unsigned long address)
2562{
2563 struct anon_vma *anon_vma = page_anon_vma(page);
2564 struct page *new_page;
2565
2566 if (PageKsm(page)) {
2567 if (page_stable_node(page) &&
2568 !(ksm_run & KSM_RUN_UNMERGE))
2569 return page;
2570 } else if (!anon_vma) {
2571 return page;
2572 } else if (anon_vma->root == vma->anon_vma->root &&
2573 page->index == linear_page_index(vma, address)) {
2574 return page;
2575 }
2576 if (!PageUptodate(page))
2577 return page;
2578
2579 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2580 if (new_page) {
2581 copy_user_highpage(new_page, page, address, vma);
2582
2583 SetPageDirty(new_page);
2584 __SetPageUptodate(new_page);
2585 __SetPageLocked(new_page);
2586 }
2587
2588 return new_page;
2589}
2590
2591void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
2592{
2593 struct stable_node *stable_node;
2594 struct rmap_item *rmap_item;
2595 int search_new_forks = 0;
2596
2597 VM_BUG_ON_PAGE(!PageKsm(page), page);
2598
2599
2600
2601
2602
2603 VM_BUG_ON_PAGE(!PageLocked(page), page);
2604
2605 stable_node = page_stable_node(page);
2606 if (!stable_node)
2607 return;
2608again:
2609 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2610 struct anon_vma *anon_vma = rmap_item->anon_vma;
2611 struct anon_vma_chain *vmac;
2612 struct vm_area_struct *vma;
2613
2614 cond_resched();
2615 anon_vma_lock_read(anon_vma);
2616 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2617 0, ULONG_MAX) {
2618 unsigned long addr;
2619
2620 cond_resched();
2621 vma = vmac->vma;
2622
2623
2624 addr = rmap_item->address & ~KSM_FLAG_MASK;
2625
2626 if (addr < vma->vm_start || addr >= vma->vm_end)
2627 continue;
2628
2629
2630
2631
2632
2633
2634 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2635 continue;
2636
2637 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2638 continue;
2639
2640 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
2641 anon_vma_unlock_read(anon_vma);
2642 return;
2643 }
2644 if (rwc->done && rwc->done(page)) {
2645 anon_vma_unlock_read(anon_vma);
2646 return;
2647 }
2648 }
2649 anon_vma_unlock_read(anon_vma);
2650 }
2651 if (!search_new_forks++)
2652 goto again;
2653}
2654
2655bool reuse_ksm_page(struct page *page,
2656 struct vm_area_struct *vma,
2657 unsigned long address)
2658{
2659#ifdef CONFIG_DEBUG_VM
2660 if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
2661 WARN_ON(!page_mapped(page)) ||
2662 WARN_ON(!PageLocked(page))) {
2663 dump_page(page, "reuse_ksm_page");
2664 return false;
2665 }
2666#endif
2667
2668 if (PageSwapCache(page) || !page_stable_node(page))
2669 return false;
2670
2671 if (!page_ref_freeze(page, 1))
2672 return false;
2673
2674 page_move_anon_rmap(page, vma);
2675 page->index = linear_page_index(vma, address);
2676 page_ref_unfreeze(page, 1);
2677
2678 return true;
2679}
2680#ifdef CONFIG_MIGRATION
2681void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2682{
2683 struct stable_node *stable_node;
2684
2685 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2686 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2687 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2688
2689 stable_node = page_stable_node(newpage);
2690 if (stable_node) {
2691 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2692 stable_node->kpfn = page_to_pfn(newpage);
2693
2694
2695
2696
2697
2698
2699 smp_wmb();
2700 set_page_stable_node(oldpage, NULL);
2701 }
2702}
2703#endif
2704
2705#ifdef CONFIG_MEMORY_HOTREMOVE
2706static void wait_while_offlining(void)
2707{
2708 while (ksm_run & KSM_RUN_OFFLINE) {
2709 mutex_unlock(&ksm_thread_mutex);
2710 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2711 TASK_UNINTERRUPTIBLE);
2712 mutex_lock(&ksm_thread_mutex);
2713 }
2714}
2715
2716static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2717 unsigned long start_pfn,
2718 unsigned long end_pfn)
2719{
2720 if (stable_node->kpfn >= start_pfn &&
2721 stable_node->kpfn < end_pfn) {
2722
2723
2724
2725
2726 remove_node_from_stable_tree(stable_node);
2727 return true;
2728 }
2729 return false;
2730}
2731
2732static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2733 unsigned long start_pfn,
2734 unsigned long end_pfn,
2735 struct rb_root *root)
2736{
2737 struct stable_node *dup;
2738 struct hlist_node *hlist_safe;
2739
2740 if (!is_stable_node_chain(stable_node)) {
2741 VM_BUG_ON(is_stable_node_dup(stable_node));
2742 return stable_node_dup_remove_range(stable_node, start_pfn,
2743 end_pfn);
2744 }
2745
2746 hlist_for_each_entry_safe(dup, hlist_safe,
2747 &stable_node->hlist, hlist_dup) {
2748 VM_BUG_ON(!is_stable_node_dup(dup));
2749 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2750 }
2751 if (hlist_empty(&stable_node->hlist)) {
2752 free_stable_node_chain(stable_node, root);
2753 return true;
2754 } else
2755 return false;
2756}
2757
2758static void ksm_check_stable_tree(unsigned long start_pfn,
2759 unsigned long end_pfn)
2760{
2761 struct stable_node *stable_node, *next;
2762 struct rb_node *node;
2763 int nid;
2764
2765 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2766 node = rb_first(root_stable_tree + nid);
2767 while (node) {
2768 stable_node = rb_entry(node, struct stable_node, node);
2769 if (stable_node_chain_remove_range(stable_node,
2770 start_pfn, end_pfn,
2771 root_stable_tree +
2772 nid))
2773 node = rb_first(root_stable_tree + nid);
2774 else
2775 node = rb_next(node);
2776 cond_resched();
2777 }
2778 }
2779 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2780 if (stable_node->kpfn >= start_pfn &&
2781 stable_node->kpfn < end_pfn)
2782 remove_node_from_stable_tree(stable_node);
2783 cond_resched();
2784 }
2785}
2786
2787static int ksm_memory_callback(struct notifier_block *self,
2788 unsigned long action, void *arg)
2789{
2790 struct memory_notify *mn = arg;
2791
2792 switch (action) {
2793 case MEM_GOING_OFFLINE:
2794
2795
2796
2797
2798
2799
2800
2801 mutex_lock(&ksm_thread_mutex);
2802 ksm_run |= KSM_RUN_OFFLINE;
2803 mutex_unlock(&ksm_thread_mutex);
2804 break;
2805
2806 case MEM_OFFLINE:
2807
2808
2809
2810
2811
2812
2813
2814 ksm_check_stable_tree(mn->start_pfn,
2815 mn->start_pfn + mn->nr_pages);
2816
2817
2818 case MEM_CANCEL_OFFLINE:
2819 mutex_lock(&ksm_thread_mutex);
2820 ksm_run &= ~KSM_RUN_OFFLINE;
2821 mutex_unlock(&ksm_thread_mutex);
2822
2823 smp_mb();
2824 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2825 break;
2826 }
2827 return NOTIFY_OK;
2828}
2829#else
2830static void wait_while_offlining(void)
2831{
2832}
2833#endif
2834
2835#ifdef CONFIG_SYSFS
2836
2837
2838
2839
2840#define KSM_ATTR_RO(_name) \
2841 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2842#define KSM_ATTR(_name) \
2843 static struct kobj_attribute _name##_attr = \
2844 __ATTR(_name, 0644, _name##_show, _name##_store)
2845
2846static ssize_t sleep_millisecs_show(struct kobject *kobj,
2847 struct kobj_attribute *attr, char *buf)
2848{
2849 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2850}
2851
2852static ssize_t sleep_millisecs_store(struct kobject *kobj,
2853 struct kobj_attribute *attr,
2854 const char *buf, size_t count)
2855{
2856 unsigned long msecs;
2857 int err;
2858
2859 err = kstrtoul(buf, 10, &msecs);
2860 if (err || msecs > UINT_MAX)
2861 return -EINVAL;
2862
2863 ksm_thread_sleep_millisecs = msecs;
2864 wake_up_interruptible(&ksm_iter_wait);
2865
2866 return count;
2867}
2868KSM_ATTR(sleep_millisecs);
2869
2870static ssize_t pages_to_scan_show(struct kobject *kobj,
2871 struct kobj_attribute *attr, char *buf)
2872{
2873 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2874}
2875
2876static ssize_t pages_to_scan_store(struct kobject *kobj,
2877 struct kobj_attribute *attr,
2878 const char *buf, size_t count)
2879{
2880 int err;
2881 unsigned long nr_pages;
2882
2883 err = kstrtoul(buf, 10, &nr_pages);
2884 if (err || nr_pages > UINT_MAX)
2885 return -EINVAL;
2886
2887 ksm_thread_pages_to_scan = nr_pages;
2888
2889 return count;
2890}
2891KSM_ATTR(pages_to_scan);
2892
2893static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2894 char *buf)
2895{
2896 return sprintf(buf, "%lu\n", ksm_run);
2897}
2898
2899static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2900 const char *buf, size_t count)
2901{
2902 int err;
2903 unsigned long flags;
2904
2905 err = kstrtoul(buf, 10, &flags);
2906 if (err || flags > UINT_MAX)
2907 return -EINVAL;
2908 if (flags > KSM_RUN_UNMERGE)
2909 return -EINVAL;
2910
2911
2912
2913
2914
2915
2916
2917
2918 mutex_lock(&ksm_thread_mutex);
2919 wait_while_offlining();
2920 if (ksm_run != flags) {
2921 ksm_run = flags;
2922 if (flags & KSM_RUN_UNMERGE) {
2923 set_current_oom_origin();
2924 err = unmerge_and_remove_all_rmap_items();
2925 clear_current_oom_origin();
2926 if (err) {
2927 ksm_run = KSM_RUN_STOP;
2928 count = err;
2929 }
2930 }
2931 }
2932 mutex_unlock(&ksm_thread_mutex);
2933
2934 if (flags & KSM_RUN_MERGE)
2935 wake_up_interruptible(&ksm_thread_wait);
2936
2937 return count;
2938}
2939KSM_ATTR(run);
2940
2941#ifdef CONFIG_NUMA
2942static ssize_t merge_across_nodes_show(struct kobject *kobj,
2943 struct kobj_attribute *attr, char *buf)
2944{
2945 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2946}
2947
2948static ssize_t merge_across_nodes_store(struct kobject *kobj,
2949 struct kobj_attribute *attr,
2950 const char *buf, size_t count)
2951{
2952 int err;
2953 unsigned long knob;
2954
2955 err = kstrtoul(buf, 10, &knob);
2956 if (err)
2957 return err;
2958 if (knob > 1)
2959 return -EINVAL;
2960
2961 mutex_lock(&ksm_thread_mutex);
2962 wait_while_offlining();
2963 if (ksm_merge_across_nodes != knob) {
2964 if (ksm_pages_shared || remove_all_stable_nodes())
2965 err = -EBUSY;
2966 else if (root_stable_tree == one_stable_tree) {
2967 struct rb_root *buf;
2968
2969
2970
2971
2972
2973
2974
2975 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2976 GFP_KERNEL);
2977
2978 if (!buf)
2979 err = -ENOMEM;
2980 else {
2981 root_stable_tree = buf;
2982 root_unstable_tree = buf + nr_node_ids;
2983
2984 root_unstable_tree[0] = one_unstable_tree[0];
2985 }
2986 }
2987 if (!err) {
2988 ksm_merge_across_nodes = knob;
2989 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2990 }
2991 }
2992 mutex_unlock(&ksm_thread_mutex);
2993
2994 return err ? err : count;
2995}
2996KSM_ATTR(merge_across_nodes);
2997#endif
2998
2999static ssize_t use_zero_pages_show(struct kobject *kobj,
3000 struct kobj_attribute *attr, char *buf)
3001{
3002 return sprintf(buf, "%u\n", ksm_use_zero_pages);
3003}
3004static ssize_t use_zero_pages_store(struct kobject *kobj,
3005 struct kobj_attribute *attr,
3006 const char *buf, size_t count)
3007{
3008 int err;
3009 bool value;
3010
3011 err = kstrtobool(buf, &value);
3012 if (err)
3013 return -EINVAL;
3014
3015 ksm_use_zero_pages = value;
3016
3017 return count;
3018}
3019KSM_ATTR(use_zero_pages);
3020
3021static ssize_t max_page_sharing_show(struct kobject *kobj,
3022 struct kobj_attribute *attr, char *buf)
3023{
3024 return sprintf(buf, "%u\n", ksm_max_page_sharing);
3025}
3026
3027static ssize_t max_page_sharing_store(struct kobject *kobj,
3028 struct kobj_attribute *attr,
3029 const char *buf, size_t count)
3030{
3031 int err;
3032 int knob;
3033
3034 err = kstrtoint(buf, 10, &knob);
3035 if (err)
3036 return err;
3037
3038
3039
3040
3041
3042 if (knob < 2)
3043 return -EINVAL;
3044
3045 if (READ_ONCE(ksm_max_page_sharing) == knob)
3046 return count;
3047
3048 mutex_lock(&ksm_thread_mutex);
3049 wait_while_offlining();
3050 if (ksm_max_page_sharing != knob) {
3051 if (ksm_pages_shared || remove_all_stable_nodes())
3052 err = -EBUSY;
3053 else
3054 ksm_max_page_sharing = knob;
3055 }
3056 mutex_unlock(&ksm_thread_mutex);
3057
3058 return err ? err : count;
3059}
3060KSM_ATTR(max_page_sharing);
3061
3062static ssize_t pages_shared_show(struct kobject *kobj,
3063 struct kobj_attribute *attr, char *buf)
3064{
3065 return sprintf(buf, "%lu\n", ksm_pages_shared);
3066}
3067KSM_ATTR_RO(pages_shared);
3068
3069static ssize_t pages_sharing_show(struct kobject *kobj,
3070 struct kobj_attribute *attr, char *buf)
3071{
3072 return sprintf(buf, "%lu\n", ksm_pages_sharing);
3073}
3074KSM_ATTR_RO(pages_sharing);
3075
3076static ssize_t pages_unshared_show(struct kobject *kobj,
3077 struct kobj_attribute *attr, char *buf)
3078{
3079 return sprintf(buf, "%lu\n", ksm_pages_unshared);
3080}
3081KSM_ATTR_RO(pages_unshared);
3082
3083static ssize_t pages_volatile_show(struct kobject *kobj,
3084 struct kobj_attribute *attr, char *buf)
3085{
3086 long ksm_pages_volatile;
3087
3088 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3089 - ksm_pages_sharing - ksm_pages_unshared;
3090
3091
3092
3093
3094 if (ksm_pages_volatile < 0)
3095 ksm_pages_volatile = 0;
3096 return sprintf(buf, "%ld\n", ksm_pages_volatile);
3097}
3098KSM_ATTR_RO(pages_volatile);
3099
3100static ssize_t stable_node_dups_show(struct kobject *kobj,
3101 struct kobj_attribute *attr, char *buf)
3102{
3103 return sprintf(buf, "%lu\n", ksm_stable_node_dups);
3104}
3105KSM_ATTR_RO(stable_node_dups);
3106
3107static ssize_t stable_node_chains_show(struct kobject *kobj,
3108 struct kobj_attribute *attr, char *buf)
3109{
3110 return sprintf(buf, "%lu\n", ksm_stable_node_chains);
3111}
3112KSM_ATTR_RO(stable_node_chains);
3113
3114static ssize_t
3115stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3116 struct kobj_attribute *attr,
3117 char *buf)
3118{
3119 return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3120}
3121
3122static ssize_t
3123stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3124 struct kobj_attribute *attr,
3125 const char *buf, size_t count)
3126{
3127 unsigned long msecs;
3128 int err;
3129
3130 err = kstrtoul(buf, 10, &msecs);
3131 if (err || msecs > UINT_MAX)
3132 return -EINVAL;
3133
3134 ksm_stable_node_chains_prune_millisecs = msecs;
3135
3136 return count;
3137}
3138KSM_ATTR(stable_node_chains_prune_millisecs);
3139
3140static ssize_t full_scans_show(struct kobject *kobj,
3141 struct kobj_attribute *attr, char *buf)
3142{
3143 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
3144}
3145KSM_ATTR_RO(full_scans);
3146
3147static struct attribute *ksm_attrs[] = {
3148 &sleep_millisecs_attr.attr,
3149 &pages_to_scan_attr.attr,
3150 &run_attr.attr,
3151 &pages_shared_attr.attr,
3152 &pages_sharing_attr.attr,
3153 &pages_unshared_attr.attr,
3154 &pages_volatile_attr.attr,
3155 &full_scans_attr.attr,
3156#ifdef CONFIG_NUMA
3157 &merge_across_nodes_attr.attr,
3158#endif
3159 &max_page_sharing_attr.attr,
3160 &stable_node_chains_attr.attr,
3161 &stable_node_dups_attr.attr,
3162 &stable_node_chains_prune_millisecs_attr.attr,
3163 &use_zero_pages_attr.attr,
3164 NULL,
3165};
3166
3167static const struct attribute_group ksm_attr_group = {
3168 .attrs = ksm_attrs,
3169 .name = "ksm",
3170};
3171#endif
3172
3173static int __init ksm_init(void)
3174{
3175 struct task_struct *ksm_thread;
3176 int err;
3177
3178
3179 zero_checksum = calc_checksum(ZERO_PAGE(0));
3180
3181 ksm_use_zero_pages = false;
3182
3183 err = ksm_slab_init();
3184 if (err)
3185 goto out;
3186
3187 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3188 if (IS_ERR(ksm_thread)) {
3189 pr_err("ksm: creating kthread failed\n");
3190 err = PTR_ERR(ksm_thread);
3191 goto out_free;
3192 }
3193
3194#ifdef CONFIG_SYSFS
3195 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3196 if (err) {
3197 pr_err("ksm: register sysfs failed\n");
3198 kthread_stop(ksm_thread);
3199 goto out_free;
3200 }
3201#else
3202 ksm_run = KSM_RUN_MERGE;
3203
3204#endif
3205
3206#ifdef CONFIG_MEMORY_HOTREMOVE
3207
3208 hotplug_memory_notifier(ksm_memory_callback, 100);
3209#endif
3210 return 0;
3211
3212out_free:
3213 ksm_slab_free();
3214out:
3215 return err;
3216}
3217subsys_initcall(ksm_init);
3218