1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <linux/errno.h>
17#include <linux/mm.h>
18#include <linux/fs.h>
19#include <linux/mman.h>
20#include <linux/sched.h>
21#include <linux/sched/mm.h>
22#include <linux/sched/coredump.h>
23#include <linux/rwsem.h>
24#include <linux/pagemap.h>
25#include <linux/rmap.h>
26#include <linux/spinlock.h>
27#include <linux/xxhash.h>
28#include <linux/delay.h>
29#include <linux/kthread.h>
30#include <linux/wait.h>
31#include <linux/slab.h>
32#include <linux/rbtree.h>
33#include <linux/memory.h>
34#include <linux/mmu_notifier.h>
35#include <linux/swap.h>
36#include <linux/ksm.h>
37#include <linux/hashtable.h>
38#include <linux/freezer.h>
39#include <linux/oom.h>
40#include <linux/numa.h>
41
42#include <asm/tlbflush.h>
43#include "internal.h"
44
45#ifdef CONFIG_NUMA
46#define NUMA(x) (x)
47#define DO_NUMA(x) do { (x); } while (0)
48#else
49#define NUMA(x) (0)
50#define DO_NUMA(x) do { } while (0)
51#endif
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120struct mm_slot {
121 struct hlist_node link;
122 struct list_head mm_list;
123 struct rmap_item *rmap_list;
124 struct mm_struct *mm;
125};
126
127
128
129
130
131
132
133
134
135
136struct ksm_scan {
137 struct mm_slot *mm_slot;
138 unsigned long address;
139 struct rmap_item **rmap_list;
140 unsigned long seqnr;
141};
142
143
144
145
146
147
148
149
150
151
152
153
154
155struct stable_node {
156 union {
157 struct rb_node node;
158 struct {
159 struct list_head *head;
160 struct {
161 struct hlist_node hlist_dup;
162 struct list_head list;
163 };
164 };
165 };
166 struct hlist_head hlist;
167 union {
168 unsigned long kpfn;
169 unsigned long chain_prune_time;
170 };
171
172
173
174
175
176#define STABLE_NODE_CHAIN -1024
177 int rmap_hlist_len;
178#ifdef CONFIG_NUMA
179 int nid;
180#endif
181};
182
183
184
185
186
187
188
189
190
191
192
193
194
195struct rmap_item {
196 struct rmap_item *rmap_list;
197 union {
198 struct anon_vma *anon_vma;
199#ifdef CONFIG_NUMA
200 int nid;
201#endif
202 };
203 struct mm_struct *mm;
204 unsigned long address;
205 unsigned int oldchecksum;
206 union {
207 struct rb_node node;
208 struct {
209 struct stable_node *head;
210 struct hlist_node hlist;
211 };
212 };
213};
214
215#define SEQNR_MASK 0x0ff
216#define UNSTABLE_FLAG 0x100
217#define STABLE_FLAG 0x200
218#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
219
220
221
222static struct rb_root one_stable_tree[1] = { RB_ROOT };
223static struct rb_root one_unstable_tree[1] = { RB_ROOT };
224static struct rb_root *root_stable_tree = one_stable_tree;
225static struct rb_root *root_unstable_tree = one_unstable_tree;
226
227
228static LIST_HEAD(migrate_nodes);
229#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
230
231#define MM_SLOTS_HASH_BITS 10
232static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
233
234static struct mm_slot ksm_mm_head = {
235 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
236};
237static struct ksm_scan ksm_scan = {
238 .mm_slot = &ksm_mm_head,
239};
240
241static struct kmem_cache *rmap_item_cache;
242static struct kmem_cache *stable_node_cache;
243static struct kmem_cache *mm_slot_cache;
244
245
246static unsigned long ksm_pages_shared;
247
248
249static unsigned long ksm_pages_sharing;
250
251
252static unsigned long ksm_pages_unshared;
253
254
255static unsigned long ksm_rmap_items;
256
257
258static unsigned long ksm_stable_node_chains;
259
260
261static unsigned long ksm_stable_node_dups;
262
263
264static int ksm_stable_node_chains_prune_millisecs = 2000;
265
266
267static int ksm_max_page_sharing = 256;
268
269
270static unsigned int ksm_thread_pages_to_scan = 100;
271
272
273static unsigned int ksm_thread_sleep_millisecs = 20;
274
275
276static unsigned int zero_checksum __read_mostly;
277
278
279static bool ksm_use_zero_pages __read_mostly;
280
281#ifdef CONFIG_NUMA
282
283static unsigned int ksm_merge_across_nodes = 1;
284static int ksm_nr_node_ids = 1;
285#else
286#define ksm_merge_across_nodes 1U
287#define ksm_nr_node_ids 1
288#endif
289
290#define KSM_RUN_STOP 0
291#define KSM_RUN_MERGE 1
292#define KSM_RUN_UNMERGE 2
293#define KSM_RUN_OFFLINE 4
294static unsigned long ksm_run = KSM_RUN_STOP;
295static void wait_while_offlining(void);
296
297static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
298static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
299static DEFINE_MUTEX(ksm_thread_mutex);
300static DEFINE_SPINLOCK(ksm_mmlist_lock);
301
302#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
303 sizeof(struct __struct), __alignof__(struct __struct),\
304 (__flags), NULL)
305
306static int __init ksm_slab_init(void)
307{
308 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
309 if (!rmap_item_cache)
310 goto out;
311
312 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
313 if (!stable_node_cache)
314 goto out_free1;
315
316 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
317 if (!mm_slot_cache)
318 goto out_free2;
319
320 return 0;
321
322out_free2:
323 kmem_cache_destroy(stable_node_cache);
324out_free1:
325 kmem_cache_destroy(rmap_item_cache);
326out:
327 return -ENOMEM;
328}
329
330static void __init ksm_slab_free(void)
331{
332 kmem_cache_destroy(mm_slot_cache);
333 kmem_cache_destroy(stable_node_cache);
334 kmem_cache_destroy(rmap_item_cache);
335 mm_slot_cache = NULL;
336}
337
338static __always_inline bool is_stable_node_chain(struct stable_node *chain)
339{
340 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
341}
342
343static __always_inline bool is_stable_node_dup(struct stable_node *dup)
344{
345 return dup->head == STABLE_NODE_DUP_HEAD;
346}
347
348static inline void stable_node_chain_add_dup(struct stable_node *dup,
349 struct stable_node *chain)
350{
351 VM_BUG_ON(is_stable_node_dup(dup));
352 dup->head = STABLE_NODE_DUP_HEAD;
353 VM_BUG_ON(!is_stable_node_chain(chain));
354 hlist_add_head(&dup->hlist_dup, &chain->hlist);
355 ksm_stable_node_dups++;
356}
357
358static inline void __stable_node_dup_del(struct stable_node *dup)
359{
360 VM_BUG_ON(!is_stable_node_dup(dup));
361 hlist_del(&dup->hlist_dup);
362 ksm_stable_node_dups--;
363}
364
365static inline void stable_node_dup_del(struct stable_node *dup)
366{
367 VM_BUG_ON(is_stable_node_chain(dup));
368 if (is_stable_node_dup(dup))
369 __stable_node_dup_del(dup);
370 else
371 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
372#ifdef CONFIG_DEBUG_VM
373 dup->head = NULL;
374#endif
375}
376
377static inline struct rmap_item *alloc_rmap_item(void)
378{
379 struct rmap_item *rmap_item;
380
381 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
382 __GFP_NORETRY | __GFP_NOWARN);
383 if (rmap_item)
384 ksm_rmap_items++;
385 return rmap_item;
386}
387
388static inline void free_rmap_item(struct rmap_item *rmap_item)
389{
390 ksm_rmap_items--;
391 rmap_item->mm = NULL;
392 kmem_cache_free(rmap_item_cache, rmap_item);
393}
394
395static inline struct stable_node *alloc_stable_node(void)
396{
397
398
399
400
401
402 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
403}
404
405static inline void free_stable_node(struct stable_node *stable_node)
406{
407 VM_BUG_ON(stable_node->rmap_hlist_len &&
408 !is_stable_node_chain(stable_node));
409 kmem_cache_free(stable_node_cache, stable_node);
410}
411
412static inline struct mm_slot *alloc_mm_slot(void)
413{
414 if (!mm_slot_cache)
415 return NULL;
416 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
417}
418
419static inline void free_mm_slot(struct mm_slot *mm_slot)
420{
421 kmem_cache_free(mm_slot_cache, mm_slot);
422}
423
424static struct mm_slot *get_mm_slot(struct mm_struct *mm)
425{
426 struct mm_slot *slot;
427
428 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
429 if (slot->mm == mm)
430 return slot;
431
432 return NULL;
433}
434
435static void insert_to_mm_slots_hash(struct mm_struct *mm,
436 struct mm_slot *mm_slot)
437{
438 mm_slot->mm = mm;
439 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
440}
441
442
443
444
445
446
447
448
449
450static inline bool ksm_test_exit(struct mm_struct *mm)
451{
452 return atomic_read(&mm->mm_users) == 0;
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
471{
472 struct page *page;
473 vm_fault_t ret = 0;
474
475 do {
476 cond_resched();
477 page = follow_page(vma, addr,
478 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
479 if (IS_ERR_OR_NULL(page))
480 break;
481 if (PageKsm(page))
482 ret = handle_mm_fault(vma, addr,
483 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
484 else
485 ret = VM_FAULT_WRITE;
486 put_page(page);
487 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
517}
518
519static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
520 unsigned long addr)
521{
522 struct vm_area_struct *vma;
523 if (ksm_test_exit(mm))
524 return NULL;
525 vma = find_vma(mm, addr);
526 if (!vma || vma->vm_start > addr)
527 return NULL;
528 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
529 return NULL;
530 return vma;
531}
532
533static void break_cow(struct rmap_item *rmap_item)
534{
535 struct mm_struct *mm = rmap_item->mm;
536 unsigned long addr = rmap_item->address;
537 struct vm_area_struct *vma;
538
539
540
541
542
543 put_anon_vma(rmap_item->anon_vma);
544
545 down_read(&mm->mmap_sem);
546 vma = find_mergeable_vma(mm, addr);
547 if (vma)
548 break_ksm(vma, addr);
549 up_read(&mm->mmap_sem);
550}
551
552static struct page *get_mergeable_page(struct rmap_item *rmap_item)
553{
554 struct mm_struct *mm = rmap_item->mm;
555 unsigned long addr = rmap_item->address;
556 struct vm_area_struct *vma;
557 struct page *page;
558
559 down_read(&mm->mmap_sem);
560 vma = find_mergeable_vma(mm, addr);
561 if (!vma)
562 goto out;
563
564 page = follow_page(vma, addr, FOLL_GET);
565 if (IS_ERR_OR_NULL(page))
566 goto out;
567 if (PageAnon(page)) {
568 flush_anon_page(vma, page, addr);
569 flush_dcache_page(page);
570 } else {
571 put_page(page);
572out:
573 page = NULL;
574 }
575 up_read(&mm->mmap_sem);
576 return page;
577}
578
579
580
581
582
583
584
585static inline int get_kpfn_nid(unsigned long kpfn)
586{
587 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
588}
589
590static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
591 struct rb_root *root)
592{
593 struct stable_node *chain = alloc_stable_node();
594 VM_BUG_ON(is_stable_node_chain(dup));
595 if (likely(chain)) {
596 INIT_HLIST_HEAD(&chain->hlist);
597 chain->chain_prune_time = jiffies;
598 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
599#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
600 chain->nid = NUMA_NO_NODE;
601#endif
602 ksm_stable_node_chains++;
603
604
605
606
607
608
609 rb_replace_node(&dup->node, &chain->node, root);
610
611
612
613
614
615
616
617
618 stable_node_chain_add_dup(dup, chain);
619 }
620 return chain;
621}
622
623static inline void free_stable_node_chain(struct stable_node *chain,
624 struct rb_root *root)
625{
626 rb_erase(&chain->node, root);
627 free_stable_node(chain);
628 ksm_stable_node_chains--;
629}
630
631static void remove_node_from_stable_tree(struct stable_node *stable_node)
632{
633 struct rmap_item *rmap_item;
634
635
636 BUG_ON(stable_node->rmap_hlist_len < 0);
637
638 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
639 if (rmap_item->hlist.next)
640 ksm_pages_sharing--;
641 else
642 ksm_pages_shared--;
643 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
644 stable_node->rmap_hlist_len--;
645 put_anon_vma(rmap_item->anon_vma);
646 rmap_item->address &= PAGE_MASK;
647 cond_resched();
648 }
649
650
651
652
653
654
655
656
657#if defined(GCC_VERSION) && GCC_VERSION >= 40903
658 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
659 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
660#endif
661
662 if (stable_node->head == &migrate_nodes)
663 list_del(&stable_node->list);
664 else
665 stable_node_dup_del(stable_node);
666 free_stable_node(stable_node);
667}
668
669enum get_ksm_page_flags {
670 GET_KSM_PAGE_NOLOCK,
671 GET_KSM_PAGE_LOCK,
672 GET_KSM_PAGE_TRYLOCK
673};
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694static struct page *get_ksm_page(struct stable_node *stable_node,
695 enum get_ksm_page_flags flags)
696{
697 struct page *page;
698 void *expected_mapping;
699 unsigned long kpfn;
700
701 expected_mapping = (void *)((unsigned long)stable_node |
702 PAGE_MAPPING_KSM);
703again:
704 kpfn = READ_ONCE(stable_node->kpfn);
705 page = pfn_to_page(kpfn);
706 if (READ_ONCE(page->mapping) != expected_mapping)
707 goto stale;
708
709
710
711
712
713
714
715
716
717
718
719 while (!get_page_unless_zero(page)) {
720
721
722
723
724
725
726
727
728 if (!PageSwapCache(page))
729 goto stale;
730 cpu_relax();
731 }
732
733 if (READ_ONCE(page->mapping) != expected_mapping) {
734 put_page(page);
735 goto stale;
736 }
737
738 if (flags == GET_KSM_PAGE_TRYLOCK) {
739 if (!trylock_page(page)) {
740 put_page(page);
741 return ERR_PTR(-EBUSY);
742 }
743 } else if (flags == GET_KSM_PAGE_LOCK)
744 lock_page(page);
745
746 if (flags != GET_KSM_PAGE_NOLOCK) {
747 if (READ_ONCE(page->mapping) != expected_mapping) {
748 unlock_page(page);
749 put_page(page);
750 goto stale;
751 }
752 }
753 return page;
754
755stale:
756
757
758
759
760
761
762 smp_rmb();
763 if (READ_ONCE(stable_node->kpfn) != kpfn)
764 goto again;
765 remove_node_from_stable_tree(stable_node);
766 return NULL;
767}
768
769
770
771
772
773static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
774{
775 if (rmap_item->address & STABLE_FLAG) {
776 struct stable_node *stable_node;
777 struct page *page;
778
779 stable_node = rmap_item->head;
780 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
781 if (!page)
782 goto out;
783
784 hlist_del(&rmap_item->hlist);
785 unlock_page(page);
786 put_page(page);
787
788 if (!hlist_empty(&stable_node->hlist))
789 ksm_pages_sharing--;
790 else
791 ksm_pages_shared--;
792 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
793 stable_node->rmap_hlist_len--;
794
795 put_anon_vma(rmap_item->anon_vma);
796 rmap_item->address &= PAGE_MASK;
797
798 } else if (rmap_item->address & UNSTABLE_FLAG) {
799 unsigned char age;
800
801
802
803
804
805
806
807 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
808 BUG_ON(age > 1);
809 if (!age)
810 rb_erase(&rmap_item->node,
811 root_unstable_tree + NUMA(rmap_item->nid));
812 ksm_pages_unshared--;
813 rmap_item->address &= PAGE_MASK;
814 }
815out:
816 cond_resched();
817}
818
819static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
820 struct rmap_item **rmap_list)
821{
822 while (*rmap_list) {
823 struct rmap_item *rmap_item = *rmap_list;
824 *rmap_list = rmap_item->rmap_list;
825 remove_rmap_item_from_tree(rmap_item);
826 free_rmap_item(rmap_item);
827 }
828}
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843static int unmerge_ksm_pages(struct vm_area_struct *vma,
844 unsigned long start, unsigned long end)
845{
846 unsigned long addr;
847 int err = 0;
848
849 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
850 if (ksm_test_exit(vma->vm_mm))
851 break;
852 if (signal_pending(current))
853 err = -ERESTARTSYS;
854 else
855 err = break_ksm(vma, addr);
856 }
857 return err;
858}
859
860static inline struct stable_node *page_stable_node(struct page *page)
861{
862 return PageKsm(page) ? page_rmapping(page) : NULL;
863}
864
865static inline void set_page_stable_node(struct page *page,
866 struct stable_node *stable_node)
867{
868 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
869}
870
871#ifdef CONFIG_SYSFS
872
873
874
875static int remove_stable_node(struct stable_node *stable_node)
876{
877 struct page *page;
878 int err;
879
880 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
881 if (!page) {
882
883
884
885 return 0;
886 }
887
888
889
890
891
892
893 err = -EBUSY;
894 if (!page_mapped(page)) {
895
896
897
898
899
900
901
902
903 set_page_stable_node(page, NULL);
904 remove_node_from_stable_tree(stable_node);
905 err = 0;
906 }
907
908 unlock_page(page);
909 put_page(page);
910 return err;
911}
912
913static int remove_stable_node_chain(struct stable_node *stable_node,
914 struct rb_root *root)
915{
916 struct stable_node *dup;
917 struct hlist_node *hlist_safe;
918
919 if (!is_stable_node_chain(stable_node)) {
920 VM_BUG_ON(is_stable_node_dup(stable_node));
921 if (remove_stable_node(stable_node))
922 return true;
923 else
924 return false;
925 }
926
927 hlist_for_each_entry_safe(dup, hlist_safe,
928 &stable_node->hlist, hlist_dup) {
929 VM_BUG_ON(!is_stable_node_dup(dup));
930 if (remove_stable_node(dup))
931 return true;
932 }
933 BUG_ON(!hlist_empty(&stable_node->hlist));
934 free_stable_node_chain(stable_node, root);
935 return false;
936}
937
938static int remove_all_stable_nodes(void)
939{
940 struct stable_node *stable_node, *next;
941 int nid;
942 int err = 0;
943
944 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
945 while (root_stable_tree[nid].rb_node) {
946 stable_node = rb_entry(root_stable_tree[nid].rb_node,
947 struct stable_node, node);
948 if (remove_stable_node_chain(stable_node,
949 root_stable_tree + nid)) {
950 err = -EBUSY;
951 break;
952 }
953 cond_resched();
954 }
955 }
956 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
957 if (remove_stable_node(stable_node))
958 err = -EBUSY;
959 cond_resched();
960 }
961 return err;
962}
963
964static int unmerge_and_remove_all_rmap_items(void)
965{
966 struct mm_slot *mm_slot;
967 struct mm_struct *mm;
968 struct vm_area_struct *vma;
969 int err = 0;
970
971 spin_lock(&ksm_mmlist_lock);
972 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
973 struct mm_slot, mm_list);
974 spin_unlock(&ksm_mmlist_lock);
975
976 for (mm_slot = ksm_scan.mm_slot;
977 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
978 mm = mm_slot->mm;
979 down_read(&mm->mmap_sem);
980 for (vma = mm->mmap; vma; vma = vma->vm_next) {
981 if (ksm_test_exit(mm))
982 break;
983 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
984 continue;
985 err = unmerge_ksm_pages(vma,
986 vma->vm_start, vma->vm_end);
987 if (err)
988 goto error;
989 }
990
991 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
992 up_read(&mm->mmap_sem);
993
994 spin_lock(&ksm_mmlist_lock);
995 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
996 struct mm_slot, mm_list);
997 if (ksm_test_exit(mm)) {
998 hash_del(&mm_slot->link);
999 list_del(&mm_slot->mm_list);
1000 spin_unlock(&ksm_mmlist_lock);
1001
1002 free_mm_slot(mm_slot);
1003 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1004 mmdrop(mm);
1005 } else
1006 spin_unlock(&ksm_mmlist_lock);
1007 }
1008
1009
1010 remove_all_stable_nodes();
1011 ksm_scan.seqnr = 0;
1012 return 0;
1013
1014error:
1015 up_read(&mm->mmap_sem);
1016 spin_lock(&ksm_mmlist_lock);
1017 ksm_scan.mm_slot = &ksm_mm_head;
1018 spin_unlock(&ksm_mmlist_lock);
1019 return err;
1020}
1021#endif
1022
1023static u32 calc_checksum(struct page *page)
1024{
1025 u32 checksum;
1026 void *addr = kmap_atomic(page);
1027 checksum = xxhash(addr, PAGE_SIZE, 0);
1028 kunmap_atomic(addr);
1029 return checksum;
1030}
1031
1032static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1033 pte_t *orig_pte)
1034{
1035 struct mm_struct *mm = vma->vm_mm;
1036 struct page_vma_mapped_walk pvmw = {
1037 .page = page,
1038 .vma = vma,
1039 };
1040 int swapped;
1041 int err = -EFAULT;
1042 struct mmu_notifier_range range;
1043
1044 pvmw.address = page_address_in_vma(page, vma);
1045 if (pvmw.address == -EFAULT)
1046 goto out;
1047
1048 BUG_ON(PageTransCompound(page));
1049
1050 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1051 pvmw.address,
1052 pvmw.address + PAGE_SIZE);
1053 mmu_notifier_invalidate_range_start(&range);
1054
1055 if (!page_vma_mapped_walk(&pvmw))
1056 goto out_mn;
1057 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1058 goto out_unlock;
1059
1060 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1061 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1062 mm_tlb_flush_pending(mm)) {
1063 pte_t entry;
1064
1065 swapped = PageSwapCache(page);
1066 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1082
1083
1084
1085
1086 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1087 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1088 goto out_unlock;
1089 }
1090 if (pte_dirty(entry))
1091 set_page_dirty(page);
1092
1093 if (pte_protnone(entry))
1094 entry = pte_mkclean(pte_clear_savedwrite(entry));
1095 else
1096 entry = pte_mkclean(pte_wrprotect(entry));
1097 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1098 }
1099 *orig_pte = *pvmw.pte;
1100 err = 0;
1101
1102out_unlock:
1103 page_vma_mapped_walk_done(&pvmw);
1104out_mn:
1105 mmu_notifier_invalidate_range_end(&range);
1106out:
1107 return err;
1108}
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119static int replace_page(struct vm_area_struct *vma, struct page *page,
1120 struct page *kpage, pte_t orig_pte)
1121{
1122 struct mm_struct *mm = vma->vm_mm;
1123 pmd_t *pmd;
1124 pte_t *ptep;
1125 pte_t newpte;
1126 spinlock_t *ptl;
1127 unsigned long addr;
1128 int err = -EFAULT;
1129 struct mmu_notifier_range range;
1130
1131 addr = page_address_in_vma(page, vma);
1132 if (addr == -EFAULT)
1133 goto out;
1134
1135 pmd = mm_find_pmd(mm, addr);
1136 if (!pmd)
1137 goto out;
1138
1139 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1140 addr + PAGE_SIZE);
1141 mmu_notifier_invalidate_range_start(&range);
1142
1143 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1144 if (!pte_same(*ptep, orig_pte)) {
1145 pte_unmap_unlock(ptep, ptl);
1146 goto out_mn;
1147 }
1148
1149
1150
1151
1152
1153 if (!is_zero_pfn(page_to_pfn(kpage))) {
1154 get_page(kpage);
1155 page_add_anon_rmap(kpage, vma, addr, false);
1156 newpte = mk_pte(kpage, vma->vm_page_prot);
1157 } else {
1158 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1159 vma->vm_page_prot));
1160
1161
1162
1163
1164
1165
1166 dec_mm_counter(mm, MM_ANONPAGES);
1167 }
1168
1169 flush_cache_page(vma, addr, pte_pfn(*ptep));
1170
1171
1172
1173
1174
1175
1176 ptep_clear_flush(vma, addr, ptep);
1177 set_pte_at_notify(mm, addr, ptep, newpte);
1178
1179 page_remove_rmap(page, false);
1180 if (!page_mapped(page))
1181 try_to_free_swap(page);
1182 put_page(page);
1183
1184 pte_unmap_unlock(ptep, ptl);
1185 err = 0;
1186out_mn:
1187 mmu_notifier_invalidate_range_end(&range);
1188out:
1189 return err;
1190}
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201static int try_to_merge_one_page(struct vm_area_struct *vma,
1202 struct page *page, struct page *kpage)
1203{
1204 pte_t orig_pte = __pte(0);
1205 int err = -EFAULT;
1206
1207 if (page == kpage)
1208 return 0;
1209
1210 if (!PageAnon(page))
1211 goto out;
1212
1213
1214
1215
1216
1217
1218
1219
1220 if (!trylock_page(page))
1221 goto out;
1222
1223 if (PageTransCompound(page)) {
1224 if (split_huge_page(page))
1225 goto out_unlock;
1226 }
1227
1228
1229
1230
1231
1232
1233
1234 if (write_protect_page(vma, page, &orig_pte) == 0) {
1235 if (!kpage) {
1236
1237
1238
1239
1240
1241 set_page_stable_node(page, NULL);
1242 mark_page_accessed(page);
1243
1244
1245
1246
1247 if (!PageDirty(page))
1248 SetPageDirty(page);
1249 err = 0;
1250 } else if (pages_identical(page, kpage))
1251 err = replace_page(vma, page, kpage, orig_pte);
1252 }
1253
1254 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1255 munlock_vma_page(page);
1256 if (!PageMlocked(kpage)) {
1257 unlock_page(page);
1258 lock_page(kpage);
1259 mlock_vma_page(kpage);
1260 page = kpage;
1261 }
1262 }
1263
1264out_unlock:
1265 unlock_page(page);
1266out:
1267 return err;
1268}
1269
1270
1271
1272
1273
1274
1275
1276static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1277 struct page *page, struct page *kpage)
1278{
1279 struct mm_struct *mm = rmap_item->mm;
1280 struct vm_area_struct *vma;
1281 int err = -EFAULT;
1282
1283 down_read(&mm->mmap_sem);
1284 vma = find_mergeable_vma(mm, rmap_item->address);
1285 if (!vma)
1286 goto out;
1287
1288 err = try_to_merge_one_page(vma, page, kpage);
1289 if (err)
1290 goto out;
1291
1292
1293 remove_rmap_item_from_tree(rmap_item);
1294
1295
1296 rmap_item->anon_vma = vma->anon_vma;
1297 get_anon_vma(vma->anon_vma);
1298out:
1299 up_read(&mm->mmap_sem);
1300 return err;
1301}
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1314 struct page *page,
1315 struct rmap_item *tree_rmap_item,
1316 struct page *tree_page)
1317{
1318 int err;
1319
1320 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1321 if (!err) {
1322 err = try_to_merge_with_ksm_page(tree_rmap_item,
1323 tree_page, page);
1324
1325
1326
1327
1328 if (err)
1329 break_cow(rmap_item);
1330 }
1331 return err ? NULL : page;
1332}
1333
1334static __always_inline
1335bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1336{
1337 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1338
1339
1340
1341
1342
1343
1344 return stable_node->rmap_hlist_len &&
1345 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1346}
1347
1348static __always_inline
1349bool is_page_sharing_candidate(struct stable_node *stable_node)
1350{
1351 return __is_page_sharing_candidate(stable_node, 0);
1352}
1353
1354static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1355 struct stable_node **_stable_node,
1356 struct rb_root *root,
1357 bool prune_stale_stable_nodes)
1358{
1359 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1360 struct hlist_node *hlist_safe;
1361 struct page *_tree_page, *tree_page = NULL;
1362 int nr = 0;
1363 int found_rmap_hlist_len;
1364
1365 if (!prune_stale_stable_nodes ||
1366 time_before(jiffies, stable_node->chain_prune_time +
1367 msecs_to_jiffies(
1368 ksm_stable_node_chains_prune_millisecs)))
1369 prune_stale_stable_nodes = false;
1370 else
1371 stable_node->chain_prune_time = jiffies;
1372
1373 hlist_for_each_entry_safe(dup, hlist_safe,
1374 &stable_node->hlist, hlist_dup) {
1375 cond_resched();
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1387 if (!_tree_page)
1388 continue;
1389 nr += 1;
1390 if (is_page_sharing_candidate(dup)) {
1391 if (!found ||
1392 dup->rmap_hlist_len > found_rmap_hlist_len) {
1393 if (found)
1394 put_page(tree_page);
1395 found = dup;
1396 found_rmap_hlist_len = found->rmap_hlist_len;
1397 tree_page = _tree_page;
1398
1399
1400 if (!prune_stale_stable_nodes)
1401 break;
1402 continue;
1403 }
1404 }
1405 put_page(_tree_page);
1406 }
1407
1408 if (found) {
1409
1410
1411
1412
1413
1414
1415 if (prune_stale_stable_nodes && nr == 1) {
1416
1417
1418
1419
1420
1421
1422 BUG_ON(stable_node->hlist.first->next);
1423
1424
1425
1426
1427
1428 rb_replace_node(&stable_node->node, &found->node,
1429 root);
1430 free_stable_node(stable_node);
1431 ksm_stable_node_chains--;
1432 ksm_stable_node_dups--;
1433
1434
1435
1436
1437
1438 *_stable_node = found;
1439
1440
1441
1442
1443
1444
1445 stable_node = NULL;
1446 } else if (stable_node->hlist.first != &found->hlist_dup &&
1447 __is_page_sharing_candidate(found, 1)) {
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463 hlist_del(&found->hlist_dup);
1464 hlist_add_head(&found->hlist_dup,
1465 &stable_node->hlist);
1466 }
1467 }
1468
1469 *_stable_node_dup = found;
1470 return tree_page;
1471}
1472
1473static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1474 struct rb_root *root)
1475{
1476 if (!is_stable_node_chain(stable_node))
1477 return stable_node;
1478 if (hlist_empty(&stable_node->hlist)) {
1479 free_stable_node_chain(stable_node, root);
1480 return NULL;
1481 }
1482 return hlist_entry(stable_node->hlist.first,
1483 typeof(*stable_node), hlist_dup);
1484}
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1501 struct stable_node **_stable_node,
1502 struct rb_root *root,
1503 bool prune_stale_stable_nodes)
1504{
1505 struct stable_node *stable_node = *_stable_node;
1506 if (!is_stable_node_chain(stable_node)) {
1507 if (is_page_sharing_candidate(stable_node)) {
1508 *_stable_node_dup = stable_node;
1509 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1510 }
1511
1512
1513
1514
1515 *_stable_node_dup = NULL;
1516 return NULL;
1517 }
1518 return stable_node_dup(_stable_node_dup, _stable_node, root,
1519 prune_stale_stable_nodes);
1520}
1521
1522static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1523 struct stable_node **s_n,
1524 struct rb_root *root)
1525{
1526 return __stable_node_chain(s_n_d, s_n, root, true);
1527}
1528
1529static __always_inline struct page *chain(struct stable_node **s_n_d,
1530 struct stable_node *s_n,
1531 struct rb_root *root)
1532{
1533 struct stable_node *old_stable_node = s_n;
1534 struct page *tree_page;
1535
1536 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1537
1538 VM_BUG_ON(s_n != old_stable_node);
1539 return tree_page;
1540}
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551static struct page *stable_tree_search(struct page *page)
1552{
1553 int nid;
1554 struct rb_root *root;
1555 struct rb_node **new;
1556 struct rb_node *parent;
1557 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1558 struct stable_node *page_node;
1559
1560 page_node = page_stable_node(page);
1561 if (page_node && page_node->head != &migrate_nodes) {
1562
1563 get_page(page);
1564 return page;
1565 }
1566
1567 nid = get_kpfn_nid(page_to_pfn(page));
1568 root = root_stable_tree + nid;
1569again:
1570 new = &root->rb_node;
1571 parent = NULL;
1572
1573 while (*new) {
1574 struct page *tree_page;
1575 int ret;
1576
1577 cond_resched();
1578 stable_node = rb_entry(*new, struct stable_node, node);
1579 stable_node_any = NULL;
1580 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593 if (!stable_node_dup) {
1594
1595
1596
1597
1598
1599 stable_node_any = stable_node_dup_any(stable_node,
1600 root);
1601 if (!stable_node_any) {
1602
1603 goto again;
1604 }
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614 tree_page = get_ksm_page(stable_node_any,
1615 GET_KSM_PAGE_NOLOCK);
1616 }
1617 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1618 if (!tree_page) {
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628 goto again;
1629 }
1630
1631 ret = memcmp_pages(page, tree_page);
1632 put_page(tree_page);
1633
1634 parent = *new;
1635 if (ret < 0)
1636 new = &parent->rb_left;
1637 else if (ret > 0)
1638 new = &parent->rb_right;
1639 else {
1640 if (page_node) {
1641 VM_BUG_ON(page_node->head != &migrate_nodes);
1642
1643
1644
1645
1646
1647
1648 if (page_mapcount(page) > 1)
1649 goto chain_append;
1650 }
1651
1652 if (!stable_node_dup) {
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665 return NULL;
1666 }
1667
1668
1669
1670
1671
1672
1673
1674
1675 tree_page = get_ksm_page(stable_node_dup,
1676 GET_KSM_PAGE_TRYLOCK);
1677
1678 if (PTR_ERR(tree_page) == -EBUSY)
1679 return ERR_PTR(-EBUSY);
1680
1681 if (unlikely(!tree_page))
1682
1683
1684
1685
1686 goto again;
1687 unlock_page(tree_page);
1688
1689 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1690 NUMA(stable_node_dup->nid)) {
1691 put_page(tree_page);
1692 goto replace;
1693 }
1694 return tree_page;
1695 }
1696 }
1697
1698 if (!page_node)
1699 return NULL;
1700
1701 list_del(&page_node->list);
1702 DO_NUMA(page_node->nid = nid);
1703 rb_link_node(&page_node->node, parent, new);
1704 rb_insert_color(&page_node->node, root);
1705out:
1706 if (is_page_sharing_candidate(page_node)) {
1707 get_page(page);
1708 return page;
1709 } else
1710 return NULL;
1711
1712replace:
1713
1714
1715
1716
1717
1718
1719
1720
1721 if (stable_node_dup == stable_node) {
1722 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1723 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1724
1725 if (page_node) {
1726 VM_BUG_ON(page_node->head != &migrate_nodes);
1727 list_del(&page_node->list);
1728 DO_NUMA(page_node->nid = nid);
1729 rb_replace_node(&stable_node_dup->node,
1730 &page_node->node,
1731 root);
1732 if (is_page_sharing_candidate(page_node))
1733 get_page(page);
1734 else
1735 page = NULL;
1736 } else {
1737 rb_erase(&stable_node_dup->node, root);
1738 page = NULL;
1739 }
1740 } else {
1741 VM_BUG_ON(!is_stable_node_chain(stable_node));
1742 __stable_node_dup_del(stable_node_dup);
1743 if (page_node) {
1744 VM_BUG_ON(page_node->head != &migrate_nodes);
1745 list_del(&page_node->list);
1746 DO_NUMA(page_node->nid = nid);
1747 stable_node_chain_add_dup(page_node, stable_node);
1748 if (is_page_sharing_candidate(page_node))
1749 get_page(page);
1750 else
1751 page = NULL;
1752 } else {
1753 page = NULL;
1754 }
1755 }
1756 stable_node_dup->head = &migrate_nodes;
1757 list_add(&stable_node_dup->list, stable_node_dup->head);
1758 return page;
1759
1760chain_append:
1761
1762 if (!stable_node_dup)
1763 stable_node_dup = stable_node_any;
1764
1765
1766
1767
1768
1769
1770
1771
1772 if (stable_node_dup == stable_node) {
1773 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1774 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1775
1776 stable_node = alloc_stable_node_chain(stable_node_dup,
1777 root);
1778 if (!stable_node)
1779 return NULL;
1780 }
1781
1782
1783
1784
1785
1786
1787 VM_BUG_ON(!is_stable_node_chain(stable_node));
1788 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1789 VM_BUG_ON(page_node->head != &migrate_nodes);
1790 list_del(&page_node->list);
1791 DO_NUMA(page_node->nid = nid);
1792 stable_node_chain_add_dup(page_node, stable_node);
1793 goto out;
1794}
1795
1796
1797
1798
1799
1800
1801
1802
1803static struct stable_node *stable_tree_insert(struct page *kpage)
1804{
1805 int nid;
1806 unsigned long kpfn;
1807 struct rb_root *root;
1808 struct rb_node **new;
1809 struct rb_node *parent;
1810 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1811 bool need_chain = false;
1812
1813 kpfn = page_to_pfn(kpage);
1814 nid = get_kpfn_nid(kpfn);
1815 root = root_stable_tree + nid;
1816again:
1817 parent = NULL;
1818 new = &root->rb_node;
1819
1820 while (*new) {
1821 struct page *tree_page;
1822 int ret;
1823
1824 cond_resched();
1825 stable_node = rb_entry(*new, struct stable_node, node);
1826 stable_node_any = NULL;
1827 tree_page = chain(&stable_node_dup, stable_node, root);
1828 if (!stable_node_dup) {
1829
1830
1831
1832
1833
1834 stable_node_any = stable_node_dup_any(stable_node,
1835 root);
1836 if (!stable_node_any) {
1837
1838 goto again;
1839 }
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849 tree_page = get_ksm_page(stable_node_any,
1850 GET_KSM_PAGE_NOLOCK);
1851 }
1852 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1853 if (!tree_page) {
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863 goto again;
1864 }
1865
1866 ret = memcmp_pages(kpage, tree_page);
1867 put_page(tree_page);
1868
1869 parent = *new;
1870 if (ret < 0)
1871 new = &parent->rb_left;
1872 else if (ret > 0)
1873 new = &parent->rb_right;
1874 else {
1875 need_chain = true;
1876 break;
1877 }
1878 }
1879
1880 stable_node_dup = alloc_stable_node();
1881 if (!stable_node_dup)
1882 return NULL;
1883
1884 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1885 stable_node_dup->kpfn = kpfn;
1886 set_page_stable_node(kpage, stable_node_dup);
1887 stable_node_dup->rmap_hlist_len = 0;
1888 DO_NUMA(stable_node_dup->nid = nid);
1889 if (!need_chain) {
1890 rb_link_node(&stable_node_dup->node, parent, new);
1891 rb_insert_color(&stable_node_dup->node, root);
1892 } else {
1893 if (!is_stable_node_chain(stable_node)) {
1894 struct stable_node *orig = stable_node;
1895
1896 stable_node = alloc_stable_node_chain(orig, root);
1897 if (!stable_node) {
1898 free_stable_node(stable_node_dup);
1899 return NULL;
1900 }
1901 }
1902 stable_node_chain_add_dup(stable_node_dup, stable_node);
1903 }
1904
1905 return stable_node_dup;
1906}
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922static
1923struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1924 struct page *page,
1925 struct page **tree_pagep)
1926{
1927 struct rb_node **new;
1928 struct rb_root *root;
1929 struct rb_node *parent = NULL;
1930 int nid;
1931
1932 nid = get_kpfn_nid(page_to_pfn(page));
1933 root = root_unstable_tree + nid;
1934 new = &root->rb_node;
1935
1936 while (*new) {
1937 struct rmap_item *tree_rmap_item;
1938 struct page *tree_page;
1939 int ret;
1940
1941 cond_resched();
1942 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1943 tree_page = get_mergeable_page(tree_rmap_item);
1944 if (!tree_page)
1945 return NULL;
1946
1947
1948
1949
1950 if (page == tree_page) {
1951 put_page(tree_page);
1952 return NULL;
1953 }
1954
1955 ret = memcmp_pages(page, tree_page);
1956
1957 parent = *new;
1958 if (ret < 0) {
1959 put_page(tree_page);
1960 new = &parent->rb_left;
1961 } else if (ret > 0) {
1962 put_page(tree_page);
1963 new = &parent->rb_right;
1964 } else if (!ksm_merge_across_nodes &&
1965 page_to_nid(tree_page) != nid) {
1966
1967
1968
1969
1970
1971 put_page(tree_page);
1972 return NULL;
1973 } else {
1974 *tree_pagep = tree_page;
1975 return tree_rmap_item;
1976 }
1977 }
1978
1979 rmap_item->address |= UNSTABLE_FLAG;
1980 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1981 DO_NUMA(rmap_item->nid = nid);
1982 rb_link_node(&rmap_item->node, parent, new);
1983 rb_insert_color(&rmap_item->node, root);
1984
1985 ksm_pages_unshared++;
1986 return NULL;
1987}
1988
1989
1990
1991
1992
1993
1994static void stable_tree_append(struct rmap_item *rmap_item,
1995 struct stable_node *stable_node,
1996 bool max_page_sharing_bypass)
1997{
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008 BUG_ON(stable_node->rmap_hlist_len < 0);
2009
2010 stable_node->rmap_hlist_len++;
2011 if (!max_page_sharing_bypass)
2012
2013 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2014 ksm_max_page_sharing);
2015
2016 rmap_item->head = stable_node;
2017 rmap_item->address |= STABLE_FLAG;
2018 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2019
2020 if (rmap_item->hlist.next)
2021 ksm_pages_sharing++;
2022 else
2023 ksm_pages_shared++;
2024}
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2036{
2037 struct mm_struct *mm = rmap_item->mm;
2038 struct rmap_item *tree_rmap_item;
2039 struct page *tree_page = NULL;
2040 struct stable_node *stable_node;
2041 struct page *kpage;
2042 unsigned int checksum;
2043 int err;
2044 bool max_page_sharing_bypass = false;
2045
2046 stable_node = page_stable_node(page);
2047 if (stable_node) {
2048 if (stable_node->head != &migrate_nodes &&
2049 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2050 NUMA(stable_node->nid)) {
2051 stable_node_dup_del(stable_node);
2052 stable_node->head = &migrate_nodes;
2053 list_add(&stable_node->list, stable_node->head);
2054 }
2055 if (stable_node->head != &migrate_nodes &&
2056 rmap_item->head == stable_node)
2057 return;
2058
2059
2060
2061
2062 if (!is_page_sharing_candidate(stable_node))
2063 max_page_sharing_bypass = true;
2064 }
2065
2066
2067 kpage = stable_tree_search(page);
2068 if (kpage == page && rmap_item->head == stable_node) {
2069 put_page(kpage);
2070 return;
2071 }
2072
2073 remove_rmap_item_from_tree(rmap_item);
2074
2075 if (kpage) {
2076 if (PTR_ERR(kpage) == -EBUSY)
2077 return;
2078
2079 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2080 if (!err) {
2081
2082
2083
2084
2085 lock_page(kpage);
2086 stable_tree_append(rmap_item, page_stable_node(kpage),
2087 max_page_sharing_bypass);
2088 unlock_page(kpage);
2089 }
2090 put_page(kpage);
2091 return;
2092 }
2093
2094
2095
2096
2097
2098
2099
2100 checksum = calc_checksum(page);
2101 if (rmap_item->oldchecksum != checksum) {
2102 rmap_item->oldchecksum = checksum;
2103 return;
2104 }
2105
2106
2107
2108
2109
2110 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2111 struct vm_area_struct *vma;
2112
2113 down_read(&mm->mmap_sem);
2114 vma = find_mergeable_vma(mm, rmap_item->address);
2115 err = try_to_merge_one_page(vma, page,
2116 ZERO_PAGE(rmap_item->address));
2117 up_read(&mm->mmap_sem);
2118
2119
2120
2121
2122 if (!err)
2123 return;
2124 }
2125 tree_rmap_item =
2126 unstable_tree_search_insert(rmap_item, page, &tree_page);
2127 if (tree_rmap_item) {
2128 bool split;
2129
2130 kpage = try_to_merge_two_pages(rmap_item, page,
2131 tree_rmap_item, tree_page);
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142 split = PageTransCompound(page)
2143 && compound_head(page) == compound_head(tree_page);
2144 put_page(tree_page);
2145 if (kpage) {
2146
2147
2148
2149
2150 lock_page(kpage);
2151 stable_node = stable_tree_insert(kpage);
2152 if (stable_node) {
2153 stable_tree_append(tree_rmap_item, stable_node,
2154 false);
2155 stable_tree_append(rmap_item, stable_node,
2156 false);
2157 }
2158 unlock_page(kpage);
2159
2160
2161
2162
2163
2164
2165
2166 if (!stable_node) {
2167 break_cow(tree_rmap_item);
2168 break_cow(rmap_item);
2169 }
2170 } else if (split) {
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180 if (!trylock_page(page))
2181 return;
2182 split_huge_page(page);
2183 unlock_page(page);
2184 }
2185 }
2186}
2187
2188static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2189 struct rmap_item **rmap_list,
2190 unsigned long addr)
2191{
2192 struct rmap_item *rmap_item;
2193
2194 while (*rmap_list) {
2195 rmap_item = *rmap_list;
2196 if ((rmap_item->address & PAGE_MASK) == addr)
2197 return rmap_item;
2198 if (rmap_item->address > addr)
2199 break;
2200 *rmap_list = rmap_item->rmap_list;
2201 remove_rmap_item_from_tree(rmap_item);
2202 free_rmap_item(rmap_item);
2203 }
2204
2205 rmap_item = alloc_rmap_item();
2206 if (rmap_item) {
2207
2208 rmap_item->mm = mm_slot->mm;
2209 rmap_item->address = addr;
2210 rmap_item->rmap_list = *rmap_list;
2211 *rmap_list = rmap_item;
2212 }
2213 return rmap_item;
2214}
2215
2216static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2217{
2218 struct mm_struct *mm;
2219 struct mm_slot *slot;
2220 struct vm_area_struct *vma;
2221 struct rmap_item *rmap_item;
2222 int nid;
2223
2224 if (list_empty(&ksm_mm_head.mm_list))
2225 return NULL;
2226
2227 slot = ksm_scan.mm_slot;
2228 if (slot == &ksm_mm_head) {
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239 lru_add_drain_all();
2240
2241
2242
2243
2244
2245
2246
2247 if (!ksm_merge_across_nodes) {
2248 struct stable_node *stable_node, *next;
2249 struct page *page;
2250
2251 list_for_each_entry_safe(stable_node, next,
2252 &migrate_nodes, list) {
2253 page = get_ksm_page(stable_node,
2254 GET_KSM_PAGE_NOLOCK);
2255 if (page)
2256 put_page(page);
2257 cond_resched();
2258 }
2259 }
2260
2261 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2262 root_unstable_tree[nid] = RB_ROOT;
2263
2264 spin_lock(&ksm_mmlist_lock);
2265 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2266 ksm_scan.mm_slot = slot;
2267 spin_unlock(&ksm_mmlist_lock);
2268
2269
2270
2271
2272 if (slot == &ksm_mm_head)
2273 return NULL;
2274next_mm:
2275 ksm_scan.address = 0;
2276 ksm_scan.rmap_list = &slot->rmap_list;
2277 }
2278
2279 mm = slot->mm;
2280 down_read(&mm->mmap_sem);
2281 if (ksm_test_exit(mm))
2282 vma = NULL;
2283 else
2284 vma = find_vma(mm, ksm_scan.address);
2285
2286 for (; vma; vma = vma->vm_next) {
2287 if (!(vma->vm_flags & VM_MERGEABLE))
2288 continue;
2289 if (ksm_scan.address < vma->vm_start)
2290 ksm_scan.address = vma->vm_start;
2291 if (!vma->anon_vma)
2292 ksm_scan.address = vma->vm_end;
2293
2294 while (ksm_scan.address < vma->vm_end) {
2295 if (ksm_test_exit(mm))
2296 break;
2297 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2298 if (IS_ERR_OR_NULL(*page)) {
2299 ksm_scan.address += PAGE_SIZE;
2300 cond_resched();
2301 continue;
2302 }
2303 if (PageAnon(*page)) {
2304 flush_anon_page(vma, *page, ksm_scan.address);
2305 flush_dcache_page(*page);
2306 rmap_item = get_next_rmap_item(slot,
2307 ksm_scan.rmap_list, ksm_scan.address);
2308 if (rmap_item) {
2309 ksm_scan.rmap_list =
2310 &rmap_item->rmap_list;
2311 ksm_scan.address += PAGE_SIZE;
2312 } else
2313 put_page(*page);
2314 up_read(&mm->mmap_sem);
2315 return rmap_item;
2316 }
2317 put_page(*page);
2318 ksm_scan.address += PAGE_SIZE;
2319 cond_resched();
2320 }
2321 }
2322
2323 if (ksm_test_exit(mm)) {
2324 ksm_scan.address = 0;
2325 ksm_scan.rmap_list = &slot->rmap_list;
2326 }
2327
2328
2329
2330
2331 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
2332
2333 spin_lock(&ksm_mmlist_lock);
2334 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2335 struct mm_slot, mm_list);
2336 if (ksm_scan.address == 0) {
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346 hash_del(&slot->link);
2347 list_del(&slot->mm_list);
2348 spin_unlock(&ksm_mmlist_lock);
2349
2350 free_mm_slot(slot);
2351 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2352 up_read(&mm->mmap_sem);
2353 mmdrop(mm);
2354 } else {
2355 up_read(&mm->mmap_sem);
2356
2357
2358
2359
2360
2361
2362
2363 spin_unlock(&ksm_mmlist_lock);
2364 }
2365
2366
2367 slot = ksm_scan.mm_slot;
2368 if (slot != &ksm_mm_head)
2369 goto next_mm;
2370
2371 ksm_scan.seqnr++;
2372 return NULL;
2373}
2374
2375
2376
2377
2378
2379static void ksm_do_scan(unsigned int scan_npages)
2380{
2381 struct rmap_item *rmap_item;
2382 struct page *uninitialized_var(page);
2383
2384 while (scan_npages-- && likely(!freezing(current))) {
2385 cond_resched();
2386 rmap_item = scan_get_next_rmap_item(&page);
2387 if (!rmap_item)
2388 return;
2389 cmp_and_merge_page(page, rmap_item);
2390 put_page(page);
2391 }
2392}
2393
2394static int ksmd_should_run(void)
2395{
2396 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2397}
2398
2399static int ksm_scan_thread(void *nothing)
2400{
2401 unsigned int sleep_ms;
2402
2403 set_freezable();
2404 set_user_nice(current, 5);
2405
2406 while (!kthread_should_stop()) {
2407 mutex_lock(&ksm_thread_mutex);
2408 wait_while_offlining();
2409 if (ksmd_should_run())
2410 ksm_do_scan(ksm_thread_pages_to_scan);
2411 mutex_unlock(&ksm_thread_mutex);
2412
2413 try_to_freeze();
2414
2415 if (ksmd_should_run()) {
2416 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2417 wait_event_interruptible_timeout(ksm_iter_wait,
2418 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2419 msecs_to_jiffies(sleep_ms));
2420 } else {
2421 wait_event_freezable(ksm_thread_wait,
2422 ksmd_should_run() || kthread_should_stop());
2423 }
2424 }
2425 return 0;
2426}
2427
2428int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2429 unsigned long end, int advice, unsigned long *vm_flags)
2430{
2431 struct mm_struct *mm = vma->vm_mm;
2432 int err;
2433
2434 switch (advice) {
2435 case MADV_MERGEABLE:
2436
2437
2438
2439 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2440 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2441 VM_HUGETLB | VM_MIXEDMAP))
2442 return 0;
2443
2444 if (vma_is_dax(vma))
2445 return 0;
2446
2447#ifdef VM_SAO
2448 if (*vm_flags & VM_SAO)
2449 return 0;
2450#endif
2451#ifdef VM_SPARC_ADI
2452 if (*vm_flags & VM_SPARC_ADI)
2453 return 0;
2454#endif
2455
2456 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2457 err = __ksm_enter(mm);
2458 if (err)
2459 return err;
2460 }
2461
2462 *vm_flags |= VM_MERGEABLE;
2463 break;
2464
2465 case MADV_UNMERGEABLE:
2466 if (!(*vm_flags & VM_MERGEABLE))
2467 return 0;
2468
2469 if (vma->anon_vma) {
2470 err = unmerge_ksm_pages(vma, start, end);
2471 if (err)
2472 return err;
2473 }
2474
2475 *vm_flags &= ~VM_MERGEABLE;
2476 break;
2477 }
2478
2479 return 0;
2480}
2481
2482int __ksm_enter(struct mm_struct *mm)
2483{
2484 struct mm_slot *mm_slot;
2485 int needs_wakeup;
2486
2487 mm_slot = alloc_mm_slot();
2488 if (!mm_slot)
2489 return -ENOMEM;
2490
2491
2492 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2493
2494 spin_lock(&ksm_mmlist_lock);
2495 insert_to_mm_slots_hash(mm, mm_slot);
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506 if (ksm_run & KSM_RUN_UNMERGE)
2507 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2508 else
2509 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2510 spin_unlock(&ksm_mmlist_lock);
2511
2512 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2513 mmgrab(mm);
2514
2515 if (needs_wakeup)
2516 wake_up_interruptible(&ksm_thread_wait);
2517
2518 return 0;
2519}
2520
2521void __ksm_exit(struct mm_struct *mm)
2522{
2523 struct mm_slot *mm_slot;
2524 int easy_to_free = 0;
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535 spin_lock(&ksm_mmlist_lock);
2536 mm_slot = get_mm_slot(mm);
2537 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2538 if (!mm_slot->rmap_list) {
2539 hash_del(&mm_slot->link);
2540 list_del(&mm_slot->mm_list);
2541 easy_to_free = 1;
2542 } else {
2543 list_move(&mm_slot->mm_list,
2544 &ksm_scan.mm_slot->mm_list);
2545 }
2546 }
2547 spin_unlock(&ksm_mmlist_lock);
2548
2549 if (easy_to_free) {
2550 free_mm_slot(mm_slot);
2551 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2552 mmdrop(mm);
2553 } else if (mm_slot) {
2554 down_write(&mm->mmap_sem);
2555 up_write(&mm->mmap_sem);
2556 }
2557}
2558
2559struct page *ksm_might_need_to_copy(struct page *page,
2560 struct vm_area_struct *vma, unsigned long address)
2561{
2562 struct anon_vma *anon_vma = page_anon_vma(page);
2563 struct page *new_page;
2564
2565 if (PageKsm(page)) {
2566 if (page_stable_node(page) &&
2567 !(ksm_run & KSM_RUN_UNMERGE))
2568 return page;
2569 } else if (!anon_vma) {
2570 return page;
2571 } else if (anon_vma->root == vma->anon_vma->root &&
2572 page->index == linear_page_index(vma, address)) {
2573 return page;
2574 }
2575 if (!PageUptodate(page))
2576 return page;
2577
2578 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2579 if (new_page) {
2580 copy_user_highpage(new_page, page, address, vma);
2581
2582 SetPageDirty(new_page);
2583 __SetPageUptodate(new_page);
2584 __SetPageLocked(new_page);
2585 }
2586
2587 return new_page;
2588}
2589
2590void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
2591{
2592 struct stable_node *stable_node;
2593 struct rmap_item *rmap_item;
2594 int search_new_forks = 0;
2595
2596 VM_BUG_ON_PAGE(!PageKsm(page), page);
2597
2598
2599
2600
2601
2602 VM_BUG_ON_PAGE(!PageLocked(page), page);
2603
2604 stable_node = page_stable_node(page);
2605 if (!stable_node)
2606 return;
2607again:
2608 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2609 struct anon_vma *anon_vma = rmap_item->anon_vma;
2610 struct anon_vma_chain *vmac;
2611 struct vm_area_struct *vma;
2612
2613 cond_resched();
2614 anon_vma_lock_read(anon_vma);
2615 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2616 0, ULONG_MAX) {
2617 unsigned long addr;
2618
2619 cond_resched();
2620 vma = vmac->vma;
2621
2622
2623 addr = rmap_item->address & ~KSM_FLAG_MASK;
2624
2625 if (addr < vma->vm_start || addr >= vma->vm_end)
2626 continue;
2627
2628
2629
2630
2631
2632
2633 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2634 continue;
2635
2636 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2637 continue;
2638
2639 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
2640 anon_vma_unlock_read(anon_vma);
2641 return;
2642 }
2643 if (rwc->done && rwc->done(page)) {
2644 anon_vma_unlock_read(anon_vma);
2645 return;
2646 }
2647 }
2648 anon_vma_unlock_read(anon_vma);
2649 }
2650 if (!search_new_forks++)
2651 goto again;
2652}
2653
2654bool reuse_ksm_page(struct page *page,
2655 struct vm_area_struct *vma,
2656 unsigned long address)
2657{
2658#ifdef CONFIG_DEBUG_VM
2659 if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
2660 WARN_ON(!page_mapped(page)) ||
2661 WARN_ON(!PageLocked(page))) {
2662 dump_page(page, "reuse_ksm_page");
2663 return false;
2664 }
2665#endif
2666
2667 if (PageSwapCache(page) || !page_stable_node(page))
2668 return false;
2669
2670 if (!page_ref_freeze(page, 1))
2671 return false;
2672
2673 page_move_anon_rmap(page, vma);
2674 page->index = linear_page_index(vma, address);
2675 page_ref_unfreeze(page, 1);
2676
2677 return true;
2678}
2679#ifdef CONFIG_MIGRATION
2680void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2681{
2682 struct stable_node *stable_node;
2683
2684 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2685 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2686 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2687
2688 stable_node = page_stable_node(newpage);
2689 if (stable_node) {
2690 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2691 stable_node->kpfn = page_to_pfn(newpage);
2692
2693
2694
2695
2696
2697
2698 smp_wmb();
2699 set_page_stable_node(oldpage, NULL);
2700 }
2701}
2702#endif
2703
2704#ifdef CONFIG_MEMORY_HOTREMOVE
2705static void wait_while_offlining(void)
2706{
2707 while (ksm_run & KSM_RUN_OFFLINE) {
2708 mutex_unlock(&ksm_thread_mutex);
2709 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2710 TASK_UNINTERRUPTIBLE);
2711 mutex_lock(&ksm_thread_mutex);
2712 }
2713}
2714
2715static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2716 unsigned long start_pfn,
2717 unsigned long end_pfn)
2718{
2719 if (stable_node->kpfn >= start_pfn &&
2720 stable_node->kpfn < end_pfn) {
2721
2722
2723
2724
2725 remove_node_from_stable_tree(stable_node);
2726 return true;
2727 }
2728 return false;
2729}
2730
2731static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2732 unsigned long start_pfn,
2733 unsigned long end_pfn,
2734 struct rb_root *root)
2735{
2736 struct stable_node *dup;
2737 struct hlist_node *hlist_safe;
2738
2739 if (!is_stable_node_chain(stable_node)) {
2740 VM_BUG_ON(is_stable_node_dup(stable_node));
2741 return stable_node_dup_remove_range(stable_node, start_pfn,
2742 end_pfn);
2743 }
2744
2745 hlist_for_each_entry_safe(dup, hlist_safe,
2746 &stable_node->hlist, hlist_dup) {
2747 VM_BUG_ON(!is_stable_node_dup(dup));
2748 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2749 }
2750 if (hlist_empty(&stable_node->hlist)) {
2751 free_stable_node_chain(stable_node, root);
2752 return true;
2753 } else
2754 return false;
2755}
2756
2757static void ksm_check_stable_tree(unsigned long start_pfn,
2758 unsigned long end_pfn)
2759{
2760 struct stable_node *stable_node, *next;
2761 struct rb_node *node;
2762 int nid;
2763
2764 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2765 node = rb_first(root_stable_tree + nid);
2766 while (node) {
2767 stable_node = rb_entry(node, struct stable_node, node);
2768 if (stable_node_chain_remove_range(stable_node,
2769 start_pfn, end_pfn,
2770 root_stable_tree +
2771 nid))
2772 node = rb_first(root_stable_tree + nid);
2773 else
2774 node = rb_next(node);
2775 cond_resched();
2776 }
2777 }
2778 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2779 if (stable_node->kpfn >= start_pfn &&
2780 stable_node->kpfn < end_pfn)
2781 remove_node_from_stable_tree(stable_node);
2782 cond_resched();
2783 }
2784}
2785
2786static int ksm_memory_callback(struct notifier_block *self,
2787 unsigned long action, void *arg)
2788{
2789 struct memory_notify *mn = arg;
2790
2791 switch (action) {
2792 case MEM_GOING_OFFLINE:
2793
2794
2795
2796
2797
2798
2799
2800 mutex_lock(&ksm_thread_mutex);
2801 ksm_run |= KSM_RUN_OFFLINE;
2802 mutex_unlock(&ksm_thread_mutex);
2803 break;
2804
2805 case MEM_OFFLINE:
2806
2807
2808
2809
2810
2811
2812
2813 ksm_check_stable_tree(mn->start_pfn,
2814 mn->start_pfn + mn->nr_pages);
2815
2816
2817 case MEM_CANCEL_OFFLINE:
2818 mutex_lock(&ksm_thread_mutex);
2819 ksm_run &= ~KSM_RUN_OFFLINE;
2820 mutex_unlock(&ksm_thread_mutex);
2821
2822 smp_mb();
2823 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2824 break;
2825 }
2826 return NOTIFY_OK;
2827}
2828#else
2829static void wait_while_offlining(void)
2830{
2831}
2832#endif
2833
2834#ifdef CONFIG_SYSFS
2835
2836
2837
2838
2839#define KSM_ATTR_RO(_name) \
2840 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2841#define KSM_ATTR(_name) \
2842 static struct kobj_attribute _name##_attr = \
2843 __ATTR(_name, 0644, _name##_show, _name##_store)
2844
2845static ssize_t sleep_millisecs_show(struct kobject *kobj,
2846 struct kobj_attribute *attr, char *buf)
2847{
2848 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2849}
2850
2851static ssize_t sleep_millisecs_store(struct kobject *kobj,
2852 struct kobj_attribute *attr,
2853 const char *buf, size_t count)
2854{
2855 unsigned long msecs;
2856 int err;
2857
2858 err = kstrtoul(buf, 10, &msecs);
2859 if (err || msecs > UINT_MAX)
2860 return -EINVAL;
2861
2862 ksm_thread_sleep_millisecs = msecs;
2863 wake_up_interruptible(&ksm_iter_wait);
2864
2865 return count;
2866}
2867KSM_ATTR(sleep_millisecs);
2868
2869static ssize_t pages_to_scan_show(struct kobject *kobj,
2870 struct kobj_attribute *attr, char *buf)
2871{
2872 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2873}
2874
2875static ssize_t pages_to_scan_store(struct kobject *kobj,
2876 struct kobj_attribute *attr,
2877 const char *buf, size_t count)
2878{
2879 int err;
2880 unsigned long nr_pages;
2881
2882 err = kstrtoul(buf, 10, &nr_pages);
2883 if (err || nr_pages > UINT_MAX)
2884 return -EINVAL;
2885
2886 ksm_thread_pages_to_scan = nr_pages;
2887
2888 return count;
2889}
2890KSM_ATTR(pages_to_scan);
2891
2892static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2893 char *buf)
2894{
2895 return sprintf(buf, "%lu\n", ksm_run);
2896}
2897
2898static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2899 const char *buf, size_t count)
2900{
2901 int err;
2902 unsigned long flags;
2903
2904 err = kstrtoul(buf, 10, &flags);
2905 if (err || flags > UINT_MAX)
2906 return -EINVAL;
2907 if (flags > KSM_RUN_UNMERGE)
2908 return -EINVAL;
2909
2910
2911
2912
2913
2914
2915
2916
2917 mutex_lock(&ksm_thread_mutex);
2918 wait_while_offlining();
2919 if (ksm_run != flags) {
2920 ksm_run = flags;
2921 if (flags & KSM_RUN_UNMERGE) {
2922 set_current_oom_origin();
2923 err = unmerge_and_remove_all_rmap_items();
2924 clear_current_oom_origin();
2925 if (err) {
2926 ksm_run = KSM_RUN_STOP;
2927 count = err;
2928 }
2929 }
2930 }
2931 mutex_unlock(&ksm_thread_mutex);
2932
2933 if (flags & KSM_RUN_MERGE)
2934 wake_up_interruptible(&ksm_thread_wait);
2935
2936 return count;
2937}
2938KSM_ATTR(run);
2939
2940#ifdef CONFIG_NUMA
2941static ssize_t merge_across_nodes_show(struct kobject *kobj,
2942 struct kobj_attribute *attr, char *buf)
2943{
2944 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2945}
2946
2947static ssize_t merge_across_nodes_store(struct kobject *kobj,
2948 struct kobj_attribute *attr,
2949 const char *buf, size_t count)
2950{
2951 int err;
2952 unsigned long knob;
2953
2954 err = kstrtoul(buf, 10, &knob);
2955 if (err)
2956 return err;
2957 if (knob > 1)
2958 return -EINVAL;
2959
2960 mutex_lock(&ksm_thread_mutex);
2961 wait_while_offlining();
2962 if (ksm_merge_across_nodes != knob) {
2963 if (ksm_pages_shared || remove_all_stable_nodes())
2964 err = -EBUSY;
2965 else if (root_stable_tree == one_stable_tree) {
2966 struct rb_root *buf;
2967
2968
2969
2970
2971
2972
2973
2974 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2975 GFP_KERNEL);
2976
2977 if (!buf)
2978 err = -ENOMEM;
2979 else {
2980 root_stable_tree = buf;
2981 root_unstable_tree = buf + nr_node_ids;
2982
2983 root_unstable_tree[0] = one_unstable_tree[0];
2984 }
2985 }
2986 if (!err) {
2987 ksm_merge_across_nodes = knob;
2988 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2989 }
2990 }
2991 mutex_unlock(&ksm_thread_mutex);
2992
2993 return err ? err : count;
2994}
2995KSM_ATTR(merge_across_nodes);
2996#endif
2997
2998static ssize_t use_zero_pages_show(struct kobject *kobj,
2999 struct kobj_attribute *attr, char *buf)
3000{
3001 return sprintf(buf, "%u\n", ksm_use_zero_pages);
3002}
3003static ssize_t use_zero_pages_store(struct kobject *kobj,
3004 struct kobj_attribute *attr,
3005 const char *buf, size_t count)
3006{
3007 int err;
3008 bool value;
3009
3010 err = kstrtobool(buf, &value);
3011 if (err)
3012 return -EINVAL;
3013
3014 ksm_use_zero_pages = value;
3015
3016 return count;
3017}
3018KSM_ATTR(use_zero_pages);
3019
3020static ssize_t max_page_sharing_show(struct kobject *kobj,
3021 struct kobj_attribute *attr, char *buf)
3022{
3023 return sprintf(buf, "%u\n", ksm_max_page_sharing);
3024}
3025
3026static ssize_t max_page_sharing_store(struct kobject *kobj,
3027 struct kobj_attribute *attr,
3028 const char *buf, size_t count)
3029{
3030 int err;
3031 int knob;
3032
3033 err = kstrtoint(buf, 10, &knob);
3034 if (err)
3035 return err;
3036
3037
3038
3039
3040
3041 if (knob < 2)
3042 return -EINVAL;
3043
3044 if (READ_ONCE(ksm_max_page_sharing) == knob)
3045 return count;
3046
3047 mutex_lock(&ksm_thread_mutex);
3048 wait_while_offlining();
3049 if (ksm_max_page_sharing != knob) {
3050 if (ksm_pages_shared || remove_all_stable_nodes())
3051 err = -EBUSY;
3052 else
3053 ksm_max_page_sharing = knob;
3054 }
3055 mutex_unlock(&ksm_thread_mutex);
3056
3057 return err ? err : count;
3058}
3059KSM_ATTR(max_page_sharing);
3060
3061static ssize_t pages_shared_show(struct kobject *kobj,
3062 struct kobj_attribute *attr, char *buf)
3063{
3064 return sprintf(buf, "%lu\n", ksm_pages_shared);
3065}
3066KSM_ATTR_RO(pages_shared);
3067
3068static ssize_t pages_sharing_show(struct kobject *kobj,
3069 struct kobj_attribute *attr, char *buf)
3070{
3071 return sprintf(buf, "%lu\n", ksm_pages_sharing);
3072}
3073KSM_ATTR_RO(pages_sharing);
3074
3075static ssize_t pages_unshared_show(struct kobject *kobj,
3076 struct kobj_attribute *attr, char *buf)
3077{
3078 return sprintf(buf, "%lu\n", ksm_pages_unshared);
3079}
3080KSM_ATTR_RO(pages_unshared);
3081
3082static ssize_t pages_volatile_show(struct kobject *kobj,
3083 struct kobj_attribute *attr, char *buf)
3084{
3085 long ksm_pages_volatile;
3086
3087 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3088 - ksm_pages_sharing - ksm_pages_unshared;
3089
3090
3091
3092
3093 if (ksm_pages_volatile < 0)
3094 ksm_pages_volatile = 0;
3095 return sprintf(buf, "%ld\n", ksm_pages_volatile);
3096}
3097KSM_ATTR_RO(pages_volatile);
3098
3099static ssize_t stable_node_dups_show(struct kobject *kobj,
3100 struct kobj_attribute *attr, char *buf)
3101{
3102 return sprintf(buf, "%lu\n", ksm_stable_node_dups);
3103}
3104KSM_ATTR_RO(stable_node_dups);
3105
3106static ssize_t stable_node_chains_show(struct kobject *kobj,
3107 struct kobj_attribute *attr, char *buf)
3108{
3109 return sprintf(buf, "%lu\n", ksm_stable_node_chains);
3110}
3111KSM_ATTR_RO(stable_node_chains);
3112
3113static ssize_t
3114stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3115 struct kobj_attribute *attr,
3116 char *buf)
3117{
3118 return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3119}
3120
3121static ssize_t
3122stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3123 struct kobj_attribute *attr,
3124 const char *buf, size_t count)
3125{
3126 unsigned long msecs;
3127 int err;
3128
3129 err = kstrtoul(buf, 10, &msecs);
3130 if (err || msecs > UINT_MAX)
3131 return -EINVAL;
3132
3133 ksm_stable_node_chains_prune_millisecs = msecs;
3134
3135 return count;
3136}
3137KSM_ATTR(stable_node_chains_prune_millisecs);
3138
3139static ssize_t full_scans_show(struct kobject *kobj,
3140 struct kobj_attribute *attr, char *buf)
3141{
3142 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
3143}
3144KSM_ATTR_RO(full_scans);
3145
3146static struct attribute *ksm_attrs[] = {
3147 &sleep_millisecs_attr.attr,
3148 &pages_to_scan_attr.attr,
3149 &run_attr.attr,
3150 &pages_shared_attr.attr,
3151 &pages_sharing_attr.attr,
3152 &pages_unshared_attr.attr,
3153 &pages_volatile_attr.attr,
3154 &full_scans_attr.attr,
3155#ifdef CONFIG_NUMA
3156 &merge_across_nodes_attr.attr,
3157#endif
3158 &max_page_sharing_attr.attr,
3159 &stable_node_chains_attr.attr,
3160 &stable_node_dups_attr.attr,
3161 &stable_node_chains_prune_millisecs_attr.attr,
3162 &use_zero_pages_attr.attr,
3163 NULL,
3164};
3165
3166static const struct attribute_group ksm_attr_group = {
3167 .attrs = ksm_attrs,
3168 .name = "ksm",
3169};
3170#endif
3171
3172static int __init ksm_init(void)
3173{
3174 struct task_struct *ksm_thread;
3175 int err;
3176
3177
3178 zero_checksum = calc_checksum(ZERO_PAGE(0));
3179
3180 ksm_use_zero_pages = false;
3181
3182 err = ksm_slab_init();
3183 if (err)
3184 goto out;
3185
3186 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3187 if (IS_ERR(ksm_thread)) {
3188 pr_err("ksm: creating kthread failed\n");
3189 err = PTR_ERR(ksm_thread);
3190 goto out_free;
3191 }
3192
3193#ifdef CONFIG_SYSFS
3194 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3195 if (err) {
3196 pr_err("ksm: register sysfs failed\n");
3197 kthread_stop(ksm_thread);
3198 goto out_free;
3199 }
3200#else
3201 ksm_run = KSM_RUN_MERGE;
3202
3203#endif
3204
3205#ifdef CONFIG_MEMORY_HOTREMOVE
3206
3207 hotplug_memory_notifier(ksm_memory_callback, 100);
3208#endif
3209 return 0;
3210
3211out_free:
3212 ksm_slab_free();
3213out:
3214 return err;
3215}
3216subsys_initcall(ksm_init);
3217