1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <linux/errno.h>
17#include <linux/mm.h>
18#include <linux/fs.h>
19#include <linux/mman.h>
20#include <linux/sched.h>
21#include <linux/sched/mm.h>
22#include <linux/sched/coredump.h>
23#include <linux/rwsem.h>
24#include <linux/pagemap.h>
25#include <linux/rmap.h>
26#include <linux/spinlock.h>
27#include <linux/xxhash.h>
28#include <linux/delay.h>
29#include <linux/kthread.h>
30#include <linux/wait.h>
31#include <linux/slab.h>
32#include <linux/rbtree.h>
33#include <linux/memory.h>
34#include <linux/mmu_notifier.h>
35#include <linux/swap.h>
36#include <linux/ksm.h>
37#include <linux/hashtable.h>
38#include <linux/freezer.h>
39#include <linux/oom.h>
40#include <linux/numa.h>
41
42#include <asm/tlbflush.h>
43#include "internal.h"
44
45#ifdef CONFIG_NUMA
46#define NUMA(x) (x)
47#define DO_NUMA(x) do { (x); } while (0)
48#else
49#define NUMA(x) (0)
50#define DO_NUMA(x) do { } while (0)
51#endif
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120struct mm_slot {
121 struct hlist_node link;
122 struct list_head mm_list;
123 struct rmap_item *rmap_list;
124 struct mm_struct *mm;
125};
126
127
128
129
130
131
132
133
134
135
136struct ksm_scan {
137 struct mm_slot *mm_slot;
138 unsigned long address;
139 struct rmap_item **rmap_list;
140 unsigned long seqnr;
141};
142
143
144
145
146
147
148
149
150
151
152
153
154
155struct stable_node {
156 union {
157 struct rb_node node;
158 struct {
159 struct list_head *head;
160 struct {
161 struct hlist_node hlist_dup;
162 struct list_head list;
163 };
164 };
165 };
166 struct hlist_head hlist;
167 union {
168 unsigned long kpfn;
169 unsigned long chain_prune_time;
170 };
171
172
173
174
175
176#define STABLE_NODE_CHAIN -1024
177 int rmap_hlist_len;
178#ifdef CONFIG_NUMA
179 int nid;
180#endif
181};
182
183
184
185
186
187
188
189
190
191
192
193
194
195struct rmap_item {
196 struct rmap_item *rmap_list;
197 union {
198 struct anon_vma *anon_vma;
199#ifdef CONFIG_NUMA
200 int nid;
201#endif
202 };
203 struct mm_struct *mm;
204 unsigned long address;
205 unsigned int oldchecksum;
206 union {
207 struct rb_node node;
208 struct {
209 struct stable_node *head;
210 struct hlist_node hlist;
211 };
212 };
213};
214
215#define SEQNR_MASK 0x0ff
216#define UNSTABLE_FLAG 0x100
217#define STABLE_FLAG 0x200
218
219
220static struct rb_root one_stable_tree[1] = { RB_ROOT };
221static struct rb_root one_unstable_tree[1] = { RB_ROOT };
222static struct rb_root *root_stable_tree = one_stable_tree;
223static struct rb_root *root_unstable_tree = one_unstable_tree;
224
225
226static LIST_HEAD(migrate_nodes);
227#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
228
229#define MM_SLOTS_HASH_BITS 10
230static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
231
232static struct mm_slot ksm_mm_head = {
233 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
234};
235static struct ksm_scan ksm_scan = {
236 .mm_slot = &ksm_mm_head,
237};
238
239static struct kmem_cache *rmap_item_cache;
240static struct kmem_cache *stable_node_cache;
241static struct kmem_cache *mm_slot_cache;
242
243
244static unsigned long ksm_pages_shared;
245
246
247static unsigned long ksm_pages_sharing;
248
249
250static unsigned long ksm_pages_unshared;
251
252
253static unsigned long ksm_rmap_items;
254
255
256static unsigned long ksm_stable_node_chains;
257
258
259static unsigned long ksm_stable_node_dups;
260
261
262static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
263
264
265static int ksm_max_page_sharing = 256;
266
267
268static unsigned int ksm_thread_pages_to_scan = 100;
269
270
271static unsigned int ksm_thread_sleep_millisecs = 20;
272
273
274static unsigned int zero_checksum __read_mostly;
275
276
277static bool ksm_use_zero_pages __read_mostly;
278
279#ifdef CONFIG_NUMA
280
281static unsigned int ksm_merge_across_nodes = 1;
282static int ksm_nr_node_ids = 1;
283#else
284#define ksm_merge_across_nodes 1U
285#define ksm_nr_node_ids 1
286#endif
287
288#define KSM_RUN_STOP 0
289#define KSM_RUN_MERGE 1
290#define KSM_RUN_UNMERGE 2
291#define KSM_RUN_OFFLINE 4
292static unsigned long ksm_run = KSM_RUN_STOP;
293static void wait_while_offlining(void);
294
295static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
296static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
297static DEFINE_MUTEX(ksm_thread_mutex);
298static DEFINE_SPINLOCK(ksm_mmlist_lock);
299
300#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
301 sizeof(struct __struct), __alignof__(struct __struct),\
302 (__flags), NULL)
303
304static int __init ksm_slab_init(void)
305{
306 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
307 if (!rmap_item_cache)
308 goto out;
309
310 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
311 if (!stable_node_cache)
312 goto out_free1;
313
314 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
315 if (!mm_slot_cache)
316 goto out_free2;
317
318 return 0;
319
320out_free2:
321 kmem_cache_destroy(stable_node_cache);
322out_free1:
323 kmem_cache_destroy(rmap_item_cache);
324out:
325 return -ENOMEM;
326}
327
328static void __init ksm_slab_free(void)
329{
330 kmem_cache_destroy(mm_slot_cache);
331 kmem_cache_destroy(stable_node_cache);
332 kmem_cache_destroy(rmap_item_cache);
333 mm_slot_cache = NULL;
334}
335
336static __always_inline bool is_stable_node_chain(struct stable_node *chain)
337{
338 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
339}
340
341static __always_inline bool is_stable_node_dup(struct stable_node *dup)
342{
343 return dup->head == STABLE_NODE_DUP_HEAD;
344}
345
346static inline void stable_node_chain_add_dup(struct stable_node *dup,
347 struct stable_node *chain)
348{
349 VM_BUG_ON(is_stable_node_dup(dup));
350 dup->head = STABLE_NODE_DUP_HEAD;
351 VM_BUG_ON(!is_stable_node_chain(chain));
352 hlist_add_head(&dup->hlist_dup, &chain->hlist);
353 ksm_stable_node_dups++;
354}
355
356static inline void __stable_node_dup_del(struct stable_node *dup)
357{
358 VM_BUG_ON(!is_stable_node_dup(dup));
359 hlist_del(&dup->hlist_dup);
360 ksm_stable_node_dups--;
361}
362
363static inline void stable_node_dup_del(struct stable_node *dup)
364{
365 VM_BUG_ON(is_stable_node_chain(dup));
366 if (is_stable_node_dup(dup))
367 __stable_node_dup_del(dup);
368 else
369 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
370#ifdef CONFIG_DEBUG_VM
371 dup->head = NULL;
372#endif
373}
374
375static inline struct rmap_item *alloc_rmap_item(void)
376{
377 struct rmap_item *rmap_item;
378
379 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
380 __GFP_NORETRY | __GFP_NOWARN);
381 if (rmap_item)
382 ksm_rmap_items++;
383 return rmap_item;
384}
385
386static inline void free_rmap_item(struct rmap_item *rmap_item)
387{
388 ksm_rmap_items--;
389 rmap_item->mm = NULL;
390 kmem_cache_free(rmap_item_cache, rmap_item);
391}
392
393static inline struct stable_node *alloc_stable_node(void)
394{
395
396
397
398
399
400 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
401}
402
403static inline void free_stable_node(struct stable_node *stable_node)
404{
405 VM_BUG_ON(stable_node->rmap_hlist_len &&
406 !is_stable_node_chain(stable_node));
407 kmem_cache_free(stable_node_cache, stable_node);
408}
409
410static inline struct mm_slot *alloc_mm_slot(void)
411{
412 if (!mm_slot_cache)
413 return NULL;
414 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
415}
416
417static inline void free_mm_slot(struct mm_slot *mm_slot)
418{
419 kmem_cache_free(mm_slot_cache, mm_slot);
420}
421
422static struct mm_slot *get_mm_slot(struct mm_struct *mm)
423{
424 struct mm_slot *slot;
425
426 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
427 if (slot->mm == mm)
428 return slot;
429
430 return NULL;
431}
432
433static void insert_to_mm_slots_hash(struct mm_struct *mm,
434 struct mm_slot *mm_slot)
435{
436 mm_slot->mm = mm;
437 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
438}
439
440
441
442
443
444
445
446
447
448static inline bool ksm_test_exit(struct mm_struct *mm)
449{
450 return atomic_read(&mm->mm_users) == 0;
451}
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
469{
470 struct page *page;
471 vm_fault_t ret = 0;
472
473 do {
474 cond_resched();
475 page = follow_page(vma, addr,
476 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
477 if (IS_ERR_OR_NULL(page))
478 break;
479 if (PageKsm(page))
480 ret = handle_mm_fault(vma, addr,
481 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
482 NULL);
483 else
484 ret = VM_FAULT_WRITE;
485 put_page(page);
486 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
516}
517
518static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
519 unsigned long addr)
520{
521 struct vm_area_struct *vma;
522 if (ksm_test_exit(mm))
523 return NULL;
524 vma = vma_lookup(mm, addr);
525 if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
526 return NULL;
527 return vma;
528}
529
530static void break_cow(struct rmap_item *rmap_item)
531{
532 struct mm_struct *mm = rmap_item->mm;
533 unsigned long addr = rmap_item->address;
534 struct vm_area_struct *vma;
535
536
537
538
539
540 put_anon_vma(rmap_item->anon_vma);
541
542 mmap_read_lock(mm);
543 vma = find_mergeable_vma(mm, addr);
544 if (vma)
545 break_ksm(vma, addr);
546 mmap_read_unlock(mm);
547}
548
549static struct page *get_mergeable_page(struct rmap_item *rmap_item)
550{
551 struct mm_struct *mm = rmap_item->mm;
552 unsigned long addr = rmap_item->address;
553 struct vm_area_struct *vma;
554 struct page *page;
555
556 mmap_read_lock(mm);
557 vma = find_mergeable_vma(mm, addr);
558 if (!vma)
559 goto out;
560
561 page = follow_page(vma, addr, FOLL_GET);
562 if (IS_ERR_OR_NULL(page))
563 goto out;
564 if (PageAnon(page)) {
565 flush_anon_page(vma, page, addr);
566 flush_dcache_page(page);
567 } else {
568 put_page(page);
569out:
570 page = NULL;
571 }
572 mmap_read_unlock(mm);
573 return page;
574}
575
576
577
578
579
580
581
582static inline int get_kpfn_nid(unsigned long kpfn)
583{
584 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
585}
586
587static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
588 struct rb_root *root)
589{
590 struct stable_node *chain = alloc_stable_node();
591 VM_BUG_ON(is_stable_node_chain(dup));
592 if (likely(chain)) {
593 INIT_HLIST_HEAD(&chain->hlist);
594 chain->chain_prune_time = jiffies;
595 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
596#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
597 chain->nid = NUMA_NO_NODE;
598#endif
599 ksm_stable_node_chains++;
600
601
602
603
604
605
606 rb_replace_node(&dup->node, &chain->node, root);
607
608
609
610
611
612
613
614
615 stable_node_chain_add_dup(dup, chain);
616 }
617 return chain;
618}
619
620static inline void free_stable_node_chain(struct stable_node *chain,
621 struct rb_root *root)
622{
623 rb_erase(&chain->node, root);
624 free_stable_node(chain);
625 ksm_stable_node_chains--;
626}
627
628static void remove_node_from_stable_tree(struct stable_node *stable_node)
629{
630 struct rmap_item *rmap_item;
631
632
633 BUG_ON(stable_node->rmap_hlist_len < 0);
634
635 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
636 if (rmap_item->hlist.next)
637 ksm_pages_sharing--;
638 else
639 ksm_pages_shared--;
640 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
641 stable_node->rmap_hlist_len--;
642 put_anon_vma(rmap_item->anon_vma);
643 rmap_item->address &= PAGE_MASK;
644 cond_resched();
645 }
646
647
648
649
650
651
652
653
654 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
655 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
656
657 if (stable_node->head == &migrate_nodes)
658 list_del(&stable_node->list);
659 else
660 stable_node_dup_del(stable_node);
661 free_stable_node(stable_node);
662}
663
664enum get_ksm_page_flags {
665 GET_KSM_PAGE_NOLOCK,
666 GET_KSM_PAGE_LOCK,
667 GET_KSM_PAGE_TRYLOCK
668};
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689static struct page *get_ksm_page(struct stable_node *stable_node,
690 enum get_ksm_page_flags flags)
691{
692 struct page *page;
693 void *expected_mapping;
694 unsigned long kpfn;
695
696 expected_mapping = (void *)((unsigned long)stable_node |
697 PAGE_MAPPING_KSM);
698again:
699 kpfn = READ_ONCE(stable_node->kpfn);
700 page = pfn_to_page(kpfn);
701 if (READ_ONCE(page->mapping) != expected_mapping)
702 goto stale;
703
704
705
706
707
708
709
710
711
712
713
714 while (!get_page_unless_zero(page)) {
715
716
717
718
719
720
721
722
723 if (!PageSwapCache(page))
724 goto stale;
725 cpu_relax();
726 }
727
728 if (READ_ONCE(page->mapping) != expected_mapping) {
729 put_page(page);
730 goto stale;
731 }
732
733 if (flags == GET_KSM_PAGE_TRYLOCK) {
734 if (!trylock_page(page)) {
735 put_page(page);
736 return ERR_PTR(-EBUSY);
737 }
738 } else if (flags == GET_KSM_PAGE_LOCK)
739 lock_page(page);
740
741 if (flags != GET_KSM_PAGE_NOLOCK) {
742 if (READ_ONCE(page->mapping) != expected_mapping) {
743 unlock_page(page);
744 put_page(page);
745 goto stale;
746 }
747 }
748 return page;
749
750stale:
751
752
753
754
755
756
757 smp_rmb();
758 if (READ_ONCE(stable_node->kpfn) != kpfn)
759 goto again;
760 remove_node_from_stable_tree(stable_node);
761 return NULL;
762}
763
764
765
766
767
768static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
769{
770 if (rmap_item->address & STABLE_FLAG) {
771 struct stable_node *stable_node;
772 struct page *page;
773
774 stable_node = rmap_item->head;
775 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
776 if (!page)
777 goto out;
778
779 hlist_del(&rmap_item->hlist);
780 unlock_page(page);
781 put_page(page);
782
783 if (!hlist_empty(&stable_node->hlist))
784 ksm_pages_sharing--;
785 else
786 ksm_pages_shared--;
787 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
788 stable_node->rmap_hlist_len--;
789
790 put_anon_vma(rmap_item->anon_vma);
791 rmap_item->head = NULL;
792 rmap_item->address &= PAGE_MASK;
793
794 } else if (rmap_item->address & UNSTABLE_FLAG) {
795 unsigned char age;
796
797
798
799
800
801
802
803 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
804 BUG_ON(age > 1);
805 if (!age)
806 rb_erase(&rmap_item->node,
807 root_unstable_tree + NUMA(rmap_item->nid));
808 ksm_pages_unshared--;
809 rmap_item->address &= PAGE_MASK;
810 }
811out:
812 cond_resched();
813}
814
815static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
816{
817 while (*rmap_list) {
818 struct rmap_item *rmap_item = *rmap_list;
819 *rmap_list = rmap_item->rmap_list;
820 remove_rmap_item_from_tree(rmap_item);
821 free_rmap_item(rmap_item);
822 }
823}
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838static int unmerge_ksm_pages(struct vm_area_struct *vma,
839 unsigned long start, unsigned long end)
840{
841 unsigned long addr;
842 int err = 0;
843
844 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
845 if (ksm_test_exit(vma->vm_mm))
846 break;
847 if (signal_pending(current))
848 err = -ERESTARTSYS;
849 else
850 err = break_ksm(vma, addr);
851 }
852 return err;
853}
854
855static inline struct stable_node *page_stable_node(struct page *page)
856{
857 return PageKsm(page) ? page_rmapping(page) : NULL;
858}
859
860static inline void set_page_stable_node(struct page *page,
861 struct stable_node *stable_node)
862{
863 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
864}
865
866#ifdef CONFIG_SYSFS
867
868
869
870static int remove_stable_node(struct stable_node *stable_node)
871{
872 struct page *page;
873 int err;
874
875 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
876 if (!page) {
877
878
879
880 return 0;
881 }
882
883
884
885
886
887
888 err = -EBUSY;
889 if (!page_mapped(page)) {
890
891
892
893
894
895
896
897
898 set_page_stable_node(page, NULL);
899 remove_node_from_stable_tree(stable_node);
900 err = 0;
901 }
902
903 unlock_page(page);
904 put_page(page);
905 return err;
906}
907
908static int remove_stable_node_chain(struct stable_node *stable_node,
909 struct rb_root *root)
910{
911 struct stable_node *dup;
912 struct hlist_node *hlist_safe;
913
914 if (!is_stable_node_chain(stable_node)) {
915 VM_BUG_ON(is_stable_node_dup(stable_node));
916 if (remove_stable_node(stable_node))
917 return true;
918 else
919 return false;
920 }
921
922 hlist_for_each_entry_safe(dup, hlist_safe,
923 &stable_node->hlist, hlist_dup) {
924 VM_BUG_ON(!is_stable_node_dup(dup));
925 if (remove_stable_node(dup))
926 return true;
927 }
928 BUG_ON(!hlist_empty(&stable_node->hlist));
929 free_stable_node_chain(stable_node, root);
930 return false;
931}
932
933static int remove_all_stable_nodes(void)
934{
935 struct stable_node *stable_node, *next;
936 int nid;
937 int err = 0;
938
939 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
940 while (root_stable_tree[nid].rb_node) {
941 stable_node = rb_entry(root_stable_tree[nid].rb_node,
942 struct stable_node, node);
943 if (remove_stable_node_chain(stable_node,
944 root_stable_tree + nid)) {
945 err = -EBUSY;
946 break;
947 }
948 cond_resched();
949 }
950 }
951 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
952 if (remove_stable_node(stable_node))
953 err = -EBUSY;
954 cond_resched();
955 }
956 return err;
957}
958
959static int unmerge_and_remove_all_rmap_items(void)
960{
961 struct mm_slot *mm_slot;
962 struct mm_struct *mm;
963 struct vm_area_struct *vma;
964 int err = 0;
965
966 spin_lock(&ksm_mmlist_lock);
967 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
968 struct mm_slot, mm_list);
969 spin_unlock(&ksm_mmlist_lock);
970
971 for (mm_slot = ksm_scan.mm_slot;
972 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
973 mm = mm_slot->mm;
974 mmap_read_lock(mm);
975 for (vma = mm->mmap; vma; vma = vma->vm_next) {
976 if (ksm_test_exit(mm))
977 break;
978 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
979 continue;
980 err = unmerge_ksm_pages(vma,
981 vma->vm_start, vma->vm_end);
982 if (err)
983 goto error;
984 }
985
986 remove_trailing_rmap_items(&mm_slot->rmap_list);
987 mmap_read_unlock(mm);
988
989 spin_lock(&ksm_mmlist_lock);
990 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
991 struct mm_slot, mm_list);
992 if (ksm_test_exit(mm)) {
993 hash_del(&mm_slot->link);
994 list_del(&mm_slot->mm_list);
995 spin_unlock(&ksm_mmlist_lock);
996
997 free_mm_slot(mm_slot);
998 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
999 mmdrop(mm);
1000 } else
1001 spin_unlock(&ksm_mmlist_lock);
1002 }
1003
1004
1005 remove_all_stable_nodes();
1006 ksm_scan.seqnr = 0;
1007 return 0;
1008
1009error:
1010 mmap_read_unlock(mm);
1011 spin_lock(&ksm_mmlist_lock);
1012 ksm_scan.mm_slot = &ksm_mm_head;
1013 spin_unlock(&ksm_mmlist_lock);
1014 return err;
1015}
1016#endif
1017
1018static u32 calc_checksum(struct page *page)
1019{
1020 u32 checksum;
1021 void *addr = kmap_atomic(page);
1022 checksum = xxhash(addr, PAGE_SIZE, 0);
1023 kunmap_atomic(addr);
1024 return checksum;
1025}
1026
1027static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1028 pte_t *orig_pte)
1029{
1030 struct mm_struct *mm = vma->vm_mm;
1031 struct page_vma_mapped_walk pvmw = {
1032 .page = page,
1033 .vma = vma,
1034 };
1035 int swapped;
1036 int err = -EFAULT;
1037 struct mmu_notifier_range range;
1038
1039 pvmw.address = page_address_in_vma(page, vma);
1040 if (pvmw.address == -EFAULT)
1041 goto out;
1042
1043 BUG_ON(PageTransCompound(page));
1044
1045 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1046 pvmw.address,
1047 pvmw.address + PAGE_SIZE);
1048 mmu_notifier_invalidate_range_start(&range);
1049
1050 if (!page_vma_mapped_walk(&pvmw))
1051 goto out_mn;
1052 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1053 goto out_unlock;
1054
1055 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1056 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1057 mm_tlb_flush_pending(mm)) {
1058 pte_t entry;
1059
1060 swapped = PageSwapCache(page);
1061 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1077
1078
1079
1080
1081 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1082 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1083 goto out_unlock;
1084 }
1085 if (pte_dirty(entry))
1086 set_page_dirty(page);
1087
1088 if (pte_protnone(entry))
1089 entry = pte_mkclean(pte_clear_savedwrite(entry));
1090 else
1091 entry = pte_mkclean(pte_wrprotect(entry));
1092 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1093 }
1094 *orig_pte = *pvmw.pte;
1095 err = 0;
1096
1097out_unlock:
1098 page_vma_mapped_walk_done(&pvmw);
1099out_mn:
1100 mmu_notifier_invalidate_range_end(&range);
1101out:
1102 return err;
1103}
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114static int replace_page(struct vm_area_struct *vma, struct page *page,
1115 struct page *kpage, pte_t orig_pte)
1116{
1117 struct mm_struct *mm = vma->vm_mm;
1118 pmd_t *pmd;
1119 pte_t *ptep;
1120 pte_t newpte;
1121 spinlock_t *ptl;
1122 unsigned long addr;
1123 int err = -EFAULT;
1124 struct mmu_notifier_range range;
1125
1126 addr = page_address_in_vma(page, vma);
1127 if (addr == -EFAULT)
1128 goto out;
1129
1130 pmd = mm_find_pmd(mm, addr);
1131 if (!pmd)
1132 goto out;
1133
1134 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1135 addr + PAGE_SIZE);
1136 mmu_notifier_invalidate_range_start(&range);
1137
1138 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1139 if (!pte_same(*ptep, orig_pte)) {
1140 pte_unmap_unlock(ptep, ptl);
1141 goto out_mn;
1142 }
1143
1144
1145
1146
1147
1148 if (!is_zero_pfn(page_to_pfn(kpage))) {
1149 get_page(kpage);
1150 page_add_anon_rmap(kpage, vma, addr, false);
1151 newpte = mk_pte(kpage, vma->vm_page_prot);
1152 } else {
1153 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1154 vma->vm_page_prot));
1155
1156
1157
1158
1159
1160
1161 dec_mm_counter(mm, MM_ANONPAGES);
1162 }
1163
1164 flush_cache_page(vma, addr, pte_pfn(*ptep));
1165
1166
1167
1168
1169
1170
1171 ptep_clear_flush(vma, addr, ptep);
1172 set_pte_at_notify(mm, addr, ptep, newpte);
1173
1174 page_remove_rmap(page, false);
1175 if (!page_mapped(page))
1176 try_to_free_swap(page);
1177 put_page(page);
1178
1179 pte_unmap_unlock(ptep, ptl);
1180 err = 0;
1181out_mn:
1182 mmu_notifier_invalidate_range_end(&range);
1183out:
1184 return err;
1185}
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196static int try_to_merge_one_page(struct vm_area_struct *vma,
1197 struct page *page, struct page *kpage)
1198{
1199 pte_t orig_pte = __pte(0);
1200 int err = -EFAULT;
1201
1202 if (page == kpage)
1203 return 0;
1204
1205 if (!PageAnon(page))
1206 goto out;
1207
1208
1209
1210
1211
1212
1213
1214
1215 if (!trylock_page(page))
1216 goto out;
1217
1218 if (PageTransCompound(page)) {
1219 if (split_huge_page(page))
1220 goto out_unlock;
1221 }
1222
1223
1224
1225
1226
1227
1228
1229 if (write_protect_page(vma, page, &orig_pte) == 0) {
1230 if (!kpage) {
1231
1232
1233
1234
1235
1236 set_page_stable_node(page, NULL);
1237 mark_page_accessed(page);
1238
1239
1240
1241
1242 if (!PageDirty(page))
1243 SetPageDirty(page);
1244 err = 0;
1245 } else if (pages_identical(page, kpage))
1246 err = replace_page(vma, page, kpage, orig_pte);
1247 }
1248
1249 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1250 munlock_vma_page(page);
1251 if (!PageMlocked(kpage)) {
1252 unlock_page(page);
1253 lock_page(kpage);
1254 mlock_vma_page(kpage);
1255 page = kpage;
1256 }
1257 }
1258
1259out_unlock:
1260 unlock_page(page);
1261out:
1262 return err;
1263}
1264
1265
1266
1267
1268
1269
1270
1271static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1272 struct page *page, struct page *kpage)
1273{
1274 struct mm_struct *mm = rmap_item->mm;
1275 struct vm_area_struct *vma;
1276 int err = -EFAULT;
1277
1278 mmap_read_lock(mm);
1279 vma = find_mergeable_vma(mm, rmap_item->address);
1280 if (!vma)
1281 goto out;
1282
1283 err = try_to_merge_one_page(vma, page, kpage);
1284 if (err)
1285 goto out;
1286
1287
1288 remove_rmap_item_from_tree(rmap_item);
1289
1290
1291 rmap_item->anon_vma = vma->anon_vma;
1292 get_anon_vma(vma->anon_vma);
1293out:
1294 mmap_read_unlock(mm);
1295 return err;
1296}
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1309 struct page *page,
1310 struct rmap_item *tree_rmap_item,
1311 struct page *tree_page)
1312{
1313 int err;
1314
1315 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1316 if (!err) {
1317 err = try_to_merge_with_ksm_page(tree_rmap_item,
1318 tree_page, page);
1319
1320
1321
1322
1323 if (err)
1324 break_cow(rmap_item);
1325 }
1326 return err ? NULL : page;
1327}
1328
1329static __always_inline
1330bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1331{
1332 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1333
1334
1335
1336
1337
1338
1339 return stable_node->rmap_hlist_len &&
1340 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1341}
1342
1343static __always_inline
1344bool is_page_sharing_candidate(struct stable_node *stable_node)
1345{
1346 return __is_page_sharing_candidate(stable_node, 0);
1347}
1348
1349static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1350 struct stable_node **_stable_node,
1351 struct rb_root *root,
1352 bool prune_stale_stable_nodes)
1353{
1354 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1355 struct hlist_node *hlist_safe;
1356 struct page *_tree_page, *tree_page = NULL;
1357 int nr = 0;
1358 int found_rmap_hlist_len;
1359
1360 if (!prune_stale_stable_nodes ||
1361 time_before(jiffies, stable_node->chain_prune_time +
1362 msecs_to_jiffies(
1363 ksm_stable_node_chains_prune_millisecs)))
1364 prune_stale_stable_nodes = false;
1365 else
1366 stable_node->chain_prune_time = jiffies;
1367
1368 hlist_for_each_entry_safe(dup, hlist_safe,
1369 &stable_node->hlist, hlist_dup) {
1370 cond_resched();
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1382 if (!_tree_page)
1383 continue;
1384 nr += 1;
1385 if (is_page_sharing_candidate(dup)) {
1386 if (!found ||
1387 dup->rmap_hlist_len > found_rmap_hlist_len) {
1388 if (found)
1389 put_page(tree_page);
1390 found = dup;
1391 found_rmap_hlist_len = found->rmap_hlist_len;
1392 tree_page = _tree_page;
1393
1394
1395 if (!prune_stale_stable_nodes)
1396 break;
1397 continue;
1398 }
1399 }
1400 put_page(_tree_page);
1401 }
1402
1403 if (found) {
1404
1405
1406
1407
1408
1409
1410 if (prune_stale_stable_nodes && nr == 1) {
1411
1412
1413
1414
1415
1416
1417 BUG_ON(stable_node->hlist.first->next);
1418
1419
1420
1421
1422
1423 rb_replace_node(&stable_node->node, &found->node,
1424 root);
1425 free_stable_node(stable_node);
1426 ksm_stable_node_chains--;
1427 ksm_stable_node_dups--;
1428
1429
1430
1431
1432
1433 *_stable_node = found;
1434
1435
1436
1437
1438
1439
1440 stable_node = NULL;
1441 } else if (stable_node->hlist.first != &found->hlist_dup &&
1442 __is_page_sharing_candidate(found, 1)) {
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458 hlist_del(&found->hlist_dup);
1459 hlist_add_head(&found->hlist_dup,
1460 &stable_node->hlist);
1461 }
1462 }
1463
1464 *_stable_node_dup = found;
1465 return tree_page;
1466}
1467
1468static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1469 struct rb_root *root)
1470{
1471 if (!is_stable_node_chain(stable_node))
1472 return stable_node;
1473 if (hlist_empty(&stable_node->hlist)) {
1474 free_stable_node_chain(stable_node, root);
1475 return NULL;
1476 }
1477 return hlist_entry(stable_node->hlist.first,
1478 typeof(*stable_node), hlist_dup);
1479}
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1496 struct stable_node **_stable_node,
1497 struct rb_root *root,
1498 bool prune_stale_stable_nodes)
1499{
1500 struct stable_node *stable_node = *_stable_node;
1501 if (!is_stable_node_chain(stable_node)) {
1502 if (is_page_sharing_candidate(stable_node)) {
1503 *_stable_node_dup = stable_node;
1504 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1505 }
1506
1507
1508
1509
1510 *_stable_node_dup = NULL;
1511 return NULL;
1512 }
1513 return stable_node_dup(_stable_node_dup, _stable_node, root,
1514 prune_stale_stable_nodes);
1515}
1516
1517static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1518 struct stable_node **s_n,
1519 struct rb_root *root)
1520{
1521 return __stable_node_chain(s_n_d, s_n, root, true);
1522}
1523
1524static __always_inline struct page *chain(struct stable_node **s_n_d,
1525 struct stable_node *s_n,
1526 struct rb_root *root)
1527{
1528 struct stable_node *old_stable_node = s_n;
1529 struct page *tree_page;
1530
1531 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1532
1533 VM_BUG_ON(s_n != old_stable_node);
1534 return tree_page;
1535}
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546static struct page *stable_tree_search(struct page *page)
1547{
1548 int nid;
1549 struct rb_root *root;
1550 struct rb_node **new;
1551 struct rb_node *parent;
1552 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1553 struct stable_node *page_node;
1554
1555 page_node = page_stable_node(page);
1556 if (page_node && page_node->head != &migrate_nodes) {
1557
1558 get_page(page);
1559 return page;
1560 }
1561
1562 nid = get_kpfn_nid(page_to_pfn(page));
1563 root = root_stable_tree + nid;
1564again:
1565 new = &root->rb_node;
1566 parent = NULL;
1567
1568 while (*new) {
1569 struct page *tree_page;
1570 int ret;
1571
1572 cond_resched();
1573 stable_node = rb_entry(*new, struct stable_node, node);
1574 stable_node_any = NULL;
1575 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588 if (!stable_node_dup) {
1589
1590
1591
1592
1593
1594 stable_node_any = stable_node_dup_any(stable_node,
1595 root);
1596 if (!stable_node_any) {
1597
1598 goto again;
1599 }
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609 tree_page = get_ksm_page(stable_node_any,
1610 GET_KSM_PAGE_NOLOCK);
1611 }
1612 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1613 if (!tree_page) {
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623 goto again;
1624 }
1625
1626 ret = memcmp_pages(page, tree_page);
1627 put_page(tree_page);
1628
1629 parent = *new;
1630 if (ret < 0)
1631 new = &parent->rb_left;
1632 else if (ret > 0)
1633 new = &parent->rb_right;
1634 else {
1635 if (page_node) {
1636 VM_BUG_ON(page_node->head != &migrate_nodes);
1637
1638
1639
1640
1641
1642
1643 if (page_mapcount(page) > 1)
1644 goto chain_append;
1645 }
1646
1647 if (!stable_node_dup) {
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660 return NULL;
1661 }
1662
1663
1664
1665
1666
1667
1668
1669
1670 tree_page = get_ksm_page(stable_node_dup,
1671 GET_KSM_PAGE_TRYLOCK);
1672
1673 if (PTR_ERR(tree_page) == -EBUSY)
1674 return ERR_PTR(-EBUSY);
1675
1676 if (unlikely(!tree_page))
1677
1678
1679
1680
1681 goto again;
1682 unlock_page(tree_page);
1683
1684 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1685 NUMA(stable_node_dup->nid)) {
1686 put_page(tree_page);
1687 goto replace;
1688 }
1689 return tree_page;
1690 }
1691 }
1692
1693 if (!page_node)
1694 return NULL;
1695
1696 list_del(&page_node->list);
1697 DO_NUMA(page_node->nid = nid);
1698 rb_link_node(&page_node->node, parent, new);
1699 rb_insert_color(&page_node->node, root);
1700out:
1701 if (is_page_sharing_candidate(page_node)) {
1702 get_page(page);
1703 return page;
1704 } else
1705 return NULL;
1706
1707replace:
1708
1709
1710
1711
1712
1713
1714
1715
1716 if (stable_node_dup == stable_node) {
1717 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1718 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1719
1720 if (page_node) {
1721 VM_BUG_ON(page_node->head != &migrate_nodes);
1722 list_del(&page_node->list);
1723 DO_NUMA(page_node->nid = nid);
1724 rb_replace_node(&stable_node_dup->node,
1725 &page_node->node,
1726 root);
1727 if (is_page_sharing_candidate(page_node))
1728 get_page(page);
1729 else
1730 page = NULL;
1731 } else {
1732 rb_erase(&stable_node_dup->node, root);
1733 page = NULL;
1734 }
1735 } else {
1736 VM_BUG_ON(!is_stable_node_chain(stable_node));
1737 __stable_node_dup_del(stable_node_dup);
1738 if (page_node) {
1739 VM_BUG_ON(page_node->head != &migrate_nodes);
1740 list_del(&page_node->list);
1741 DO_NUMA(page_node->nid = nid);
1742 stable_node_chain_add_dup(page_node, stable_node);
1743 if (is_page_sharing_candidate(page_node))
1744 get_page(page);
1745 else
1746 page = NULL;
1747 } else {
1748 page = NULL;
1749 }
1750 }
1751 stable_node_dup->head = &migrate_nodes;
1752 list_add(&stable_node_dup->list, stable_node_dup->head);
1753 return page;
1754
1755chain_append:
1756
1757 if (!stable_node_dup)
1758 stable_node_dup = stable_node_any;
1759
1760
1761
1762
1763
1764
1765
1766
1767 if (stable_node_dup == stable_node) {
1768 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1769
1770 stable_node = alloc_stable_node_chain(stable_node_dup,
1771 root);
1772 if (!stable_node)
1773 return NULL;
1774 }
1775
1776
1777
1778
1779
1780
1781 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1782 VM_BUG_ON(page_node->head != &migrate_nodes);
1783 list_del(&page_node->list);
1784 DO_NUMA(page_node->nid = nid);
1785 stable_node_chain_add_dup(page_node, stable_node);
1786 goto out;
1787}
1788
1789
1790
1791
1792
1793
1794
1795
1796static struct stable_node *stable_tree_insert(struct page *kpage)
1797{
1798 int nid;
1799 unsigned long kpfn;
1800 struct rb_root *root;
1801 struct rb_node **new;
1802 struct rb_node *parent;
1803 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1804 bool need_chain = false;
1805
1806 kpfn = page_to_pfn(kpage);
1807 nid = get_kpfn_nid(kpfn);
1808 root = root_stable_tree + nid;
1809again:
1810 parent = NULL;
1811 new = &root->rb_node;
1812
1813 while (*new) {
1814 struct page *tree_page;
1815 int ret;
1816
1817 cond_resched();
1818 stable_node = rb_entry(*new, struct stable_node, node);
1819 stable_node_any = NULL;
1820 tree_page = chain(&stable_node_dup, stable_node, root);
1821 if (!stable_node_dup) {
1822
1823
1824
1825
1826
1827 stable_node_any = stable_node_dup_any(stable_node,
1828 root);
1829 if (!stable_node_any) {
1830
1831 goto again;
1832 }
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842 tree_page = get_ksm_page(stable_node_any,
1843 GET_KSM_PAGE_NOLOCK);
1844 }
1845 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1846 if (!tree_page) {
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856 goto again;
1857 }
1858
1859 ret = memcmp_pages(kpage, tree_page);
1860 put_page(tree_page);
1861
1862 parent = *new;
1863 if (ret < 0)
1864 new = &parent->rb_left;
1865 else if (ret > 0)
1866 new = &parent->rb_right;
1867 else {
1868 need_chain = true;
1869 break;
1870 }
1871 }
1872
1873 stable_node_dup = alloc_stable_node();
1874 if (!stable_node_dup)
1875 return NULL;
1876
1877 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1878 stable_node_dup->kpfn = kpfn;
1879 set_page_stable_node(kpage, stable_node_dup);
1880 stable_node_dup->rmap_hlist_len = 0;
1881 DO_NUMA(stable_node_dup->nid = nid);
1882 if (!need_chain) {
1883 rb_link_node(&stable_node_dup->node, parent, new);
1884 rb_insert_color(&stable_node_dup->node, root);
1885 } else {
1886 if (!is_stable_node_chain(stable_node)) {
1887 struct stable_node *orig = stable_node;
1888
1889 stable_node = alloc_stable_node_chain(orig, root);
1890 if (!stable_node) {
1891 free_stable_node(stable_node_dup);
1892 return NULL;
1893 }
1894 }
1895 stable_node_chain_add_dup(stable_node_dup, stable_node);
1896 }
1897
1898 return stable_node_dup;
1899}
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915static
1916struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1917 struct page *page,
1918 struct page **tree_pagep)
1919{
1920 struct rb_node **new;
1921 struct rb_root *root;
1922 struct rb_node *parent = NULL;
1923 int nid;
1924
1925 nid = get_kpfn_nid(page_to_pfn(page));
1926 root = root_unstable_tree + nid;
1927 new = &root->rb_node;
1928
1929 while (*new) {
1930 struct rmap_item *tree_rmap_item;
1931 struct page *tree_page;
1932 int ret;
1933
1934 cond_resched();
1935 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1936 tree_page = get_mergeable_page(tree_rmap_item);
1937 if (!tree_page)
1938 return NULL;
1939
1940
1941
1942
1943 if (page == tree_page) {
1944 put_page(tree_page);
1945 return NULL;
1946 }
1947
1948 ret = memcmp_pages(page, tree_page);
1949
1950 parent = *new;
1951 if (ret < 0) {
1952 put_page(tree_page);
1953 new = &parent->rb_left;
1954 } else if (ret > 0) {
1955 put_page(tree_page);
1956 new = &parent->rb_right;
1957 } else if (!ksm_merge_across_nodes &&
1958 page_to_nid(tree_page) != nid) {
1959
1960
1961
1962
1963
1964 put_page(tree_page);
1965 return NULL;
1966 } else {
1967 *tree_pagep = tree_page;
1968 return tree_rmap_item;
1969 }
1970 }
1971
1972 rmap_item->address |= UNSTABLE_FLAG;
1973 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1974 DO_NUMA(rmap_item->nid = nid);
1975 rb_link_node(&rmap_item->node, parent, new);
1976 rb_insert_color(&rmap_item->node, root);
1977
1978 ksm_pages_unshared++;
1979 return NULL;
1980}
1981
1982
1983
1984
1985
1986
1987static void stable_tree_append(struct rmap_item *rmap_item,
1988 struct stable_node *stable_node,
1989 bool max_page_sharing_bypass)
1990{
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001 BUG_ON(stable_node->rmap_hlist_len < 0);
2002
2003 stable_node->rmap_hlist_len++;
2004 if (!max_page_sharing_bypass)
2005
2006 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2007 ksm_max_page_sharing);
2008
2009 rmap_item->head = stable_node;
2010 rmap_item->address |= STABLE_FLAG;
2011 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2012
2013 if (rmap_item->hlist.next)
2014 ksm_pages_sharing++;
2015 else
2016 ksm_pages_shared++;
2017}
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2029{
2030 struct mm_struct *mm = rmap_item->mm;
2031 struct rmap_item *tree_rmap_item;
2032 struct page *tree_page = NULL;
2033 struct stable_node *stable_node;
2034 struct page *kpage;
2035 unsigned int checksum;
2036 int err;
2037 bool max_page_sharing_bypass = false;
2038
2039 stable_node = page_stable_node(page);
2040 if (stable_node) {
2041 if (stable_node->head != &migrate_nodes &&
2042 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2043 NUMA(stable_node->nid)) {
2044 stable_node_dup_del(stable_node);
2045 stable_node->head = &migrate_nodes;
2046 list_add(&stable_node->list, stable_node->head);
2047 }
2048 if (stable_node->head != &migrate_nodes &&
2049 rmap_item->head == stable_node)
2050 return;
2051
2052
2053
2054
2055 if (!is_page_sharing_candidate(stable_node))
2056 max_page_sharing_bypass = true;
2057 }
2058
2059
2060 kpage = stable_tree_search(page);
2061 if (kpage == page && rmap_item->head == stable_node) {
2062 put_page(kpage);
2063 return;
2064 }
2065
2066 remove_rmap_item_from_tree(rmap_item);
2067
2068 if (kpage) {
2069 if (PTR_ERR(kpage) == -EBUSY)
2070 return;
2071
2072 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2073 if (!err) {
2074
2075
2076
2077
2078 lock_page(kpage);
2079 stable_tree_append(rmap_item, page_stable_node(kpage),
2080 max_page_sharing_bypass);
2081 unlock_page(kpage);
2082 }
2083 put_page(kpage);
2084 return;
2085 }
2086
2087
2088
2089
2090
2091
2092
2093 checksum = calc_checksum(page);
2094 if (rmap_item->oldchecksum != checksum) {
2095 rmap_item->oldchecksum = checksum;
2096 return;
2097 }
2098
2099
2100
2101
2102
2103 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2104 struct vm_area_struct *vma;
2105
2106 mmap_read_lock(mm);
2107 vma = find_mergeable_vma(mm, rmap_item->address);
2108 if (vma) {
2109 err = try_to_merge_one_page(vma, page,
2110 ZERO_PAGE(rmap_item->address));
2111 } else {
2112
2113
2114
2115
2116 err = 0;
2117 }
2118 mmap_read_unlock(mm);
2119
2120
2121
2122
2123 if (!err)
2124 return;
2125 }
2126 tree_rmap_item =
2127 unstable_tree_search_insert(rmap_item, page, &tree_page);
2128 if (tree_rmap_item) {
2129 bool split;
2130
2131 kpage = try_to_merge_two_pages(rmap_item, page,
2132 tree_rmap_item, tree_page);
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143 split = PageTransCompound(page)
2144 && compound_head(page) == compound_head(tree_page);
2145 put_page(tree_page);
2146 if (kpage) {
2147
2148
2149
2150
2151 lock_page(kpage);
2152 stable_node = stable_tree_insert(kpage);
2153 if (stable_node) {
2154 stable_tree_append(tree_rmap_item, stable_node,
2155 false);
2156 stable_tree_append(rmap_item, stable_node,
2157 false);
2158 }
2159 unlock_page(kpage);
2160
2161
2162
2163
2164
2165
2166
2167 if (!stable_node) {
2168 break_cow(tree_rmap_item);
2169 break_cow(rmap_item);
2170 }
2171 } else if (split) {
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181 if (!trylock_page(page))
2182 return;
2183 split_huge_page(page);
2184 unlock_page(page);
2185 }
2186 }
2187}
2188
2189static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2190 struct rmap_item **rmap_list,
2191 unsigned long addr)
2192{
2193 struct rmap_item *rmap_item;
2194
2195 while (*rmap_list) {
2196 rmap_item = *rmap_list;
2197 if ((rmap_item->address & PAGE_MASK) == addr)
2198 return rmap_item;
2199 if (rmap_item->address > addr)
2200 break;
2201 *rmap_list = rmap_item->rmap_list;
2202 remove_rmap_item_from_tree(rmap_item);
2203 free_rmap_item(rmap_item);
2204 }
2205
2206 rmap_item = alloc_rmap_item();
2207 if (rmap_item) {
2208
2209 rmap_item->mm = mm_slot->mm;
2210 rmap_item->address = addr;
2211 rmap_item->rmap_list = *rmap_list;
2212 *rmap_list = rmap_item;
2213 }
2214 return rmap_item;
2215}
2216
2217static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2218{
2219 struct mm_struct *mm;
2220 struct mm_slot *slot;
2221 struct vm_area_struct *vma;
2222 struct rmap_item *rmap_item;
2223 int nid;
2224
2225 if (list_empty(&ksm_mm_head.mm_list))
2226 return NULL;
2227
2228 slot = ksm_scan.mm_slot;
2229 if (slot == &ksm_mm_head) {
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240 lru_add_drain_all();
2241
2242
2243
2244
2245
2246
2247
2248 if (!ksm_merge_across_nodes) {
2249 struct stable_node *stable_node, *next;
2250 struct page *page;
2251
2252 list_for_each_entry_safe(stable_node, next,
2253 &migrate_nodes, list) {
2254 page = get_ksm_page(stable_node,
2255 GET_KSM_PAGE_NOLOCK);
2256 if (page)
2257 put_page(page);
2258 cond_resched();
2259 }
2260 }
2261
2262 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2263 root_unstable_tree[nid] = RB_ROOT;
2264
2265 spin_lock(&ksm_mmlist_lock);
2266 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2267 ksm_scan.mm_slot = slot;
2268 spin_unlock(&ksm_mmlist_lock);
2269
2270
2271
2272
2273 if (slot == &ksm_mm_head)
2274 return NULL;
2275next_mm:
2276 ksm_scan.address = 0;
2277 ksm_scan.rmap_list = &slot->rmap_list;
2278 }
2279
2280 mm = slot->mm;
2281 mmap_read_lock(mm);
2282 if (ksm_test_exit(mm))
2283 vma = NULL;
2284 else
2285 vma = find_vma(mm, ksm_scan.address);
2286
2287 for (; vma; vma = vma->vm_next) {
2288 if (!(vma->vm_flags & VM_MERGEABLE))
2289 continue;
2290 if (ksm_scan.address < vma->vm_start)
2291 ksm_scan.address = vma->vm_start;
2292 if (!vma->anon_vma)
2293 ksm_scan.address = vma->vm_end;
2294
2295 while (ksm_scan.address < vma->vm_end) {
2296 if (ksm_test_exit(mm))
2297 break;
2298 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2299 if (IS_ERR_OR_NULL(*page)) {
2300 ksm_scan.address += PAGE_SIZE;
2301 cond_resched();
2302 continue;
2303 }
2304 if (PageAnon(*page)) {
2305 flush_anon_page(vma, *page, ksm_scan.address);
2306 flush_dcache_page(*page);
2307 rmap_item = get_next_rmap_item(slot,
2308 ksm_scan.rmap_list, ksm_scan.address);
2309 if (rmap_item) {
2310 ksm_scan.rmap_list =
2311 &rmap_item->rmap_list;
2312 ksm_scan.address += PAGE_SIZE;
2313 } else
2314 put_page(*page);
2315 mmap_read_unlock(mm);
2316 return rmap_item;
2317 }
2318 put_page(*page);
2319 ksm_scan.address += PAGE_SIZE;
2320 cond_resched();
2321 }
2322 }
2323
2324 if (ksm_test_exit(mm)) {
2325 ksm_scan.address = 0;
2326 ksm_scan.rmap_list = &slot->rmap_list;
2327 }
2328
2329
2330
2331
2332 remove_trailing_rmap_items(ksm_scan.rmap_list);
2333
2334 spin_lock(&ksm_mmlist_lock);
2335 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2336 struct mm_slot, mm_list);
2337 if (ksm_scan.address == 0) {
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347 hash_del(&slot->link);
2348 list_del(&slot->mm_list);
2349 spin_unlock(&ksm_mmlist_lock);
2350
2351 free_mm_slot(slot);
2352 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2353 mmap_read_unlock(mm);
2354 mmdrop(mm);
2355 } else {
2356 mmap_read_unlock(mm);
2357
2358
2359
2360
2361
2362
2363
2364 spin_unlock(&ksm_mmlist_lock);
2365 }
2366
2367
2368 slot = ksm_scan.mm_slot;
2369 if (slot != &ksm_mm_head)
2370 goto next_mm;
2371
2372 ksm_scan.seqnr++;
2373 return NULL;
2374}
2375
2376
2377
2378
2379
2380static void ksm_do_scan(unsigned int scan_npages)
2381{
2382 struct rmap_item *rmap_item;
2383 struct page *page;
2384
2385 while (scan_npages-- && likely(!freezing(current))) {
2386 cond_resched();
2387 rmap_item = scan_get_next_rmap_item(&page);
2388 if (!rmap_item)
2389 return;
2390 cmp_and_merge_page(page, rmap_item);
2391 put_page(page);
2392 }
2393}
2394
2395static int ksmd_should_run(void)
2396{
2397 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2398}
2399
2400static int ksm_scan_thread(void *nothing)
2401{
2402 unsigned int sleep_ms;
2403
2404 set_freezable();
2405 set_user_nice(current, 5);
2406
2407 while (!kthread_should_stop()) {
2408 mutex_lock(&ksm_thread_mutex);
2409 wait_while_offlining();
2410 if (ksmd_should_run())
2411 ksm_do_scan(ksm_thread_pages_to_scan);
2412 mutex_unlock(&ksm_thread_mutex);
2413
2414 try_to_freeze();
2415
2416 if (ksmd_should_run()) {
2417 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2418 wait_event_interruptible_timeout(ksm_iter_wait,
2419 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2420 msecs_to_jiffies(sleep_ms));
2421 } else {
2422 wait_event_freezable(ksm_thread_wait,
2423 ksmd_should_run() || kthread_should_stop());
2424 }
2425 }
2426 return 0;
2427}
2428
2429int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2430 unsigned long end, int advice, unsigned long *vm_flags)
2431{
2432 struct mm_struct *mm = vma->vm_mm;
2433 int err;
2434
2435 switch (advice) {
2436 case MADV_MERGEABLE:
2437
2438
2439
2440 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2441 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2442 VM_HUGETLB | VM_MIXEDMAP))
2443 return 0;
2444
2445 if (vma_is_dax(vma))
2446 return 0;
2447
2448#ifdef VM_SAO
2449 if (*vm_flags & VM_SAO)
2450 return 0;
2451#endif
2452#ifdef VM_SPARC_ADI
2453 if (*vm_flags & VM_SPARC_ADI)
2454 return 0;
2455#endif
2456
2457 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2458 err = __ksm_enter(mm);
2459 if (err)
2460 return err;
2461 }
2462
2463 *vm_flags |= VM_MERGEABLE;
2464 break;
2465
2466 case MADV_UNMERGEABLE:
2467 if (!(*vm_flags & VM_MERGEABLE))
2468 return 0;
2469
2470 if (vma->anon_vma) {
2471 err = unmerge_ksm_pages(vma, start, end);
2472 if (err)
2473 return err;
2474 }
2475
2476 *vm_flags &= ~VM_MERGEABLE;
2477 break;
2478 }
2479
2480 return 0;
2481}
2482EXPORT_SYMBOL_GPL(ksm_madvise);
2483
2484int __ksm_enter(struct mm_struct *mm)
2485{
2486 struct mm_slot *mm_slot;
2487 int needs_wakeup;
2488
2489 mm_slot = alloc_mm_slot();
2490 if (!mm_slot)
2491 return -ENOMEM;
2492
2493
2494 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2495
2496 spin_lock(&ksm_mmlist_lock);
2497 insert_to_mm_slots_hash(mm, mm_slot);
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508 if (ksm_run & KSM_RUN_UNMERGE)
2509 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2510 else
2511 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2512 spin_unlock(&ksm_mmlist_lock);
2513
2514 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2515 mmgrab(mm);
2516
2517 if (needs_wakeup)
2518 wake_up_interruptible(&ksm_thread_wait);
2519
2520 return 0;
2521}
2522
2523void __ksm_exit(struct mm_struct *mm)
2524{
2525 struct mm_slot *mm_slot;
2526 int easy_to_free = 0;
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537 spin_lock(&ksm_mmlist_lock);
2538 mm_slot = get_mm_slot(mm);
2539 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2540 if (!mm_slot->rmap_list) {
2541 hash_del(&mm_slot->link);
2542 list_del(&mm_slot->mm_list);
2543 easy_to_free = 1;
2544 } else {
2545 list_move(&mm_slot->mm_list,
2546 &ksm_scan.mm_slot->mm_list);
2547 }
2548 }
2549 spin_unlock(&ksm_mmlist_lock);
2550
2551 if (easy_to_free) {
2552 free_mm_slot(mm_slot);
2553 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2554 mmdrop(mm);
2555 } else if (mm_slot) {
2556 mmap_write_lock(mm);
2557 mmap_write_unlock(mm);
2558 }
2559}
2560
2561struct page *ksm_might_need_to_copy(struct page *page,
2562 struct vm_area_struct *vma, unsigned long address)
2563{
2564 struct anon_vma *anon_vma = page_anon_vma(page);
2565 struct page *new_page;
2566
2567 if (PageKsm(page)) {
2568 if (page_stable_node(page) &&
2569 !(ksm_run & KSM_RUN_UNMERGE))
2570 return page;
2571 } else if (!anon_vma) {
2572 return page;
2573 } else if (anon_vma->root == vma->anon_vma->root &&
2574 page->index == linear_page_index(vma, address)) {
2575 return page;
2576 }
2577 if (!PageUptodate(page))
2578 return page;
2579
2580 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2581 if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
2582 put_page(new_page);
2583 new_page = NULL;
2584 }
2585 if (new_page) {
2586 copy_user_highpage(new_page, page, address, vma);
2587
2588 SetPageDirty(new_page);
2589 __SetPageUptodate(new_page);
2590 __SetPageLocked(new_page);
2591 }
2592
2593 return new_page;
2594}
2595
2596void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
2597{
2598 struct stable_node *stable_node;
2599 struct rmap_item *rmap_item;
2600 int search_new_forks = 0;
2601
2602 VM_BUG_ON_PAGE(!PageKsm(page), page);
2603
2604
2605
2606
2607
2608 VM_BUG_ON_PAGE(!PageLocked(page), page);
2609
2610 stable_node = page_stable_node(page);
2611 if (!stable_node)
2612 return;
2613again:
2614 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2615 struct anon_vma *anon_vma = rmap_item->anon_vma;
2616 struct anon_vma_chain *vmac;
2617 struct vm_area_struct *vma;
2618
2619 cond_resched();
2620 anon_vma_lock_read(anon_vma);
2621 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2622 0, ULONG_MAX) {
2623 unsigned long addr;
2624
2625 cond_resched();
2626 vma = vmac->vma;
2627
2628
2629 addr = rmap_item->address & PAGE_MASK;
2630
2631 if (addr < vma->vm_start || addr >= vma->vm_end)
2632 continue;
2633
2634
2635
2636
2637
2638
2639 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2640 continue;
2641
2642 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2643 continue;
2644
2645 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
2646 anon_vma_unlock_read(anon_vma);
2647 return;
2648 }
2649 if (rwc->done && rwc->done(page)) {
2650 anon_vma_unlock_read(anon_vma);
2651 return;
2652 }
2653 }
2654 anon_vma_unlock_read(anon_vma);
2655 }
2656 if (!search_new_forks++)
2657 goto again;
2658}
2659
2660#ifdef CONFIG_MIGRATION
2661void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2662{
2663 struct stable_node *stable_node;
2664
2665 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2666 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2667 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2668
2669 stable_node = page_stable_node(newpage);
2670 if (stable_node) {
2671 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2672 stable_node->kpfn = page_to_pfn(newpage);
2673
2674
2675
2676
2677
2678
2679 smp_wmb();
2680 set_page_stable_node(oldpage, NULL);
2681 }
2682}
2683#endif
2684
2685#ifdef CONFIG_MEMORY_HOTREMOVE
2686static void wait_while_offlining(void)
2687{
2688 while (ksm_run & KSM_RUN_OFFLINE) {
2689 mutex_unlock(&ksm_thread_mutex);
2690 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2691 TASK_UNINTERRUPTIBLE);
2692 mutex_lock(&ksm_thread_mutex);
2693 }
2694}
2695
2696static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2697 unsigned long start_pfn,
2698 unsigned long end_pfn)
2699{
2700 if (stable_node->kpfn >= start_pfn &&
2701 stable_node->kpfn < end_pfn) {
2702
2703
2704
2705
2706 remove_node_from_stable_tree(stable_node);
2707 return true;
2708 }
2709 return false;
2710}
2711
2712static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2713 unsigned long start_pfn,
2714 unsigned long end_pfn,
2715 struct rb_root *root)
2716{
2717 struct stable_node *dup;
2718 struct hlist_node *hlist_safe;
2719
2720 if (!is_stable_node_chain(stable_node)) {
2721 VM_BUG_ON(is_stable_node_dup(stable_node));
2722 return stable_node_dup_remove_range(stable_node, start_pfn,
2723 end_pfn);
2724 }
2725
2726 hlist_for_each_entry_safe(dup, hlist_safe,
2727 &stable_node->hlist, hlist_dup) {
2728 VM_BUG_ON(!is_stable_node_dup(dup));
2729 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2730 }
2731 if (hlist_empty(&stable_node->hlist)) {
2732 free_stable_node_chain(stable_node, root);
2733 return true;
2734 } else
2735 return false;
2736}
2737
2738static void ksm_check_stable_tree(unsigned long start_pfn,
2739 unsigned long end_pfn)
2740{
2741 struct stable_node *stable_node, *next;
2742 struct rb_node *node;
2743 int nid;
2744
2745 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2746 node = rb_first(root_stable_tree + nid);
2747 while (node) {
2748 stable_node = rb_entry(node, struct stable_node, node);
2749 if (stable_node_chain_remove_range(stable_node,
2750 start_pfn, end_pfn,
2751 root_stable_tree +
2752 nid))
2753 node = rb_first(root_stable_tree + nid);
2754 else
2755 node = rb_next(node);
2756 cond_resched();
2757 }
2758 }
2759 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2760 if (stable_node->kpfn >= start_pfn &&
2761 stable_node->kpfn < end_pfn)
2762 remove_node_from_stable_tree(stable_node);
2763 cond_resched();
2764 }
2765}
2766
2767static int ksm_memory_callback(struct notifier_block *self,
2768 unsigned long action, void *arg)
2769{
2770 struct memory_notify *mn = arg;
2771
2772 switch (action) {
2773 case MEM_GOING_OFFLINE:
2774
2775
2776
2777
2778
2779
2780
2781 mutex_lock(&ksm_thread_mutex);
2782 ksm_run |= KSM_RUN_OFFLINE;
2783 mutex_unlock(&ksm_thread_mutex);
2784 break;
2785
2786 case MEM_OFFLINE:
2787
2788
2789
2790
2791
2792
2793
2794 ksm_check_stable_tree(mn->start_pfn,
2795 mn->start_pfn + mn->nr_pages);
2796 fallthrough;
2797 case MEM_CANCEL_OFFLINE:
2798 mutex_lock(&ksm_thread_mutex);
2799 ksm_run &= ~KSM_RUN_OFFLINE;
2800 mutex_unlock(&ksm_thread_mutex);
2801
2802 smp_mb();
2803 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2804 break;
2805 }
2806 return NOTIFY_OK;
2807}
2808#else
2809static void wait_while_offlining(void)
2810{
2811}
2812#endif
2813
2814#ifdef CONFIG_SYSFS
2815
2816
2817
2818
2819#define KSM_ATTR_RO(_name) \
2820 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2821#define KSM_ATTR(_name) \
2822 static struct kobj_attribute _name##_attr = \
2823 __ATTR(_name, 0644, _name##_show, _name##_store)
2824
2825static ssize_t sleep_millisecs_show(struct kobject *kobj,
2826 struct kobj_attribute *attr, char *buf)
2827{
2828 return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
2829}
2830
2831static ssize_t sleep_millisecs_store(struct kobject *kobj,
2832 struct kobj_attribute *attr,
2833 const char *buf, size_t count)
2834{
2835 unsigned int msecs;
2836 int err;
2837
2838 err = kstrtouint(buf, 10, &msecs);
2839 if (err)
2840 return -EINVAL;
2841
2842 ksm_thread_sleep_millisecs = msecs;
2843 wake_up_interruptible(&ksm_iter_wait);
2844
2845 return count;
2846}
2847KSM_ATTR(sleep_millisecs);
2848
2849static ssize_t pages_to_scan_show(struct kobject *kobj,
2850 struct kobj_attribute *attr, char *buf)
2851{
2852 return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
2853}
2854
2855static ssize_t pages_to_scan_store(struct kobject *kobj,
2856 struct kobj_attribute *attr,
2857 const char *buf, size_t count)
2858{
2859 unsigned int nr_pages;
2860 int err;
2861
2862 err = kstrtouint(buf, 10, &nr_pages);
2863 if (err)
2864 return -EINVAL;
2865
2866 ksm_thread_pages_to_scan = nr_pages;
2867
2868 return count;
2869}
2870KSM_ATTR(pages_to_scan);
2871
2872static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2873 char *buf)
2874{
2875 return sysfs_emit(buf, "%lu\n", ksm_run);
2876}
2877
2878static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2879 const char *buf, size_t count)
2880{
2881 unsigned int flags;
2882 int err;
2883
2884 err = kstrtouint(buf, 10, &flags);
2885 if (err)
2886 return -EINVAL;
2887 if (flags > KSM_RUN_UNMERGE)
2888 return -EINVAL;
2889
2890
2891
2892
2893
2894
2895
2896
2897 mutex_lock(&ksm_thread_mutex);
2898 wait_while_offlining();
2899 if (ksm_run != flags) {
2900 ksm_run = flags;
2901 if (flags & KSM_RUN_UNMERGE) {
2902 set_current_oom_origin();
2903 err = unmerge_and_remove_all_rmap_items();
2904 clear_current_oom_origin();
2905 if (err) {
2906 ksm_run = KSM_RUN_STOP;
2907 count = err;
2908 }
2909 }
2910 }
2911 mutex_unlock(&ksm_thread_mutex);
2912
2913 if (flags & KSM_RUN_MERGE)
2914 wake_up_interruptible(&ksm_thread_wait);
2915
2916 return count;
2917}
2918KSM_ATTR(run);
2919
2920#ifdef CONFIG_NUMA
2921static ssize_t merge_across_nodes_show(struct kobject *kobj,
2922 struct kobj_attribute *attr, char *buf)
2923{
2924 return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
2925}
2926
2927static ssize_t merge_across_nodes_store(struct kobject *kobj,
2928 struct kobj_attribute *attr,
2929 const char *buf, size_t count)
2930{
2931 int err;
2932 unsigned long knob;
2933
2934 err = kstrtoul(buf, 10, &knob);
2935 if (err)
2936 return err;
2937 if (knob > 1)
2938 return -EINVAL;
2939
2940 mutex_lock(&ksm_thread_mutex);
2941 wait_while_offlining();
2942 if (ksm_merge_across_nodes != knob) {
2943 if (ksm_pages_shared || remove_all_stable_nodes())
2944 err = -EBUSY;
2945 else if (root_stable_tree == one_stable_tree) {
2946 struct rb_root *buf;
2947
2948
2949
2950
2951
2952
2953
2954 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2955 GFP_KERNEL);
2956
2957 if (!buf)
2958 err = -ENOMEM;
2959 else {
2960 root_stable_tree = buf;
2961 root_unstable_tree = buf + nr_node_ids;
2962
2963 root_unstable_tree[0] = one_unstable_tree[0];
2964 }
2965 }
2966 if (!err) {
2967 ksm_merge_across_nodes = knob;
2968 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2969 }
2970 }
2971 mutex_unlock(&ksm_thread_mutex);
2972
2973 return err ? err : count;
2974}
2975KSM_ATTR(merge_across_nodes);
2976#endif
2977
2978static ssize_t use_zero_pages_show(struct kobject *kobj,
2979 struct kobj_attribute *attr, char *buf)
2980{
2981 return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
2982}
2983static ssize_t use_zero_pages_store(struct kobject *kobj,
2984 struct kobj_attribute *attr,
2985 const char *buf, size_t count)
2986{
2987 int err;
2988 bool value;
2989
2990 err = kstrtobool(buf, &value);
2991 if (err)
2992 return -EINVAL;
2993
2994 ksm_use_zero_pages = value;
2995
2996 return count;
2997}
2998KSM_ATTR(use_zero_pages);
2999
3000static ssize_t max_page_sharing_show(struct kobject *kobj,
3001 struct kobj_attribute *attr, char *buf)
3002{
3003 return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
3004}
3005
3006static ssize_t max_page_sharing_store(struct kobject *kobj,
3007 struct kobj_attribute *attr,
3008 const char *buf, size_t count)
3009{
3010 int err;
3011 int knob;
3012
3013 err = kstrtoint(buf, 10, &knob);
3014 if (err)
3015 return err;
3016
3017
3018
3019
3020
3021 if (knob < 2)
3022 return -EINVAL;
3023
3024 if (READ_ONCE(ksm_max_page_sharing) == knob)
3025 return count;
3026
3027 mutex_lock(&ksm_thread_mutex);
3028 wait_while_offlining();
3029 if (ksm_max_page_sharing != knob) {
3030 if (ksm_pages_shared || remove_all_stable_nodes())
3031 err = -EBUSY;
3032 else
3033 ksm_max_page_sharing = knob;
3034 }
3035 mutex_unlock(&ksm_thread_mutex);
3036
3037 return err ? err : count;
3038}
3039KSM_ATTR(max_page_sharing);
3040
3041static ssize_t pages_shared_show(struct kobject *kobj,
3042 struct kobj_attribute *attr, char *buf)
3043{
3044 return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
3045}
3046KSM_ATTR_RO(pages_shared);
3047
3048static ssize_t pages_sharing_show(struct kobject *kobj,
3049 struct kobj_attribute *attr, char *buf)
3050{
3051 return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
3052}
3053KSM_ATTR_RO(pages_sharing);
3054
3055static ssize_t pages_unshared_show(struct kobject *kobj,
3056 struct kobj_attribute *attr, char *buf)
3057{
3058 return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
3059}
3060KSM_ATTR_RO(pages_unshared);
3061
3062static ssize_t pages_volatile_show(struct kobject *kobj,
3063 struct kobj_attribute *attr, char *buf)
3064{
3065 long ksm_pages_volatile;
3066
3067 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3068 - ksm_pages_sharing - ksm_pages_unshared;
3069
3070
3071
3072
3073 if (ksm_pages_volatile < 0)
3074 ksm_pages_volatile = 0;
3075 return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
3076}
3077KSM_ATTR_RO(pages_volatile);
3078
3079static ssize_t stable_node_dups_show(struct kobject *kobj,
3080 struct kobj_attribute *attr, char *buf)
3081{
3082 return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
3083}
3084KSM_ATTR_RO(stable_node_dups);
3085
3086static ssize_t stable_node_chains_show(struct kobject *kobj,
3087 struct kobj_attribute *attr, char *buf)
3088{
3089 return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
3090}
3091KSM_ATTR_RO(stable_node_chains);
3092
3093static ssize_t
3094stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3095 struct kobj_attribute *attr,
3096 char *buf)
3097{
3098 return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3099}
3100
3101static ssize_t
3102stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3103 struct kobj_attribute *attr,
3104 const char *buf, size_t count)
3105{
3106 unsigned int msecs;
3107 int err;
3108
3109 err = kstrtouint(buf, 10, &msecs);
3110 if (err)
3111 return -EINVAL;
3112
3113 ksm_stable_node_chains_prune_millisecs = msecs;
3114
3115 return count;
3116}
3117KSM_ATTR(stable_node_chains_prune_millisecs);
3118
3119static ssize_t full_scans_show(struct kobject *kobj,
3120 struct kobj_attribute *attr, char *buf)
3121{
3122 return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
3123}
3124KSM_ATTR_RO(full_scans);
3125
3126static struct attribute *ksm_attrs[] = {
3127 &sleep_millisecs_attr.attr,
3128 &pages_to_scan_attr.attr,
3129 &run_attr.attr,
3130 &pages_shared_attr.attr,
3131 &pages_sharing_attr.attr,
3132 &pages_unshared_attr.attr,
3133 &pages_volatile_attr.attr,
3134 &full_scans_attr.attr,
3135#ifdef CONFIG_NUMA
3136 &merge_across_nodes_attr.attr,
3137#endif
3138 &max_page_sharing_attr.attr,
3139 &stable_node_chains_attr.attr,
3140 &stable_node_dups_attr.attr,
3141 &stable_node_chains_prune_millisecs_attr.attr,
3142 &use_zero_pages_attr.attr,
3143 NULL,
3144};
3145
3146static const struct attribute_group ksm_attr_group = {
3147 .attrs = ksm_attrs,
3148 .name = "ksm",
3149};
3150#endif
3151
3152static int __init ksm_init(void)
3153{
3154 struct task_struct *ksm_thread;
3155 int err;
3156
3157
3158 zero_checksum = calc_checksum(ZERO_PAGE(0));
3159
3160 ksm_use_zero_pages = false;
3161
3162 err = ksm_slab_init();
3163 if (err)
3164 goto out;
3165
3166 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3167 if (IS_ERR(ksm_thread)) {
3168 pr_err("ksm: creating kthread failed\n");
3169 err = PTR_ERR(ksm_thread);
3170 goto out_free;
3171 }
3172
3173#ifdef CONFIG_SYSFS
3174 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3175 if (err) {
3176 pr_err("ksm: register sysfs failed\n");
3177 kthread_stop(ksm_thread);
3178 goto out_free;
3179 }
3180#else
3181 ksm_run = KSM_RUN_MERGE;
3182
3183#endif
3184
3185#ifdef CONFIG_MEMORY_HOTREMOVE
3186
3187 hotplug_memory_notifier(ksm_memory_callback, 100);
3188#endif
3189 return 0;
3190
3191out_free:
3192 ksm_slab_free();
3193out:
3194 return err;
3195}
3196subsys_initcall(ksm_init);
3197