1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/sched/mm.h>
23#include <linux/sched/coredump.h>
24#include <linux/rwsem.h>
25#include <linux/pagemap.h>
26#include <linux/rmap.h>
27#include <linux/spinlock.h>
28#include <linux/jhash.h>
29#include <linux/delay.h>
30#include <linux/kthread.h>
31#include <linux/wait.h>
32#include <linux/slab.h>
33#include <linux/rbtree.h>
34#include <linux/memory.h>
35#include <linux/mmu_notifier.h>
36#include <linux/swap.h>
37#include <linux/ksm.h>
38#include <linux/hashtable.h>
39#include <linux/freezer.h>
40#include <linux/oom.h>
41#include <linux/numa.h>
42
43#include <asm/tlbflush.h>
44#include "internal.h"
45
46#ifdef CONFIG_NUMA
47#define NUMA(x) (x)
48#define DO_NUMA(x) do { (x); } while (0)
49#else
50#define NUMA(x) (0)
51#define DO_NUMA(x) do { } while (0)
52#endif
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121struct mm_slot {
122 struct hlist_node link;
123 struct list_head mm_list;
124 struct rmap_item *rmap_list;
125 struct mm_struct *mm;
126};
127
128
129
130
131
132
133
134
135
136
137struct ksm_scan {
138 struct mm_slot *mm_slot;
139 unsigned long address;
140 struct rmap_item **rmap_list;
141 unsigned long seqnr;
142};
143
144
145
146
147
148
149
150
151
152
153
154
155
156struct stable_node {
157 union {
158 struct rb_node node;
159 struct {
160 struct list_head *head;
161 struct {
162 struct hlist_node hlist_dup;
163 struct list_head list;
164 };
165 };
166 };
167 struct hlist_head hlist;
168 union {
169 unsigned long kpfn;
170 unsigned long chain_prune_time;
171 };
172
173
174
175
176
177#define STABLE_NODE_CHAIN -1024
178 int rmap_hlist_len;
179#ifdef CONFIG_NUMA
180 int nid;
181#endif
182};
183
184
185
186
187
188
189
190
191
192
193
194
195
196struct rmap_item {
197 struct rmap_item *rmap_list;
198 union {
199 struct anon_vma *anon_vma;
200#ifdef CONFIG_NUMA
201 int nid;
202#endif
203 };
204 struct mm_struct *mm;
205 unsigned long address;
206 unsigned int oldchecksum;
207 union {
208 struct rb_node node;
209 struct {
210 struct stable_node *head;
211 struct hlist_node hlist;
212 };
213 };
214};
215
216#define SEQNR_MASK 0x0ff
217#define UNSTABLE_FLAG 0x100
218#define STABLE_FLAG 0x200
219#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
220
221
222
223static struct rb_root one_stable_tree[1] = { RB_ROOT };
224static struct rb_root one_unstable_tree[1] = { RB_ROOT };
225static struct rb_root *root_stable_tree = one_stable_tree;
226static struct rb_root *root_unstable_tree = one_unstable_tree;
227
228
229static LIST_HEAD(migrate_nodes);
230#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
231
232#define MM_SLOTS_HASH_BITS 10
233static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
234
235static struct mm_slot ksm_mm_head = {
236 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
237};
238static struct ksm_scan ksm_scan = {
239 .mm_slot = &ksm_mm_head,
240};
241
242static struct kmem_cache *rmap_item_cache;
243static struct kmem_cache *stable_node_cache;
244static struct kmem_cache *mm_slot_cache;
245
246
247static unsigned long ksm_pages_shared;
248
249
250static unsigned long ksm_pages_sharing;
251
252
253static unsigned long ksm_pages_unshared;
254
255
256static unsigned long ksm_rmap_items;
257
258
259static unsigned long ksm_stable_node_chains;
260
261
262static unsigned long ksm_stable_node_dups;
263
264
265static int ksm_stable_node_chains_prune_millisecs = 2000;
266
267
268static int ksm_max_page_sharing = 256;
269
270
271static unsigned int ksm_thread_pages_to_scan = 100;
272
273
274static unsigned int ksm_thread_sleep_millisecs = 20;
275
276
277static unsigned int zero_checksum __read_mostly;
278
279
280static bool ksm_use_zero_pages __read_mostly;
281
282#ifdef CONFIG_NUMA
283
284static unsigned int ksm_merge_across_nodes = 1;
285static int ksm_nr_node_ids = 1;
286#else
287#define ksm_merge_across_nodes 1U
288#define ksm_nr_node_ids 1
289#endif
290
291#define KSM_RUN_STOP 0
292#define KSM_RUN_MERGE 1
293#define KSM_RUN_UNMERGE 2
294#define KSM_RUN_OFFLINE 4
295static unsigned long ksm_run = KSM_RUN_STOP;
296static void wait_while_offlining(void);
297
298static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
299static DEFINE_MUTEX(ksm_thread_mutex);
300static DEFINE_SPINLOCK(ksm_mmlist_lock);
301
302#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
303 sizeof(struct __struct), __alignof__(struct __struct),\
304 (__flags), NULL)
305
306static int __init ksm_slab_init(void)
307{
308 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
309 if (!rmap_item_cache)
310 goto out;
311
312 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
313 if (!stable_node_cache)
314 goto out_free1;
315
316 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
317 if (!mm_slot_cache)
318 goto out_free2;
319
320 return 0;
321
322out_free2:
323 kmem_cache_destroy(stable_node_cache);
324out_free1:
325 kmem_cache_destroy(rmap_item_cache);
326out:
327 return -ENOMEM;
328}
329
330static void __init ksm_slab_free(void)
331{
332 kmem_cache_destroy(mm_slot_cache);
333 kmem_cache_destroy(stable_node_cache);
334 kmem_cache_destroy(rmap_item_cache);
335 mm_slot_cache = NULL;
336}
337
338static __always_inline bool is_stable_node_chain(struct stable_node *chain)
339{
340 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
341}
342
343static __always_inline bool is_stable_node_dup(struct stable_node *dup)
344{
345 return dup->head == STABLE_NODE_DUP_HEAD;
346}
347
348static inline void stable_node_chain_add_dup(struct stable_node *dup,
349 struct stable_node *chain)
350{
351 VM_BUG_ON(is_stable_node_dup(dup));
352 dup->head = STABLE_NODE_DUP_HEAD;
353 VM_BUG_ON(!is_stable_node_chain(chain));
354 hlist_add_head(&dup->hlist_dup, &chain->hlist);
355 ksm_stable_node_dups++;
356}
357
358static inline void __stable_node_dup_del(struct stable_node *dup)
359{
360 VM_BUG_ON(!is_stable_node_dup(dup));
361 hlist_del(&dup->hlist_dup);
362 ksm_stable_node_dups--;
363}
364
365static inline void stable_node_dup_del(struct stable_node *dup)
366{
367 VM_BUG_ON(is_stable_node_chain(dup));
368 if (is_stable_node_dup(dup))
369 __stable_node_dup_del(dup);
370 else
371 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
372#ifdef CONFIG_DEBUG_VM
373 dup->head = NULL;
374#endif
375}
376
377static inline struct rmap_item *alloc_rmap_item(void)
378{
379 struct rmap_item *rmap_item;
380
381 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
382 __GFP_NORETRY | __GFP_NOWARN);
383 if (rmap_item)
384 ksm_rmap_items++;
385 return rmap_item;
386}
387
388static inline void free_rmap_item(struct rmap_item *rmap_item)
389{
390 ksm_rmap_items--;
391 rmap_item->mm = NULL;
392 kmem_cache_free(rmap_item_cache, rmap_item);
393}
394
395static inline struct stable_node *alloc_stable_node(void)
396{
397
398
399
400
401
402 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
403}
404
405static inline void free_stable_node(struct stable_node *stable_node)
406{
407 VM_BUG_ON(stable_node->rmap_hlist_len &&
408 !is_stable_node_chain(stable_node));
409 kmem_cache_free(stable_node_cache, stable_node);
410}
411
412static inline struct mm_slot *alloc_mm_slot(void)
413{
414 if (!mm_slot_cache)
415 return NULL;
416 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
417}
418
419static inline void free_mm_slot(struct mm_slot *mm_slot)
420{
421 kmem_cache_free(mm_slot_cache, mm_slot);
422}
423
424static struct mm_slot *get_mm_slot(struct mm_struct *mm)
425{
426 struct mm_slot *slot;
427
428 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
429 if (slot->mm == mm)
430 return slot;
431
432 return NULL;
433}
434
435static void insert_to_mm_slots_hash(struct mm_struct *mm,
436 struct mm_slot *mm_slot)
437{
438 mm_slot->mm = mm;
439 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
440}
441
442
443
444
445
446
447
448
449
450static inline bool ksm_test_exit(struct mm_struct *mm)
451{
452 return atomic_read(&mm->mm_users) == 0;
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
471{
472 struct page *page;
473 vm_fault_t ret = 0;
474
475 do {
476 cond_resched();
477 page = follow_page(vma, addr,
478 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
479 if (IS_ERR_OR_NULL(page))
480 break;
481 if (PageKsm(page))
482 ret = handle_mm_fault(vma, addr,
483 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
484 else
485 ret = VM_FAULT_WRITE;
486 put_page(page);
487 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
517}
518
519static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
520 unsigned long addr)
521{
522 struct vm_area_struct *vma;
523 if (ksm_test_exit(mm))
524 return NULL;
525 vma = find_vma(mm, addr);
526 if (!vma || vma->vm_start > addr)
527 return NULL;
528 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
529 return NULL;
530 return vma;
531}
532
533static void break_cow(struct rmap_item *rmap_item)
534{
535 struct mm_struct *mm = rmap_item->mm;
536 unsigned long addr = rmap_item->address;
537 struct vm_area_struct *vma;
538
539
540
541
542
543 put_anon_vma(rmap_item->anon_vma);
544
545 down_read(&mm->mmap_sem);
546 vma = find_mergeable_vma(mm, addr);
547 if (vma)
548 break_ksm(vma, addr);
549 up_read(&mm->mmap_sem);
550}
551
552static struct page *get_mergeable_page(struct rmap_item *rmap_item)
553{
554 struct mm_struct *mm = rmap_item->mm;
555 unsigned long addr = rmap_item->address;
556 struct vm_area_struct *vma;
557 struct page *page;
558
559 down_read(&mm->mmap_sem);
560 vma = find_mergeable_vma(mm, addr);
561 if (!vma)
562 goto out;
563
564 page = follow_page(vma, addr, FOLL_GET);
565 if (IS_ERR_OR_NULL(page))
566 goto out;
567 if (PageAnon(page)) {
568 flush_anon_page(vma, page, addr);
569 flush_dcache_page(page);
570 } else {
571 put_page(page);
572out:
573 page = NULL;
574 }
575 up_read(&mm->mmap_sem);
576 return page;
577}
578
579
580
581
582
583
584
585static inline int get_kpfn_nid(unsigned long kpfn)
586{
587 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
588}
589
590static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
591 struct rb_root *root)
592{
593 struct stable_node *chain = alloc_stable_node();
594 VM_BUG_ON(is_stable_node_chain(dup));
595 if (likely(chain)) {
596 INIT_HLIST_HEAD(&chain->hlist);
597 chain->chain_prune_time = jiffies;
598 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
599#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
600 chain->nid = -1;
601#endif
602 ksm_stable_node_chains++;
603
604
605
606
607
608
609 rb_replace_node(&dup->node, &chain->node, root);
610
611
612
613
614
615
616
617
618 stable_node_chain_add_dup(dup, chain);
619 }
620 return chain;
621}
622
623static inline void free_stable_node_chain(struct stable_node *chain,
624 struct rb_root *root)
625{
626 rb_erase(&chain->node, root);
627 free_stable_node(chain);
628 ksm_stable_node_chains--;
629}
630
631static void remove_node_from_stable_tree(struct stable_node *stable_node)
632{
633 struct rmap_item *rmap_item;
634
635
636 BUG_ON(stable_node->rmap_hlist_len < 0);
637
638 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
639 if (rmap_item->hlist.next)
640 ksm_pages_sharing--;
641 else
642 ksm_pages_shared--;
643 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
644 stable_node->rmap_hlist_len--;
645 put_anon_vma(rmap_item->anon_vma);
646 rmap_item->address &= PAGE_MASK;
647 cond_resched();
648 }
649
650
651
652
653
654
655
656
657#if defined(GCC_VERSION) && GCC_VERSION >= 40903
658 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
659 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
660#endif
661
662 if (stable_node->head == &migrate_nodes)
663 list_del(&stable_node->list);
664 else
665 stable_node_dup_del(stable_node);
666 free_stable_node(stable_node);
667}
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
689{
690 struct page *page;
691 void *expected_mapping;
692 unsigned long kpfn;
693
694 expected_mapping = (void *)((unsigned long)stable_node |
695 PAGE_MAPPING_KSM);
696again:
697 kpfn = READ_ONCE(stable_node->kpfn);
698 page = pfn_to_page(kpfn);
699 if (READ_ONCE(page->mapping) != expected_mapping)
700 goto stale;
701
702
703
704
705
706
707
708
709
710
711 while (!get_page_unless_zero(page)) {
712
713
714
715
716
717
718
719
720 if (!PageSwapCache(page))
721 goto stale;
722 cpu_relax();
723 }
724
725 if (READ_ONCE(page->mapping) != expected_mapping) {
726 put_page(page);
727 goto stale;
728 }
729
730 if (lock_it) {
731 lock_page(page);
732 if (READ_ONCE(page->mapping) != expected_mapping) {
733 unlock_page(page);
734 put_page(page);
735 goto stale;
736 }
737 }
738 return page;
739
740stale:
741
742
743
744
745
746
747 smp_rmb();
748 if (READ_ONCE(stable_node->kpfn) != kpfn)
749 goto again;
750 remove_node_from_stable_tree(stable_node);
751 return NULL;
752}
753
754
755
756
757
758static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
759{
760 if (rmap_item->address & STABLE_FLAG) {
761 struct stable_node *stable_node;
762 struct page *page;
763
764 stable_node = rmap_item->head;
765 page = get_ksm_page(stable_node, true);
766 if (!page)
767 goto out;
768
769 hlist_del(&rmap_item->hlist);
770 unlock_page(page);
771 put_page(page);
772
773 if (!hlist_empty(&stable_node->hlist))
774 ksm_pages_sharing--;
775 else
776 ksm_pages_shared--;
777 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
778 stable_node->rmap_hlist_len--;
779
780 put_anon_vma(rmap_item->anon_vma);
781 rmap_item->address &= PAGE_MASK;
782
783 } else if (rmap_item->address & UNSTABLE_FLAG) {
784 unsigned char age;
785
786
787
788
789
790
791
792 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
793 BUG_ON(age > 1);
794 if (!age)
795 rb_erase(&rmap_item->node,
796 root_unstable_tree + NUMA(rmap_item->nid));
797 ksm_pages_unshared--;
798 rmap_item->address &= PAGE_MASK;
799 }
800out:
801 cond_resched();
802}
803
804static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
805 struct rmap_item **rmap_list)
806{
807 while (*rmap_list) {
808 struct rmap_item *rmap_item = *rmap_list;
809 *rmap_list = rmap_item->rmap_list;
810 remove_rmap_item_from_tree(rmap_item);
811 free_rmap_item(rmap_item);
812 }
813}
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828static int unmerge_ksm_pages(struct vm_area_struct *vma,
829 unsigned long start, unsigned long end)
830{
831 unsigned long addr;
832 int err = 0;
833
834 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
835 if (ksm_test_exit(vma->vm_mm))
836 break;
837 if (signal_pending(current))
838 err = -ERESTARTSYS;
839 else
840 err = break_ksm(vma, addr);
841 }
842 return err;
843}
844
845static inline struct stable_node *page_stable_node(struct page *page)
846{
847 return PageKsm(page) ? page_rmapping(page) : NULL;
848}
849
850static inline void set_page_stable_node(struct page *page,
851 struct stable_node *stable_node)
852{
853 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
854}
855
856#ifdef CONFIG_SYSFS
857
858
859
860static int remove_stable_node(struct stable_node *stable_node)
861{
862 struct page *page;
863 int err;
864
865 page = get_ksm_page(stable_node, true);
866 if (!page) {
867
868
869
870 return 0;
871 }
872
873 if (WARN_ON_ONCE(page_mapped(page))) {
874
875
876
877
878 err = -EBUSY;
879 } else {
880
881
882
883
884
885
886
887
888 set_page_stable_node(page, NULL);
889 remove_node_from_stable_tree(stable_node);
890 err = 0;
891 }
892
893 unlock_page(page);
894 put_page(page);
895 return err;
896}
897
898static int remove_stable_node_chain(struct stable_node *stable_node,
899 struct rb_root *root)
900{
901 struct stable_node *dup;
902 struct hlist_node *hlist_safe;
903
904 if (!is_stable_node_chain(stable_node)) {
905 VM_BUG_ON(is_stable_node_dup(stable_node));
906 if (remove_stable_node(stable_node))
907 return true;
908 else
909 return false;
910 }
911
912 hlist_for_each_entry_safe(dup, hlist_safe,
913 &stable_node->hlist, hlist_dup) {
914 VM_BUG_ON(!is_stable_node_dup(dup));
915 if (remove_stable_node(dup))
916 return true;
917 }
918 BUG_ON(!hlist_empty(&stable_node->hlist));
919 free_stable_node_chain(stable_node, root);
920 return false;
921}
922
923static int remove_all_stable_nodes(void)
924{
925 struct stable_node *stable_node, *next;
926 int nid;
927 int err = 0;
928
929 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
930 while (root_stable_tree[nid].rb_node) {
931 stable_node = rb_entry(root_stable_tree[nid].rb_node,
932 struct stable_node, node);
933 if (remove_stable_node_chain(stable_node,
934 root_stable_tree + nid)) {
935 err = -EBUSY;
936 break;
937 }
938 cond_resched();
939 }
940 }
941 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
942 if (remove_stable_node(stable_node))
943 err = -EBUSY;
944 cond_resched();
945 }
946 return err;
947}
948
949static int unmerge_and_remove_all_rmap_items(void)
950{
951 struct mm_slot *mm_slot;
952 struct mm_struct *mm;
953 struct vm_area_struct *vma;
954 int err = 0;
955
956 spin_lock(&ksm_mmlist_lock);
957 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
958 struct mm_slot, mm_list);
959 spin_unlock(&ksm_mmlist_lock);
960
961 for (mm_slot = ksm_scan.mm_slot;
962 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
963 mm = mm_slot->mm;
964 down_read(&mm->mmap_sem);
965 for (vma = mm->mmap; vma; vma = vma->vm_next) {
966 if (ksm_test_exit(mm))
967 break;
968 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
969 continue;
970 err = unmerge_ksm_pages(vma,
971 vma->vm_start, vma->vm_end);
972 if (err)
973 goto error;
974 }
975
976 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
977 up_read(&mm->mmap_sem);
978
979 spin_lock(&ksm_mmlist_lock);
980 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
981 struct mm_slot, mm_list);
982 if (ksm_test_exit(mm)) {
983 hash_del(&mm_slot->link);
984 list_del(&mm_slot->mm_list);
985 spin_unlock(&ksm_mmlist_lock);
986
987 free_mm_slot(mm_slot);
988 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
989 mmdrop(mm);
990 } else
991 spin_unlock(&ksm_mmlist_lock);
992 }
993
994
995 remove_all_stable_nodes();
996 ksm_scan.seqnr = 0;
997 return 0;
998
999error:
1000 up_read(&mm->mmap_sem);
1001 spin_lock(&ksm_mmlist_lock);
1002 ksm_scan.mm_slot = &ksm_mm_head;
1003 spin_unlock(&ksm_mmlist_lock);
1004 return err;
1005}
1006#endif
1007
1008static u32 calc_checksum(struct page *page)
1009{
1010 u32 checksum;
1011 void *addr = kmap_atomic(page);
1012 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
1013 kunmap_atomic(addr);
1014 return checksum;
1015}
1016
1017static int memcmp_pages(struct page *page1, struct page *page2)
1018{
1019 char *addr1, *addr2;
1020 int ret;
1021
1022 addr1 = kmap_atomic(page1);
1023 addr2 = kmap_atomic(page2);
1024 ret = memcmp(addr1, addr2, PAGE_SIZE);
1025 kunmap_atomic(addr2);
1026 kunmap_atomic(addr1);
1027 return ret;
1028}
1029
1030static inline int pages_identical(struct page *page1, struct page *page2)
1031{
1032 return !memcmp_pages(page1, page2);
1033}
1034
1035static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1036 pte_t *orig_pte)
1037{
1038 struct mm_struct *mm = vma->vm_mm;
1039 struct page_vma_mapped_walk pvmw = {
1040 .page = page,
1041 .vma = vma,
1042 };
1043 int swapped;
1044 int err = -EFAULT;
1045 unsigned long mmun_start;
1046 unsigned long mmun_end;
1047
1048 pvmw.address = page_address_in_vma(page, vma);
1049 if (pvmw.address == -EFAULT)
1050 goto out;
1051
1052 BUG_ON(PageTransCompound(page));
1053
1054 mmun_start = pvmw.address;
1055 mmun_end = pvmw.address + PAGE_SIZE;
1056 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1057
1058 if (!page_vma_mapped_walk(&pvmw))
1059 goto out_mn;
1060 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1061 goto out_unlock;
1062
1063 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1064 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1065 mm_tlb_flush_pending(mm)) {
1066 pte_t entry;
1067
1068 swapped = PageSwapCache(page);
1069 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1085
1086
1087
1088
1089 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1090 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1091 goto out_unlock;
1092 }
1093 if (pte_dirty(entry))
1094 set_page_dirty(page);
1095
1096 if (pte_protnone(entry))
1097 entry = pte_mkclean(pte_clear_savedwrite(entry));
1098 else
1099 entry = pte_mkclean(pte_wrprotect(entry));
1100 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1101 }
1102 *orig_pte = *pvmw.pte;
1103 err = 0;
1104
1105out_unlock:
1106 page_vma_mapped_walk_done(&pvmw);
1107out_mn:
1108 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1109out:
1110 return err;
1111}
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122static int replace_page(struct vm_area_struct *vma, struct page *page,
1123 struct page *kpage, pte_t orig_pte)
1124{
1125 struct mm_struct *mm = vma->vm_mm;
1126 pmd_t *pmd;
1127 pte_t *ptep;
1128 pte_t newpte;
1129 spinlock_t *ptl;
1130 unsigned long addr;
1131 int err = -EFAULT;
1132 unsigned long mmun_start;
1133 unsigned long mmun_end;
1134
1135 addr = page_address_in_vma(page, vma);
1136 if (addr == -EFAULT)
1137 goto out;
1138
1139 pmd = mm_find_pmd(mm, addr);
1140 if (!pmd)
1141 goto out;
1142
1143 mmun_start = addr;
1144 mmun_end = addr + PAGE_SIZE;
1145 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1146
1147 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1148 if (!pte_same(*ptep, orig_pte)) {
1149 pte_unmap_unlock(ptep, ptl);
1150 goto out_mn;
1151 }
1152
1153
1154
1155
1156
1157 if (!is_zero_pfn(page_to_pfn(kpage))) {
1158 get_page(kpage);
1159 page_add_anon_rmap(kpage, vma, addr, false);
1160 newpte = mk_pte(kpage, vma->vm_page_prot);
1161 } else {
1162 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1163 vma->vm_page_prot));
1164
1165
1166
1167
1168
1169
1170 dec_mm_counter(mm, MM_ANONPAGES);
1171 }
1172
1173 flush_cache_page(vma, addr, pte_pfn(*ptep));
1174
1175
1176
1177
1178
1179
1180 ptep_clear_flush(vma, addr, ptep);
1181 set_pte_at_notify(mm, addr, ptep, newpte);
1182
1183 page_remove_rmap(page, false);
1184 if (!page_mapped(page))
1185 try_to_free_swap(page);
1186 put_page(page);
1187
1188 pte_unmap_unlock(ptep, ptl);
1189 err = 0;
1190out_mn:
1191 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1192out:
1193 return err;
1194}
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205static int try_to_merge_one_page(struct vm_area_struct *vma,
1206 struct page *page, struct page *kpage)
1207{
1208 pte_t orig_pte = __pte(0);
1209 int err = -EFAULT;
1210
1211 if (page == kpage)
1212 return 0;
1213
1214 if (!PageAnon(page))
1215 goto out;
1216
1217
1218
1219
1220
1221
1222
1223
1224 if (!trylock_page(page))
1225 goto out;
1226
1227 if (PageTransCompound(page)) {
1228 if (split_huge_page(page))
1229 goto out_unlock;
1230 }
1231
1232
1233
1234
1235
1236
1237
1238 if (write_protect_page(vma, page, &orig_pte) == 0) {
1239 if (!kpage) {
1240
1241
1242
1243
1244
1245 set_page_stable_node(page, NULL);
1246 mark_page_accessed(page);
1247
1248
1249
1250
1251 if (!PageDirty(page))
1252 SetPageDirty(page);
1253 err = 0;
1254 } else if (pages_identical(page, kpage))
1255 err = replace_page(vma, page, kpage, orig_pte);
1256 }
1257
1258 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1259 munlock_vma_page(page);
1260 if (!PageMlocked(kpage)) {
1261 unlock_page(page);
1262 lock_page(kpage);
1263 mlock_vma_page(kpage);
1264 page = kpage;
1265 }
1266 }
1267
1268out_unlock:
1269 unlock_page(page);
1270out:
1271 return err;
1272}
1273
1274
1275
1276
1277
1278
1279
1280static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1281 struct page *page, struct page *kpage)
1282{
1283 struct mm_struct *mm = rmap_item->mm;
1284 struct vm_area_struct *vma;
1285 int err = -EFAULT;
1286
1287 down_read(&mm->mmap_sem);
1288 vma = find_mergeable_vma(mm, rmap_item->address);
1289 if (!vma)
1290 goto out;
1291
1292 err = try_to_merge_one_page(vma, page, kpage);
1293 if (err)
1294 goto out;
1295
1296
1297 remove_rmap_item_from_tree(rmap_item);
1298
1299
1300 rmap_item->anon_vma = vma->anon_vma;
1301 get_anon_vma(vma->anon_vma);
1302out:
1303 up_read(&mm->mmap_sem);
1304 return err;
1305}
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1318 struct page *page,
1319 struct rmap_item *tree_rmap_item,
1320 struct page *tree_page)
1321{
1322 int err;
1323
1324 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1325 if (!err) {
1326 err = try_to_merge_with_ksm_page(tree_rmap_item,
1327 tree_page, page);
1328
1329
1330
1331
1332 if (err)
1333 break_cow(rmap_item);
1334 }
1335 return err ? NULL : page;
1336}
1337
1338static __always_inline
1339bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1340{
1341 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1342
1343
1344
1345
1346
1347
1348 return stable_node->rmap_hlist_len &&
1349 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1350}
1351
1352static __always_inline
1353bool is_page_sharing_candidate(struct stable_node *stable_node)
1354{
1355 return __is_page_sharing_candidate(stable_node, 0);
1356}
1357
1358static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1359 struct stable_node **_stable_node,
1360 struct rb_root *root,
1361 bool prune_stale_stable_nodes)
1362{
1363 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1364 struct hlist_node *hlist_safe;
1365 struct page *_tree_page, *tree_page = NULL;
1366 int nr = 0;
1367 int found_rmap_hlist_len;
1368
1369 if (!prune_stale_stable_nodes ||
1370 time_before(jiffies, stable_node->chain_prune_time +
1371 msecs_to_jiffies(
1372 ksm_stable_node_chains_prune_millisecs)))
1373 prune_stale_stable_nodes = false;
1374 else
1375 stable_node->chain_prune_time = jiffies;
1376
1377 hlist_for_each_entry_safe(dup, hlist_safe,
1378 &stable_node->hlist, hlist_dup) {
1379 cond_resched();
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390 _tree_page = get_ksm_page(dup, false);
1391 if (!_tree_page)
1392 continue;
1393 nr += 1;
1394 if (is_page_sharing_candidate(dup)) {
1395 if (!found ||
1396 dup->rmap_hlist_len > found_rmap_hlist_len) {
1397 if (found)
1398 put_page(tree_page);
1399 found = dup;
1400 found_rmap_hlist_len = found->rmap_hlist_len;
1401 tree_page = _tree_page;
1402
1403
1404 if (!prune_stale_stable_nodes)
1405 break;
1406 continue;
1407 }
1408 }
1409 put_page(_tree_page);
1410 }
1411
1412 if (found) {
1413
1414
1415
1416
1417
1418
1419 if (prune_stale_stable_nodes && nr == 1) {
1420
1421
1422
1423
1424
1425
1426 BUG_ON(stable_node->hlist.first->next);
1427
1428
1429
1430
1431
1432 rb_replace_node(&stable_node->node, &found->node,
1433 root);
1434 free_stable_node(stable_node);
1435 ksm_stable_node_chains--;
1436 ksm_stable_node_dups--;
1437
1438
1439
1440
1441
1442 *_stable_node = found;
1443
1444
1445
1446
1447
1448
1449 stable_node = NULL;
1450 } else if (stable_node->hlist.first != &found->hlist_dup &&
1451 __is_page_sharing_candidate(found, 1)) {
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467 hlist_del(&found->hlist_dup);
1468 hlist_add_head(&found->hlist_dup,
1469 &stable_node->hlist);
1470 }
1471 }
1472
1473 *_stable_node_dup = found;
1474 return tree_page;
1475}
1476
1477static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1478 struct rb_root *root)
1479{
1480 if (!is_stable_node_chain(stable_node))
1481 return stable_node;
1482 if (hlist_empty(&stable_node->hlist)) {
1483 free_stable_node_chain(stable_node, root);
1484 return NULL;
1485 }
1486 return hlist_entry(stable_node->hlist.first,
1487 typeof(*stable_node), hlist_dup);
1488}
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1505 struct stable_node **_stable_node,
1506 struct rb_root *root,
1507 bool prune_stale_stable_nodes)
1508{
1509 struct stable_node *stable_node = *_stable_node;
1510 if (!is_stable_node_chain(stable_node)) {
1511 if (is_page_sharing_candidate(stable_node)) {
1512 *_stable_node_dup = stable_node;
1513 return get_ksm_page(stable_node, false);
1514 }
1515
1516
1517
1518
1519 *_stable_node_dup = NULL;
1520 return NULL;
1521 }
1522 return stable_node_dup(_stable_node_dup, _stable_node, root,
1523 prune_stale_stable_nodes);
1524}
1525
1526static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1527 struct stable_node **s_n,
1528 struct rb_root *root)
1529{
1530 return __stable_node_chain(s_n_d, s_n, root, true);
1531}
1532
1533static __always_inline struct page *chain(struct stable_node **s_n_d,
1534 struct stable_node *s_n,
1535 struct rb_root *root)
1536{
1537 struct stable_node *old_stable_node = s_n;
1538 struct page *tree_page;
1539
1540 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1541
1542 VM_BUG_ON(s_n != old_stable_node);
1543 return tree_page;
1544}
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555static struct page *stable_tree_search(struct page *page)
1556{
1557 int nid;
1558 struct rb_root *root;
1559 struct rb_node **new;
1560 struct rb_node *parent;
1561 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1562 struct stable_node *page_node;
1563
1564 page_node = page_stable_node(page);
1565 if (page_node && page_node->head != &migrate_nodes) {
1566
1567 get_page(page);
1568 return page;
1569 }
1570
1571 nid = get_kpfn_nid(page_to_pfn(page));
1572 root = root_stable_tree + nid;
1573again:
1574 new = &root->rb_node;
1575 parent = NULL;
1576
1577 while (*new) {
1578 struct page *tree_page;
1579 int ret;
1580
1581 cond_resched();
1582 stable_node = rb_entry(*new, struct stable_node, node);
1583 stable_node_any = NULL;
1584 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597 if (!stable_node_dup) {
1598
1599
1600
1601
1602
1603 stable_node_any = stable_node_dup_any(stable_node,
1604 root);
1605 if (!stable_node_any) {
1606
1607 goto again;
1608 }
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618 tree_page = get_ksm_page(stable_node_any, false);
1619 }
1620 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1621 if (!tree_page) {
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631 goto again;
1632 }
1633
1634 ret = memcmp_pages(page, tree_page);
1635 put_page(tree_page);
1636
1637 parent = *new;
1638 if (ret < 0)
1639 new = &parent->rb_left;
1640 else if (ret > 0)
1641 new = &parent->rb_right;
1642 else {
1643 if (page_node) {
1644 VM_BUG_ON(page_node->head != &migrate_nodes);
1645
1646
1647
1648
1649
1650
1651 if (page_mapcount(page) > 1)
1652 goto chain_append;
1653 }
1654
1655 if (!stable_node_dup) {
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668 return NULL;
1669 }
1670
1671
1672
1673
1674
1675
1676
1677
1678 tree_page = get_ksm_page(stable_node_dup, true);
1679 if (unlikely(!tree_page))
1680
1681
1682
1683
1684 goto again;
1685 unlock_page(tree_page);
1686
1687 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1688 NUMA(stable_node_dup->nid)) {
1689 put_page(tree_page);
1690 goto replace;
1691 }
1692 return tree_page;
1693 }
1694 }
1695
1696 if (!page_node)
1697 return NULL;
1698
1699 list_del(&page_node->list);
1700 DO_NUMA(page_node->nid = nid);
1701 rb_link_node(&page_node->node, parent, new);
1702 rb_insert_color(&page_node->node, root);
1703out:
1704 if (is_page_sharing_candidate(page_node)) {
1705 get_page(page);
1706 return page;
1707 } else
1708 return NULL;
1709
1710replace:
1711
1712
1713
1714
1715
1716
1717
1718
1719 if (stable_node_dup == stable_node) {
1720 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1721 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1722
1723 if (page_node) {
1724 VM_BUG_ON(page_node->head != &migrate_nodes);
1725 list_del(&page_node->list);
1726 DO_NUMA(page_node->nid = nid);
1727 rb_replace_node(&stable_node_dup->node,
1728 &page_node->node,
1729 root);
1730 if (is_page_sharing_candidate(page_node))
1731 get_page(page);
1732 else
1733 page = NULL;
1734 } else {
1735 rb_erase(&stable_node_dup->node, root);
1736 page = NULL;
1737 }
1738 } else {
1739 VM_BUG_ON(!is_stable_node_chain(stable_node));
1740 __stable_node_dup_del(stable_node_dup);
1741 if (page_node) {
1742 VM_BUG_ON(page_node->head != &migrate_nodes);
1743 list_del(&page_node->list);
1744 DO_NUMA(page_node->nid = nid);
1745 stable_node_chain_add_dup(page_node, stable_node);
1746 if (is_page_sharing_candidate(page_node))
1747 get_page(page);
1748 else
1749 page = NULL;
1750 } else {
1751 page = NULL;
1752 }
1753 }
1754 stable_node_dup->head = &migrate_nodes;
1755 list_add(&stable_node_dup->list, stable_node_dup->head);
1756 return page;
1757
1758chain_append:
1759
1760 if (!stable_node_dup)
1761 stable_node_dup = stable_node_any;
1762
1763
1764
1765
1766
1767
1768
1769
1770 if (stable_node_dup == stable_node) {
1771 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1772 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1773
1774 stable_node = alloc_stable_node_chain(stable_node_dup,
1775 root);
1776 if (!stable_node)
1777 return NULL;
1778 }
1779
1780
1781
1782
1783
1784
1785 VM_BUG_ON(!is_stable_node_chain(stable_node));
1786 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1787 VM_BUG_ON(page_node->head != &migrate_nodes);
1788 list_del(&page_node->list);
1789 DO_NUMA(page_node->nid = nid);
1790 stable_node_chain_add_dup(page_node, stable_node);
1791 goto out;
1792}
1793
1794
1795
1796
1797
1798
1799
1800
1801static struct stable_node *stable_tree_insert(struct page *kpage)
1802{
1803 int nid;
1804 unsigned long kpfn;
1805 struct rb_root *root;
1806 struct rb_node **new;
1807 struct rb_node *parent;
1808 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1809 bool need_chain = false;
1810
1811 kpfn = page_to_pfn(kpage);
1812 nid = get_kpfn_nid(kpfn);
1813 root = root_stable_tree + nid;
1814again:
1815 parent = NULL;
1816 new = &root->rb_node;
1817
1818 while (*new) {
1819 struct page *tree_page;
1820 int ret;
1821
1822 cond_resched();
1823 stable_node = rb_entry(*new, struct stable_node, node);
1824 stable_node_any = NULL;
1825 tree_page = chain(&stable_node_dup, stable_node, root);
1826 if (!stable_node_dup) {
1827
1828
1829
1830
1831
1832 stable_node_any = stable_node_dup_any(stable_node,
1833 root);
1834 if (!stable_node_any) {
1835
1836 goto again;
1837 }
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847 tree_page = get_ksm_page(stable_node_any, false);
1848 }
1849 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1850 if (!tree_page) {
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860 goto again;
1861 }
1862
1863 ret = memcmp_pages(kpage, tree_page);
1864 put_page(tree_page);
1865
1866 parent = *new;
1867 if (ret < 0)
1868 new = &parent->rb_left;
1869 else if (ret > 0)
1870 new = &parent->rb_right;
1871 else {
1872 need_chain = true;
1873 break;
1874 }
1875 }
1876
1877 stable_node_dup = alloc_stable_node();
1878 if (!stable_node_dup)
1879 return NULL;
1880
1881 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1882 stable_node_dup->kpfn = kpfn;
1883 set_page_stable_node(kpage, stable_node_dup);
1884 stable_node_dup->rmap_hlist_len = 0;
1885 DO_NUMA(stable_node_dup->nid = nid);
1886 if (!need_chain) {
1887 rb_link_node(&stable_node_dup->node, parent, new);
1888 rb_insert_color(&stable_node_dup->node, root);
1889 } else {
1890 if (!is_stable_node_chain(stable_node)) {
1891 struct stable_node *orig = stable_node;
1892
1893 stable_node = alloc_stable_node_chain(orig, root);
1894 if (!stable_node) {
1895 free_stable_node(stable_node_dup);
1896 return NULL;
1897 }
1898 }
1899 stable_node_chain_add_dup(stable_node_dup, stable_node);
1900 }
1901
1902 return stable_node_dup;
1903}
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919static
1920struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1921 struct page *page,
1922 struct page **tree_pagep)
1923{
1924 struct rb_node **new;
1925 struct rb_root *root;
1926 struct rb_node *parent = NULL;
1927 int nid;
1928
1929 nid = get_kpfn_nid(page_to_pfn(page));
1930 root = root_unstable_tree + nid;
1931 new = &root->rb_node;
1932
1933 while (*new) {
1934 struct rmap_item *tree_rmap_item;
1935 struct page *tree_page;
1936 int ret;
1937
1938 cond_resched();
1939 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1940 tree_page = get_mergeable_page(tree_rmap_item);
1941 if (!tree_page)
1942 return NULL;
1943
1944
1945
1946
1947 if (page == tree_page) {
1948 put_page(tree_page);
1949 return NULL;
1950 }
1951
1952 ret = memcmp_pages(page, tree_page);
1953
1954 parent = *new;
1955 if (ret < 0) {
1956 put_page(tree_page);
1957 new = &parent->rb_left;
1958 } else if (ret > 0) {
1959 put_page(tree_page);
1960 new = &parent->rb_right;
1961 } else if (!ksm_merge_across_nodes &&
1962 page_to_nid(tree_page) != nid) {
1963
1964
1965
1966
1967
1968 put_page(tree_page);
1969 return NULL;
1970 } else {
1971 *tree_pagep = tree_page;
1972 return tree_rmap_item;
1973 }
1974 }
1975
1976 rmap_item->address |= UNSTABLE_FLAG;
1977 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1978 DO_NUMA(rmap_item->nid = nid);
1979 rb_link_node(&rmap_item->node, parent, new);
1980 rb_insert_color(&rmap_item->node, root);
1981
1982 ksm_pages_unshared++;
1983 return NULL;
1984}
1985
1986
1987
1988
1989
1990
1991static void stable_tree_append(struct rmap_item *rmap_item,
1992 struct stable_node *stable_node,
1993 bool max_page_sharing_bypass)
1994{
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005 BUG_ON(stable_node->rmap_hlist_len < 0);
2006
2007 stable_node->rmap_hlist_len++;
2008 if (!max_page_sharing_bypass)
2009
2010 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2011 ksm_max_page_sharing);
2012
2013 rmap_item->head = stable_node;
2014 rmap_item->address |= STABLE_FLAG;
2015 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2016
2017 if (rmap_item->hlist.next)
2018 ksm_pages_sharing++;
2019 else
2020 ksm_pages_shared++;
2021}
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2033{
2034 struct mm_struct *mm = rmap_item->mm;
2035 struct rmap_item *tree_rmap_item;
2036 struct page *tree_page = NULL;
2037 struct stable_node *stable_node;
2038 struct page *kpage;
2039 unsigned int checksum;
2040 int err;
2041 bool max_page_sharing_bypass = false;
2042
2043 stable_node = page_stable_node(page);
2044 if (stable_node) {
2045 if (stable_node->head != &migrate_nodes &&
2046 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2047 NUMA(stable_node->nid)) {
2048 stable_node_dup_del(stable_node);
2049 stable_node->head = &migrate_nodes;
2050 list_add(&stable_node->list, stable_node->head);
2051 }
2052 if (stable_node->head != &migrate_nodes &&
2053 rmap_item->head == stable_node)
2054 return;
2055
2056
2057
2058
2059 if (!is_page_sharing_candidate(stable_node))
2060 max_page_sharing_bypass = true;
2061 }
2062
2063
2064 kpage = stable_tree_search(page);
2065 if (kpage == page && rmap_item->head == stable_node) {
2066 put_page(kpage);
2067 return;
2068 }
2069
2070 remove_rmap_item_from_tree(rmap_item);
2071
2072 if (kpage) {
2073 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2074 if (!err) {
2075
2076
2077
2078
2079 lock_page(kpage);
2080 stable_tree_append(rmap_item, page_stable_node(kpage),
2081 max_page_sharing_bypass);
2082 unlock_page(kpage);
2083 }
2084 put_page(kpage);
2085 return;
2086 }
2087
2088
2089
2090
2091
2092
2093
2094 checksum = calc_checksum(page);
2095 if (rmap_item->oldchecksum != checksum) {
2096 rmap_item->oldchecksum = checksum;
2097 return;
2098 }
2099
2100
2101
2102
2103
2104 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2105 struct vm_area_struct *vma;
2106
2107 down_read(&mm->mmap_sem);
2108 vma = find_mergeable_vma(mm, rmap_item->address);
2109 err = try_to_merge_one_page(vma, page,
2110 ZERO_PAGE(rmap_item->address));
2111 up_read(&mm->mmap_sem);
2112
2113
2114
2115
2116 if (!err)
2117 return;
2118 }
2119 tree_rmap_item =
2120 unstable_tree_search_insert(rmap_item, page, &tree_page);
2121 if (tree_rmap_item) {
2122 bool split;
2123
2124 kpage = try_to_merge_two_pages(rmap_item, page,
2125 tree_rmap_item, tree_page);
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136 split = PageTransCompound(page)
2137 && compound_head(page) == compound_head(tree_page);
2138 put_page(tree_page);
2139 if (kpage) {
2140
2141
2142
2143
2144 lock_page(kpage);
2145 stable_node = stable_tree_insert(kpage);
2146 if (stable_node) {
2147 stable_tree_append(tree_rmap_item, stable_node,
2148 false);
2149 stable_tree_append(rmap_item, stable_node,
2150 false);
2151 }
2152 unlock_page(kpage);
2153
2154
2155
2156
2157
2158
2159
2160 if (!stable_node) {
2161 break_cow(tree_rmap_item);
2162 break_cow(rmap_item);
2163 }
2164 } else if (split) {
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174 if (!trylock_page(page))
2175 return;
2176 split_huge_page(page);
2177 unlock_page(page);
2178 }
2179 }
2180}
2181
2182static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2183 struct rmap_item **rmap_list,
2184 unsigned long addr)
2185{
2186 struct rmap_item *rmap_item;
2187
2188 while (*rmap_list) {
2189 rmap_item = *rmap_list;
2190 if ((rmap_item->address & PAGE_MASK) == addr)
2191 return rmap_item;
2192 if (rmap_item->address > addr)
2193 break;
2194 *rmap_list = rmap_item->rmap_list;
2195 remove_rmap_item_from_tree(rmap_item);
2196 free_rmap_item(rmap_item);
2197 }
2198
2199 rmap_item = alloc_rmap_item();
2200 if (rmap_item) {
2201
2202 rmap_item->mm = mm_slot->mm;
2203 rmap_item->address = addr;
2204 rmap_item->rmap_list = *rmap_list;
2205 *rmap_list = rmap_item;
2206 }
2207 return rmap_item;
2208}
2209
2210static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2211{
2212 struct mm_struct *mm;
2213 struct mm_slot *slot;
2214 struct vm_area_struct *vma;
2215 struct rmap_item *rmap_item;
2216 int nid;
2217
2218 if (list_empty(&ksm_mm_head.mm_list))
2219 return NULL;
2220
2221 slot = ksm_scan.mm_slot;
2222 if (slot == &ksm_mm_head) {
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233 lru_add_drain_all();
2234
2235
2236
2237
2238
2239
2240
2241 if (!ksm_merge_across_nodes) {
2242 struct stable_node *stable_node, *next;
2243 struct page *page;
2244
2245 list_for_each_entry_safe(stable_node, next,
2246 &migrate_nodes, list) {
2247 page = get_ksm_page(stable_node, false);
2248 if (page)
2249 put_page(page);
2250 cond_resched();
2251 }
2252 }
2253
2254 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2255 root_unstable_tree[nid] = RB_ROOT;
2256
2257 spin_lock(&ksm_mmlist_lock);
2258 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2259 ksm_scan.mm_slot = slot;
2260 spin_unlock(&ksm_mmlist_lock);
2261
2262
2263
2264
2265 if (slot == &ksm_mm_head)
2266 return NULL;
2267next_mm:
2268 ksm_scan.address = 0;
2269 ksm_scan.rmap_list = &slot->rmap_list;
2270 }
2271
2272 mm = slot->mm;
2273 down_read(&mm->mmap_sem);
2274 if (ksm_test_exit(mm))
2275 vma = NULL;
2276 else
2277 vma = find_vma(mm, ksm_scan.address);
2278
2279 for (; vma; vma = vma->vm_next) {
2280 if (!(vma->vm_flags & VM_MERGEABLE))
2281 continue;
2282 if (ksm_scan.address < vma->vm_start)
2283 ksm_scan.address = vma->vm_start;
2284 if (!vma->anon_vma)
2285 ksm_scan.address = vma->vm_end;
2286
2287 while (ksm_scan.address < vma->vm_end) {
2288 if (ksm_test_exit(mm))
2289 break;
2290 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2291 if (IS_ERR_OR_NULL(*page)) {
2292 ksm_scan.address += PAGE_SIZE;
2293 cond_resched();
2294 continue;
2295 }
2296 if (PageAnon(*page)) {
2297 flush_anon_page(vma, *page, ksm_scan.address);
2298 flush_dcache_page(*page);
2299 rmap_item = get_next_rmap_item(slot,
2300 ksm_scan.rmap_list, ksm_scan.address);
2301 if (rmap_item) {
2302 ksm_scan.rmap_list =
2303 &rmap_item->rmap_list;
2304 ksm_scan.address += PAGE_SIZE;
2305 } else
2306 put_page(*page);
2307 up_read(&mm->mmap_sem);
2308 return rmap_item;
2309 }
2310 put_page(*page);
2311 ksm_scan.address += PAGE_SIZE;
2312 cond_resched();
2313 }
2314 }
2315
2316 if (ksm_test_exit(mm)) {
2317 ksm_scan.address = 0;
2318 ksm_scan.rmap_list = &slot->rmap_list;
2319 }
2320
2321
2322
2323
2324 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
2325
2326 spin_lock(&ksm_mmlist_lock);
2327 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2328 struct mm_slot, mm_list);
2329 if (ksm_scan.address == 0) {
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339 hash_del(&slot->link);
2340 list_del(&slot->mm_list);
2341 spin_unlock(&ksm_mmlist_lock);
2342
2343 free_mm_slot(slot);
2344 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2345 up_read(&mm->mmap_sem);
2346 mmdrop(mm);
2347 } else {
2348 up_read(&mm->mmap_sem);
2349
2350
2351
2352
2353
2354
2355
2356 spin_unlock(&ksm_mmlist_lock);
2357 }
2358
2359
2360 slot = ksm_scan.mm_slot;
2361 if (slot != &ksm_mm_head)
2362 goto next_mm;
2363
2364 ksm_scan.seqnr++;
2365 return NULL;
2366}
2367
2368
2369
2370
2371
2372static void ksm_do_scan(unsigned int scan_npages)
2373{
2374 struct rmap_item *rmap_item;
2375 struct page *uninitialized_var(page);
2376
2377 while (scan_npages-- && likely(!freezing(current))) {
2378 cond_resched();
2379 rmap_item = scan_get_next_rmap_item(&page);
2380 if (!rmap_item)
2381 return;
2382 cmp_and_merge_page(page, rmap_item);
2383 put_page(page);
2384 }
2385}
2386
2387static int ksmd_should_run(void)
2388{
2389 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2390}
2391
2392static int ksm_scan_thread(void *nothing)
2393{
2394 set_freezable();
2395 set_user_nice(current, 5);
2396
2397 while (!kthread_should_stop()) {
2398 mutex_lock(&ksm_thread_mutex);
2399 wait_while_offlining();
2400 if (ksmd_should_run())
2401 ksm_do_scan(ksm_thread_pages_to_scan);
2402 mutex_unlock(&ksm_thread_mutex);
2403
2404 try_to_freeze();
2405
2406 if (ksmd_should_run()) {
2407 schedule_timeout_interruptible(
2408 msecs_to_jiffies(ksm_thread_sleep_millisecs));
2409 } else {
2410 wait_event_freezable(ksm_thread_wait,
2411 ksmd_should_run() || kthread_should_stop());
2412 }
2413 }
2414 return 0;
2415}
2416
2417int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2418 unsigned long end, int advice, unsigned long *vm_flags)
2419{
2420 struct mm_struct *mm = vma->vm_mm;
2421 int err;
2422
2423 switch (advice) {
2424 case MADV_MERGEABLE:
2425
2426
2427
2428 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2429 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2430 VM_HUGETLB | VM_MIXEDMAP))
2431 return 0;
2432
2433 if (vma_is_dax(vma))
2434 return 0;
2435
2436#ifdef VM_SAO
2437 if (*vm_flags & VM_SAO)
2438 return 0;
2439#endif
2440#ifdef VM_SPARC_ADI
2441 if (*vm_flags & VM_SPARC_ADI)
2442 return 0;
2443#endif
2444
2445 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2446 err = __ksm_enter(mm);
2447 if (err)
2448 return err;
2449 }
2450
2451 *vm_flags |= VM_MERGEABLE;
2452 break;
2453
2454 case MADV_UNMERGEABLE:
2455 if (!(*vm_flags & VM_MERGEABLE))
2456 return 0;
2457
2458 if (vma->anon_vma) {
2459 err = unmerge_ksm_pages(vma, start, end);
2460 if (err)
2461 return err;
2462 }
2463
2464 *vm_flags &= ~VM_MERGEABLE;
2465 break;
2466 }
2467
2468 return 0;
2469}
2470
2471int __ksm_enter(struct mm_struct *mm)
2472{
2473 struct mm_slot *mm_slot;
2474 int needs_wakeup;
2475
2476 mm_slot = alloc_mm_slot();
2477 if (!mm_slot)
2478 return -ENOMEM;
2479
2480
2481 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2482
2483 spin_lock(&ksm_mmlist_lock);
2484 insert_to_mm_slots_hash(mm, mm_slot);
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495 if (ksm_run & KSM_RUN_UNMERGE)
2496 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2497 else
2498 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2499 spin_unlock(&ksm_mmlist_lock);
2500
2501 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2502 mmgrab(mm);
2503
2504 if (needs_wakeup)
2505 wake_up_interruptible(&ksm_thread_wait);
2506
2507 return 0;
2508}
2509
2510void __ksm_exit(struct mm_struct *mm)
2511{
2512 struct mm_slot *mm_slot;
2513 int easy_to_free = 0;
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524 spin_lock(&ksm_mmlist_lock);
2525 mm_slot = get_mm_slot(mm);
2526 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2527 if (!mm_slot->rmap_list) {
2528 hash_del(&mm_slot->link);
2529 list_del(&mm_slot->mm_list);
2530 easy_to_free = 1;
2531 } else {
2532 list_move(&mm_slot->mm_list,
2533 &ksm_scan.mm_slot->mm_list);
2534 }
2535 }
2536 spin_unlock(&ksm_mmlist_lock);
2537
2538 if (easy_to_free) {
2539 free_mm_slot(mm_slot);
2540 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2541 mmdrop(mm);
2542 } else if (mm_slot) {
2543 down_write(&mm->mmap_sem);
2544 up_write(&mm->mmap_sem);
2545 }
2546}
2547
2548struct page *ksm_might_need_to_copy(struct page *page,
2549 struct vm_area_struct *vma, unsigned long address)
2550{
2551 struct anon_vma *anon_vma = page_anon_vma(page);
2552 struct page *new_page;
2553
2554 if (PageKsm(page)) {
2555 if (page_stable_node(page) &&
2556 !(ksm_run & KSM_RUN_UNMERGE))
2557 return page;
2558 } else if (!anon_vma) {
2559 return page;
2560 } else if (anon_vma->root == vma->anon_vma->root &&
2561 page->index == linear_page_index(vma, address)) {
2562 return page;
2563 }
2564 if (!PageUptodate(page))
2565 return page;
2566
2567 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2568 if (new_page) {
2569 copy_user_highpage(new_page, page, address, vma);
2570
2571 SetPageDirty(new_page);
2572 __SetPageUptodate(new_page);
2573 __SetPageLocked(new_page);
2574 }
2575
2576 return new_page;
2577}
2578
2579void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
2580{
2581 struct stable_node *stable_node;
2582 struct rmap_item *rmap_item;
2583 int search_new_forks = 0;
2584
2585 VM_BUG_ON_PAGE(!PageKsm(page), page);
2586
2587
2588
2589
2590
2591 VM_BUG_ON_PAGE(!PageLocked(page), page);
2592
2593 stable_node = page_stable_node(page);
2594 if (!stable_node)
2595 return;
2596again:
2597 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2598 struct anon_vma *anon_vma = rmap_item->anon_vma;
2599 struct anon_vma_chain *vmac;
2600 struct vm_area_struct *vma;
2601
2602 cond_resched();
2603 anon_vma_lock_read(anon_vma);
2604 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2605 0, ULONG_MAX) {
2606 unsigned long addr;
2607
2608 cond_resched();
2609 vma = vmac->vma;
2610
2611
2612 addr = rmap_item->address & ~KSM_FLAG_MASK;
2613
2614 if (addr < vma->vm_start || addr >= vma->vm_end)
2615 continue;
2616
2617
2618
2619
2620
2621
2622 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2623 continue;
2624
2625 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2626 continue;
2627
2628 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
2629 anon_vma_unlock_read(anon_vma);
2630 return;
2631 }
2632 if (rwc->done && rwc->done(page)) {
2633 anon_vma_unlock_read(anon_vma);
2634 return;
2635 }
2636 }
2637 anon_vma_unlock_read(anon_vma);
2638 }
2639 if (!search_new_forks++)
2640 goto again;
2641}
2642
2643#ifdef CONFIG_MIGRATION
2644void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2645{
2646 struct stable_node *stable_node;
2647
2648 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2649 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2650 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2651
2652 stable_node = page_stable_node(newpage);
2653 if (stable_node) {
2654 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2655 stable_node->kpfn = page_to_pfn(newpage);
2656
2657
2658
2659
2660
2661
2662 smp_wmb();
2663 set_page_stable_node(oldpage, NULL);
2664 }
2665}
2666#endif
2667
2668#ifdef CONFIG_MEMORY_HOTREMOVE
2669static void wait_while_offlining(void)
2670{
2671 while (ksm_run & KSM_RUN_OFFLINE) {
2672 mutex_unlock(&ksm_thread_mutex);
2673 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2674 TASK_UNINTERRUPTIBLE);
2675 mutex_lock(&ksm_thread_mutex);
2676 }
2677}
2678
2679static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2680 unsigned long start_pfn,
2681 unsigned long end_pfn)
2682{
2683 if (stable_node->kpfn >= start_pfn &&
2684 stable_node->kpfn < end_pfn) {
2685
2686
2687
2688
2689 remove_node_from_stable_tree(stable_node);
2690 return true;
2691 }
2692 return false;
2693}
2694
2695static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2696 unsigned long start_pfn,
2697 unsigned long end_pfn,
2698 struct rb_root *root)
2699{
2700 struct stable_node *dup;
2701 struct hlist_node *hlist_safe;
2702
2703 if (!is_stable_node_chain(stable_node)) {
2704 VM_BUG_ON(is_stable_node_dup(stable_node));
2705 return stable_node_dup_remove_range(stable_node, start_pfn,
2706 end_pfn);
2707 }
2708
2709 hlist_for_each_entry_safe(dup, hlist_safe,
2710 &stable_node->hlist, hlist_dup) {
2711 VM_BUG_ON(!is_stable_node_dup(dup));
2712 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2713 }
2714 if (hlist_empty(&stable_node->hlist)) {
2715 free_stable_node_chain(stable_node, root);
2716 return true;
2717 } else
2718 return false;
2719}
2720
2721static void ksm_check_stable_tree(unsigned long start_pfn,
2722 unsigned long end_pfn)
2723{
2724 struct stable_node *stable_node, *next;
2725 struct rb_node *node;
2726 int nid;
2727
2728 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2729 node = rb_first(root_stable_tree + nid);
2730 while (node) {
2731 stable_node = rb_entry(node, struct stable_node, node);
2732 if (stable_node_chain_remove_range(stable_node,
2733 start_pfn, end_pfn,
2734 root_stable_tree +
2735 nid))
2736 node = rb_first(root_stable_tree + nid);
2737 else
2738 node = rb_next(node);
2739 cond_resched();
2740 }
2741 }
2742 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2743 if (stable_node->kpfn >= start_pfn &&
2744 stable_node->kpfn < end_pfn)
2745 remove_node_from_stable_tree(stable_node);
2746 cond_resched();
2747 }
2748}
2749
2750static int ksm_memory_callback(struct notifier_block *self,
2751 unsigned long action, void *arg)
2752{
2753 struct memory_notify *mn = arg;
2754
2755 switch (action) {
2756 case MEM_GOING_OFFLINE:
2757
2758
2759
2760
2761
2762
2763
2764 mutex_lock(&ksm_thread_mutex);
2765 ksm_run |= KSM_RUN_OFFLINE;
2766 mutex_unlock(&ksm_thread_mutex);
2767 break;
2768
2769 case MEM_OFFLINE:
2770
2771
2772
2773
2774
2775
2776
2777 ksm_check_stable_tree(mn->start_pfn,
2778 mn->start_pfn + mn->nr_pages);
2779
2780
2781 case MEM_CANCEL_OFFLINE:
2782 mutex_lock(&ksm_thread_mutex);
2783 ksm_run &= ~KSM_RUN_OFFLINE;
2784 mutex_unlock(&ksm_thread_mutex);
2785
2786 smp_mb();
2787 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2788 break;
2789 }
2790 return NOTIFY_OK;
2791}
2792#else
2793static void wait_while_offlining(void)
2794{
2795}
2796#endif
2797
2798#ifdef CONFIG_SYSFS
2799
2800
2801
2802
2803#define KSM_ATTR_RO(_name) \
2804 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2805#define KSM_ATTR(_name) \
2806 static struct kobj_attribute _name##_attr = \
2807 __ATTR(_name, 0644, _name##_show, _name##_store)
2808
2809static ssize_t sleep_millisecs_show(struct kobject *kobj,
2810 struct kobj_attribute *attr, char *buf)
2811{
2812 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2813}
2814
2815static ssize_t sleep_millisecs_store(struct kobject *kobj,
2816 struct kobj_attribute *attr,
2817 const char *buf, size_t count)
2818{
2819 unsigned long msecs;
2820 int err;
2821
2822 err = kstrtoul(buf, 10, &msecs);
2823 if (err || msecs > UINT_MAX)
2824 return -EINVAL;
2825
2826 ksm_thread_sleep_millisecs = msecs;
2827
2828 return count;
2829}
2830KSM_ATTR(sleep_millisecs);
2831
2832static ssize_t pages_to_scan_show(struct kobject *kobj,
2833 struct kobj_attribute *attr, char *buf)
2834{
2835 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2836}
2837
2838static ssize_t pages_to_scan_store(struct kobject *kobj,
2839 struct kobj_attribute *attr,
2840 const char *buf, size_t count)
2841{
2842 int err;
2843 unsigned long nr_pages;
2844
2845 err = kstrtoul(buf, 10, &nr_pages);
2846 if (err || nr_pages > UINT_MAX)
2847 return -EINVAL;
2848
2849 ksm_thread_pages_to_scan = nr_pages;
2850
2851 return count;
2852}
2853KSM_ATTR(pages_to_scan);
2854
2855static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2856 char *buf)
2857{
2858 return sprintf(buf, "%lu\n", ksm_run);
2859}
2860
2861static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2862 const char *buf, size_t count)
2863{
2864 int err;
2865 unsigned long flags;
2866
2867 err = kstrtoul(buf, 10, &flags);
2868 if (err || flags > UINT_MAX)
2869 return -EINVAL;
2870 if (flags > KSM_RUN_UNMERGE)
2871 return -EINVAL;
2872
2873
2874
2875
2876
2877
2878
2879
2880 mutex_lock(&ksm_thread_mutex);
2881 wait_while_offlining();
2882 if (ksm_run != flags) {
2883 ksm_run = flags;
2884 if (flags & KSM_RUN_UNMERGE) {
2885 set_current_oom_origin();
2886 err = unmerge_and_remove_all_rmap_items();
2887 clear_current_oom_origin();
2888 if (err) {
2889 ksm_run = KSM_RUN_STOP;
2890 count = err;
2891 }
2892 }
2893 }
2894 mutex_unlock(&ksm_thread_mutex);
2895
2896 if (flags & KSM_RUN_MERGE)
2897 wake_up_interruptible(&ksm_thread_wait);
2898
2899 return count;
2900}
2901KSM_ATTR(run);
2902
2903#ifdef CONFIG_NUMA
2904static ssize_t merge_across_nodes_show(struct kobject *kobj,
2905 struct kobj_attribute *attr, char *buf)
2906{
2907 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2908}
2909
2910static ssize_t merge_across_nodes_store(struct kobject *kobj,
2911 struct kobj_attribute *attr,
2912 const char *buf, size_t count)
2913{
2914 int err;
2915 unsigned long knob;
2916
2917 err = kstrtoul(buf, 10, &knob);
2918 if (err)
2919 return err;
2920 if (knob > 1)
2921 return -EINVAL;
2922
2923 mutex_lock(&ksm_thread_mutex);
2924 wait_while_offlining();
2925 if (ksm_merge_across_nodes != knob) {
2926 if (ksm_pages_shared || remove_all_stable_nodes())
2927 err = -EBUSY;
2928 else if (root_stable_tree == one_stable_tree) {
2929 struct rb_root *buf;
2930
2931
2932
2933
2934
2935
2936
2937 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2938 GFP_KERNEL);
2939
2940 if (!buf)
2941 err = -ENOMEM;
2942 else {
2943 root_stable_tree = buf;
2944 root_unstable_tree = buf + nr_node_ids;
2945
2946 root_unstable_tree[0] = one_unstable_tree[0];
2947 }
2948 }
2949 if (!err) {
2950 ksm_merge_across_nodes = knob;
2951 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2952 }
2953 }
2954 mutex_unlock(&ksm_thread_mutex);
2955
2956 return err ? err : count;
2957}
2958KSM_ATTR(merge_across_nodes);
2959#endif
2960
2961static ssize_t use_zero_pages_show(struct kobject *kobj,
2962 struct kobj_attribute *attr, char *buf)
2963{
2964 return sprintf(buf, "%u\n", ksm_use_zero_pages);
2965}
2966static ssize_t use_zero_pages_store(struct kobject *kobj,
2967 struct kobj_attribute *attr,
2968 const char *buf, size_t count)
2969{
2970 int err;
2971 bool value;
2972
2973 err = kstrtobool(buf, &value);
2974 if (err)
2975 return -EINVAL;
2976
2977 ksm_use_zero_pages = value;
2978
2979 return count;
2980}
2981KSM_ATTR(use_zero_pages);
2982
2983static ssize_t max_page_sharing_show(struct kobject *kobj,
2984 struct kobj_attribute *attr, char *buf)
2985{
2986 return sprintf(buf, "%u\n", ksm_max_page_sharing);
2987}
2988
2989static ssize_t max_page_sharing_store(struct kobject *kobj,
2990 struct kobj_attribute *attr,
2991 const char *buf, size_t count)
2992{
2993 int err;
2994 int knob;
2995
2996 err = kstrtoint(buf, 10, &knob);
2997 if (err)
2998 return err;
2999
3000
3001
3002
3003
3004 if (knob < 2)
3005 return -EINVAL;
3006
3007 if (READ_ONCE(ksm_max_page_sharing) == knob)
3008 return count;
3009
3010 mutex_lock(&ksm_thread_mutex);
3011 wait_while_offlining();
3012 if (ksm_max_page_sharing != knob) {
3013 if (ksm_pages_shared || remove_all_stable_nodes())
3014 err = -EBUSY;
3015 else
3016 ksm_max_page_sharing = knob;
3017 }
3018 mutex_unlock(&ksm_thread_mutex);
3019
3020 return err ? err : count;
3021}
3022KSM_ATTR(max_page_sharing);
3023
3024static ssize_t pages_shared_show(struct kobject *kobj,
3025 struct kobj_attribute *attr, char *buf)
3026{
3027 return sprintf(buf, "%lu\n", ksm_pages_shared);
3028}
3029KSM_ATTR_RO(pages_shared);
3030
3031static ssize_t pages_sharing_show(struct kobject *kobj,
3032 struct kobj_attribute *attr, char *buf)
3033{
3034 return sprintf(buf, "%lu\n", ksm_pages_sharing);
3035}
3036KSM_ATTR_RO(pages_sharing);
3037
3038static ssize_t pages_unshared_show(struct kobject *kobj,
3039 struct kobj_attribute *attr, char *buf)
3040{
3041 return sprintf(buf, "%lu\n", ksm_pages_unshared);
3042}
3043KSM_ATTR_RO(pages_unshared);
3044
3045static ssize_t pages_volatile_show(struct kobject *kobj,
3046 struct kobj_attribute *attr, char *buf)
3047{
3048 long ksm_pages_volatile;
3049
3050 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3051 - ksm_pages_sharing - ksm_pages_unshared;
3052
3053
3054
3055
3056 if (ksm_pages_volatile < 0)
3057 ksm_pages_volatile = 0;
3058 return sprintf(buf, "%ld\n", ksm_pages_volatile);
3059}
3060KSM_ATTR_RO(pages_volatile);
3061
3062static ssize_t stable_node_dups_show(struct kobject *kobj,
3063 struct kobj_attribute *attr, char *buf)
3064{
3065 return sprintf(buf, "%lu\n", ksm_stable_node_dups);
3066}
3067KSM_ATTR_RO(stable_node_dups);
3068
3069static ssize_t stable_node_chains_show(struct kobject *kobj,
3070 struct kobj_attribute *attr, char *buf)
3071{
3072 return sprintf(buf, "%lu\n", ksm_stable_node_chains);
3073}
3074KSM_ATTR_RO(stable_node_chains);
3075
3076static ssize_t
3077stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3078 struct kobj_attribute *attr,
3079 char *buf)
3080{
3081 return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3082}
3083
3084static ssize_t
3085stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3086 struct kobj_attribute *attr,
3087 const char *buf, size_t count)
3088{
3089 unsigned long msecs;
3090 int err;
3091
3092 err = kstrtoul(buf, 10, &msecs);
3093 if (err || msecs > UINT_MAX)
3094 return -EINVAL;
3095
3096 ksm_stable_node_chains_prune_millisecs = msecs;
3097
3098 return count;
3099}
3100KSM_ATTR(stable_node_chains_prune_millisecs);
3101
3102static ssize_t full_scans_show(struct kobject *kobj,
3103 struct kobj_attribute *attr, char *buf)
3104{
3105 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
3106}
3107KSM_ATTR_RO(full_scans);
3108
3109static struct attribute *ksm_attrs[] = {
3110 &sleep_millisecs_attr.attr,
3111 &pages_to_scan_attr.attr,
3112 &run_attr.attr,
3113 &pages_shared_attr.attr,
3114 &pages_sharing_attr.attr,
3115 &pages_unshared_attr.attr,
3116 &pages_volatile_attr.attr,
3117 &full_scans_attr.attr,
3118#ifdef CONFIG_NUMA
3119 &merge_across_nodes_attr.attr,
3120#endif
3121 &max_page_sharing_attr.attr,
3122 &stable_node_chains_attr.attr,
3123 &stable_node_dups_attr.attr,
3124 &stable_node_chains_prune_millisecs_attr.attr,
3125 &use_zero_pages_attr.attr,
3126 NULL,
3127};
3128
3129static const struct attribute_group ksm_attr_group = {
3130 .attrs = ksm_attrs,
3131 .name = "ksm",
3132};
3133#endif
3134
3135static int __init ksm_init(void)
3136{
3137 struct task_struct *ksm_thread;
3138 int err;
3139
3140
3141 zero_checksum = calc_checksum(ZERO_PAGE(0));
3142
3143 ksm_use_zero_pages = false;
3144
3145 err = ksm_slab_init();
3146 if (err)
3147 goto out;
3148
3149 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3150 if (IS_ERR(ksm_thread)) {
3151 pr_err("ksm: creating kthread failed\n");
3152 err = PTR_ERR(ksm_thread);
3153 goto out_free;
3154 }
3155
3156#ifdef CONFIG_SYSFS
3157 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3158 if (err) {
3159 pr_err("ksm: register sysfs failed\n");
3160 kthread_stop(ksm_thread);
3161 goto out_free;
3162 }
3163#else
3164 ksm_run = KSM_RUN_MERGE;
3165
3166#endif
3167
3168#ifdef CONFIG_MEMORY_HOTREMOVE
3169
3170 hotplug_memory_notifier(ksm_memory_callback, 100);
3171#endif
3172 return 0;
3173
3174out_free:
3175 ksm_slab_free();
3176out:
3177 return err;
3178}
3179subsys_initcall(ksm_init);
3180