1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include <linux/errno.h>
17#include <linux/mm.h>
18#include <linux/fs.h>
19#include <linux/mman.h>
20#include <linux/sched.h>
21#include <linux/sched/mm.h>
22#include <linux/sched/coredump.h>
23#include <linux/rwsem.h>
24#include <linux/pagemap.h>
25#include <linux/rmap.h>
26#include <linux/spinlock.h>
27#include <linux/xxhash.h>
28#include <linux/delay.h>
29#include <linux/kthread.h>
30#include <linux/wait.h>
31#include <linux/slab.h>
32#include <linux/rbtree.h>
33#include <linux/memory.h>
34#include <linux/mmu_notifier.h>
35#include <linux/swap.h>
36#include <linux/ksm.h>
37#include <linux/hashtable.h>
38#include <linux/freezer.h>
39#include <linux/oom.h>
40#include <linux/numa.h>
41
42#include <asm/tlbflush.h>
43#include "internal.h"
44
45#ifdef CONFIG_NUMA
46#define NUMA(x) (x)
47#define DO_NUMA(x) do { (x); } while (0)
48#else
49#define NUMA(x) (0)
50#define DO_NUMA(x) do { } while (0)
51#endif
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120struct mm_slot {
121 struct hlist_node link;
122 struct list_head mm_list;
123 struct rmap_item *rmap_list;
124 struct mm_struct *mm;
125};
126
127
128
129
130
131
132
133
134
135
136struct ksm_scan {
137 struct mm_slot *mm_slot;
138 unsigned long address;
139 struct rmap_item **rmap_list;
140 unsigned long seqnr;
141};
142
143
144
145
146
147
148
149
150
151
152
153
154
155struct stable_node {
156 union {
157 struct rb_node node;
158 struct {
159 struct list_head *head;
160 struct {
161 struct hlist_node hlist_dup;
162 struct list_head list;
163 };
164 };
165 };
166 struct hlist_head hlist;
167 union {
168 unsigned long kpfn;
169 unsigned long chain_prune_time;
170 };
171
172
173
174
175
176#define STABLE_NODE_CHAIN -1024
177 int rmap_hlist_len;
178#ifdef CONFIG_NUMA
179 int nid;
180#endif
181};
182
183
184
185
186
187
188
189
190
191
192
193
194
195struct rmap_item {
196 struct rmap_item *rmap_list;
197 union {
198 struct anon_vma *anon_vma;
199#ifdef CONFIG_NUMA
200 int nid;
201#endif
202 };
203 struct mm_struct *mm;
204 unsigned long address;
205 unsigned int oldchecksum;
206 union {
207 struct rb_node node;
208 struct {
209 struct stable_node *head;
210 struct hlist_node hlist;
211 };
212 };
213};
214
215#define SEQNR_MASK 0x0ff
216#define UNSTABLE_FLAG 0x100
217#define STABLE_FLAG 0x200
218#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
219
220
221
222static struct rb_root one_stable_tree[1] = { RB_ROOT };
223static struct rb_root one_unstable_tree[1] = { RB_ROOT };
224static struct rb_root *root_stable_tree = one_stable_tree;
225static struct rb_root *root_unstable_tree = one_unstable_tree;
226
227
228static LIST_HEAD(migrate_nodes);
229#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
230
231#define MM_SLOTS_HASH_BITS 10
232static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
233
234static struct mm_slot ksm_mm_head = {
235 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
236};
237static struct ksm_scan ksm_scan = {
238 .mm_slot = &ksm_mm_head,
239};
240
241static struct kmem_cache *rmap_item_cache;
242static struct kmem_cache *stable_node_cache;
243static struct kmem_cache *mm_slot_cache;
244
245
246static unsigned long ksm_pages_shared;
247
248
249static unsigned long ksm_pages_sharing;
250
251
252static unsigned long ksm_pages_unshared;
253
254
255static unsigned long ksm_rmap_items;
256
257
258static unsigned long ksm_stable_node_chains;
259
260
261static unsigned long ksm_stable_node_dups;
262
263
264static int ksm_stable_node_chains_prune_millisecs = 2000;
265
266
267static int ksm_max_page_sharing = 256;
268
269
270static unsigned int ksm_thread_pages_to_scan = 100;
271
272
273static unsigned int ksm_thread_sleep_millisecs = 20;
274
275
276static unsigned int zero_checksum __read_mostly;
277
278
279static bool ksm_use_zero_pages __read_mostly;
280
281#ifdef CONFIG_NUMA
282
283static unsigned int ksm_merge_across_nodes = 1;
284static int ksm_nr_node_ids = 1;
285#else
286#define ksm_merge_across_nodes 1U
287#define ksm_nr_node_ids 1
288#endif
289
290#define KSM_RUN_STOP 0
291#define KSM_RUN_MERGE 1
292#define KSM_RUN_UNMERGE 2
293#define KSM_RUN_OFFLINE 4
294static unsigned long ksm_run = KSM_RUN_STOP;
295static void wait_while_offlining(void);
296
297static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
298static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
299static DEFINE_MUTEX(ksm_thread_mutex);
300static DEFINE_SPINLOCK(ksm_mmlist_lock);
301
302#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
303 sizeof(struct __struct), __alignof__(struct __struct),\
304 (__flags), NULL)
305
306static int __init ksm_slab_init(void)
307{
308 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
309 if (!rmap_item_cache)
310 goto out;
311
312 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
313 if (!stable_node_cache)
314 goto out_free1;
315
316 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
317 if (!mm_slot_cache)
318 goto out_free2;
319
320 return 0;
321
322out_free2:
323 kmem_cache_destroy(stable_node_cache);
324out_free1:
325 kmem_cache_destroy(rmap_item_cache);
326out:
327 return -ENOMEM;
328}
329
330static void __init ksm_slab_free(void)
331{
332 kmem_cache_destroy(mm_slot_cache);
333 kmem_cache_destroy(stable_node_cache);
334 kmem_cache_destroy(rmap_item_cache);
335 mm_slot_cache = NULL;
336}
337
338static __always_inline bool is_stable_node_chain(struct stable_node *chain)
339{
340 return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
341}
342
343static __always_inline bool is_stable_node_dup(struct stable_node *dup)
344{
345 return dup->head == STABLE_NODE_DUP_HEAD;
346}
347
348static inline void stable_node_chain_add_dup(struct stable_node *dup,
349 struct stable_node *chain)
350{
351 VM_BUG_ON(is_stable_node_dup(dup));
352 dup->head = STABLE_NODE_DUP_HEAD;
353 VM_BUG_ON(!is_stable_node_chain(chain));
354 hlist_add_head(&dup->hlist_dup, &chain->hlist);
355 ksm_stable_node_dups++;
356}
357
358static inline void __stable_node_dup_del(struct stable_node *dup)
359{
360 VM_BUG_ON(!is_stable_node_dup(dup));
361 hlist_del(&dup->hlist_dup);
362 ksm_stable_node_dups--;
363}
364
365static inline void stable_node_dup_del(struct stable_node *dup)
366{
367 VM_BUG_ON(is_stable_node_chain(dup));
368 if (is_stable_node_dup(dup))
369 __stable_node_dup_del(dup);
370 else
371 rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
372#ifdef CONFIG_DEBUG_VM
373 dup->head = NULL;
374#endif
375}
376
377static inline struct rmap_item *alloc_rmap_item(void)
378{
379 struct rmap_item *rmap_item;
380
381 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
382 __GFP_NORETRY | __GFP_NOWARN);
383 if (rmap_item)
384 ksm_rmap_items++;
385 return rmap_item;
386}
387
388static inline void free_rmap_item(struct rmap_item *rmap_item)
389{
390 ksm_rmap_items--;
391 rmap_item->mm = NULL;
392 kmem_cache_free(rmap_item_cache, rmap_item);
393}
394
395static inline struct stable_node *alloc_stable_node(void)
396{
397
398
399
400
401
402 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
403}
404
405static inline void free_stable_node(struct stable_node *stable_node)
406{
407 VM_BUG_ON(stable_node->rmap_hlist_len &&
408 !is_stable_node_chain(stable_node));
409 kmem_cache_free(stable_node_cache, stable_node);
410}
411
412static inline struct mm_slot *alloc_mm_slot(void)
413{
414 if (!mm_slot_cache)
415 return NULL;
416 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
417}
418
419static inline void free_mm_slot(struct mm_slot *mm_slot)
420{
421 kmem_cache_free(mm_slot_cache, mm_slot);
422}
423
424static struct mm_slot *get_mm_slot(struct mm_struct *mm)
425{
426 struct mm_slot *slot;
427
428 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
429 if (slot->mm == mm)
430 return slot;
431
432 return NULL;
433}
434
435static void insert_to_mm_slots_hash(struct mm_struct *mm,
436 struct mm_slot *mm_slot)
437{
438 mm_slot->mm = mm;
439 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
440}
441
442
443
444
445
446
447
448
449
450static inline bool ksm_test_exit(struct mm_struct *mm)
451{
452 return atomic_read(&mm->mm_users) == 0;
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
471{
472 struct page *page;
473 vm_fault_t ret = 0;
474
475 do {
476 cond_resched();
477 page = follow_page(vma, addr,
478 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
479 if (IS_ERR_OR_NULL(page))
480 break;
481 if (PageKsm(page))
482 ret = handle_mm_fault(vma, addr,
483 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
484 else
485 ret = VM_FAULT_WRITE;
486 put_page(page);
487 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
517}
518
519static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
520 unsigned long addr)
521{
522 struct vm_area_struct *vma;
523 if (ksm_test_exit(mm))
524 return NULL;
525 vma = find_vma(mm, addr);
526 if (!vma || vma->vm_start > addr)
527 return NULL;
528 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
529 return NULL;
530 return vma;
531}
532
533static void break_cow(struct rmap_item *rmap_item)
534{
535 struct mm_struct *mm = rmap_item->mm;
536 unsigned long addr = rmap_item->address;
537 struct vm_area_struct *vma;
538
539
540
541
542
543 put_anon_vma(rmap_item->anon_vma);
544
545 down_read(&mm->mmap_sem);
546 vma = find_mergeable_vma(mm, addr);
547 if (vma)
548 break_ksm(vma, addr);
549 up_read(&mm->mmap_sem);
550}
551
552static struct page *get_mergeable_page(struct rmap_item *rmap_item)
553{
554 struct mm_struct *mm = rmap_item->mm;
555 unsigned long addr = rmap_item->address;
556 struct vm_area_struct *vma;
557 struct page *page;
558
559 down_read(&mm->mmap_sem);
560 vma = find_mergeable_vma(mm, addr);
561 if (!vma)
562 goto out;
563
564 page = follow_page(vma, addr, FOLL_GET);
565 if (IS_ERR_OR_NULL(page))
566 goto out;
567 if (PageAnon(page)) {
568 flush_anon_page(vma, page, addr);
569 flush_dcache_page(page);
570 } else {
571 put_page(page);
572out:
573 page = NULL;
574 }
575 up_read(&mm->mmap_sem);
576 return page;
577}
578
579
580
581
582
583
584
585static inline int get_kpfn_nid(unsigned long kpfn)
586{
587 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
588}
589
590static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
591 struct rb_root *root)
592{
593 struct stable_node *chain = alloc_stable_node();
594 VM_BUG_ON(is_stable_node_chain(dup));
595 if (likely(chain)) {
596 INIT_HLIST_HEAD(&chain->hlist);
597 chain->chain_prune_time = jiffies;
598 chain->rmap_hlist_len = STABLE_NODE_CHAIN;
599#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
600 chain->nid = NUMA_NO_NODE;
601#endif
602 ksm_stable_node_chains++;
603
604
605
606
607
608
609 rb_replace_node(&dup->node, &chain->node, root);
610
611
612
613
614
615
616
617
618 stable_node_chain_add_dup(dup, chain);
619 }
620 return chain;
621}
622
623static inline void free_stable_node_chain(struct stable_node *chain,
624 struct rb_root *root)
625{
626 rb_erase(&chain->node, root);
627 free_stable_node(chain);
628 ksm_stable_node_chains--;
629}
630
631static void remove_node_from_stable_tree(struct stable_node *stable_node)
632{
633 struct rmap_item *rmap_item;
634
635
636 BUG_ON(stable_node->rmap_hlist_len < 0);
637
638 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
639 if (rmap_item->hlist.next)
640 ksm_pages_sharing--;
641 else
642 ksm_pages_shared--;
643 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
644 stable_node->rmap_hlist_len--;
645 put_anon_vma(rmap_item->anon_vma);
646 rmap_item->address &= PAGE_MASK;
647 cond_resched();
648 }
649
650
651
652
653
654
655
656
657#if defined(GCC_VERSION) && GCC_VERSION >= 40903
658 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
659 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
660#endif
661
662 if (stable_node->head == &migrate_nodes)
663 list_del(&stable_node->list);
664 else
665 stable_node_dup_del(stable_node);
666 free_stable_node(stable_node);
667}
668
669enum get_ksm_page_flags {
670 GET_KSM_PAGE_NOLOCK,
671 GET_KSM_PAGE_LOCK,
672 GET_KSM_PAGE_TRYLOCK
673};
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694static struct page *get_ksm_page(struct stable_node *stable_node,
695 enum get_ksm_page_flags flags)
696{
697 struct page *page;
698 void *expected_mapping;
699 unsigned long kpfn;
700
701 expected_mapping = (void *)((unsigned long)stable_node |
702 PAGE_MAPPING_KSM);
703again:
704 kpfn = READ_ONCE(stable_node->kpfn);
705 page = pfn_to_page(kpfn);
706 if (READ_ONCE(page->mapping) != expected_mapping)
707 goto stale;
708
709
710
711
712
713
714
715
716
717
718
719 while (!get_page_unless_zero(page)) {
720
721
722
723
724
725
726
727
728 if (!PageSwapCache(page))
729 goto stale;
730 cpu_relax();
731 }
732
733 if (READ_ONCE(page->mapping) != expected_mapping) {
734 put_page(page);
735 goto stale;
736 }
737
738 if (flags == GET_KSM_PAGE_TRYLOCK) {
739 if (!trylock_page(page)) {
740 put_page(page);
741 return ERR_PTR(-EBUSY);
742 }
743 } else if (flags == GET_KSM_PAGE_LOCK)
744 lock_page(page);
745
746 if (flags != GET_KSM_PAGE_NOLOCK) {
747 if (READ_ONCE(page->mapping) != expected_mapping) {
748 unlock_page(page);
749 put_page(page);
750 goto stale;
751 }
752 }
753 return page;
754
755stale:
756
757
758
759
760
761
762 smp_rmb();
763 if (READ_ONCE(stable_node->kpfn) != kpfn)
764 goto again;
765 remove_node_from_stable_tree(stable_node);
766 return NULL;
767}
768
769
770
771
772
773static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
774{
775 if (rmap_item->address & STABLE_FLAG) {
776 struct stable_node *stable_node;
777 struct page *page;
778
779 stable_node = rmap_item->head;
780 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
781 if (!page)
782 goto out;
783
784 hlist_del(&rmap_item->hlist);
785 unlock_page(page);
786 put_page(page);
787
788 if (!hlist_empty(&stable_node->hlist))
789 ksm_pages_sharing--;
790 else
791 ksm_pages_shared--;
792 VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
793 stable_node->rmap_hlist_len--;
794
795 put_anon_vma(rmap_item->anon_vma);
796 rmap_item->address &= PAGE_MASK;
797
798 } else if (rmap_item->address & UNSTABLE_FLAG) {
799 unsigned char age;
800
801
802
803
804
805
806
807 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
808 BUG_ON(age > 1);
809 if (!age)
810 rb_erase(&rmap_item->node,
811 root_unstable_tree + NUMA(rmap_item->nid));
812 ksm_pages_unshared--;
813 rmap_item->address &= PAGE_MASK;
814 }
815out:
816 cond_resched();
817}
818
819static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
820 struct rmap_item **rmap_list)
821{
822 while (*rmap_list) {
823 struct rmap_item *rmap_item = *rmap_list;
824 *rmap_list = rmap_item->rmap_list;
825 remove_rmap_item_from_tree(rmap_item);
826 free_rmap_item(rmap_item);
827 }
828}
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843static int unmerge_ksm_pages(struct vm_area_struct *vma,
844 unsigned long start, unsigned long end)
845{
846 unsigned long addr;
847 int err = 0;
848
849 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
850 if (ksm_test_exit(vma->vm_mm))
851 break;
852 if (signal_pending(current))
853 err = -ERESTARTSYS;
854 else
855 err = break_ksm(vma, addr);
856 }
857 return err;
858}
859
860static inline struct stable_node *page_stable_node(struct page *page)
861{
862 return PageKsm(page) ? page_rmapping(page) : NULL;
863}
864
865static inline void set_page_stable_node(struct page *page,
866 struct stable_node *stable_node)
867{
868 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
869}
870
871#ifdef CONFIG_SYSFS
872
873
874
875static int remove_stable_node(struct stable_node *stable_node)
876{
877 struct page *page;
878 int err;
879
880 page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
881 if (!page) {
882
883
884
885 return 0;
886 }
887
888 if (WARN_ON_ONCE(page_mapped(page))) {
889
890
891
892
893 err = -EBUSY;
894 } else {
895
896
897
898
899
900
901
902
903 set_page_stable_node(page, NULL);
904 remove_node_from_stable_tree(stable_node);
905 err = 0;
906 }
907
908 unlock_page(page);
909 put_page(page);
910 return err;
911}
912
913static int remove_stable_node_chain(struct stable_node *stable_node,
914 struct rb_root *root)
915{
916 struct stable_node *dup;
917 struct hlist_node *hlist_safe;
918
919 if (!is_stable_node_chain(stable_node)) {
920 VM_BUG_ON(is_stable_node_dup(stable_node));
921 if (remove_stable_node(stable_node))
922 return true;
923 else
924 return false;
925 }
926
927 hlist_for_each_entry_safe(dup, hlist_safe,
928 &stable_node->hlist, hlist_dup) {
929 VM_BUG_ON(!is_stable_node_dup(dup));
930 if (remove_stable_node(dup))
931 return true;
932 }
933 BUG_ON(!hlist_empty(&stable_node->hlist));
934 free_stable_node_chain(stable_node, root);
935 return false;
936}
937
938static int remove_all_stable_nodes(void)
939{
940 struct stable_node *stable_node, *next;
941 int nid;
942 int err = 0;
943
944 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
945 while (root_stable_tree[nid].rb_node) {
946 stable_node = rb_entry(root_stable_tree[nid].rb_node,
947 struct stable_node, node);
948 if (remove_stable_node_chain(stable_node,
949 root_stable_tree + nid)) {
950 err = -EBUSY;
951 break;
952 }
953 cond_resched();
954 }
955 }
956 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
957 if (remove_stable_node(stable_node))
958 err = -EBUSY;
959 cond_resched();
960 }
961 return err;
962}
963
964static int unmerge_and_remove_all_rmap_items(void)
965{
966 struct mm_slot *mm_slot;
967 struct mm_struct *mm;
968 struct vm_area_struct *vma;
969 int err = 0;
970
971 spin_lock(&ksm_mmlist_lock);
972 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
973 struct mm_slot, mm_list);
974 spin_unlock(&ksm_mmlist_lock);
975
976 for (mm_slot = ksm_scan.mm_slot;
977 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
978 mm = mm_slot->mm;
979 down_read(&mm->mmap_sem);
980 for (vma = mm->mmap; vma; vma = vma->vm_next) {
981 if (ksm_test_exit(mm))
982 break;
983 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
984 continue;
985 err = unmerge_ksm_pages(vma,
986 vma->vm_start, vma->vm_end);
987 if (err)
988 goto error;
989 }
990
991 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
992 up_read(&mm->mmap_sem);
993
994 spin_lock(&ksm_mmlist_lock);
995 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
996 struct mm_slot, mm_list);
997 if (ksm_test_exit(mm)) {
998 hash_del(&mm_slot->link);
999 list_del(&mm_slot->mm_list);
1000 spin_unlock(&ksm_mmlist_lock);
1001
1002 free_mm_slot(mm_slot);
1003 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1004 mmdrop(mm);
1005 } else
1006 spin_unlock(&ksm_mmlist_lock);
1007 }
1008
1009
1010 remove_all_stable_nodes();
1011 ksm_scan.seqnr = 0;
1012 return 0;
1013
1014error:
1015 up_read(&mm->mmap_sem);
1016 spin_lock(&ksm_mmlist_lock);
1017 ksm_scan.mm_slot = &ksm_mm_head;
1018 spin_unlock(&ksm_mmlist_lock);
1019 return err;
1020}
1021#endif
1022
1023static u32 calc_checksum(struct page *page)
1024{
1025 u32 checksum;
1026 void *addr = kmap_atomic(page);
1027 checksum = xxhash(addr, PAGE_SIZE, 0);
1028 kunmap_atomic(addr);
1029 return checksum;
1030}
1031
1032static int memcmp_pages(struct page *page1, struct page *page2)
1033{
1034 char *addr1, *addr2;
1035 int ret;
1036
1037 addr1 = kmap_atomic(page1);
1038 addr2 = kmap_atomic(page2);
1039 ret = memcmp(addr1, addr2, PAGE_SIZE);
1040 kunmap_atomic(addr2);
1041 kunmap_atomic(addr1);
1042 return ret;
1043}
1044
1045static inline int pages_identical(struct page *page1, struct page *page2)
1046{
1047 return !memcmp_pages(page1, page2);
1048}
1049
1050static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1051 pte_t *orig_pte)
1052{
1053 struct mm_struct *mm = vma->vm_mm;
1054 struct page_vma_mapped_walk pvmw = {
1055 .page = page,
1056 .vma = vma,
1057 };
1058 int swapped;
1059 int err = -EFAULT;
1060 struct mmu_notifier_range range;
1061
1062 pvmw.address = page_address_in_vma(page, vma);
1063 if (pvmw.address == -EFAULT)
1064 goto out;
1065
1066 BUG_ON(PageTransCompound(page));
1067
1068 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
1069 pvmw.address,
1070 pvmw.address + PAGE_SIZE);
1071 mmu_notifier_invalidate_range_start(&range);
1072
1073 if (!page_vma_mapped_walk(&pvmw))
1074 goto out_mn;
1075 if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
1076 goto out_unlock;
1077
1078 if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
1079 (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
1080 mm_tlb_flush_pending(mm)) {
1081 pte_t entry;
1082
1083 swapped = PageSwapCache(page);
1084 flush_cache_page(vma, pvmw.address, page_to_pfn(page));
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099 entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
1100
1101
1102
1103
1104 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
1105 set_pte_at(mm, pvmw.address, pvmw.pte, entry);
1106 goto out_unlock;
1107 }
1108 if (pte_dirty(entry))
1109 set_page_dirty(page);
1110
1111 if (pte_protnone(entry))
1112 entry = pte_mkclean(pte_clear_savedwrite(entry));
1113 else
1114 entry = pte_mkclean(pte_wrprotect(entry));
1115 set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
1116 }
1117 *orig_pte = *pvmw.pte;
1118 err = 0;
1119
1120out_unlock:
1121 page_vma_mapped_walk_done(&pvmw);
1122out_mn:
1123 mmu_notifier_invalidate_range_end(&range);
1124out:
1125 return err;
1126}
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137static int replace_page(struct vm_area_struct *vma, struct page *page,
1138 struct page *kpage, pte_t orig_pte)
1139{
1140 struct mm_struct *mm = vma->vm_mm;
1141 pmd_t *pmd;
1142 pte_t *ptep;
1143 pte_t newpte;
1144 spinlock_t *ptl;
1145 unsigned long addr;
1146 int err = -EFAULT;
1147 struct mmu_notifier_range range;
1148
1149 addr = page_address_in_vma(page, vma);
1150 if (addr == -EFAULT)
1151 goto out;
1152
1153 pmd = mm_find_pmd(mm, addr);
1154 if (!pmd)
1155 goto out;
1156
1157 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
1158 addr + PAGE_SIZE);
1159 mmu_notifier_invalidate_range_start(&range);
1160
1161 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
1162 if (!pte_same(*ptep, orig_pte)) {
1163 pte_unmap_unlock(ptep, ptl);
1164 goto out_mn;
1165 }
1166
1167
1168
1169
1170
1171 if (!is_zero_pfn(page_to_pfn(kpage))) {
1172 get_page(kpage);
1173 page_add_anon_rmap(kpage, vma, addr, false);
1174 newpte = mk_pte(kpage, vma->vm_page_prot);
1175 } else {
1176 newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
1177 vma->vm_page_prot));
1178
1179
1180
1181
1182
1183
1184 dec_mm_counter(mm, MM_ANONPAGES);
1185 }
1186
1187 flush_cache_page(vma, addr, pte_pfn(*ptep));
1188
1189
1190
1191
1192
1193
1194 ptep_clear_flush(vma, addr, ptep);
1195 set_pte_at_notify(mm, addr, ptep, newpte);
1196
1197 page_remove_rmap(page, false);
1198 if (!page_mapped(page))
1199 try_to_free_swap(page);
1200 put_page(page);
1201
1202 pte_unmap_unlock(ptep, ptl);
1203 err = 0;
1204out_mn:
1205 mmu_notifier_invalidate_range_end(&range);
1206out:
1207 return err;
1208}
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219static int try_to_merge_one_page(struct vm_area_struct *vma,
1220 struct page *page, struct page *kpage)
1221{
1222 pte_t orig_pte = __pte(0);
1223 int err = -EFAULT;
1224
1225 if (page == kpage)
1226 return 0;
1227
1228 if (!PageAnon(page))
1229 goto out;
1230
1231
1232
1233
1234
1235
1236
1237
1238 if (!trylock_page(page))
1239 goto out;
1240
1241 if (PageTransCompound(page)) {
1242 if (split_huge_page(page))
1243 goto out_unlock;
1244 }
1245
1246
1247
1248
1249
1250
1251
1252 if (write_protect_page(vma, page, &orig_pte) == 0) {
1253 if (!kpage) {
1254
1255
1256
1257
1258
1259 set_page_stable_node(page, NULL);
1260 mark_page_accessed(page);
1261
1262
1263
1264
1265 if (!PageDirty(page))
1266 SetPageDirty(page);
1267 err = 0;
1268 } else if (pages_identical(page, kpage))
1269 err = replace_page(vma, page, kpage, orig_pte);
1270 }
1271
1272 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1273 munlock_vma_page(page);
1274 if (!PageMlocked(kpage)) {
1275 unlock_page(page);
1276 lock_page(kpage);
1277 mlock_vma_page(kpage);
1278 page = kpage;
1279 }
1280 }
1281
1282out_unlock:
1283 unlock_page(page);
1284out:
1285 return err;
1286}
1287
1288
1289
1290
1291
1292
1293
1294static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1295 struct page *page, struct page *kpage)
1296{
1297 struct mm_struct *mm = rmap_item->mm;
1298 struct vm_area_struct *vma;
1299 int err = -EFAULT;
1300
1301 down_read(&mm->mmap_sem);
1302 vma = find_mergeable_vma(mm, rmap_item->address);
1303 if (!vma)
1304 goto out;
1305
1306 err = try_to_merge_one_page(vma, page, kpage);
1307 if (err)
1308 goto out;
1309
1310
1311 remove_rmap_item_from_tree(rmap_item);
1312
1313
1314 rmap_item->anon_vma = vma->anon_vma;
1315 get_anon_vma(vma->anon_vma);
1316out:
1317 up_read(&mm->mmap_sem);
1318 return err;
1319}
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1332 struct page *page,
1333 struct rmap_item *tree_rmap_item,
1334 struct page *tree_page)
1335{
1336 int err;
1337
1338 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1339 if (!err) {
1340 err = try_to_merge_with_ksm_page(tree_rmap_item,
1341 tree_page, page);
1342
1343
1344
1345
1346 if (err)
1347 break_cow(rmap_item);
1348 }
1349 return err ? NULL : page;
1350}
1351
1352static __always_inline
1353bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
1354{
1355 VM_BUG_ON(stable_node->rmap_hlist_len < 0);
1356
1357
1358
1359
1360
1361
1362 return stable_node->rmap_hlist_len &&
1363 stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
1364}
1365
1366static __always_inline
1367bool is_page_sharing_candidate(struct stable_node *stable_node)
1368{
1369 return __is_page_sharing_candidate(stable_node, 0);
1370}
1371
1372static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1373 struct stable_node **_stable_node,
1374 struct rb_root *root,
1375 bool prune_stale_stable_nodes)
1376{
1377 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1378 struct hlist_node *hlist_safe;
1379 struct page *_tree_page, *tree_page = NULL;
1380 int nr = 0;
1381 int found_rmap_hlist_len;
1382
1383 if (!prune_stale_stable_nodes ||
1384 time_before(jiffies, stable_node->chain_prune_time +
1385 msecs_to_jiffies(
1386 ksm_stable_node_chains_prune_millisecs)))
1387 prune_stale_stable_nodes = false;
1388 else
1389 stable_node->chain_prune_time = jiffies;
1390
1391 hlist_for_each_entry_safe(dup, hlist_safe,
1392 &stable_node->hlist, hlist_dup) {
1393 cond_resched();
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404 _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
1405 if (!_tree_page)
1406 continue;
1407 nr += 1;
1408 if (is_page_sharing_candidate(dup)) {
1409 if (!found ||
1410 dup->rmap_hlist_len > found_rmap_hlist_len) {
1411 if (found)
1412 put_page(tree_page);
1413 found = dup;
1414 found_rmap_hlist_len = found->rmap_hlist_len;
1415 tree_page = _tree_page;
1416
1417
1418 if (!prune_stale_stable_nodes)
1419 break;
1420 continue;
1421 }
1422 }
1423 put_page(_tree_page);
1424 }
1425
1426 if (found) {
1427
1428
1429
1430
1431
1432
1433 if (prune_stale_stable_nodes && nr == 1) {
1434
1435
1436
1437
1438
1439
1440 BUG_ON(stable_node->hlist.first->next);
1441
1442
1443
1444
1445
1446 rb_replace_node(&stable_node->node, &found->node,
1447 root);
1448 free_stable_node(stable_node);
1449 ksm_stable_node_chains--;
1450 ksm_stable_node_dups--;
1451
1452
1453
1454
1455
1456 *_stable_node = found;
1457
1458
1459
1460
1461
1462
1463 stable_node = NULL;
1464 } else if (stable_node->hlist.first != &found->hlist_dup &&
1465 __is_page_sharing_candidate(found, 1)) {
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481 hlist_del(&found->hlist_dup);
1482 hlist_add_head(&found->hlist_dup,
1483 &stable_node->hlist);
1484 }
1485 }
1486
1487 *_stable_node_dup = found;
1488 return tree_page;
1489}
1490
1491static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
1492 struct rb_root *root)
1493{
1494 if (!is_stable_node_chain(stable_node))
1495 return stable_node;
1496 if (hlist_empty(&stable_node->hlist)) {
1497 free_stable_node_chain(stable_node, root);
1498 return NULL;
1499 }
1500 return hlist_entry(stable_node->hlist.first,
1501 typeof(*stable_node), hlist_dup);
1502}
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
1519 struct stable_node **_stable_node,
1520 struct rb_root *root,
1521 bool prune_stale_stable_nodes)
1522{
1523 struct stable_node *stable_node = *_stable_node;
1524 if (!is_stable_node_chain(stable_node)) {
1525 if (is_page_sharing_candidate(stable_node)) {
1526 *_stable_node_dup = stable_node;
1527 return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
1528 }
1529
1530
1531
1532
1533 *_stable_node_dup = NULL;
1534 return NULL;
1535 }
1536 return stable_node_dup(_stable_node_dup, _stable_node, root,
1537 prune_stale_stable_nodes);
1538}
1539
1540static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
1541 struct stable_node **s_n,
1542 struct rb_root *root)
1543{
1544 return __stable_node_chain(s_n_d, s_n, root, true);
1545}
1546
1547static __always_inline struct page *chain(struct stable_node **s_n_d,
1548 struct stable_node *s_n,
1549 struct rb_root *root)
1550{
1551 struct stable_node *old_stable_node = s_n;
1552 struct page *tree_page;
1553
1554 tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
1555
1556 VM_BUG_ON(s_n != old_stable_node);
1557 return tree_page;
1558}
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569static struct page *stable_tree_search(struct page *page)
1570{
1571 int nid;
1572 struct rb_root *root;
1573 struct rb_node **new;
1574 struct rb_node *parent;
1575 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1576 struct stable_node *page_node;
1577
1578 page_node = page_stable_node(page);
1579 if (page_node && page_node->head != &migrate_nodes) {
1580
1581 get_page(page);
1582 return page;
1583 }
1584
1585 nid = get_kpfn_nid(page_to_pfn(page));
1586 root = root_stable_tree + nid;
1587again:
1588 new = &root->rb_node;
1589 parent = NULL;
1590
1591 while (*new) {
1592 struct page *tree_page;
1593 int ret;
1594
1595 cond_resched();
1596 stable_node = rb_entry(*new, struct stable_node, node);
1597 stable_node_any = NULL;
1598 tree_page = chain_prune(&stable_node_dup, &stable_node, root);
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611 if (!stable_node_dup) {
1612
1613
1614
1615
1616
1617 stable_node_any = stable_node_dup_any(stable_node,
1618 root);
1619 if (!stable_node_any) {
1620
1621 goto again;
1622 }
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632 tree_page = get_ksm_page(stable_node_any,
1633 GET_KSM_PAGE_NOLOCK);
1634 }
1635 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1636 if (!tree_page) {
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646 goto again;
1647 }
1648
1649 ret = memcmp_pages(page, tree_page);
1650 put_page(tree_page);
1651
1652 parent = *new;
1653 if (ret < 0)
1654 new = &parent->rb_left;
1655 else if (ret > 0)
1656 new = &parent->rb_right;
1657 else {
1658 if (page_node) {
1659 VM_BUG_ON(page_node->head != &migrate_nodes);
1660
1661
1662
1663
1664
1665
1666 if (page_mapcount(page) > 1)
1667 goto chain_append;
1668 }
1669
1670 if (!stable_node_dup) {
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683 return NULL;
1684 }
1685
1686
1687
1688
1689
1690
1691
1692
1693 tree_page = get_ksm_page(stable_node_dup,
1694 GET_KSM_PAGE_TRYLOCK);
1695
1696 if (PTR_ERR(tree_page) == -EBUSY)
1697 return ERR_PTR(-EBUSY);
1698
1699 if (unlikely(!tree_page))
1700
1701
1702
1703
1704 goto again;
1705 unlock_page(tree_page);
1706
1707 if (get_kpfn_nid(stable_node_dup->kpfn) !=
1708 NUMA(stable_node_dup->nid)) {
1709 put_page(tree_page);
1710 goto replace;
1711 }
1712 return tree_page;
1713 }
1714 }
1715
1716 if (!page_node)
1717 return NULL;
1718
1719 list_del(&page_node->list);
1720 DO_NUMA(page_node->nid = nid);
1721 rb_link_node(&page_node->node, parent, new);
1722 rb_insert_color(&page_node->node, root);
1723out:
1724 if (is_page_sharing_candidate(page_node)) {
1725 get_page(page);
1726 return page;
1727 } else
1728 return NULL;
1729
1730replace:
1731
1732
1733
1734
1735
1736
1737
1738
1739 if (stable_node_dup == stable_node) {
1740 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1741 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1742
1743 if (page_node) {
1744 VM_BUG_ON(page_node->head != &migrate_nodes);
1745 list_del(&page_node->list);
1746 DO_NUMA(page_node->nid = nid);
1747 rb_replace_node(&stable_node_dup->node,
1748 &page_node->node,
1749 root);
1750 if (is_page_sharing_candidate(page_node))
1751 get_page(page);
1752 else
1753 page = NULL;
1754 } else {
1755 rb_erase(&stable_node_dup->node, root);
1756 page = NULL;
1757 }
1758 } else {
1759 VM_BUG_ON(!is_stable_node_chain(stable_node));
1760 __stable_node_dup_del(stable_node_dup);
1761 if (page_node) {
1762 VM_BUG_ON(page_node->head != &migrate_nodes);
1763 list_del(&page_node->list);
1764 DO_NUMA(page_node->nid = nid);
1765 stable_node_chain_add_dup(page_node, stable_node);
1766 if (is_page_sharing_candidate(page_node))
1767 get_page(page);
1768 else
1769 page = NULL;
1770 } else {
1771 page = NULL;
1772 }
1773 }
1774 stable_node_dup->head = &migrate_nodes;
1775 list_add(&stable_node_dup->list, stable_node_dup->head);
1776 return page;
1777
1778chain_append:
1779
1780 if (!stable_node_dup)
1781 stable_node_dup = stable_node_any;
1782
1783
1784
1785
1786
1787
1788
1789
1790 if (stable_node_dup == stable_node) {
1791 VM_BUG_ON(is_stable_node_chain(stable_node_dup));
1792 VM_BUG_ON(is_stable_node_dup(stable_node_dup));
1793
1794 stable_node = alloc_stable_node_chain(stable_node_dup,
1795 root);
1796 if (!stable_node)
1797 return NULL;
1798 }
1799
1800
1801
1802
1803
1804
1805 VM_BUG_ON(!is_stable_node_chain(stable_node));
1806 VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
1807 VM_BUG_ON(page_node->head != &migrate_nodes);
1808 list_del(&page_node->list);
1809 DO_NUMA(page_node->nid = nid);
1810 stable_node_chain_add_dup(page_node, stable_node);
1811 goto out;
1812}
1813
1814
1815
1816
1817
1818
1819
1820
1821static struct stable_node *stable_tree_insert(struct page *kpage)
1822{
1823 int nid;
1824 unsigned long kpfn;
1825 struct rb_root *root;
1826 struct rb_node **new;
1827 struct rb_node *parent;
1828 struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
1829 bool need_chain = false;
1830
1831 kpfn = page_to_pfn(kpage);
1832 nid = get_kpfn_nid(kpfn);
1833 root = root_stable_tree + nid;
1834again:
1835 parent = NULL;
1836 new = &root->rb_node;
1837
1838 while (*new) {
1839 struct page *tree_page;
1840 int ret;
1841
1842 cond_resched();
1843 stable_node = rb_entry(*new, struct stable_node, node);
1844 stable_node_any = NULL;
1845 tree_page = chain(&stable_node_dup, stable_node, root);
1846 if (!stable_node_dup) {
1847
1848
1849
1850
1851
1852 stable_node_any = stable_node_dup_any(stable_node,
1853 root);
1854 if (!stable_node_any) {
1855
1856 goto again;
1857 }
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867 tree_page = get_ksm_page(stable_node_any,
1868 GET_KSM_PAGE_NOLOCK);
1869 }
1870 VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
1871 if (!tree_page) {
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881 goto again;
1882 }
1883
1884 ret = memcmp_pages(kpage, tree_page);
1885 put_page(tree_page);
1886
1887 parent = *new;
1888 if (ret < 0)
1889 new = &parent->rb_left;
1890 else if (ret > 0)
1891 new = &parent->rb_right;
1892 else {
1893 need_chain = true;
1894 break;
1895 }
1896 }
1897
1898 stable_node_dup = alloc_stable_node();
1899 if (!stable_node_dup)
1900 return NULL;
1901
1902 INIT_HLIST_HEAD(&stable_node_dup->hlist);
1903 stable_node_dup->kpfn = kpfn;
1904 set_page_stable_node(kpage, stable_node_dup);
1905 stable_node_dup->rmap_hlist_len = 0;
1906 DO_NUMA(stable_node_dup->nid = nid);
1907 if (!need_chain) {
1908 rb_link_node(&stable_node_dup->node, parent, new);
1909 rb_insert_color(&stable_node_dup->node, root);
1910 } else {
1911 if (!is_stable_node_chain(stable_node)) {
1912 struct stable_node *orig = stable_node;
1913
1914 stable_node = alloc_stable_node_chain(orig, root);
1915 if (!stable_node) {
1916 free_stable_node(stable_node_dup);
1917 return NULL;
1918 }
1919 }
1920 stable_node_chain_add_dup(stable_node_dup, stable_node);
1921 }
1922
1923 return stable_node_dup;
1924}
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940static
1941struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1942 struct page *page,
1943 struct page **tree_pagep)
1944{
1945 struct rb_node **new;
1946 struct rb_root *root;
1947 struct rb_node *parent = NULL;
1948 int nid;
1949
1950 nid = get_kpfn_nid(page_to_pfn(page));
1951 root = root_unstable_tree + nid;
1952 new = &root->rb_node;
1953
1954 while (*new) {
1955 struct rmap_item *tree_rmap_item;
1956 struct page *tree_page;
1957 int ret;
1958
1959 cond_resched();
1960 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1961 tree_page = get_mergeable_page(tree_rmap_item);
1962 if (!tree_page)
1963 return NULL;
1964
1965
1966
1967
1968 if (page == tree_page) {
1969 put_page(tree_page);
1970 return NULL;
1971 }
1972
1973 ret = memcmp_pages(page, tree_page);
1974
1975 parent = *new;
1976 if (ret < 0) {
1977 put_page(tree_page);
1978 new = &parent->rb_left;
1979 } else if (ret > 0) {
1980 put_page(tree_page);
1981 new = &parent->rb_right;
1982 } else if (!ksm_merge_across_nodes &&
1983 page_to_nid(tree_page) != nid) {
1984
1985
1986
1987
1988
1989 put_page(tree_page);
1990 return NULL;
1991 } else {
1992 *tree_pagep = tree_page;
1993 return tree_rmap_item;
1994 }
1995 }
1996
1997 rmap_item->address |= UNSTABLE_FLAG;
1998 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1999 DO_NUMA(rmap_item->nid = nid);
2000 rb_link_node(&rmap_item->node, parent, new);
2001 rb_insert_color(&rmap_item->node, root);
2002
2003 ksm_pages_unshared++;
2004 return NULL;
2005}
2006
2007
2008
2009
2010
2011
2012static void stable_tree_append(struct rmap_item *rmap_item,
2013 struct stable_node *stable_node,
2014 bool max_page_sharing_bypass)
2015{
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026 BUG_ON(stable_node->rmap_hlist_len < 0);
2027
2028 stable_node->rmap_hlist_len++;
2029 if (!max_page_sharing_bypass)
2030
2031 WARN_ON_ONCE(stable_node->rmap_hlist_len >
2032 ksm_max_page_sharing);
2033
2034 rmap_item->head = stable_node;
2035 rmap_item->address |= STABLE_FLAG;
2036 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
2037
2038 if (rmap_item->hlist.next)
2039 ksm_pages_sharing++;
2040 else
2041 ksm_pages_shared++;
2042}
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2054{
2055 struct mm_struct *mm = rmap_item->mm;
2056 struct rmap_item *tree_rmap_item;
2057 struct page *tree_page = NULL;
2058 struct stable_node *stable_node;
2059 struct page *kpage;
2060 unsigned int checksum;
2061 int err;
2062 bool max_page_sharing_bypass = false;
2063
2064 stable_node = page_stable_node(page);
2065 if (stable_node) {
2066 if (stable_node->head != &migrate_nodes &&
2067 get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
2068 NUMA(stable_node->nid)) {
2069 stable_node_dup_del(stable_node);
2070 stable_node->head = &migrate_nodes;
2071 list_add(&stable_node->list, stable_node->head);
2072 }
2073 if (stable_node->head != &migrate_nodes &&
2074 rmap_item->head == stable_node)
2075 return;
2076
2077
2078
2079
2080 if (!is_page_sharing_candidate(stable_node))
2081 max_page_sharing_bypass = true;
2082 }
2083
2084
2085 kpage = stable_tree_search(page);
2086 if (kpage == page && rmap_item->head == stable_node) {
2087 put_page(kpage);
2088 return;
2089 }
2090
2091 remove_rmap_item_from_tree(rmap_item);
2092
2093 if (kpage) {
2094 if (PTR_ERR(kpage) == -EBUSY)
2095 return;
2096
2097 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
2098 if (!err) {
2099
2100
2101
2102
2103 lock_page(kpage);
2104 stable_tree_append(rmap_item, page_stable_node(kpage),
2105 max_page_sharing_bypass);
2106 unlock_page(kpage);
2107 }
2108 put_page(kpage);
2109 return;
2110 }
2111
2112
2113
2114
2115
2116
2117
2118 checksum = calc_checksum(page);
2119 if (rmap_item->oldchecksum != checksum) {
2120 rmap_item->oldchecksum = checksum;
2121 return;
2122 }
2123
2124
2125
2126
2127
2128 if (ksm_use_zero_pages && (checksum == zero_checksum)) {
2129 struct vm_area_struct *vma;
2130
2131 down_read(&mm->mmap_sem);
2132 vma = find_mergeable_vma(mm, rmap_item->address);
2133 err = try_to_merge_one_page(vma, page,
2134 ZERO_PAGE(rmap_item->address));
2135 up_read(&mm->mmap_sem);
2136
2137
2138
2139
2140 if (!err)
2141 return;
2142 }
2143 tree_rmap_item =
2144 unstable_tree_search_insert(rmap_item, page, &tree_page);
2145 if (tree_rmap_item) {
2146 bool split;
2147
2148 kpage = try_to_merge_two_pages(rmap_item, page,
2149 tree_rmap_item, tree_page);
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160 split = PageTransCompound(page)
2161 && compound_head(page) == compound_head(tree_page);
2162 put_page(tree_page);
2163 if (kpage) {
2164
2165
2166
2167
2168 lock_page(kpage);
2169 stable_node = stable_tree_insert(kpage);
2170 if (stable_node) {
2171 stable_tree_append(tree_rmap_item, stable_node,
2172 false);
2173 stable_tree_append(rmap_item, stable_node,
2174 false);
2175 }
2176 unlock_page(kpage);
2177
2178
2179
2180
2181
2182
2183
2184 if (!stable_node) {
2185 break_cow(tree_rmap_item);
2186 break_cow(rmap_item);
2187 }
2188 } else if (split) {
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198 if (!trylock_page(page))
2199 return;
2200 split_huge_page(page);
2201 unlock_page(page);
2202 }
2203 }
2204}
2205
2206static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
2207 struct rmap_item **rmap_list,
2208 unsigned long addr)
2209{
2210 struct rmap_item *rmap_item;
2211
2212 while (*rmap_list) {
2213 rmap_item = *rmap_list;
2214 if ((rmap_item->address & PAGE_MASK) == addr)
2215 return rmap_item;
2216 if (rmap_item->address > addr)
2217 break;
2218 *rmap_list = rmap_item->rmap_list;
2219 remove_rmap_item_from_tree(rmap_item);
2220 free_rmap_item(rmap_item);
2221 }
2222
2223 rmap_item = alloc_rmap_item();
2224 if (rmap_item) {
2225
2226 rmap_item->mm = mm_slot->mm;
2227 rmap_item->address = addr;
2228 rmap_item->rmap_list = *rmap_list;
2229 *rmap_list = rmap_item;
2230 }
2231 return rmap_item;
2232}
2233
2234static struct rmap_item *scan_get_next_rmap_item(struct page **page)
2235{
2236 struct mm_struct *mm;
2237 struct mm_slot *slot;
2238 struct vm_area_struct *vma;
2239 struct rmap_item *rmap_item;
2240 int nid;
2241
2242 if (list_empty(&ksm_mm_head.mm_list))
2243 return NULL;
2244
2245 slot = ksm_scan.mm_slot;
2246 if (slot == &ksm_mm_head) {
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257 lru_add_drain_all();
2258
2259
2260
2261
2262
2263
2264
2265 if (!ksm_merge_across_nodes) {
2266 struct stable_node *stable_node, *next;
2267 struct page *page;
2268
2269 list_for_each_entry_safe(stable_node, next,
2270 &migrate_nodes, list) {
2271 page = get_ksm_page(stable_node,
2272 GET_KSM_PAGE_NOLOCK);
2273 if (page)
2274 put_page(page);
2275 cond_resched();
2276 }
2277 }
2278
2279 for (nid = 0; nid < ksm_nr_node_ids; nid++)
2280 root_unstable_tree[nid] = RB_ROOT;
2281
2282 spin_lock(&ksm_mmlist_lock);
2283 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
2284 ksm_scan.mm_slot = slot;
2285 spin_unlock(&ksm_mmlist_lock);
2286
2287
2288
2289
2290 if (slot == &ksm_mm_head)
2291 return NULL;
2292next_mm:
2293 ksm_scan.address = 0;
2294 ksm_scan.rmap_list = &slot->rmap_list;
2295 }
2296
2297 mm = slot->mm;
2298 down_read(&mm->mmap_sem);
2299 if (ksm_test_exit(mm))
2300 vma = NULL;
2301 else
2302 vma = find_vma(mm, ksm_scan.address);
2303
2304 for (; vma; vma = vma->vm_next) {
2305 if (!(vma->vm_flags & VM_MERGEABLE))
2306 continue;
2307 if (ksm_scan.address < vma->vm_start)
2308 ksm_scan.address = vma->vm_start;
2309 if (!vma->anon_vma)
2310 ksm_scan.address = vma->vm_end;
2311
2312 while (ksm_scan.address < vma->vm_end) {
2313 if (ksm_test_exit(mm))
2314 break;
2315 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
2316 if (IS_ERR_OR_NULL(*page)) {
2317 ksm_scan.address += PAGE_SIZE;
2318 cond_resched();
2319 continue;
2320 }
2321 if (PageAnon(*page)) {
2322 flush_anon_page(vma, *page, ksm_scan.address);
2323 flush_dcache_page(*page);
2324 rmap_item = get_next_rmap_item(slot,
2325 ksm_scan.rmap_list, ksm_scan.address);
2326 if (rmap_item) {
2327 ksm_scan.rmap_list =
2328 &rmap_item->rmap_list;
2329 ksm_scan.address += PAGE_SIZE;
2330 } else
2331 put_page(*page);
2332 up_read(&mm->mmap_sem);
2333 return rmap_item;
2334 }
2335 put_page(*page);
2336 ksm_scan.address += PAGE_SIZE;
2337 cond_resched();
2338 }
2339 }
2340
2341 if (ksm_test_exit(mm)) {
2342 ksm_scan.address = 0;
2343 ksm_scan.rmap_list = &slot->rmap_list;
2344 }
2345
2346
2347
2348
2349 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
2350
2351 spin_lock(&ksm_mmlist_lock);
2352 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
2353 struct mm_slot, mm_list);
2354 if (ksm_scan.address == 0) {
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364 hash_del(&slot->link);
2365 list_del(&slot->mm_list);
2366 spin_unlock(&ksm_mmlist_lock);
2367
2368 free_mm_slot(slot);
2369 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2370 up_read(&mm->mmap_sem);
2371 mmdrop(mm);
2372 } else {
2373 up_read(&mm->mmap_sem);
2374
2375
2376
2377
2378
2379
2380
2381 spin_unlock(&ksm_mmlist_lock);
2382 }
2383
2384
2385 slot = ksm_scan.mm_slot;
2386 if (slot != &ksm_mm_head)
2387 goto next_mm;
2388
2389 ksm_scan.seqnr++;
2390 return NULL;
2391}
2392
2393
2394
2395
2396
2397static void ksm_do_scan(unsigned int scan_npages)
2398{
2399 struct rmap_item *rmap_item;
2400 struct page *uninitialized_var(page);
2401
2402 while (scan_npages-- && likely(!freezing(current))) {
2403 cond_resched();
2404 rmap_item = scan_get_next_rmap_item(&page);
2405 if (!rmap_item)
2406 return;
2407 cmp_and_merge_page(page, rmap_item);
2408 put_page(page);
2409 }
2410}
2411
2412static int ksmd_should_run(void)
2413{
2414 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
2415}
2416
2417static int ksm_scan_thread(void *nothing)
2418{
2419 unsigned int sleep_ms;
2420
2421 set_freezable();
2422 set_user_nice(current, 5);
2423
2424 while (!kthread_should_stop()) {
2425 mutex_lock(&ksm_thread_mutex);
2426 wait_while_offlining();
2427 if (ksmd_should_run())
2428 ksm_do_scan(ksm_thread_pages_to_scan);
2429 mutex_unlock(&ksm_thread_mutex);
2430
2431 try_to_freeze();
2432
2433 if (ksmd_should_run()) {
2434 sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
2435 wait_event_interruptible_timeout(ksm_iter_wait,
2436 sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
2437 msecs_to_jiffies(sleep_ms));
2438 } else {
2439 wait_event_freezable(ksm_thread_wait,
2440 ksmd_should_run() || kthread_should_stop());
2441 }
2442 }
2443 return 0;
2444}
2445
2446int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
2447 unsigned long end, int advice, unsigned long *vm_flags)
2448{
2449 struct mm_struct *mm = vma->vm_mm;
2450 int err;
2451
2452 switch (advice) {
2453 case MADV_MERGEABLE:
2454
2455
2456
2457 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
2458 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
2459 VM_HUGETLB | VM_MIXEDMAP))
2460 return 0;
2461
2462 if (vma_is_dax(vma))
2463 return 0;
2464
2465#ifdef VM_SAO
2466 if (*vm_flags & VM_SAO)
2467 return 0;
2468#endif
2469#ifdef VM_SPARC_ADI
2470 if (*vm_flags & VM_SPARC_ADI)
2471 return 0;
2472#endif
2473
2474 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
2475 err = __ksm_enter(mm);
2476 if (err)
2477 return err;
2478 }
2479
2480 *vm_flags |= VM_MERGEABLE;
2481 break;
2482
2483 case MADV_UNMERGEABLE:
2484 if (!(*vm_flags & VM_MERGEABLE))
2485 return 0;
2486
2487 if (vma->anon_vma) {
2488 err = unmerge_ksm_pages(vma, start, end);
2489 if (err)
2490 return err;
2491 }
2492
2493 *vm_flags &= ~VM_MERGEABLE;
2494 break;
2495 }
2496
2497 return 0;
2498}
2499
2500int __ksm_enter(struct mm_struct *mm)
2501{
2502 struct mm_slot *mm_slot;
2503 int needs_wakeup;
2504
2505 mm_slot = alloc_mm_slot();
2506 if (!mm_slot)
2507 return -ENOMEM;
2508
2509
2510 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
2511
2512 spin_lock(&ksm_mmlist_lock);
2513 insert_to_mm_slots_hash(mm, mm_slot);
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524 if (ksm_run & KSM_RUN_UNMERGE)
2525 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
2526 else
2527 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
2528 spin_unlock(&ksm_mmlist_lock);
2529
2530 set_bit(MMF_VM_MERGEABLE, &mm->flags);
2531 mmgrab(mm);
2532
2533 if (needs_wakeup)
2534 wake_up_interruptible(&ksm_thread_wait);
2535
2536 return 0;
2537}
2538
2539void __ksm_exit(struct mm_struct *mm)
2540{
2541 struct mm_slot *mm_slot;
2542 int easy_to_free = 0;
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553 spin_lock(&ksm_mmlist_lock);
2554 mm_slot = get_mm_slot(mm);
2555 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
2556 if (!mm_slot->rmap_list) {
2557 hash_del(&mm_slot->link);
2558 list_del(&mm_slot->mm_list);
2559 easy_to_free = 1;
2560 } else {
2561 list_move(&mm_slot->mm_list,
2562 &ksm_scan.mm_slot->mm_list);
2563 }
2564 }
2565 spin_unlock(&ksm_mmlist_lock);
2566
2567 if (easy_to_free) {
2568 free_mm_slot(mm_slot);
2569 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
2570 mmdrop(mm);
2571 } else if (mm_slot) {
2572 down_write(&mm->mmap_sem);
2573 up_write(&mm->mmap_sem);
2574 }
2575}
2576
2577struct page *ksm_might_need_to_copy(struct page *page,
2578 struct vm_area_struct *vma, unsigned long address)
2579{
2580 struct anon_vma *anon_vma = page_anon_vma(page);
2581 struct page *new_page;
2582
2583 if (PageKsm(page)) {
2584 if (page_stable_node(page) &&
2585 !(ksm_run & KSM_RUN_UNMERGE))
2586 return page;
2587 } else if (!anon_vma) {
2588 return page;
2589 } else if (anon_vma->root == vma->anon_vma->root &&
2590 page->index == linear_page_index(vma, address)) {
2591 return page;
2592 }
2593 if (!PageUptodate(page))
2594 return page;
2595
2596 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2597 if (new_page) {
2598 copy_user_highpage(new_page, page, address, vma);
2599
2600 SetPageDirty(new_page);
2601 __SetPageUptodate(new_page);
2602 __SetPageLocked(new_page);
2603 }
2604
2605 return new_page;
2606}
2607
2608void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
2609{
2610 struct stable_node *stable_node;
2611 struct rmap_item *rmap_item;
2612 int search_new_forks = 0;
2613
2614 VM_BUG_ON_PAGE(!PageKsm(page), page);
2615
2616
2617
2618
2619
2620 VM_BUG_ON_PAGE(!PageLocked(page), page);
2621
2622 stable_node = page_stable_node(page);
2623 if (!stable_node)
2624 return;
2625again:
2626 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2627 struct anon_vma *anon_vma = rmap_item->anon_vma;
2628 struct anon_vma_chain *vmac;
2629 struct vm_area_struct *vma;
2630
2631 cond_resched();
2632 anon_vma_lock_read(anon_vma);
2633 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2634 0, ULONG_MAX) {
2635 unsigned long addr;
2636
2637 cond_resched();
2638 vma = vmac->vma;
2639
2640
2641 addr = rmap_item->address & ~KSM_FLAG_MASK;
2642
2643 if (addr < vma->vm_start || addr >= vma->vm_end)
2644 continue;
2645
2646
2647
2648
2649
2650
2651 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2652 continue;
2653
2654 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2655 continue;
2656
2657 if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
2658 anon_vma_unlock_read(anon_vma);
2659 return;
2660 }
2661 if (rwc->done && rwc->done(page)) {
2662 anon_vma_unlock_read(anon_vma);
2663 return;
2664 }
2665 }
2666 anon_vma_unlock_read(anon_vma);
2667 }
2668 if (!search_new_forks++)
2669 goto again;
2670}
2671
2672bool reuse_ksm_page(struct page *page,
2673 struct vm_area_struct *vma,
2674 unsigned long address)
2675{
2676#ifdef CONFIG_DEBUG_VM
2677 if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
2678 WARN_ON(!page_mapped(page)) ||
2679 WARN_ON(!PageLocked(page))) {
2680 dump_page(page, "reuse_ksm_page");
2681 return false;
2682 }
2683#endif
2684
2685 if (PageSwapCache(page) || !page_stable_node(page))
2686 return false;
2687
2688 if (!page_ref_freeze(page, 1))
2689 return false;
2690
2691 page_move_anon_rmap(page, vma);
2692 page->index = linear_page_index(vma, address);
2693 page_ref_unfreeze(page, 1);
2694
2695 return true;
2696}
2697#ifdef CONFIG_MIGRATION
2698void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2699{
2700 struct stable_node *stable_node;
2701
2702 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
2703 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
2704 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
2705
2706 stable_node = page_stable_node(newpage);
2707 if (stable_node) {
2708 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
2709 stable_node->kpfn = page_to_pfn(newpage);
2710
2711
2712
2713
2714
2715
2716 smp_wmb();
2717 set_page_stable_node(oldpage, NULL);
2718 }
2719}
2720#endif
2721
2722#ifdef CONFIG_MEMORY_HOTREMOVE
2723static void wait_while_offlining(void)
2724{
2725 while (ksm_run & KSM_RUN_OFFLINE) {
2726 mutex_unlock(&ksm_thread_mutex);
2727 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2728 TASK_UNINTERRUPTIBLE);
2729 mutex_lock(&ksm_thread_mutex);
2730 }
2731}
2732
2733static bool stable_node_dup_remove_range(struct stable_node *stable_node,
2734 unsigned long start_pfn,
2735 unsigned long end_pfn)
2736{
2737 if (stable_node->kpfn >= start_pfn &&
2738 stable_node->kpfn < end_pfn) {
2739
2740
2741
2742
2743 remove_node_from_stable_tree(stable_node);
2744 return true;
2745 }
2746 return false;
2747}
2748
2749static bool stable_node_chain_remove_range(struct stable_node *stable_node,
2750 unsigned long start_pfn,
2751 unsigned long end_pfn,
2752 struct rb_root *root)
2753{
2754 struct stable_node *dup;
2755 struct hlist_node *hlist_safe;
2756
2757 if (!is_stable_node_chain(stable_node)) {
2758 VM_BUG_ON(is_stable_node_dup(stable_node));
2759 return stable_node_dup_remove_range(stable_node, start_pfn,
2760 end_pfn);
2761 }
2762
2763 hlist_for_each_entry_safe(dup, hlist_safe,
2764 &stable_node->hlist, hlist_dup) {
2765 VM_BUG_ON(!is_stable_node_dup(dup));
2766 stable_node_dup_remove_range(dup, start_pfn, end_pfn);
2767 }
2768 if (hlist_empty(&stable_node->hlist)) {
2769 free_stable_node_chain(stable_node, root);
2770 return true;
2771 } else
2772 return false;
2773}
2774
2775static void ksm_check_stable_tree(unsigned long start_pfn,
2776 unsigned long end_pfn)
2777{
2778 struct stable_node *stable_node, *next;
2779 struct rb_node *node;
2780 int nid;
2781
2782 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2783 node = rb_first(root_stable_tree + nid);
2784 while (node) {
2785 stable_node = rb_entry(node, struct stable_node, node);
2786 if (stable_node_chain_remove_range(stable_node,
2787 start_pfn, end_pfn,
2788 root_stable_tree +
2789 nid))
2790 node = rb_first(root_stable_tree + nid);
2791 else
2792 node = rb_next(node);
2793 cond_resched();
2794 }
2795 }
2796 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2797 if (stable_node->kpfn >= start_pfn &&
2798 stable_node->kpfn < end_pfn)
2799 remove_node_from_stable_tree(stable_node);
2800 cond_resched();
2801 }
2802}
2803
2804static int ksm_memory_callback(struct notifier_block *self,
2805 unsigned long action, void *arg)
2806{
2807 struct memory_notify *mn = arg;
2808
2809 switch (action) {
2810 case MEM_GOING_OFFLINE:
2811
2812
2813
2814
2815
2816
2817
2818 mutex_lock(&ksm_thread_mutex);
2819 ksm_run |= KSM_RUN_OFFLINE;
2820 mutex_unlock(&ksm_thread_mutex);
2821 break;
2822
2823 case MEM_OFFLINE:
2824
2825
2826
2827
2828
2829
2830
2831 ksm_check_stable_tree(mn->start_pfn,
2832 mn->start_pfn + mn->nr_pages);
2833
2834
2835 case MEM_CANCEL_OFFLINE:
2836 mutex_lock(&ksm_thread_mutex);
2837 ksm_run &= ~KSM_RUN_OFFLINE;
2838 mutex_unlock(&ksm_thread_mutex);
2839
2840 smp_mb();
2841 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2842 break;
2843 }
2844 return NOTIFY_OK;
2845}
2846#else
2847static void wait_while_offlining(void)
2848{
2849}
2850#endif
2851
2852#ifdef CONFIG_SYSFS
2853
2854
2855
2856
2857#define KSM_ATTR_RO(_name) \
2858 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2859#define KSM_ATTR(_name) \
2860 static struct kobj_attribute _name##_attr = \
2861 __ATTR(_name, 0644, _name##_show, _name##_store)
2862
2863static ssize_t sleep_millisecs_show(struct kobject *kobj,
2864 struct kobj_attribute *attr, char *buf)
2865{
2866 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2867}
2868
2869static ssize_t sleep_millisecs_store(struct kobject *kobj,
2870 struct kobj_attribute *attr,
2871 const char *buf, size_t count)
2872{
2873 unsigned long msecs;
2874 int err;
2875
2876 err = kstrtoul(buf, 10, &msecs);
2877 if (err || msecs > UINT_MAX)
2878 return -EINVAL;
2879
2880 ksm_thread_sleep_millisecs = msecs;
2881 wake_up_interruptible(&ksm_iter_wait);
2882
2883 return count;
2884}
2885KSM_ATTR(sleep_millisecs);
2886
2887static ssize_t pages_to_scan_show(struct kobject *kobj,
2888 struct kobj_attribute *attr, char *buf)
2889{
2890 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2891}
2892
2893static ssize_t pages_to_scan_store(struct kobject *kobj,
2894 struct kobj_attribute *attr,
2895 const char *buf, size_t count)
2896{
2897 int err;
2898 unsigned long nr_pages;
2899
2900 err = kstrtoul(buf, 10, &nr_pages);
2901 if (err || nr_pages > UINT_MAX)
2902 return -EINVAL;
2903
2904 ksm_thread_pages_to_scan = nr_pages;
2905
2906 return count;
2907}
2908KSM_ATTR(pages_to_scan);
2909
2910static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2911 char *buf)
2912{
2913 return sprintf(buf, "%lu\n", ksm_run);
2914}
2915
2916static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2917 const char *buf, size_t count)
2918{
2919 int err;
2920 unsigned long flags;
2921
2922 err = kstrtoul(buf, 10, &flags);
2923 if (err || flags > UINT_MAX)
2924 return -EINVAL;
2925 if (flags > KSM_RUN_UNMERGE)
2926 return -EINVAL;
2927
2928
2929
2930
2931
2932
2933
2934
2935 mutex_lock(&ksm_thread_mutex);
2936 wait_while_offlining();
2937 if (ksm_run != flags) {
2938 ksm_run = flags;
2939 if (flags & KSM_RUN_UNMERGE) {
2940 set_current_oom_origin();
2941 err = unmerge_and_remove_all_rmap_items();
2942 clear_current_oom_origin();
2943 if (err) {
2944 ksm_run = KSM_RUN_STOP;
2945 count = err;
2946 }
2947 }
2948 }
2949 mutex_unlock(&ksm_thread_mutex);
2950
2951 if (flags & KSM_RUN_MERGE)
2952 wake_up_interruptible(&ksm_thread_wait);
2953
2954 return count;
2955}
2956KSM_ATTR(run);
2957
2958#ifdef CONFIG_NUMA
2959static ssize_t merge_across_nodes_show(struct kobject *kobj,
2960 struct kobj_attribute *attr, char *buf)
2961{
2962 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2963}
2964
2965static ssize_t merge_across_nodes_store(struct kobject *kobj,
2966 struct kobj_attribute *attr,
2967 const char *buf, size_t count)
2968{
2969 int err;
2970 unsigned long knob;
2971
2972 err = kstrtoul(buf, 10, &knob);
2973 if (err)
2974 return err;
2975 if (knob > 1)
2976 return -EINVAL;
2977
2978 mutex_lock(&ksm_thread_mutex);
2979 wait_while_offlining();
2980 if (ksm_merge_across_nodes != knob) {
2981 if (ksm_pages_shared || remove_all_stable_nodes())
2982 err = -EBUSY;
2983 else if (root_stable_tree == one_stable_tree) {
2984 struct rb_root *buf;
2985
2986
2987
2988
2989
2990
2991
2992 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2993 GFP_KERNEL);
2994
2995 if (!buf)
2996 err = -ENOMEM;
2997 else {
2998 root_stable_tree = buf;
2999 root_unstable_tree = buf + nr_node_ids;
3000
3001 root_unstable_tree[0] = one_unstable_tree[0];
3002 }
3003 }
3004 if (!err) {
3005 ksm_merge_across_nodes = knob;
3006 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
3007 }
3008 }
3009 mutex_unlock(&ksm_thread_mutex);
3010
3011 return err ? err : count;
3012}
3013KSM_ATTR(merge_across_nodes);
3014#endif
3015
3016static ssize_t use_zero_pages_show(struct kobject *kobj,
3017 struct kobj_attribute *attr, char *buf)
3018{
3019 return sprintf(buf, "%u\n", ksm_use_zero_pages);
3020}
3021static ssize_t use_zero_pages_store(struct kobject *kobj,
3022 struct kobj_attribute *attr,
3023 const char *buf, size_t count)
3024{
3025 int err;
3026 bool value;
3027
3028 err = kstrtobool(buf, &value);
3029 if (err)
3030 return -EINVAL;
3031
3032 ksm_use_zero_pages = value;
3033
3034 return count;
3035}
3036KSM_ATTR(use_zero_pages);
3037
3038static ssize_t max_page_sharing_show(struct kobject *kobj,
3039 struct kobj_attribute *attr, char *buf)
3040{
3041 return sprintf(buf, "%u\n", ksm_max_page_sharing);
3042}
3043
3044static ssize_t max_page_sharing_store(struct kobject *kobj,
3045 struct kobj_attribute *attr,
3046 const char *buf, size_t count)
3047{
3048 int err;
3049 int knob;
3050
3051 err = kstrtoint(buf, 10, &knob);
3052 if (err)
3053 return err;
3054
3055
3056
3057
3058
3059 if (knob < 2)
3060 return -EINVAL;
3061
3062 if (READ_ONCE(ksm_max_page_sharing) == knob)
3063 return count;
3064
3065 mutex_lock(&ksm_thread_mutex);
3066 wait_while_offlining();
3067 if (ksm_max_page_sharing != knob) {
3068 if (ksm_pages_shared || remove_all_stable_nodes())
3069 err = -EBUSY;
3070 else
3071 ksm_max_page_sharing = knob;
3072 }
3073 mutex_unlock(&ksm_thread_mutex);
3074
3075 return err ? err : count;
3076}
3077KSM_ATTR(max_page_sharing);
3078
3079static ssize_t pages_shared_show(struct kobject *kobj,
3080 struct kobj_attribute *attr, char *buf)
3081{
3082 return sprintf(buf, "%lu\n", ksm_pages_shared);
3083}
3084KSM_ATTR_RO(pages_shared);
3085
3086static ssize_t pages_sharing_show(struct kobject *kobj,
3087 struct kobj_attribute *attr, char *buf)
3088{
3089 return sprintf(buf, "%lu\n", ksm_pages_sharing);
3090}
3091KSM_ATTR_RO(pages_sharing);
3092
3093static ssize_t pages_unshared_show(struct kobject *kobj,
3094 struct kobj_attribute *attr, char *buf)
3095{
3096 return sprintf(buf, "%lu\n", ksm_pages_unshared);
3097}
3098KSM_ATTR_RO(pages_unshared);
3099
3100static ssize_t pages_volatile_show(struct kobject *kobj,
3101 struct kobj_attribute *attr, char *buf)
3102{
3103 long ksm_pages_volatile;
3104
3105 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
3106 - ksm_pages_sharing - ksm_pages_unshared;
3107
3108
3109
3110
3111 if (ksm_pages_volatile < 0)
3112 ksm_pages_volatile = 0;
3113 return sprintf(buf, "%ld\n", ksm_pages_volatile);
3114}
3115KSM_ATTR_RO(pages_volatile);
3116
3117static ssize_t stable_node_dups_show(struct kobject *kobj,
3118 struct kobj_attribute *attr, char *buf)
3119{
3120 return sprintf(buf, "%lu\n", ksm_stable_node_dups);
3121}
3122KSM_ATTR_RO(stable_node_dups);
3123
3124static ssize_t stable_node_chains_show(struct kobject *kobj,
3125 struct kobj_attribute *attr, char *buf)
3126{
3127 return sprintf(buf, "%lu\n", ksm_stable_node_chains);
3128}
3129KSM_ATTR_RO(stable_node_chains);
3130
3131static ssize_t
3132stable_node_chains_prune_millisecs_show(struct kobject *kobj,
3133 struct kobj_attribute *attr,
3134 char *buf)
3135{
3136 return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
3137}
3138
3139static ssize_t
3140stable_node_chains_prune_millisecs_store(struct kobject *kobj,
3141 struct kobj_attribute *attr,
3142 const char *buf, size_t count)
3143{
3144 unsigned long msecs;
3145 int err;
3146
3147 err = kstrtoul(buf, 10, &msecs);
3148 if (err || msecs > UINT_MAX)
3149 return -EINVAL;
3150
3151 ksm_stable_node_chains_prune_millisecs = msecs;
3152
3153 return count;
3154}
3155KSM_ATTR(stable_node_chains_prune_millisecs);
3156
3157static ssize_t full_scans_show(struct kobject *kobj,
3158 struct kobj_attribute *attr, char *buf)
3159{
3160 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
3161}
3162KSM_ATTR_RO(full_scans);
3163
3164static struct attribute *ksm_attrs[] = {
3165 &sleep_millisecs_attr.attr,
3166 &pages_to_scan_attr.attr,
3167 &run_attr.attr,
3168 &pages_shared_attr.attr,
3169 &pages_sharing_attr.attr,
3170 &pages_unshared_attr.attr,
3171 &pages_volatile_attr.attr,
3172 &full_scans_attr.attr,
3173#ifdef CONFIG_NUMA
3174 &merge_across_nodes_attr.attr,
3175#endif
3176 &max_page_sharing_attr.attr,
3177 &stable_node_chains_attr.attr,
3178 &stable_node_dups_attr.attr,
3179 &stable_node_chains_prune_millisecs_attr.attr,
3180 &use_zero_pages_attr.attr,
3181 NULL,
3182};
3183
3184static const struct attribute_group ksm_attr_group = {
3185 .attrs = ksm_attrs,
3186 .name = "ksm",
3187};
3188#endif
3189
3190static int __init ksm_init(void)
3191{
3192 struct task_struct *ksm_thread;
3193 int err;
3194
3195
3196 zero_checksum = calc_checksum(ZERO_PAGE(0));
3197
3198 ksm_use_zero_pages = false;
3199
3200 err = ksm_slab_init();
3201 if (err)
3202 goto out;
3203
3204 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
3205 if (IS_ERR(ksm_thread)) {
3206 pr_err("ksm: creating kthread failed\n");
3207 err = PTR_ERR(ksm_thread);
3208 goto out_free;
3209 }
3210
3211#ifdef CONFIG_SYSFS
3212 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
3213 if (err) {
3214 pr_err("ksm: register sysfs failed\n");
3215 kthread_stop(ksm_thread);
3216 goto out_free;
3217 }
3218#else
3219 ksm_run = KSM_RUN_MERGE;
3220
3221#endif
3222
3223#ifdef CONFIG_MEMORY_HOTREMOVE
3224
3225 hotplug_memory_notifier(ksm_memory_callback, 100);
3226#endif
3227 return 0;
3228
3229out_free:
3230 ksm_slab_free();
3231out:
3232 return err;
3233}
3234subsys_initcall(ksm_init);
3235