1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/memory.h>
33#include <linux/mmu_notifier.h>
34#include <linux/swap.h>
35#include <linux/ksm.h>
36#include <linux/hashtable.h>
37#include <linux/freezer.h>
38#include <linux/oom.h>
39#include <linux/numa.h>
40
41#include <asm/tlbflush.h>
42#include "internal.h"
43
44#ifdef CONFIG_NUMA
45#define NUMA(x) (x)
46#define DO_NUMA(x) do { (x); } while (0)
47#else
48#define NUMA(x) (0)
49#define DO_NUMA(x) do { } while (0)
50#endif
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102struct mm_slot {
103 struct hlist_node link;
104 struct list_head mm_list;
105 struct rmap_item *rmap_list;
106 struct mm_struct *mm;
107};
108
109
110
111
112
113
114
115
116
117
118struct ksm_scan {
119 struct mm_slot *mm_slot;
120 unsigned long address;
121 struct rmap_item **rmap_list;
122 unsigned long seqnr;
123};
124
125
126
127
128
129
130
131
132
133
134struct stable_node {
135 union {
136 struct rb_node node;
137 struct {
138 struct list_head *head;
139 struct list_head list;
140 };
141 };
142 struct hlist_head hlist;
143 unsigned long kpfn;
144#ifdef CONFIG_NUMA
145 int nid;
146#endif
147};
148
149
150
151
152
153
154
155
156
157
158
159
160
161struct rmap_item {
162 struct rmap_item *rmap_list;
163 union {
164 struct anon_vma *anon_vma;
165#ifdef CONFIG_NUMA
166 int nid;
167#endif
168 };
169 struct mm_struct *mm;
170 unsigned long address;
171 unsigned int oldchecksum;
172 union {
173 struct rb_node node;
174 struct {
175 struct stable_node *head;
176 struct hlist_node hlist;
177 };
178 };
179};
180
181#define SEQNR_MASK 0x0ff
182#define UNSTABLE_FLAG 0x100
183#define STABLE_FLAG 0x200
184
185
186static struct rb_root one_stable_tree[1] = { RB_ROOT };
187static struct rb_root one_unstable_tree[1] = { RB_ROOT };
188static struct rb_root *root_stable_tree = one_stable_tree;
189static struct rb_root *root_unstable_tree = one_unstable_tree;
190
191
192static LIST_HEAD(migrate_nodes);
193
194#define MM_SLOTS_HASH_BITS 10
195static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
196
197static struct mm_slot ksm_mm_head = {
198 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
199};
200static struct ksm_scan ksm_scan = {
201 .mm_slot = &ksm_mm_head,
202};
203
204static struct kmem_cache *rmap_item_cache;
205static struct kmem_cache *stable_node_cache;
206static struct kmem_cache *mm_slot_cache;
207
208
209static unsigned long ksm_pages_shared;
210
211
212static unsigned long ksm_pages_sharing;
213
214
215static unsigned long ksm_pages_unshared;
216
217
218static unsigned long ksm_rmap_items;
219
220
221static unsigned int ksm_thread_pages_to_scan = 100;
222
223
224static unsigned int ksm_thread_sleep_millisecs = 20;
225
226#ifdef CONFIG_NUMA
227
228static unsigned int ksm_merge_across_nodes = 1;
229static int ksm_nr_node_ids = 1;
230#else
231#define ksm_merge_across_nodes 1U
232#define ksm_nr_node_ids 1
233#endif
234
235#define KSM_RUN_STOP 0
236#define KSM_RUN_MERGE 1
237#define KSM_RUN_UNMERGE 2
238#define KSM_RUN_OFFLINE 4
239static unsigned long ksm_run = KSM_RUN_STOP;
240static void wait_while_offlining(void);
241
242static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
243static DEFINE_MUTEX(ksm_thread_mutex);
244static DEFINE_SPINLOCK(ksm_mmlist_lock);
245
246#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
247 sizeof(struct __struct), __alignof__(struct __struct),\
248 (__flags), NULL)
249
250static int __init ksm_slab_init(void)
251{
252 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
253 if (!rmap_item_cache)
254 goto out;
255
256 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
257 if (!stable_node_cache)
258 goto out_free1;
259
260 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
261 if (!mm_slot_cache)
262 goto out_free2;
263
264 return 0;
265
266out_free2:
267 kmem_cache_destroy(stable_node_cache);
268out_free1:
269 kmem_cache_destroy(rmap_item_cache);
270out:
271 return -ENOMEM;
272}
273
274static void __init ksm_slab_free(void)
275{
276 kmem_cache_destroy(mm_slot_cache);
277 kmem_cache_destroy(stable_node_cache);
278 kmem_cache_destroy(rmap_item_cache);
279 mm_slot_cache = NULL;
280}
281
282static inline struct rmap_item *alloc_rmap_item(void)
283{
284 struct rmap_item *rmap_item;
285
286 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
287 if (rmap_item)
288 ksm_rmap_items++;
289 return rmap_item;
290}
291
292static inline void free_rmap_item(struct rmap_item *rmap_item)
293{
294 ksm_rmap_items--;
295 rmap_item->mm = NULL;
296 kmem_cache_free(rmap_item_cache, rmap_item);
297}
298
299static inline struct stable_node *alloc_stable_node(void)
300{
301 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
302}
303
304static inline void free_stable_node(struct stable_node *stable_node)
305{
306 kmem_cache_free(stable_node_cache, stable_node);
307}
308
309static inline struct mm_slot *alloc_mm_slot(void)
310{
311 if (!mm_slot_cache)
312 return NULL;
313 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
314}
315
316static inline void free_mm_slot(struct mm_slot *mm_slot)
317{
318 kmem_cache_free(mm_slot_cache, mm_slot);
319}
320
321static struct mm_slot *get_mm_slot(struct mm_struct *mm)
322{
323 struct mm_slot *slot;
324
325 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
326 if (slot->mm == mm)
327 return slot;
328
329 return NULL;
330}
331
332static void insert_to_mm_slots_hash(struct mm_struct *mm,
333 struct mm_slot *mm_slot)
334{
335 mm_slot->mm = mm;
336 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
337}
338
339
340
341
342
343
344
345
346
347static inline bool ksm_test_exit(struct mm_struct *mm)
348{
349 return atomic_read(&mm->mm_users) == 0;
350}
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
368{
369 struct page *page;
370 int ret = 0;
371
372 do {
373 cond_resched();
374 page = follow_page(vma, addr,
375 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
376 if (IS_ERR_OR_NULL(page))
377 break;
378 if (PageKsm(page))
379 ret = handle_mm_fault(vma->vm_mm, vma, addr,
380 FAULT_FLAG_WRITE |
381 FAULT_FLAG_REMOTE);
382 else
383 ret = VM_FAULT_WRITE;
384 put_page(page);
385 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
415}
416
417static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
418 unsigned long addr)
419{
420 struct vm_area_struct *vma;
421 if (ksm_test_exit(mm))
422 return NULL;
423 vma = find_vma(mm, addr);
424 if (!vma || vma->vm_start > addr)
425 return NULL;
426 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
427 return NULL;
428 return vma;
429}
430
431static void break_cow(struct rmap_item *rmap_item)
432{
433 struct mm_struct *mm = rmap_item->mm;
434 unsigned long addr = rmap_item->address;
435 struct vm_area_struct *vma;
436
437
438
439
440
441 put_anon_vma(rmap_item->anon_vma);
442
443 down_read(&mm->mmap_sem);
444 vma = find_mergeable_vma(mm, addr);
445 if (vma)
446 break_ksm(vma, addr);
447 up_read(&mm->mmap_sem);
448}
449
450static struct page *get_mergeable_page(struct rmap_item *rmap_item)
451{
452 struct mm_struct *mm = rmap_item->mm;
453 unsigned long addr = rmap_item->address;
454 struct vm_area_struct *vma;
455 struct page *page;
456
457 down_read(&mm->mmap_sem);
458 vma = find_mergeable_vma(mm, addr);
459 if (!vma)
460 goto out;
461
462 page = follow_page(vma, addr, FOLL_GET);
463 if (IS_ERR_OR_NULL(page))
464 goto out;
465 if (PageAnon(page)) {
466 flush_anon_page(vma, page, addr);
467 flush_dcache_page(page);
468 } else {
469 put_page(page);
470out:
471 page = NULL;
472 }
473 up_read(&mm->mmap_sem);
474 return page;
475}
476
477
478
479
480
481
482
483static inline int get_kpfn_nid(unsigned long kpfn)
484{
485 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
486}
487
488static void remove_node_from_stable_tree(struct stable_node *stable_node)
489{
490 struct rmap_item *rmap_item;
491
492 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
493 if (rmap_item->hlist.next)
494 ksm_pages_sharing--;
495 else
496 ksm_pages_shared--;
497 put_anon_vma(rmap_item->anon_vma);
498 rmap_item->address &= PAGE_MASK;
499 cond_resched();
500 }
501
502 if (stable_node->head == &migrate_nodes)
503 list_del(&stable_node->list);
504 else
505 rb_erase(&stable_node->node,
506 root_stable_tree + NUMA(stable_node->nid));
507 free_stable_node(stable_node);
508}
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
530{
531 struct page *page;
532 void *expected_mapping;
533 unsigned long kpfn;
534
535 expected_mapping = (void *)stable_node +
536 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
537again:
538 kpfn = READ_ONCE(stable_node->kpfn);
539 page = pfn_to_page(kpfn);
540
541
542
543
544
545
546 smp_read_barrier_depends();
547 if (READ_ONCE(page->mapping) != expected_mapping)
548 goto stale;
549
550
551
552
553
554
555
556
557
558
559 while (!get_page_unless_zero(page)) {
560
561
562
563
564
565
566
567
568 if (!PageSwapCache(page))
569 goto stale;
570 cpu_relax();
571 }
572
573 if (READ_ONCE(page->mapping) != expected_mapping) {
574 put_page(page);
575 goto stale;
576 }
577
578 if (lock_it) {
579 lock_page(page);
580 if (READ_ONCE(page->mapping) != expected_mapping) {
581 unlock_page(page);
582 put_page(page);
583 goto stale;
584 }
585 }
586 return page;
587
588stale:
589
590
591
592
593
594
595 smp_rmb();
596 if (READ_ONCE(stable_node->kpfn) != kpfn)
597 goto again;
598 remove_node_from_stable_tree(stable_node);
599 return NULL;
600}
601
602
603
604
605
606static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
607{
608 if (rmap_item->address & STABLE_FLAG) {
609 struct stable_node *stable_node;
610 struct page *page;
611
612 stable_node = rmap_item->head;
613 page = get_ksm_page(stable_node, true);
614 if (!page)
615 goto out;
616
617 hlist_del(&rmap_item->hlist);
618 unlock_page(page);
619 put_page(page);
620
621 if (!hlist_empty(&stable_node->hlist))
622 ksm_pages_sharing--;
623 else
624 ksm_pages_shared--;
625
626 put_anon_vma(rmap_item->anon_vma);
627 rmap_item->address &= PAGE_MASK;
628
629 } else if (rmap_item->address & UNSTABLE_FLAG) {
630 unsigned char age;
631
632
633
634
635
636
637
638 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
639 BUG_ON(age > 1);
640 if (!age)
641 rb_erase(&rmap_item->node,
642 root_unstable_tree + NUMA(rmap_item->nid));
643 ksm_pages_unshared--;
644 rmap_item->address &= PAGE_MASK;
645 }
646out:
647 cond_resched();
648}
649
650static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
651 struct rmap_item **rmap_list)
652{
653 while (*rmap_list) {
654 struct rmap_item *rmap_item = *rmap_list;
655 *rmap_list = rmap_item->rmap_list;
656 remove_rmap_item_from_tree(rmap_item);
657 free_rmap_item(rmap_item);
658 }
659}
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674static int unmerge_ksm_pages(struct vm_area_struct *vma,
675 unsigned long start, unsigned long end)
676{
677 unsigned long addr;
678 int err = 0;
679
680 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
681 if (ksm_test_exit(vma->vm_mm))
682 break;
683 if (signal_pending(current))
684 err = -ERESTARTSYS;
685 else
686 err = break_ksm(vma, addr);
687 }
688 return err;
689}
690
691#ifdef CONFIG_SYSFS
692
693
694
695static int remove_stable_node(struct stable_node *stable_node)
696{
697 struct page *page;
698 int err;
699
700 page = get_ksm_page(stable_node, true);
701 if (!page) {
702
703
704
705 return 0;
706 }
707
708 if (WARN_ON_ONCE(page_mapped(page))) {
709
710
711
712
713 err = -EBUSY;
714 } else {
715
716
717
718
719
720
721
722
723 set_page_stable_node(page, NULL);
724 remove_node_from_stable_tree(stable_node);
725 err = 0;
726 }
727
728 unlock_page(page);
729 put_page(page);
730 return err;
731}
732
733static int remove_all_stable_nodes(void)
734{
735 struct stable_node *stable_node, *next;
736 int nid;
737 int err = 0;
738
739 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
740 while (root_stable_tree[nid].rb_node) {
741 stable_node = rb_entry(root_stable_tree[nid].rb_node,
742 struct stable_node, node);
743 if (remove_stable_node(stable_node)) {
744 err = -EBUSY;
745 break;
746 }
747 cond_resched();
748 }
749 }
750 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
751 if (remove_stable_node(stable_node))
752 err = -EBUSY;
753 cond_resched();
754 }
755 return err;
756}
757
758static int unmerge_and_remove_all_rmap_items(void)
759{
760 struct mm_slot *mm_slot;
761 struct mm_struct *mm;
762 struct vm_area_struct *vma;
763 int err = 0;
764
765 spin_lock(&ksm_mmlist_lock);
766 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
767 struct mm_slot, mm_list);
768 spin_unlock(&ksm_mmlist_lock);
769
770 for (mm_slot = ksm_scan.mm_slot;
771 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
772 mm = mm_slot->mm;
773 down_read(&mm->mmap_sem);
774 for (vma = mm->mmap; vma; vma = vma->vm_next) {
775 if (ksm_test_exit(mm))
776 break;
777 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
778 continue;
779 err = unmerge_ksm_pages(vma,
780 vma->vm_start, vma->vm_end);
781 if (err)
782 goto error;
783 }
784
785 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
786 up_read(&mm->mmap_sem);
787
788 spin_lock(&ksm_mmlist_lock);
789 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
790 struct mm_slot, mm_list);
791 if (ksm_test_exit(mm)) {
792 hash_del(&mm_slot->link);
793 list_del(&mm_slot->mm_list);
794 spin_unlock(&ksm_mmlist_lock);
795
796 free_mm_slot(mm_slot);
797 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
798 mmdrop(mm);
799 } else
800 spin_unlock(&ksm_mmlist_lock);
801 }
802
803
804 remove_all_stable_nodes();
805 ksm_scan.seqnr = 0;
806 return 0;
807
808error:
809 up_read(&mm->mmap_sem);
810 spin_lock(&ksm_mmlist_lock);
811 ksm_scan.mm_slot = &ksm_mm_head;
812 spin_unlock(&ksm_mmlist_lock);
813 return err;
814}
815#endif
816
817static u32 calc_checksum(struct page *page)
818{
819 u32 checksum;
820 void *addr = kmap_atomic(page);
821 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
822 kunmap_atomic(addr);
823 return checksum;
824}
825
826static int memcmp_pages(struct page *page1, struct page *page2)
827{
828 char *addr1, *addr2;
829 int ret;
830
831 addr1 = kmap_atomic(page1);
832 addr2 = kmap_atomic(page2);
833 ret = memcmp(addr1, addr2, PAGE_SIZE);
834 kunmap_atomic(addr2);
835 kunmap_atomic(addr1);
836 return ret;
837}
838
839static inline int pages_identical(struct page *page1, struct page *page2)
840{
841 return !memcmp_pages(page1, page2);
842}
843
844static int write_protect_page(struct vm_area_struct *vma, struct page *page,
845 pte_t *orig_pte)
846{
847 struct mm_struct *mm = vma->vm_mm;
848 unsigned long addr;
849 pte_t *ptep;
850 spinlock_t *ptl;
851 int swapped;
852 int err = -EFAULT;
853 unsigned long mmun_start;
854 unsigned long mmun_end;
855
856 addr = page_address_in_vma(page, vma);
857 if (addr == -EFAULT)
858 goto out;
859
860 BUG_ON(PageTransCompound(page));
861
862 mmun_start = addr;
863 mmun_end = addr + PAGE_SIZE;
864 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
865
866 ptep = page_check_address(page, mm, addr, &ptl, 0);
867 if (!ptep)
868 goto out_mn;
869
870 if (pte_write(*ptep) || pte_dirty(*ptep)) {
871 pte_t entry;
872
873 swapped = PageSwapCache(page);
874 flush_cache_page(vma, addr, page_to_pfn(page));
875
876
877
878
879
880
881
882
883
884 entry = ptep_clear_flush_notify(vma, addr, ptep);
885
886
887
888
889 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
890 set_pte_at(mm, addr, ptep, entry);
891 goto out_unlock;
892 }
893 if (pte_dirty(entry))
894 set_page_dirty(page);
895 entry = pte_mkclean(pte_wrprotect(entry));
896 set_pte_at_notify(mm, addr, ptep, entry);
897 }
898 *orig_pte = *ptep;
899 err = 0;
900
901out_unlock:
902 pte_unmap_unlock(ptep, ptl);
903out_mn:
904 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
905out:
906 return err;
907}
908
909
910
911
912
913
914
915
916
917
918static int replace_page(struct vm_area_struct *vma, struct page *page,
919 struct page *kpage, pte_t orig_pte)
920{
921 struct mm_struct *mm = vma->vm_mm;
922 pmd_t *pmd;
923 pte_t *ptep;
924 spinlock_t *ptl;
925 unsigned long addr;
926 int err = -EFAULT;
927 unsigned long mmun_start;
928 unsigned long mmun_end;
929
930 addr = page_address_in_vma(page, vma);
931 if (addr == -EFAULT)
932 goto out;
933
934 pmd = mm_find_pmd(mm, addr);
935 if (!pmd)
936 goto out;
937
938 mmun_start = addr;
939 mmun_end = addr + PAGE_SIZE;
940 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
941
942 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
943 if (!pte_same(*ptep, orig_pte)) {
944 pte_unmap_unlock(ptep, ptl);
945 goto out_mn;
946 }
947
948 get_page(kpage);
949 page_add_anon_rmap(kpage, vma, addr, false);
950
951 flush_cache_page(vma, addr, pte_pfn(*ptep));
952 ptep_clear_flush_notify(vma, addr, ptep);
953 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
954
955 page_remove_rmap(page, false);
956 if (!page_mapped(page))
957 try_to_free_swap(page);
958 put_page(page);
959
960 pte_unmap_unlock(ptep, ptl);
961 err = 0;
962out_mn:
963 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
964out:
965 return err;
966}
967
968
969
970
971
972
973
974
975
976
977static int try_to_merge_one_page(struct vm_area_struct *vma,
978 struct page *page, struct page *kpage)
979{
980 pte_t orig_pte = __pte(0);
981 int err = -EFAULT;
982
983 if (page == kpage)
984 return 0;
985
986 if (!PageAnon(page))
987 goto out;
988
989
990
991
992
993
994
995
996 if (!trylock_page(page))
997 goto out;
998
999 if (PageTransCompound(page)) {
1000 err = split_huge_page(page);
1001 if (err)
1002 goto out_unlock;
1003 }
1004
1005
1006
1007
1008
1009
1010
1011 if (write_protect_page(vma, page, &orig_pte) == 0) {
1012 if (!kpage) {
1013
1014
1015
1016
1017
1018 set_page_stable_node(page, NULL);
1019 mark_page_accessed(page);
1020
1021
1022
1023
1024 if (!PageDirty(page))
1025 SetPageDirty(page);
1026 err = 0;
1027 } else if (pages_identical(page, kpage))
1028 err = replace_page(vma, page, kpage, orig_pte);
1029 }
1030
1031 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1032 munlock_vma_page(page);
1033 if (!PageMlocked(kpage)) {
1034 unlock_page(page);
1035 lock_page(kpage);
1036 mlock_vma_page(kpage);
1037 page = kpage;
1038 }
1039 }
1040
1041out_unlock:
1042 unlock_page(page);
1043out:
1044 return err;
1045}
1046
1047
1048
1049
1050
1051
1052
1053static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1054 struct page *page, struct page *kpage)
1055{
1056 struct mm_struct *mm = rmap_item->mm;
1057 struct vm_area_struct *vma;
1058 int err = -EFAULT;
1059
1060 down_read(&mm->mmap_sem);
1061 vma = find_mergeable_vma(mm, rmap_item->address);
1062 if (!vma)
1063 goto out;
1064
1065 err = try_to_merge_one_page(vma, page, kpage);
1066 if (err)
1067 goto out;
1068
1069
1070 remove_rmap_item_from_tree(rmap_item);
1071
1072
1073 rmap_item->anon_vma = vma->anon_vma;
1074 get_anon_vma(vma->anon_vma);
1075out:
1076 up_read(&mm->mmap_sem);
1077 return err;
1078}
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1091 struct page *page,
1092 struct rmap_item *tree_rmap_item,
1093 struct page *tree_page)
1094{
1095 int err;
1096
1097 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1098 if (!err) {
1099 err = try_to_merge_with_ksm_page(tree_rmap_item,
1100 tree_page, page);
1101
1102
1103
1104
1105 if (err)
1106 break_cow(rmap_item);
1107 }
1108 return err ? NULL : page;
1109}
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120static struct page *stable_tree_search(struct page *page)
1121{
1122 int nid;
1123 struct rb_root *root;
1124 struct rb_node **new;
1125 struct rb_node *parent;
1126 struct stable_node *stable_node;
1127 struct stable_node *page_node;
1128
1129 page_node = page_stable_node(page);
1130 if (page_node && page_node->head != &migrate_nodes) {
1131
1132 get_page(page);
1133 return page;
1134 }
1135
1136 nid = get_kpfn_nid(page_to_pfn(page));
1137 root = root_stable_tree + nid;
1138again:
1139 new = &root->rb_node;
1140 parent = NULL;
1141
1142 while (*new) {
1143 struct page *tree_page;
1144 int ret;
1145
1146 cond_resched();
1147 stable_node = rb_entry(*new, struct stable_node, node);
1148 tree_page = get_ksm_page(stable_node, false);
1149 if (!tree_page) {
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159 goto again;
1160 }
1161
1162 ret = memcmp_pages(page, tree_page);
1163 put_page(tree_page);
1164
1165 parent = *new;
1166 if (ret < 0)
1167 new = &parent->rb_left;
1168 else if (ret > 0)
1169 new = &parent->rb_right;
1170 else {
1171
1172
1173
1174
1175
1176
1177
1178 tree_page = get_ksm_page(stable_node, true);
1179 if (tree_page) {
1180 unlock_page(tree_page);
1181 if (get_kpfn_nid(stable_node->kpfn) !=
1182 NUMA(stable_node->nid)) {
1183 put_page(tree_page);
1184 goto replace;
1185 }
1186 return tree_page;
1187 }
1188
1189
1190
1191
1192 if (page_node)
1193 goto again;
1194 return NULL;
1195 }
1196 }
1197
1198 if (!page_node)
1199 return NULL;
1200
1201 list_del(&page_node->list);
1202 DO_NUMA(page_node->nid = nid);
1203 rb_link_node(&page_node->node, parent, new);
1204 rb_insert_color(&page_node->node, root);
1205 get_page(page);
1206 return page;
1207
1208replace:
1209 if (page_node) {
1210 list_del(&page_node->list);
1211 DO_NUMA(page_node->nid = nid);
1212 rb_replace_node(&stable_node->node, &page_node->node, root);
1213 get_page(page);
1214 } else {
1215 rb_erase(&stable_node->node, root);
1216 page = NULL;
1217 }
1218 stable_node->head = &migrate_nodes;
1219 list_add(&stable_node->list, stable_node->head);
1220 return page;
1221}
1222
1223
1224
1225
1226
1227
1228
1229
1230static struct stable_node *stable_tree_insert(struct page *kpage)
1231{
1232 int nid;
1233 unsigned long kpfn;
1234 struct rb_root *root;
1235 struct rb_node **new;
1236 struct rb_node *parent;
1237 struct stable_node *stable_node;
1238
1239 kpfn = page_to_pfn(kpage);
1240 nid = get_kpfn_nid(kpfn);
1241 root = root_stable_tree + nid;
1242again:
1243 parent = NULL;
1244 new = &root->rb_node;
1245
1246 while (*new) {
1247 struct page *tree_page;
1248 int ret;
1249
1250 cond_resched();
1251 stable_node = rb_entry(*new, struct stable_node, node);
1252 tree_page = get_ksm_page(stable_node, false);
1253 if (!tree_page) {
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263 goto again;
1264 }
1265
1266 ret = memcmp_pages(kpage, tree_page);
1267 put_page(tree_page);
1268
1269 parent = *new;
1270 if (ret < 0)
1271 new = &parent->rb_left;
1272 else if (ret > 0)
1273 new = &parent->rb_right;
1274 else {
1275
1276
1277
1278
1279
1280 return NULL;
1281 }
1282 }
1283
1284 stable_node = alloc_stable_node();
1285 if (!stable_node)
1286 return NULL;
1287
1288 INIT_HLIST_HEAD(&stable_node->hlist);
1289 stable_node->kpfn = kpfn;
1290 set_page_stable_node(kpage, stable_node);
1291 DO_NUMA(stable_node->nid = nid);
1292 rb_link_node(&stable_node->node, parent, new);
1293 rb_insert_color(&stable_node->node, root);
1294
1295 return stable_node;
1296}
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312static
1313struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1314 struct page *page,
1315 struct page **tree_pagep)
1316{
1317 struct rb_node **new;
1318 struct rb_root *root;
1319 struct rb_node *parent = NULL;
1320 int nid;
1321
1322 nid = get_kpfn_nid(page_to_pfn(page));
1323 root = root_unstable_tree + nid;
1324 new = &root->rb_node;
1325
1326 while (*new) {
1327 struct rmap_item *tree_rmap_item;
1328 struct page *tree_page;
1329 int ret;
1330
1331 cond_resched();
1332 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1333 tree_page = get_mergeable_page(tree_rmap_item);
1334 if (!tree_page)
1335 return NULL;
1336
1337
1338
1339
1340 if (page == tree_page) {
1341 put_page(tree_page);
1342 return NULL;
1343 }
1344
1345 ret = memcmp_pages(page, tree_page);
1346
1347 parent = *new;
1348 if (ret < 0) {
1349 put_page(tree_page);
1350 new = &parent->rb_left;
1351 } else if (ret > 0) {
1352 put_page(tree_page);
1353 new = &parent->rb_right;
1354 } else if (!ksm_merge_across_nodes &&
1355 page_to_nid(tree_page) != nid) {
1356
1357
1358
1359
1360
1361 put_page(tree_page);
1362 return NULL;
1363 } else {
1364 *tree_pagep = tree_page;
1365 return tree_rmap_item;
1366 }
1367 }
1368
1369 rmap_item->address |= UNSTABLE_FLAG;
1370 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1371 DO_NUMA(rmap_item->nid = nid);
1372 rb_link_node(&rmap_item->node, parent, new);
1373 rb_insert_color(&rmap_item->node, root);
1374
1375 ksm_pages_unshared++;
1376 return NULL;
1377}
1378
1379
1380
1381
1382
1383
1384static void stable_tree_append(struct rmap_item *rmap_item,
1385 struct stable_node *stable_node)
1386{
1387 rmap_item->head = stable_node;
1388 rmap_item->address |= STABLE_FLAG;
1389 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
1390
1391 if (rmap_item->hlist.next)
1392 ksm_pages_sharing++;
1393 else
1394 ksm_pages_shared++;
1395}
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1407{
1408 struct rmap_item *tree_rmap_item;
1409 struct page *tree_page = NULL;
1410 struct stable_node *stable_node;
1411 struct page *kpage;
1412 unsigned int checksum;
1413 int err;
1414
1415 stable_node = page_stable_node(page);
1416 if (stable_node) {
1417 if (stable_node->head != &migrate_nodes &&
1418 get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
1419 rb_erase(&stable_node->node,
1420 root_stable_tree + NUMA(stable_node->nid));
1421 stable_node->head = &migrate_nodes;
1422 list_add(&stable_node->list, stable_node->head);
1423 }
1424 if (stable_node->head != &migrate_nodes &&
1425 rmap_item->head == stable_node)
1426 return;
1427 }
1428
1429
1430 kpage = stable_tree_search(page);
1431 if (kpage == page && rmap_item->head == stable_node) {
1432 put_page(kpage);
1433 return;
1434 }
1435
1436 remove_rmap_item_from_tree(rmap_item);
1437
1438 if (kpage) {
1439 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1440 if (!err) {
1441
1442
1443
1444
1445 lock_page(kpage);
1446 stable_tree_append(rmap_item, page_stable_node(kpage));
1447 unlock_page(kpage);
1448 }
1449 put_page(kpage);
1450 return;
1451 }
1452
1453
1454
1455
1456
1457
1458
1459 checksum = calc_checksum(page);
1460 if (rmap_item->oldchecksum != checksum) {
1461 rmap_item->oldchecksum = checksum;
1462 return;
1463 }
1464
1465 tree_rmap_item =
1466 unstable_tree_search_insert(rmap_item, page, &tree_page);
1467 if (tree_rmap_item) {
1468 kpage = try_to_merge_two_pages(rmap_item, page,
1469 tree_rmap_item, tree_page);
1470 put_page(tree_page);
1471 if (kpage) {
1472
1473
1474
1475
1476 lock_page(kpage);
1477 stable_node = stable_tree_insert(kpage);
1478 if (stable_node) {
1479 stable_tree_append(tree_rmap_item, stable_node);
1480 stable_tree_append(rmap_item, stable_node);
1481 }
1482 unlock_page(kpage);
1483
1484
1485
1486
1487
1488
1489
1490 if (!stable_node) {
1491 break_cow(tree_rmap_item);
1492 break_cow(rmap_item);
1493 }
1494 }
1495 }
1496}
1497
1498static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1499 struct rmap_item **rmap_list,
1500 unsigned long addr)
1501{
1502 struct rmap_item *rmap_item;
1503
1504 while (*rmap_list) {
1505 rmap_item = *rmap_list;
1506 if ((rmap_item->address & PAGE_MASK) == addr)
1507 return rmap_item;
1508 if (rmap_item->address > addr)
1509 break;
1510 *rmap_list = rmap_item->rmap_list;
1511 remove_rmap_item_from_tree(rmap_item);
1512 free_rmap_item(rmap_item);
1513 }
1514
1515 rmap_item = alloc_rmap_item();
1516 if (rmap_item) {
1517
1518 rmap_item->mm = mm_slot->mm;
1519 rmap_item->address = addr;
1520 rmap_item->rmap_list = *rmap_list;
1521 *rmap_list = rmap_item;
1522 }
1523 return rmap_item;
1524}
1525
1526static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1527{
1528 struct mm_struct *mm;
1529 struct mm_slot *slot;
1530 struct vm_area_struct *vma;
1531 struct rmap_item *rmap_item;
1532 int nid;
1533
1534 if (list_empty(&ksm_mm_head.mm_list))
1535 return NULL;
1536
1537 slot = ksm_scan.mm_slot;
1538 if (slot == &ksm_mm_head) {
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549 lru_add_drain_all();
1550
1551
1552
1553
1554
1555
1556
1557 if (!ksm_merge_across_nodes) {
1558 struct stable_node *stable_node, *next;
1559 struct page *page;
1560
1561 list_for_each_entry_safe(stable_node, next,
1562 &migrate_nodes, list) {
1563 page = get_ksm_page(stable_node, false);
1564 if (page)
1565 put_page(page);
1566 cond_resched();
1567 }
1568 }
1569
1570 for (nid = 0; nid < ksm_nr_node_ids; nid++)
1571 root_unstable_tree[nid] = RB_ROOT;
1572
1573 spin_lock(&ksm_mmlist_lock);
1574 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1575 ksm_scan.mm_slot = slot;
1576 spin_unlock(&ksm_mmlist_lock);
1577
1578
1579
1580
1581 if (slot == &ksm_mm_head)
1582 return NULL;
1583next_mm:
1584 ksm_scan.address = 0;
1585 ksm_scan.rmap_list = &slot->rmap_list;
1586 }
1587
1588 mm = slot->mm;
1589 down_read(&mm->mmap_sem);
1590 if (ksm_test_exit(mm))
1591 vma = NULL;
1592 else
1593 vma = find_vma(mm, ksm_scan.address);
1594
1595 for (; vma; vma = vma->vm_next) {
1596 if (!(vma->vm_flags & VM_MERGEABLE))
1597 continue;
1598 if (ksm_scan.address < vma->vm_start)
1599 ksm_scan.address = vma->vm_start;
1600 if (!vma->anon_vma)
1601 ksm_scan.address = vma->vm_end;
1602
1603 while (ksm_scan.address < vma->vm_end) {
1604 if (ksm_test_exit(mm))
1605 break;
1606 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1607 if (IS_ERR_OR_NULL(*page)) {
1608 ksm_scan.address += PAGE_SIZE;
1609 cond_resched();
1610 continue;
1611 }
1612 if (PageAnon(*page)) {
1613 flush_anon_page(vma, *page, ksm_scan.address);
1614 flush_dcache_page(*page);
1615 rmap_item = get_next_rmap_item(slot,
1616 ksm_scan.rmap_list, ksm_scan.address);
1617 if (rmap_item) {
1618 ksm_scan.rmap_list =
1619 &rmap_item->rmap_list;
1620 ksm_scan.address += PAGE_SIZE;
1621 } else
1622 put_page(*page);
1623 up_read(&mm->mmap_sem);
1624 return rmap_item;
1625 }
1626 put_page(*page);
1627 ksm_scan.address += PAGE_SIZE;
1628 cond_resched();
1629 }
1630 }
1631
1632 if (ksm_test_exit(mm)) {
1633 ksm_scan.address = 0;
1634 ksm_scan.rmap_list = &slot->rmap_list;
1635 }
1636
1637
1638
1639
1640 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
1641
1642 spin_lock(&ksm_mmlist_lock);
1643 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1644 struct mm_slot, mm_list);
1645 if (ksm_scan.address == 0) {
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655 hash_del(&slot->link);
1656 list_del(&slot->mm_list);
1657 spin_unlock(&ksm_mmlist_lock);
1658
1659 free_mm_slot(slot);
1660 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1661 up_read(&mm->mmap_sem);
1662 mmdrop(mm);
1663 } else {
1664 up_read(&mm->mmap_sem);
1665
1666
1667
1668
1669
1670
1671
1672 spin_unlock(&ksm_mmlist_lock);
1673 }
1674
1675
1676 slot = ksm_scan.mm_slot;
1677 if (slot != &ksm_mm_head)
1678 goto next_mm;
1679
1680 ksm_scan.seqnr++;
1681 return NULL;
1682}
1683
1684
1685
1686
1687
1688static void ksm_do_scan(unsigned int scan_npages)
1689{
1690 struct rmap_item *rmap_item;
1691 struct page *uninitialized_var(page);
1692
1693 while (scan_npages-- && likely(!freezing(current))) {
1694 cond_resched();
1695 rmap_item = scan_get_next_rmap_item(&page);
1696 if (!rmap_item)
1697 return;
1698 cmp_and_merge_page(page, rmap_item);
1699 put_page(page);
1700 }
1701}
1702
1703static int ksmd_should_run(void)
1704{
1705 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1706}
1707
1708static int ksm_scan_thread(void *nothing)
1709{
1710 set_freezable();
1711 set_user_nice(current, 5);
1712
1713 while (!kthread_should_stop()) {
1714 mutex_lock(&ksm_thread_mutex);
1715 wait_while_offlining();
1716 if (ksmd_should_run())
1717 ksm_do_scan(ksm_thread_pages_to_scan);
1718 mutex_unlock(&ksm_thread_mutex);
1719
1720 try_to_freeze();
1721
1722 if (ksmd_should_run()) {
1723 schedule_timeout_interruptible(
1724 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1725 } else {
1726 wait_event_freezable(ksm_thread_wait,
1727 ksmd_should_run() || kthread_should_stop());
1728 }
1729 }
1730 return 0;
1731}
1732
1733int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1734 unsigned long end, int advice, unsigned long *vm_flags)
1735{
1736 struct mm_struct *mm = vma->vm_mm;
1737 int err;
1738
1739 switch (advice) {
1740 case MADV_MERGEABLE:
1741
1742
1743
1744 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1745 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1746 VM_HUGETLB | VM_MIXEDMAP))
1747 return 0;
1748
1749#ifdef VM_SAO
1750 if (*vm_flags & VM_SAO)
1751 return 0;
1752#endif
1753
1754 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1755 err = __ksm_enter(mm);
1756 if (err)
1757 return err;
1758 }
1759
1760 *vm_flags |= VM_MERGEABLE;
1761 break;
1762
1763 case MADV_UNMERGEABLE:
1764 if (!(*vm_flags & VM_MERGEABLE))
1765 return 0;
1766
1767 if (vma->anon_vma) {
1768 err = unmerge_ksm_pages(vma, start, end);
1769 if (err)
1770 return err;
1771 }
1772
1773 *vm_flags &= ~VM_MERGEABLE;
1774 break;
1775 }
1776
1777 return 0;
1778}
1779
1780int __ksm_enter(struct mm_struct *mm)
1781{
1782 struct mm_slot *mm_slot;
1783 int needs_wakeup;
1784
1785 mm_slot = alloc_mm_slot();
1786 if (!mm_slot)
1787 return -ENOMEM;
1788
1789
1790 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1791
1792 spin_lock(&ksm_mmlist_lock);
1793 insert_to_mm_slots_hash(mm, mm_slot);
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804 if (ksm_run & KSM_RUN_UNMERGE)
1805 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
1806 else
1807 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1808 spin_unlock(&ksm_mmlist_lock);
1809
1810 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1811 atomic_inc(&mm->mm_count);
1812
1813 if (needs_wakeup)
1814 wake_up_interruptible(&ksm_thread_wait);
1815
1816 return 0;
1817}
1818
1819void __ksm_exit(struct mm_struct *mm)
1820{
1821 struct mm_slot *mm_slot;
1822 int easy_to_free = 0;
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833 spin_lock(&ksm_mmlist_lock);
1834 mm_slot = get_mm_slot(mm);
1835 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1836 if (!mm_slot->rmap_list) {
1837 hash_del(&mm_slot->link);
1838 list_del(&mm_slot->mm_list);
1839 easy_to_free = 1;
1840 } else {
1841 list_move(&mm_slot->mm_list,
1842 &ksm_scan.mm_slot->mm_list);
1843 }
1844 }
1845 spin_unlock(&ksm_mmlist_lock);
1846
1847 if (easy_to_free) {
1848 free_mm_slot(mm_slot);
1849 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1850 mmdrop(mm);
1851 } else if (mm_slot) {
1852 down_write(&mm->mmap_sem);
1853 up_write(&mm->mmap_sem);
1854 }
1855}
1856
1857struct page *ksm_might_need_to_copy(struct page *page,
1858 struct vm_area_struct *vma, unsigned long address)
1859{
1860 struct anon_vma *anon_vma = page_anon_vma(page);
1861 struct page *new_page;
1862
1863 if (PageKsm(page)) {
1864 if (page_stable_node(page) &&
1865 !(ksm_run & KSM_RUN_UNMERGE))
1866 return page;
1867 } else if (!anon_vma) {
1868 return page;
1869 } else if (anon_vma->root == vma->anon_vma->root &&
1870 page->index == linear_page_index(vma, address)) {
1871 return page;
1872 }
1873 if (!PageUptodate(page))
1874 return page;
1875
1876 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1877 if (new_page) {
1878 copy_user_highpage(new_page, page, address, vma);
1879
1880 SetPageDirty(new_page);
1881 __SetPageUptodate(new_page);
1882 __SetPageLocked(new_page);
1883 }
1884
1885 return new_page;
1886}
1887
1888int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
1889{
1890 struct stable_node *stable_node;
1891 struct rmap_item *rmap_item;
1892 int ret = SWAP_AGAIN;
1893 int search_new_forks = 0;
1894
1895 VM_BUG_ON_PAGE(!PageKsm(page), page);
1896
1897
1898
1899
1900
1901 VM_BUG_ON_PAGE(!PageLocked(page), page);
1902
1903 stable_node = page_stable_node(page);
1904 if (!stable_node)
1905 return ret;
1906again:
1907 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1908 struct anon_vma *anon_vma = rmap_item->anon_vma;
1909 struct anon_vma_chain *vmac;
1910 struct vm_area_struct *vma;
1911
1912 cond_resched();
1913 anon_vma_lock_read(anon_vma);
1914 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1915 0, ULONG_MAX) {
1916 cond_resched();
1917 vma = vmac->vma;
1918 if (rmap_item->address < vma->vm_start ||
1919 rmap_item->address >= vma->vm_end)
1920 continue;
1921
1922
1923
1924
1925
1926
1927 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1928 continue;
1929
1930 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1931 continue;
1932
1933 ret = rwc->rmap_one(page, vma,
1934 rmap_item->address, rwc->arg);
1935 if (ret != SWAP_AGAIN) {
1936 anon_vma_unlock_read(anon_vma);
1937 goto out;
1938 }
1939 if (rwc->done && rwc->done(page)) {
1940 anon_vma_unlock_read(anon_vma);
1941 goto out;
1942 }
1943 }
1944 anon_vma_unlock_read(anon_vma);
1945 }
1946 if (!search_new_forks++)
1947 goto again;
1948out:
1949 return ret;
1950}
1951
1952#ifdef CONFIG_MIGRATION
1953void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1954{
1955 struct stable_node *stable_node;
1956
1957 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
1958 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1959 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
1960
1961 stable_node = page_stable_node(newpage);
1962 if (stable_node) {
1963 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
1964 stable_node->kpfn = page_to_pfn(newpage);
1965
1966
1967
1968
1969
1970
1971 smp_wmb();
1972 set_page_stable_node(oldpage, NULL);
1973 }
1974}
1975#endif
1976
1977#ifdef CONFIG_MEMORY_HOTREMOVE
1978static void wait_while_offlining(void)
1979{
1980 while (ksm_run & KSM_RUN_OFFLINE) {
1981 mutex_unlock(&ksm_thread_mutex);
1982 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
1983 TASK_UNINTERRUPTIBLE);
1984 mutex_lock(&ksm_thread_mutex);
1985 }
1986}
1987
1988static void ksm_check_stable_tree(unsigned long start_pfn,
1989 unsigned long end_pfn)
1990{
1991 struct stable_node *stable_node, *next;
1992 struct rb_node *node;
1993 int nid;
1994
1995 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
1996 node = rb_first(root_stable_tree + nid);
1997 while (node) {
1998 stable_node = rb_entry(node, struct stable_node, node);
1999 if (stable_node->kpfn >= start_pfn &&
2000 stable_node->kpfn < end_pfn) {
2001
2002
2003
2004
2005 remove_node_from_stable_tree(stable_node);
2006 node = rb_first(root_stable_tree + nid);
2007 } else
2008 node = rb_next(node);
2009 cond_resched();
2010 }
2011 }
2012 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2013 if (stable_node->kpfn >= start_pfn &&
2014 stable_node->kpfn < end_pfn)
2015 remove_node_from_stable_tree(stable_node);
2016 cond_resched();
2017 }
2018}
2019
2020static int ksm_memory_callback(struct notifier_block *self,
2021 unsigned long action, void *arg)
2022{
2023 struct memory_notify *mn = arg;
2024
2025 switch (action) {
2026 case MEM_GOING_OFFLINE:
2027
2028
2029
2030
2031
2032
2033
2034 mutex_lock(&ksm_thread_mutex);
2035 ksm_run |= KSM_RUN_OFFLINE;
2036 mutex_unlock(&ksm_thread_mutex);
2037 break;
2038
2039 case MEM_OFFLINE:
2040
2041
2042
2043
2044
2045
2046
2047 ksm_check_stable_tree(mn->start_pfn,
2048 mn->start_pfn + mn->nr_pages);
2049
2050
2051 case MEM_CANCEL_OFFLINE:
2052 mutex_lock(&ksm_thread_mutex);
2053 ksm_run &= ~KSM_RUN_OFFLINE;
2054 mutex_unlock(&ksm_thread_mutex);
2055
2056 smp_mb();
2057 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2058 break;
2059 }
2060 return NOTIFY_OK;
2061}
2062#else
2063static void wait_while_offlining(void)
2064{
2065}
2066#endif
2067
2068#ifdef CONFIG_SYSFS
2069
2070
2071
2072
2073#define KSM_ATTR_RO(_name) \
2074 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2075#define KSM_ATTR(_name) \
2076 static struct kobj_attribute _name##_attr = \
2077 __ATTR(_name, 0644, _name##_show, _name##_store)
2078
2079static ssize_t sleep_millisecs_show(struct kobject *kobj,
2080 struct kobj_attribute *attr, char *buf)
2081{
2082 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2083}
2084
2085static ssize_t sleep_millisecs_store(struct kobject *kobj,
2086 struct kobj_attribute *attr,
2087 const char *buf, size_t count)
2088{
2089 unsigned long msecs;
2090 int err;
2091
2092 err = kstrtoul(buf, 10, &msecs);
2093 if (err || msecs > UINT_MAX)
2094 return -EINVAL;
2095
2096 ksm_thread_sleep_millisecs = msecs;
2097
2098 return count;
2099}
2100KSM_ATTR(sleep_millisecs);
2101
2102static ssize_t pages_to_scan_show(struct kobject *kobj,
2103 struct kobj_attribute *attr, char *buf)
2104{
2105 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2106}
2107
2108static ssize_t pages_to_scan_store(struct kobject *kobj,
2109 struct kobj_attribute *attr,
2110 const char *buf, size_t count)
2111{
2112 int err;
2113 unsigned long nr_pages;
2114
2115 err = kstrtoul(buf, 10, &nr_pages);
2116 if (err || nr_pages > UINT_MAX)
2117 return -EINVAL;
2118
2119 ksm_thread_pages_to_scan = nr_pages;
2120
2121 return count;
2122}
2123KSM_ATTR(pages_to_scan);
2124
2125static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2126 char *buf)
2127{
2128 return sprintf(buf, "%lu\n", ksm_run);
2129}
2130
2131static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2132 const char *buf, size_t count)
2133{
2134 int err;
2135 unsigned long flags;
2136
2137 err = kstrtoul(buf, 10, &flags);
2138 if (err || flags > UINT_MAX)
2139 return -EINVAL;
2140 if (flags > KSM_RUN_UNMERGE)
2141 return -EINVAL;
2142
2143
2144
2145
2146
2147
2148
2149
2150 mutex_lock(&ksm_thread_mutex);
2151 wait_while_offlining();
2152 if (ksm_run != flags) {
2153 ksm_run = flags;
2154 if (flags & KSM_RUN_UNMERGE) {
2155 set_current_oom_origin();
2156 err = unmerge_and_remove_all_rmap_items();
2157 clear_current_oom_origin();
2158 if (err) {
2159 ksm_run = KSM_RUN_STOP;
2160 count = err;
2161 }
2162 }
2163 }
2164 mutex_unlock(&ksm_thread_mutex);
2165
2166 if (flags & KSM_RUN_MERGE)
2167 wake_up_interruptible(&ksm_thread_wait);
2168
2169 return count;
2170}
2171KSM_ATTR(run);
2172
2173#ifdef CONFIG_NUMA
2174static ssize_t merge_across_nodes_show(struct kobject *kobj,
2175 struct kobj_attribute *attr, char *buf)
2176{
2177 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2178}
2179
2180static ssize_t merge_across_nodes_store(struct kobject *kobj,
2181 struct kobj_attribute *attr,
2182 const char *buf, size_t count)
2183{
2184 int err;
2185 unsigned long knob;
2186
2187 err = kstrtoul(buf, 10, &knob);
2188 if (err)
2189 return err;
2190 if (knob > 1)
2191 return -EINVAL;
2192
2193 mutex_lock(&ksm_thread_mutex);
2194 wait_while_offlining();
2195 if (ksm_merge_across_nodes != knob) {
2196 if (ksm_pages_shared || remove_all_stable_nodes())
2197 err = -EBUSY;
2198 else if (root_stable_tree == one_stable_tree) {
2199 struct rb_root *buf;
2200
2201
2202
2203
2204
2205
2206
2207 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2208 GFP_KERNEL);
2209
2210 if (!buf)
2211 err = -ENOMEM;
2212 else {
2213 root_stable_tree = buf;
2214 root_unstable_tree = buf + nr_node_ids;
2215
2216 root_unstable_tree[0] = one_unstable_tree[0];
2217 }
2218 }
2219 if (!err) {
2220 ksm_merge_across_nodes = knob;
2221 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2222 }
2223 }
2224 mutex_unlock(&ksm_thread_mutex);
2225
2226 return err ? err : count;
2227}
2228KSM_ATTR(merge_across_nodes);
2229#endif
2230
2231static ssize_t pages_shared_show(struct kobject *kobj,
2232 struct kobj_attribute *attr, char *buf)
2233{
2234 return sprintf(buf, "%lu\n", ksm_pages_shared);
2235}
2236KSM_ATTR_RO(pages_shared);
2237
2238static ssize_t pages_sharing_show(struct kobject *kobj,
2239 struct kobj_attribute *attr, char *buf)
2240{
2241 return sprintf(buf, "%lu\n", ksm_pages_sharing);
2242}
2243KSM_ATTR_RO(pages_sharing);
2244
2245static ssize_t pages_unshared_show(struct kobject *kobj,
2246 struct kobj_attribute *attr, char *buf)
2247{
2248 return sprintf(buf, "%lu\n", ksm_pages_unshared);
2249}
2250KSM_ATTR_RO(pages_unshared);
2251
2252static ssize_t pages_volatile_show(struct kobject *kobj,
2253 struct kobj_attribute *attr, char *buf)
2254{
2255 long ksm_pages_volatile;
2256
2257 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
2258 - ksm_pages_sharing - ksm_pages_unshared;
2259
2260
2261
2262
2263 if (ksm_pages_volatile < 0)
2264 ksm_pages_volatile = 0;
2265 return sprintf(buf, "%ld\n", ksm_pages_volatile);
2266}
2267KSM_ATTR_RO(pages_volatile);
2268
2269static ssize_t full_scans_show(struct kobject *kobj,
2270 struct kobj_attribute *attr, char *buf)
2271{
2272 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
2273}
2274KSM_ATTR_RO(full_scans);
2275
2276static struct attribute *ksm_attrs[] = {
2277 &sleep_millisecs_attr.attr,
2278 &pages_to_scan_attr.attr,
2279 &run_attr.attr,
2280 &pages_shared_attr.attr,
2281 &pages_sharing_attr.attr,
2282 &pages_unshared_attr.attr,
2283 &pages_volatile_attr.attr,
2284 &full_scans_attr.attr,
2285#ifdef CONFIG_NUMA
2286 &merge_across_nodes_attr.attr,
2287#endif
2288 NULL,
2289};
2290
2291static struct attribute_group ksm_attr_group = {
2292 .attrs = ksm_attrs,
2293 .name = "ksm",
2294};
2295#endif
2296
2297static int __init ksm_init(void)
2298{
2299 struct task_struct *ksm_thread;
2300 int err;
2301
2302 err = ksm_slab_init();
2303 if (err)
2304 goto out;
2305
2306 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
2307 if (IS_ERR(ksm_thread)) {
2308 pr_err("ksm: creating kthread failed\n");
2309 err = PTR_ERR(ksm_thread);
2310 goto out_free;
2311 }
2312
2313#ifdef CONFIG_SYSFS
2314 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
2315 if (err) {
2316 pr_err("ksm: register sysfs failed\n");
2317 kthread_stop(ksm_thread);
2318 goto out_free;
2319 }
2320#else
2321 ksm_run = KSM_RUN_MERGE;
2322
2323#endif
2324
2325#ifdef CONFIG_MEMORY_HOTREMOVE
2326
2327 hotplug_memory_notifier(ksm_memory_callback, 100);
2328#endif
2329 return 0;
2330
2331out_free:
2332 ksm_slab_free();
2333out:
2334 return err;
2335}
2336subsys_initcall(ksm_init);
2337