1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/memory.h>
33#include <linux/mmu_notifier.h>
34#include <linux/swap.h>
35#include <linux/ksm.h>
36#include <linux/hashtable.h>
37#include <linux/freezer.h>
38#include <linux/oom.h>
39#include <linux/numa.h>
40
41#include <asm/tlbflush.h>
42#include "internal.h"
43
44#ifdef CONFIG_NUMA
45#define NUMA(x) (x)
46#define DO_NUMA(x) do { (x); } while (0)
47#else
48#define NUMA(x) (0)
49#define DO_NUMA(x) do { } while (0)
50#endif
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102struct mm_slot {
103 struct hlist_node link;
104 struct list_head mm_list;
105 struct rmap_item *rmap_list;
106 struct mm_struct *mm;
107};
108
109
110
111
112
113
114
115
116
117
118struct ksm_scan {
119 struct mm_slot *mm_slot;
120 unsigned long address;
121 struct rmap_item **rmap_list;
122 unsigned long seqnr;
123};
124
125
126
127
128
129
130
131
132
133
134struct stable_node {
135 union {
136 struct rb_node node;
137 struct {
138 struct list_head *head;
139 struct list_head list;
140 };
141 };
142 struct hlist_head hlist;
143 unsigned long kpfn;
144#ifdef CONFIG_NUMA
145 int nid;
146#endif
147};
148
149
150
151
152
153
154
155
156
157
158
159
160
161struct rmap_item {
162 struct rmap_item *rmap_list;
163 union {
164 struct anon_vma *anon_vma;
165#ifdef CONFIG_NUMA
166 int nid;
167#endif
168 };
169 struct mm_struct *mm;
170 unsigned long address;
171 unsigned int oldchecksum;
172 union {
173 struct rb_node node;
174 struct {
175 struct stable_node *head;
176 struct hlist_node hlist;
177 };
178 };
179};
180
181#define SEQNR_MASK 0x0ff
182#define UNSTABLE_FLAG 0x100
183#define STABLE_FLAG 0x200
184
185
186static struct rb_root one_stable_tree[1] = { RB_ROOT };
187static struct rb_root one_unstable_tree[1] = { RB_ROOT };
188static struct rb_root *root_stable_tree = one_stable_tree;
189static struct rb_root *root_unstable_tree = one_unstable_tree;
190
191
192static LIST_HEAD(migrate_nodes);
193
194#define MM_SLOTS_HASH_BITS 10
195static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
196
197static struct mm_slot ksm_mm_head = {
198 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
199};
200static struct ksm_scan ksm_scan = {
201 .mm_slot = &ksm_mm_head,
202};
203
204static struct kmem_cache *rmap_item_cache;
205static struct kmem_cache *stable_node_cache;
206static struct kmem_cache *mm_slot_cache;
207
208
209static unsigned long ksm_pages_shared;
210
211
212static unsigned long ksm_pages_sharing;
213
214
215static unsigned long ksm_pages_unshared;
216
217
218static unsigned long ksm_rmap_items;
219
220
221static unsigned int ksm_thread_pages_to_scan = 100;
222
223
224static unsigned int ksm_thread_sleep_millisecs = 20;
225
226#ifdef CONFIG_NUMA
227
228static unsigned int ksm_merge_across_nodes = 1;
229static int ksm_nr_node_ids = 1;
230#else
231#define ksm_merge_across_nodes 1U
232#define ksm_nr_node_ids 1
233#endif
234
235#define KSM_RUN_STOP 0
236#define KSM_RUN_MERGE 1
237#define KSM_RUN_UNMERGE 2
238#define KSM_RUN_OFFLINE 4
239static unsigned long ksm_run = KSM_RUN_STOP;
240static void wait_while_offlining(void);
241
242static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
243static DEFINE_MUTEX(ksm_thread_mutex);
244static DEFINE_SPINLOCK(ksm_mmlist_lock);
245
246#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
247 sizeof(struct __struct), __alignof__(struct __struct),\
248 (__flags), NULL)
249
250static int __init ksm_slab_init(void)
251{
252 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
253 if (!rmap_item_cache)
254 goto out;
255
256 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
257 if (!stable_node_cache)
258 goto out_free1;
259
260 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
261 if (!mm_slot_cache)
262 goto out_free2;
263
264 return 0;
265
266out_free2:
267 kmem_cache_destroy(stable_node_cache);
268out_free1:
269 kmem_cache_destroy(rmap_item_cache);
270out:
271 return -ENOMEM;
272}
273
274static void __init ksm_slab_free(void)
275{
276 kmem_cache_destroy(mm_slot_cache);
277 kmem_cache_destroy(stable_node_cache);
278 kmem_cache_destroy(rmap_item_cache);
279 mm_slot_cache = NULL;
280}
281
282static inline struct rmap_item *alloc_rmap_item(void)
283{
284 struct rmap_item *rmap_item;
285
286 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
287 __GFP_NORETRY | __GFP_NOWARN);
288 if (rmap_item)
289 ksm_rmap_items++;
290 return rmap_item;
291}
292
293static inline void free_rmap_item(struct rmap_item *rmap_item)
294{
295 ksm_rmap_items--;
296 rmap_item->mm = NULL;
297 kmem_cache_free(rmap_item_cache, rmap_item);
298}
299
300static inline struct stable_node *alloc_stable_node(void)
301{
302
303
304
305
306
307 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
308}
309
310static inline void free_stable_node(struct stable_node *stable_node)
311{
312 kmem_cache_free(stable_node_cache, stable_node);
313}
314
315static inline struct mm_slot *alloc_mm_slot(void)
316{
317 if (!mm_slot_cache)
318 return NULL;
319 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
320}
321
322static inline void free_mm_slot(struct mm_slot *mm_slot)
323{
324 kmem_cache_free(mm_slot_cache, mm_slot);
325}
326
327static struct mm_slot *get_mm_slot(struct mm_struct *mm)
328{
329 struct mm_slot *slot;
330
331 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
332 if (slot->mm == mm)
333 return slot;
334
335 return NULL;
336}
337
338static void insert_to_mm_slots_hash(struct mm_struct *mm,
339 struct mm_slot *mm_slot)
340{
341 mm_slot->mm = mm;
342 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
343}
344
345
346
347
348
349
350
351
352
353static inline bool ksm_test_exit(struct mm_struct *mm)
354{
355 return atomic_read(&mm->mm_users) == 0;
356}
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
374{
375 struct page *page;
376 int ret = 0;
377
378 do {
379 cond_resched();
380 page = follow_page(vma, addr,
381 FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
382 if (IS_ERR_OR_NULL(page))
383 break;
384 if (PageKsm(page))
385 ret = handle_mm_fault(vma, addr,
386 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
387 else
388 ret = VM_FAULT_WRITE;
389 put_page(page);
390 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
420}
421
422static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
423 unsigned long addr)
424{
425 struct vm_area_struct *vma;
426 if (ksm_test_exit(mm))
427 return NULL;
428 vma = find_vma(mm, addr);
429 if (!vma || vma->vm_start > addr)
430 return NULL;
431 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
432 return NULL;
433 return vma;
434}
435
436static void break_cow(struct rmap_item *rmap_item)
437{
438 struct mm_struct *mm = rmap_item->mm;
439 unsigned long addr = rmap_item->address;
440 struct vm_area_struct *vma;
441
442
443
444
445
446 put_anon_vma(rmap_item->anon_vma);
447
448 down_read(&mm->mmap_sem);
449 vma = find_mergeable_vma(mm, addr);
450 if (vma)
451 break_ksm(vma, addr);
452 up_read(&mm->mmap_sem);
453}
454
455static struct page *get_mergeable_page(struct rmap_item *rmap_item)
456{
457 struct mm_struct *mm = rmap_item->mm;
458 unsigned long addr = rmap_item->address;
459 struct vm_area_struct *vma;
460 struct page *page;
461
462 down_read(&mm->mmap_sem);
463 vma = find_mergeable_vma(mm, addr);
464 if (!vma)
465 goto out;
466
467 page = follow_page(vma, addr, FOLL_GET);
468 if (IS_ERR_OR_NULL(page))
469 goto out;
470 if (PageAnon(page)) {
471 flush_anon_page(vma, page, addr);
472 flush_dcache_page(page);
473 } else {
474 put_page(page);
475out:
476 page = NULL;
477 }
478 up_read(&mm->mmap_sem);
479 return page;
480}
481
482
483
484
485
486
487
488static inline int get_kpfn_nid(unsigned long kpfn)
489{
490 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
491}
492
493static void remove_node_from_stable_tree(struct stable_node *stable_node)
494{
495 struct rmap_item *rmap_item;
496
497 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
498 if (rmap_item->hlist.next)
499 ksm_pages_sharing--;
500 else
501 ksm_pages_shared--;
502 put_anon_vma(rmap_item->anon_vma);
503 rmap_item->address &= PAGE_MASK;
504 cond_resched();
505 }
506
507 if (stable_node->head == &migrate_nodes)
508 list_del(&stable_node->list);
509 else
510 rb_erase(&stable_node->node,
511 root_stable_tree + NUMA(stable_node->nid));
512 free_stable_node(stable_node);
513}
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
535{
536 struct page *page;
537 void *expected_mapping;
538 unsigned long kpfn;
539
540 expected_mapping = (void *)((unsigned long)stable_node |
541 PAGE_MAPPING_KSM);
542again:
543 kpfn = READ_ONCE(stable_node->kpfn);
544 page = pfn_to_page(kpfn);
545
546
547
548
549
550
551 smp_read_barrier_depends();
552 if (READ_ONCE(page->mapping) != expected_mapping)
553 goto stale;
554
555
556
557
558
559
560
561
562
563
564 while (!get_page_unless_zero(page)) {
565
566
567
568
569
570
571
572
573 if (!PageSwapCache(page))
574 goto stale;
575 cpu_relax();
576 }
577
578 if (READ_ONCE(page->mapping) != expected_mapping) {
579 put_page(page);
580 goto stale;
581 }
582
583 if (lock_it) {
584 lock_page(page);
585 if (READ_ONCE(page->mapping) != expected_mapping) {
586 unlock_page(page);
587 put_page(page);
588 goto stale;
589 }
590 }
591 return page;
592
593stale:
594
595
596
597
598
599
600 smp_rmb();
601 if (READ_ONCE(stable_node->kpfn) != kpfn)
602 goto again;
603 remove_node_from_stable_tree(stable_node);
604 return NULL;
605}
606
607
608
609
610
611static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
612{
613 if (rmap_item->address & STABLE_FLAG) {
614 struct stable_node *stable_node;
615 struct page *page;
616
617 stable_node = rmap_item->head;
618 page = get_ksm_page(stable_node, true);
619 if (!page)
620 goto out;
621
622 hlist_del(&rmap_item->hlist);
623 unlock_page(page);
624 put_page(page);
625
626 if (!hlist_empty(&stable_node->hlist))
627 ksm_pages_sharing--;
628 else
629 ksm_pages_shared--;
630
631 put_anon_vma(rmap_item->anon_vma);
632 rmap_item->address &= PAGE_MASK;
633
634 } else if (rmap_item->address & UNSTABLE_FLAG) {
635 unsigned char age;
636
637
638
639
640
641
642
643 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
644 BUG_ON(age > 1);
645 if (!age)
646 rb_erase(&rmap_item->node,
647 root_unstable_tree + NUMA(rmap_item->nid));
648 ksm_pages_unshared--;
649 rmap_item->address &= PAGE_MASK;
650 }
651out:
652 cond_resched();
653}
654
655static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
656 struct rmap_item **rmap_list)
657{
658 while (*rmap_list) {
659 struct rmap_item *rmap_item = *rmap_list;
660 *rmap_list = rmap_item->rmap_list;
661 remove_rmap_item_from_tree(rmap_item);
662 free_rmap_item(rmap_item);
663 }
664}
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679static int unmerge_ksm_pages(struct vm_area_struct *vma,
680 unsigned long start, unsigned long end)
681{
682 unsigned long addr;
683 int err = 0;
684
685 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
686 if (ksm_test_exit(vma->vm_mm))
687 break;
688 if (signal_pending(current))
689 err = -ERESTARTSYS;
690 else
691 err = break_ksm(vma, addr);
692 }
693 return err;
694}
695
696#ifdef CONFIG_SYSFS
697
698
699
700static int remove_stable_node(struct stable_node *stable_node)
701{
702 struct page *page;
703 int err;
704
705 page = get_ksm_page(stable_node, true);
706 if (!page) {
707
708
709
710 return 0;
711 }
712
713 if (WARN_ON_ONCE(page_mapped(page))) {
714
715
716
717
718 err = -EBUSY;
719 } else {
720
721
722
723
724
725
726
727
728 set_page_stable_node(page, NULL);
729 remove_node_from_stable_tree(stable_node);
730 err = 0;
731 }
732
733 unlock_page(page);
734 put_page(page);
735 return err;
736}
737
738static int remove_all_stable_nodes(void)
739{
740 struct stable_node *stable_node, *next;
741 int nid;
742 int err = 0;
743
744 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
745 while (root_stable_tree[nid].rb_node) {
746 stable_node = rb_entry(root_stable_tree[nid].rb_node,
747 struct stable_node, node);
748 if (remove_stable_node(stable_node)) {
749 err = -EBUSY;
750 break;
751 }
752 cond_resched();
753 }
754 }
755 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
756 if (remove_stable_node(stable_node))
757 err = -EBUSY;
758 cond_resched();
759 }
760 return err;
761}
762
763static int unmerge_and_remove_all_rmap_items(void)
764{
765 struct mm_slot *mm_slot;
766 struct mm_struct *mm;
767 struct vm_area_struct *vma;
768 int err = 0;
769
770 spin_lock(&ksm_mmlist_lock);
771 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
772 struct mm_slot, mm_list);
773 spin_unlock(&ksm_mmlist_lock);
774
775 for (mm_slot = ksm_scan.mm_slot;
776 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
777 mm = mm_slot->mm;
778 down_read(&mm->mmap_sem);
779 for (vma = mm->mmap; vma; vma = vma->vm_next) {
780 if (ksm_test_exit(mm))
781 break;
782 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
783 continue;
784 err = unmerge_ksm_pages(vma,
785 vma->vm_start, vma->vm_end);
786 if (err)
787 goto error;
788 }
789
790 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
791 up_read(&mm->mmap_sem);
792
793 spin_lock(&ksm_mmlist_lock);
794 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
795 struct mm_slot, mm_list);
796 if (ksm_test_exit(mm)) {
797 hash_del(&mm_slot->link);
798 list_del(&mm_slot->mm_list);
799 spin_unlock(&ksm_mmlist_lock);
800
801 free_mm_slot(mm_slot);
802 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
803 mmdrop(mm);
804 } else
805 spin_unlock(&ksm_mmlist_lock);
806 }
807
808
809 remove_all_stable_nodes();
810 ksm_scan.seqnr = 0;
811 return 0;
812
813error:
814 up_read(&mm->mmap_sem);
815 spin_lock(&ksm_mmlist_lock);
816 ksm_scan.mm_slot = &ksm_mm_head;
817 spin_unlock(&ksm_mmlist_lock);
818 return err;
819}
820#endif
821
822static u32 calc_checksum(struct page *page)
823{
824 u32 checksum;
825 void *addr = kmap_atomic(page);
826 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
827 kunmap_atomic(addr);
828 return checksum;
829}
830
831static int memcmp_pages(struct page *page1, struct page *page2)
832{
833 char *addr1, *addr2;
834 int ret;
835
836 addr1 = kmap_atomic(page1);
837 addr2 = kmap_atomic(page2);
838 ret = memcmp(addr1, addr2, PAGE_SIZE);
839 kunmap_atomic(addr2);
840 kunmap_atomic(addr1);
841 return ret;
842}
843
844static inline int pages_identical(struct page *page1, struct page *page2)
845{
846 return !memcmp_pages(page1, page2);
847}
848
849static int write_protect_page(struct vm_area_struct *vma, struct page *page,
850 pte_t *orig_pte)
851{
852 struct mm_struct *mm = vma->vm_mm;
853 unsigned long addr;
854 pte_t *ptep;
855 spinlock_t *ptl;
856 int swapped;
857 int err = -EFAULT;
858 unsigned long mmun_start;
859 unsigned long mmun_end;
860
861 addr = page_address_in_vma(page, vma);
862 if (addr == -EFAULT)
863 goto out;
864
865 BUG_ON(PageTransCompound(page));
866
867 mmun_start = addr;
868 mmun_end = addr + PAGE_SIZE;
869 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
870
871 ptep = page_check_address(page, mm, addr, &ptl, 0);
872 if (!ptep)
873 goto out_mn;
874
875 if (pte_write(*ptep) || pte_dirty(*ptep)) {
876 pte_t entry;
877
878 swapped = PageSwapCache(page);
879 flush_cache_page(vma, addr, page_to_pfn(page));
880
881
882
883
884
885
886
887
888
889 entry = ptep_clear_flush_notify(vma, addr, ptep);
890
891
892
893
894 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
895 set_pte_at(mm, addr, ptep, entry);
896 goto out_unlock;
897 }
898 if (pte_dirty(entry))
899 set_page_dirty(page);
900 entry = pte_mkclean(pte_wrprotect(entry));
901 set_pte_at_notify(mm, addr, ptep, entry);
902 }
903 *orig_pte = *ptep;
904 err = 0;
905
906out_unlock:
907 pte_unmap_unlock(ptep, ptl);
908out_mn:
909 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
910out:
911 return err;
912}
913
914
915
916
917
918
919
920
921
922
923static int replace_page(struct vm_area_struct *vma, struct page *page,
924 struct page *kpage, pte_t orig_pte)
925{
926 struct mm_struct *mm = vma->vm_mm;
927 pmd_t *pmd;
928 pte_t *ptep;
929 spinlock_t *ptl;
930 unsigned long addr;
931 int err = -EFAULT;
932 unsigned long mmun_start;
933 unsigned long mmun_end;
934
935 addr = page_address_in_vma(page, vma);
936 if (addr == -EFAULT)
937 goto out;
938
939 pmd = mm_find_pmd(mm, addr);
940 if (!pmd)
941 goto out;
942
943 mmun_start = addr;
944 mmun_end = addr + PAGE_SIZE;
945 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
946
947 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
948 if (!pte_same(*ptep, orig_pte)) {
949 pte_unmap_unlock(ptep, ptl);
950 goto out_mn;
951 }
952
953 get_page(kpage);
954 page_add_anon_rmap(kpage, vma, addr, false);
955
956 flush_cache_page(vma, addr, pte_pfn(*ptep));
957 ptep_clear_flush_notify(vma, addr, ptep);
958 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
959
960 page_remove_rmap(page, false);
961 if (!page_mapped(page))
962 try_to_free_swap(page);
963 put_page(page);
964
965 pte_unmap_unlock(ptep, ptl);
966 err = 0;
967out_mn:
968 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
969out:
970 return err;
971}
972
973
974
975
976
977
978
979
980
981
982static int try_to_merge_one_page(struct vm_area_struct *vma,
983 struct page *page, struct page *kpage)
984{
985 pte_t orig_pte = __pte(0);
986 int err = -EFAULT;
987
988 if (page == kpage)
989 return 0;
990
991 if (!PageAnon(page))
992 goto out;
993
994
995
996
997
998
999
1000
1001 if (!trylock_page(page))
1002 goto out;
1003
1004 if (PageTransCompound(page)) {
1005 err = split_huge_page(page);
1006 if (err)
1007 goto out_unlock;
1008 }
1009
1010
1011
1012
1013
1014
1015
1016 if (write_protect_page(vma, page, &orig_pte) == 0) {
1017 if (!kpage) {
1018
1019
1020
1021
1022
1023 set_page_stable_node(page, NULL);
1024 mark_page_accessed(page);
1025
1026
1027
1028
1029 if (!PageDirty(page))
1030 SetPageDirty(page);
1031 err = 0;
1032 } else if (pages_identical(page, kpage))
1033 err = replace_page(vma, page, kpage, orig_pte);
1034 }
1035
1036 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1037 munlock_vma_page(page);
1038 if (!PageMlocked(kpage)) {
1039 unlock_page(page);
1040 lock_page(kpage);
1041 mlock_vma_page(kpage);
1042 page = kpage;
1043 }
1044 }
1045
1046out_unlock:
1047 unlock_page(page);
1048out:
1049 return err;
1050}
1051
1052
1053
1054
1055
1056
1057
1058static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1059 struct page *page, struct page *kpage)
1060{
1061 struct mm_struct *mm = rmap_item->mm;
1062 struct vm_area_struct *vma;
1063 int err = -EFAULT;
1064
1065 down_read(&mm->mmap_sem);
1066 vma = find_mergeable_vma(mm, rmap_item->address);
1067 if (!vma)
1068 goto out;
1069
1070 err = try_to_merge_one_page(vma, page, kpage);
1071 if (err)
1072 goto out;
1073
1074
1075 remove_rmap_item_from_tree(rmap_item);
1076
1077
1078 rmap_item->anon_vma = vma->anon_vma;
1079 get_anon_vma(vma->anon_vma);
1080out:
1081 up_read(&mm->mmap_sem);
1082 return err;
1083}
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1096 struct page *page,
1097 struct rmap_item *tree_rmap_item,
1098 struct page *tree_page)
1099{
1100 int err;
1101
1102 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1103 if (!err) {
1104 err = try_to_merge_with_ksm_page(tree_rmap_item,
1105 tree_page, page);
1106
1107
1108
1109
1110 if (err)
1111 break_cow(rmap_item);
1112 }
1113 return err ? NULL : page;
1114}
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125static struct page *stable_tree_search(struct page *page)
1126{
1127 int nid;
1128 struct rb_root *root;
1129 struct rb_node **new;
1130 struct rb_node *parent;
1131 struct stable_node *stable_node;
1132 struct stable_node *page_node;
1133
1134 page_node = page_stable_node(page);
1135 if (page_node && page_node->head != &migrate_nodes) {
1136
1137 get_page(page);
1138 return page;
1139 }
1140
1141 nid = get_kpfn_nid(page_to_pfn(page));
1142 root = root_stable_tree + nid;
1143again:
1144 new = &root->rb_node;
1145 parent = NULL;
1146
1147 while (*new) {
1148 struct page *tree_page;
1149 int ret;
1150
1151 cond_resched();
1152 stable_node = rb_entry(*new, struct stable_node, node);
1153 tree_page = get_ksm_page(stable_node, false);
1154 if (!tree_page) {
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164 goto again;
1165 }
1166
1167 ret = memcmp_pages(page, tree_page);
1168 put_page(tree_page);
1169
1170 parent = *new;
1171 if (ret < 0)
1172 new = &parent->rb_left;
1173 else if (ret > 0)
1174 new = &parent->rb_right;
1175 else {
1176
1177
1178
1179
1180
1181
1182
1183 tree_page = get_ksm_page(stable_node, true);
1184 if (tree_page) {
1185 unlock_page(tree_page);
1186 if (get_kpfn_nid(stable_node->kpfn) !=
1187 NUMA(stable_node->nid)) {
1188 put_page(tree_page);
1189 goto replace;
1190 }
1191 return tree_page;
1192 }
1193
1194
1195
1196
1197 if (page_node)
1198 goto again;
1199 return NULL;
1200 }
1201 }
1202
1203 if (!page_node)
1204 return NULL;
1205
1206 list_del(&page_node->list);
1207 DO_NUMA(page_node->nid = nid);
1208 rb_link_node(&page_node->node, parent, new);
1209 rb_insert_color(&page_node->node, root);
1210 get_page(page);
1211 return page;
1212
1213replace:
1214 if (page_node) {
1215 list_del(&page_node->list);
1216 DO_NUMA(page_node->nid = nid);
1217 rb_replace_node(&stable_node->node, &page_node->node, root);
1218 get_page(page);
1219 } else {
1220 rb_erase(&stable_node->node, root);
1221 page = NULL;
1222 }
1223 stable_node->head = &migrate_nodes;
1224 list_add(&stable_node->list, stable_node->head);
1225 return page;
1226}
1227
1228
1229
1230
1231
1232
1233
1234
1235static struct stable_node *stable_tree_insert(struct page *kpage)
1236{
1237 int nid;
1238 unsigned long kpfn;
1239 struct rb_root *root;
1240 struct rb_node **new;
1241 struct rb_node *parent;
1242 struct stable_node *stable_node;
1243
1244 kpfn = page_to_pfn(kpage);
1245 nid = get_kpfn_nid(kpfn);
1246 root = root_stable_tree + nid;
1247again:
1248 parent = NULL;
1249 new = &root->rb_node;
1250
1251 while (*new) {
1252 struct page *tree_page;
1253 int ret;
1254
1255 cond_resched();
1256 stable_node = rb_entry(*new, struct stable_node, node);
1257 tree_page = get_ksm_page(stable_node, false);
1258 if (!tree_page) {
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268 goto again;
1269 }
1270
1271 ret = memcmp_pages(kpage, tree_page);
1272 put_page(tree_page);
1273
1274 parent = *new;
1275 if (ret < 0)
1276 new = &parent->rb_left;
1277 else if (ret > 0)
1278 new = &parent->rb_right;
1279 else {
1280
1281
1282
1283
1284
1285 return NULL;
1286 }
1287 }
1288
1289 stable_node = alloc_stable_node();
1290 if (!stable_node)
1291 return NULL;
1292
1293 INIT_HLIST_HEAD(&stable_node->hlist);
1294 stable_node->kpfn = kpfn;
1295 set_page_stable_node(kpage, stable_node);
1296 DO_NUMA(stable_node->nid = nid);
1297 rb_link_node(&stable_node->node, parent, new);
1298 rb_insert_color(&stable_node->node, root);
1299
1300 return stable_node;
1301}
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317static
1318struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1319 struct page *page,
1320 struct page **tree_pagep)
1321{
1322 struct rb_node **new;
1323 struct rb_root *root;
1324 struct rb_node *parent = NULL;
1325 int nid;
1326
1327 nid = get_kpfn_nid(page_to_pfn(page));
1328 root = root_unstable_tree + nid;
1329 new = &root->rb_node;
1330
1331 while (*new) {
1332 struct rmap_item *tree_rmap_item;
1333 struct page *tree_page;
1334 int ret;
1335
1336 cond_resched();
1337 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1338 tree_page = get_mergeable_page(tree_rmap_item);
1339 if (!tree_page)
1340 return NULL;
1341
1342
1343
1344
1345 if (page == tree_page) {
1346 put_page(tree_page);
1347 return NULL;
1348 }
1349
1350 ret = memcmp_pages(page, tree_page);
1351
1352 parent = *new;
1353 if (ret < 0) {
1354 put_page(tree_page);
1355 new = &parent->rb_left;
1356 } else if (ret > 0) {
1357 put_page(tree_page);
1358 new = &parent->rb_right;
1359 } else if (!ksm_merge_across_nodes &&
1360 page_to_nid(tree_page) != nid) {
1361
1362
1363
1364
1365
1366 put_page(tree_page);
1367 return NULL;
1368 } else {
1369 *tree_pagep = tree_page;
1370 return tree_rmap_item;
1371 }
1372 }
1373
1374 rmap_item->address |= UNSTABLE_FLAG;
1375 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1376 DO_NUMA(rmap_item->nid = nid);
1377 rb_link_node(&rmap_item->node, parent, new);
1378 rb_insert_color(&rmap_item->node, root);
1379
1380 ksm_pages_unshared++;
1381 return NULL;
1382}
1383
1384
1385
1386
1387
1388
1389static void stable_tree_append(struct rmap_item *rmap_item,
1390 struct stable_node *stable_node)
1391{
1392 rmap_item->head = stable_node;
1393 rmap_item->address |= STABLE_FLAG;
1394 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
1395
1396 if (rmap_item->hlist.next)
1397 ksm_pages_sharing++;
1398 else
1399 ksm_pages_shared++;
1400}
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1412{
1413 struct rmap_item *tree_rmap_item;
1414 struct page *tree_page = NULL;
1415 struct stable_node *stable_node;
1416 struct page *kpage;
1417 unsigned int checksum;
1418 int err;
1419
1420 stable_node = page_stable_node(page);
1421 if (stable_node) {
1422 if (stable_node->head != &migrate_nodes &&
1423 get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
1424 rb_erase(&stable_node->node,
1425 root_stable_tree + NUMA(stable_node->nid));
1426 stable_node->head = &migrate_nodes;
1427 list_add(&stable_node->list, stable_node->head);
1428 }
1429 if (stable_node->head != &migrate_nodes &&
1430 rmap_item->head == stable_node)
1431 return;
1432 }
1433
1434
1435 kpage = stable_tree_search(page);
1436 if (kpage == page && rmap_item->head == stable_node) {
1437 put_page(kpage);
1438 return;
1439 }
1440
1441 remove_rmap_item_from_tree(rmap_item);
1442
1443 if (kpage) {
1444 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1445 if (!err) {
1446
1447
1448
1449
1450 lock_page(kpage);
1451 stable_tree_append(rmap_item, page_stable_node(kpage));
1452 unlock_page(kpage);
1453 }
1454 put_page(kpage);
1455 return;
1456 }
1457
1458
1459
1460
1461
1462
1463
1464 checksum = calc_checksum(page);
1465 if (rmap_item->oldchecksum != checksum) {
1466 rmap_item->oldchecksum = checksum;
1467 return;
1468 }
1469
1470 tree_rmap_item =
1471 unstable_tree_search_insert(rmap_item, page, &tree_page);
1472 if (tree_rmap_item) {
1473 kpage = try_to_merge_two_pages(rmap_item, page,
1474 tree_rmap_item, tree_page);
1475 put_page(tree_page);
1476 if (kpage) {
1477
1478
1479
1480
1481 lock_page(kpage);
1482 stable_node = stable_tree_insert(kpage);
1483 if (stable_node) {
1484 stable_tree_append(tree_rmap_item, stable_node);
1485 stable_tree_append(rmap_item, stable_node);
1486 }
1487 unlock_page(kpage);
1488
1489
1490
1491
1492
1493
1494
1495 if (!stable_node) {
1496 break_cow(tree_rmap_item);
1497 break_cow(rmap_item);
1498 }
1499 }
1500 }
1501}
1502
1503static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1504 struct rmap_item **rmap_list,
1505 unsigned long addr)
1506{
1507 struct rmap_item *rmap_item;
1508
1509 while (*rmap_list) {
1510 rmap_item = *rmap_list;
1511 if ((rmap_item->address & PAGE_MASK) == addr)
1512 return rmap_item;
1513 if (rmap_item->address > addr)
1514 break;
1515 *rmap_list = rmap_item->rmap_list;
1516 remove_rmap_item_from_tree(rmap_item);
1517 free_rmap_item(rmap_item);
1518 }
1519
1520 rmap_item = alloc_rmap_item();
1521 if (rmap_item) {
1522
1523 rmap_item->mm = mm_slot->mm;
1524 rmap_item->address = addr;
1525 rmap_item->rmap_list = *rmap_list;
1526 *rmap_list = rmap_item;
1527 }
1528 return rmap_item;
1529}
1530
1531static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1532{
1533 struct mm_struct *mm;
1534 struct mm_slot *slot;
1535 struct vm_area_struct *vma;
1536 struct rmap_item *rmap_item;
1537 int nid;
1538
1539 if (list_empty(&ksm_mm_head.mm_list))
1540 return NULL;
1541
1542 slot = ksm_scan.mm_slot;
1543 if (slot == &ksm_mm_head) {
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554 lru_add_drain_all();
1555
1556
1557
1558
1559
1560
1561
1562 if (!ksm_merge_across_nodes) {
1563 struct stable_node *stable_node, *next;
1564 struct page *page;
1565
1566 list_for_each_entry_safe(stable_node, next,
1567 &migrate_nodes, list) {
1568 page = get_ksm_page(stable_node, false);
1569 if (page)
1570 put_page(page);
1571 cond_resched();
1572 }
1573 }
1574
1575 for (nid = 0; nid < ksm_nr_node_ids; nid++)
1576 root_unstable_tree[nid] = RB_ROOT;
1577
1578 spin_lock(&ksm_mmlist_lock);
1579 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1580 ksm_scan.mm_slot = slot;
1581 spin_unlock(&ksm_mmlist_lock);
1582
1583
1584
1585
1586 if (slot == &ksm_mm_head)
1587 return NULL;
1588next_mm:
1589 ksm_scan.address = 0;
1590 ksm_scan.rmap_list = &slot->rmap_list;
1591 }
1592
1593 mm = slot->mm;
1594 down_read(&mm->mmap_sem);
1595 if (ksm_test_exit(mm))
1596 vma = NULL;
1597 else
1598 vma = find_vma(mm, ksm_scan.address);
1599
1600 for (; vma; vma = vma->vm_next) {
1601 if (!(vma->vm_flags & VM_MERGEABLE))
1602 continue;
1603 if (ksm_scan.address < vma->vm_start)
1604 ksm_scan.address = vma->vm_start;
1605 if (!vma->anon_vma)
1606 ksm_scan.address = vma->vm_end;
1607
1608 while (ksm_scan.address < vma->vm_end) {
1609 if (ksm_test_exit(mm))
1610 break;
1611 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1612 if (IS_ERR_OR_NULL(*page)) {
1613 ksm_scan.address += PAGE_SIZE;
1614 cond_resched();
1615 continue;
1616 }
1617 if (PageAnon(*page)) {
1618 flush_anon_page(vma, *page, ksm_scan.address);
1619 flush_dcache_page(*page);
1620 rmap_item = get_next_rmap_item(slot,
1621 ksm_scan.rmap_list, ksm_scan.address);
1622 if (rmap_item) {
1623 ksm_scan.rmap_list =
1624 &rmap_item->rmap_list;
1625 ksm_scan.address += PAGE_SIZE;
1626 } else
1627 put_page(*page);
1628 up_read(&mm->mmap_sem);
1629 return rmap_item;
1630 }
1631 put_page(*page);
1632 ksm_scan.address += PAGE_SIZE;
1633 cond_resched();
1634 }
1635 }
1636
1637 if (ksm_test_exit(mm)) {
1638 ksm_scan.address = 0;
1639 ksm_scan.rmap_list = &slot->rmap_list;
1640 }
1641
1642
1643
1644
1645 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
1646
1647 spin_lock(&ksm_mmlist_lock);
1648 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1649 struct mm_slot, mm_list);
1650 if (ksm_scan.address == 0) {
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660 hash_del(&slot->link);
1661 list_del(&slot->mm_list);
1662 spin_unlock(&ksm_mmlist_lock);
1663
1664 free_mm_slot(slot);
1665 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1666 up_read(&mm->mmap_sem);
1667 mmdrop(mm);
1668 } else {
1669 up_read(&mm->mmap_sem);
1670
1671
1672
1673
1674
1675
1676
1677 spin_unlock(&ksm_mmlist_lock);
1678 }
1679
1680
1681 slot = ksm_scan.mm_slot;
1682 if (slot != &ksm_mm_head)
1683 goto next_mm;
1684
1685 ksm_scan.seqnr++;
1686 return NULL;
1687}
1688
1689
1690
1691
1692
1693static void ksm_do_scan(unsigned int scan_npages)
1694{
1695 struct rmap_item *rmap_item;
1696 struct page *uninitialized_var(page);
1697
1698 while (scan_npages-- && likely(!freezing(current))) {
1699 cond_resched();
1700 rmap_item = scan_get_next_rmap_item(&page);
1701 if (!rmap_item)
1702 return;
1703 cmp_and_merge_page(page, rmap_item);
1704 put_page(page);
1705 }
1706}
1707
1708static int ksmd_should_run(void)
1709{
1710 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1711}
1712
1713static int ksm_scan_thread(void *nothing)
1714{
1715 set_freezable();
1716 set_user_nice(current, 5);
1717
1718 while (!kthread_should_stop()) {
1719 mutex_lock(&ksm_thread_mutex);
1720 wait_while_offlining();
1721 if (ksmd_should_run())
1722 ksm_do_scan(ksm_thread_pages_to_scan);
1723 mutex_unlock(&ksm_thread_mutex);
1724
1725 try_to_freeze();
1726
1727 if (ksmd_should_run()) {
1728 schedule_timeout_interruptible(
1729 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1730 } else {
1731 wait_event_freezable(ksm_thread_wait,
1732 ksmd_should_run() || kthread_should_stop());
1733 }
1734 }
1735 return 0;
1736}
1737
1738int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1739 unsigned long end, int advice, unsigned long *vm_flags)
1740{
1741 struct mm_struct *mm = vma->vm_mm;
1742 int err;
1743
1744 switch (advice) {
1745 case MADV_MERGEABLE:
1746
1747
1748
1749 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1750 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1751 VM_HUGETLB | VM_MIXEDMAP))
1752 return 0;
1753
1754#ifdef VM_SAO
1755 if (*vm_flags & VM_SAO)
1756 return 0;
1757#endif
1758
1759 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1760 err = __ksm_enter(mm);
1761 if (err)
1762 return err;
1763 }
1764
1765 *vm_flags |= VM_MERGEABLE;
1766 break;
1767
1768 case MADV_UNMERGEABLE:
1769 if (!(*vm_flags & VM_MERGEABLE))
1770 return 0;
1771
1772 if (vma->anon_vma) {
1773 err = unmerge_ksm_pages(vma, start, end);
1774 if (err)
1775 return err;
1776 }
1777
1778 *vm_flags &= ~VM_MERGEABLE;
1779 break;
1780 }
1781
1782 return 0;
1783}
1784
1785int __ksm_enter(struct mm_struct *mm)
1786{
1787 struct mm_slot *mm_slot;
1788 int needs_wakeup;
1789
1790 mm_slot = alloc_mm_slot();
1791 if (!mm_slot)
1792 return -ENOMEM;
1793
1794
1795 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1796
1797 spin_lock(&ksm_mmlist_lock);
1798 insert_to_mm_slots_hash(mm, mm_slot);
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809 if (ksm_run & KSM_RUN_UNMERGE)
1810 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
1811 else
1812 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1813 spin_unlock(&ksm_mmlist_lock);
1814
1815 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1816 atomic_inc(&mm->mm_count);
1817
1818 if (needs_wakeup)
1819 wake_up_interruptible(&ksm_thread_wait);
1820
1821 return 0;
1822}
1823
1824void __ksm_exit(struct mm_struct *mm)
1825{
1826 struct mm_slot *mm_slot;
1827 int easy_to_free = 0;
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838 spin_lock(&ksm_mmlist_lock);
1839 mm_slot = get_mm_slot(mm);
1840 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1841 if (!mm_slot->rmap_list) {
1842 hash_del(&mm_slot->link);
1843 list_del(&mm_slot->mm_list);
1844 easy_to_free = 1;
1845 } else {
1846 list_move(&mm_slot->mm_list,
1847 &ksm_scan.mm_slot->mm_list);
1848 }
1849 }
1850 spin_unlock(&ksm_mmlist_lock);
1851
1852 if (easy_to_free) {
1853 free_mm_slot(mm_slot);
1854 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1855 mmdrop(mm);
1856 } else if (mm_slot) {
1857 down_write(&mm->mmap_sem);
1858 up_write(&mm->mmap_sem);
1859 }
1860}
1861
1862struct page *ksm_might_need_to_copy(struct page *page,
1863 struct vm_area_struct *vma, unsigned long address)
1864{
1865 struct anon_vma *anon_vma = page_anon_vma(page);
1866 struct page *new_page;
1867
1868 if (PageKsm(page)) {
1869 if (page_stable_node(page) &&
1870 !(ksm_run & KSM_RUN_UNMERGE))
1871 return page;
1872 } else if (!anon_vma) {
1873 return page;
1874 } else if (anon_vma->root == vma->anon_vma->root &&
1875 page->index == linear_page_index(vma, address)) {
1876 return page;
1877 }
1878 if (!PageUptodate(page))
1879 return page;
1880
1881 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1882 if (new_page) {
1883 copy_user_highpage(new_page, page, address, vma);
1884
1885 SetPageDirty(new_page);
1886 __SetPageUptodate(new_page);
1887 __SetPageLocked(new_page);
1888 }
1889
1890 return new_page;
1891}
1892
1893int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
1894{
1895 struct stable_node *stable_node;
1896 struct rmap_item *rmap_item;
1897 int ret = SWAP_AGAIN;
1898 int search_new_forks = 0;
1899
1900 VM_BUG_ON_PAGE(!PageKsm(page), page);
1901
1902
1903
1904
1905
1906 VM_BUG_ON_PAGE(!PageLocked(page), page);
1907
1908 stable_node = page_stable_node(page);
1909 if (!stable_node)
1910 return ret;
1911again:
1912 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1913 struct anon_vma *anon_vma = rmap_item->anon_vma;
1914 struct anon_vma_chain *vmac;
1915 struct vm_area_struct *vma;
1916
1917 cond_resched();
1918 anon_vma_lock_read(anon_vma);
1919 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1920 0, ULONG_MAX) {
1921 cond_resched();
1922 vma = vmac->vma;
1923 if (rmap_item->address < vma->vm_start ||
1924 rmap_item->address >= vma->vm_end)
1925 continue;
1926
1927
1928
1929
1930
1931
1932 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1933 continue;
1934
1935 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1936 continue;
1937
1938 ret = rwc->rmap_one(page, vma,
1939 rmap_item->address, rwc->arg);
1940 if (ret != SWAP_AGAIN) {
1941 anon_vma_unlock_read(anon_vma);
1942 goto out;
1943 }
1944 if (rwc->done && rwc->done(page)) {
1945 anon_vma_unlock_read(anon_vma);
1946 goto out;
1947 }
1948 }
1949 anon_vma_unlock_read(anon_vma);
1950 }
1951 if (!search_new_forks++)
1952 goto again;
1953out:
1954 return ret;
1955}
1956
1957#ifdef CONFIG_MIGRATION
1958void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1959{
1960 struct stable_node *stable_node;
1961
1962 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
1963 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1964 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
1965
1966 stable_node = page_stable_node(newpage);
1967 if (stable_node) {
1968 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
1969 stable_node->kpfn = page_to_pfn(newpage);
1970
1971
1972
1973
1974
1975
1976 smp_wmb();
1977 set_page_stable_node(oldpage, NULL);
1978 }
1979}
1980#endif
1981
1982#ifdef CONFIG_MEMORY_HOTREMOVE
1983static void wait_while_offlining(void)
1984{
1985 while (ksm_run & KSM_RUN_OFFLINE) {
1986 mutex_unlock(&ksm_thread_mutex);
1987 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
1988 TASK_UNINTERRUPTIBLE);
1989 mutex_lock(&ksm_thread_mutex);
1990 }
1991}
1992
1993static void ksm_check_stable_tree(unsigned long start_pfn,
1994 unsigned long end_pfn)
1995{
1996 struct stable_node *stable_node, *next;
1997 struct rb_node *node;
1998 int nid;
1999
2000 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2001 node = rb_first(root_stable_tree + nid);
2002 while (node) {
2003 stable_node = rb_entry(node, struct stable_node, node);
2004 if (stable_node->kpfn >= start_pfn &&
2005 stable_node->kpfn < end_pfn) {
2006
2007
2008
2009
2010 remove_node_from_stable_tree(stable_node);
2011 node = rb_first(root_stable_tree + nid);
2012 } else
2013 node = rb_next(node);
2014 cond_resched();
2015 }
2016 }
2017 list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
2018 if (stable_node->kpfn >= start_pfn &&
2019 stable_node->kpfn < end_pfn)
2020 remove_node_from_stable_tree(stable_node);
2021 cond_resched();
2022 }
2023}
2024
2025static int ksm_memory_callback(struct notifier_block *self,
2026 unsigned long action, void *arg)
2027{
2028 struct memory_notify *mn = arg;
2029
2030 switch (action) {
2031 case MEM_GOING_OFFLINE:
2032
2033
2034
2035
2036
2037
2038
2039 mutex_lock(&ksm_thread_mutex);
2040 ksm_run |= KSM_RUN_OFFLINE;
2041 mutex_unlock(&ksm_thread_mutex);
2042 break;
2043
2044 case MEM_OFFLINE:
2045
2046
2047
2048
2049
2050
2051
2052 ksm_check_stable_tree(mn->start_pfn,
2053 mn->start_pfn + mn->nr_pages);
2054
2055
2056 case MEM_CANCEL_OFFLINE:
2057 mutex_lock(&ksm_thread_mutex);
2058 ksm_run &= ~KSM_RUN_OFFLINE;
2059 mutex_unlock(&ksm_thread_mutex);
2060
2061 smp_mb();
2062 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2063 break;
2064 }
2065 return NOTIFY_OK;
2066}
2067#else
2068static void wait_while_offlining(void)
2069{
2070}
2071#endif
2072
2073#ifdef CONFIG_SYSFS
2074
2075
2076
2077
2078#define KSM_ATTR_RO(_name) \
2079 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2080#define KSM_ATTR(_name) \
2081 static struct kobj_attribute _name##_attr = \
2082 __ATTR(_name, 0644, _name##_show, _name##_store)
2083
2084static ssize_t sleep_millisecs_show(struct kobject *kobj,
2085 struct kobj_attribute *attr, char *buf)
2086{
2087 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2088}
2089
2090static ssize_t sleep_millisecs_store(struct kobject *kobj,
2091 struct kobj_attribute *attr,
2092 const char *buf, size_t count)
2093{
2094 unsigned long msecs;
2095 int err;
2096
2097 err = kstrtoul(buf, 10, &msecs);
2098 if (err || msecs > UINT_MAX)
2099 return -EINVAL;
2100
2101 ksm_thread_sleep_millisecs = msecs;
2102
2103 return count;
2104}
2105KSM_ATTR(sleep_millisecs);
2106
2107static ssize_t pages_to_scan_show(struct kobject *kobj,
2108 struct kobj_attribute *attr, char *buf)
2109{
2110 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2111}
2112
2113static ssize_t pages_to_scan_store(struct kobject *kobj,
2114 struct kobj_attribute *attr,
2115 const char *buf, size_t count)
2116{
2117 int err;
2118 unsigned long nr_pages;
2119
2120 err = kstrtoul(buf, 10, &nr_pages);
2121 if (err || nr_pages > UINT_MAX)
2122 return -EINVAL;
2123
2124 ksm_thread_pages_to_scan = nr_pages;
2125
2126 return count;
2127}
2128KSM_ATTR(pages_to_scan);
2129
2130static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2131 char *buf)
2132{
2133 return sprintf(buf, "%lu\n", ksm_run);
2134}
2135
2136static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2137 const char *buf, size_t count)
2138{
2139 int err;
2140 unsigned long flags;
2141
2142 err = kstrtoul(buf, 10, &flags);
2143 if (err || flags > UINT_MAX)
2144 return -EINVAL;
2145 if (flags > KSM_RUN_UNMERGE)
2146 return -EINVAL;
2147
2148
2149
2150
2151
2152
2153
2154
2155 mutex_lock(&ksm_thread_mutex);
2156 wait_while_offlining();
2157 if (ksm_run != flags) {
2158 ksm_run = flags;
2159 if (flags & KSM_RUN_UNMERGE) {
2160 set_current_oom_origin();
2161 err = unmerge_and_remove_all_rmap_items();
2162 clear_current_oom_origin();
2163 if (err) {
2164 ksm_run = KSM_RUN_STOP;
2165 count = err;
2166 }
2167 }
2168 }
2169 mutex_unlock(&ksm_thread_mutex);
2170
2171 if (flags & KSM_RUN_MERGE)
2172 wake_up_interruptible(&ksm_thread_wait);
2173
2174 return count;
2175}
2176KSM_ATTR(run);
2177
2178#ifdef CONFIG_NUMA
2179static ssize_t merge_across_nodes_show(struct kobject *kobj,
2180 struct kobj_attribute *attr, char *buf)
2181{
2182 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2183}
2184
2185static ssize_t merge_across_nodes_store(struct kobject *kobj,
2186 struct kobj_attribute *attr,
2187 const char *buf, size_t count)
2188{
2189 int err;
2190 unsigned long knob;
2191
2192 err = kstrtoul(buf, 10, &knob);
2193 if (err)
2194 return err;
2195 if (knob > 1)
2196 return -EINVAL;
2197
2198 mutex_lock(&ksm_thread_mutex);
2199 wait_while_offlining();
2200 if (ksm_merge_across_nodes != knob) {
2201 if (ksm_pages_shared || remove_all_stable_nodes())
2202 err = -EBUSY;
2203 else if (root_stable_tree == one_stable_tree) {
2204 struct rb_root *buf;
2205
2206
2207
2208
2209
2210
2211
2212 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2213 GFP_KERNEL);
2214
2215 if (!buf)
2216 err = -ENOMEM;
2217 else {
2218 root_stable_tree = buf;
2219 root_unstable_tree = buf + nr_node_ids;
2220
2221 root_unstable_tree[0] = one_unstable_tree[0];
2222 }
2223 }
2224 if (!err) {
2225 ksm_merge_across_nodes = knob;
2226 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2227 }
2228 }
2229 mutex_unlock(&ksm_thread_mutex);
2230
2231 return err ? err : count;
2232}
2233KSM_ATTR(merge_across_nodes);
2234#endif
2235
2236static ssize_t pages_shared_show(struct kobject *kobj,
2237 struct kobj_attribute *attr, char *buf)
2238{
2239 return sprintf(buf, "%lu\n", ksm_pages_shared);
2240}
2241KSM_ATTR_RO(pages_shared);
2242
2243static ssize_t pages_sharing_show(struct kobject *kobj,
2244 struct kobj_attribute *attr, char *buf)
2245{
2246 return sprintf(buf, "%lu\n", ksm_pages_sharing);
2247}
2248KSM_ATTR_RO(pages_sharing);
2249
2250static ssize_t pages_unshared_show(struct kobject *kobj,
2251 struct kobj_attribute *attr, char *buf)
2252{
2253 return sprintf(buf, "%lu\n", ksm_pages_unshared);
2254}
2255KSM_ATTR_RO(pages_unshared);
2256
2257static ssize_t pages_volatile_show(struct kobject *kobj,
2258 struct kobj_attribute *attr, char *buf)
2259{
2260 long ksm_pages_volatile;
2261
2262 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
2263 - ksm_pages_sharing - ksm_pages_unshared;
2264
2265
2266
2267
2268 if (ksm_pages_volatile < 0)
2269 ksm_pages_volatile = 0;
2270 return sprintf(buf, "%ld\n", ksm_pages_volatile);
2271}
2272KSM_ATTR_RO(pages_volatile);
2273
2274static ssize_t full_scans_show(struct kobject *kobj,
2275 struct kobj_attribute *attr, char *buf)
2276{
2277 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
2278}
2279KSM_ATTR_RO(full_scans);
2280
2281static struct attribute *ksm_attrs[] = {
2282 &sleep_millisecs_attr.attr,
2283 &pages_to_scan_attr.attr,
2284 &run_attr.attr,
2285 &pages_shared_attr.attr,
2286 &pages_sharing_attr.attr,
2287 &pages_unshared_attr.attr,
2288 &pages_volatile_attr.attr,
2289 &full_scans_attr.attr,
2290#ifdef CONFIG_NUMA
2291 &merge_across_nodes_attr.attr,
2292#endif
2293 NULL,
2294};
2295
2296static struct attribute_group ksm_attr_group = {
2297 .attrs = ksm_attrs,
2298 .name = "ksm",
2299};
2300#endif
2301
2302static int __init ksm_init(void)
2303{
2304 struct task_struct *ksm_thread;
2305 int err;
2306
2307 err = ksm_slab_init();
2308 if (err)
2309 goto out;
2310
2311 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
2312 if (IS_ERR(ksm_thread)) {
2313 pr_err("ksm: creating kthread failed\n");
2314 err = PTR_ERR(ksm_thread);
2315 goto out_free;
2316 }
2317
2318#ifdef CONFIG_SYSFS
2319 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
2320 if (err) {
2321 pr_err("ksm: register sysfs failed\n");
2322 kthread_stop(ksm_thread);
2323 goto out_free;
2324 }
2325#else
2326 ksm_run = KSM_RUN_MERGE;
2327
2328#endif
2329
2330#ifdef CONFIG_MEMORY_HOTREMOVE
2331
2332 hotplug_memory_notifier(ksm_memory_callback, 100);
2333#endif
2334 return 0;
2335
2336out_free:
2337 ksm_slab_free();
2338out:
2339 return err;
2340}
2341subsys_initcall(ksm_init);
2342