1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/memory.h>
33#include <linux/mmu_notifier.h>
34#include <linux/swap.h>
35#include <linux/ksm.h>
36#include <linux/hashtable.h>
37#include <linux/freezer.h>
38#include <linux/oom.h>
39#include <linux/numa.h>
40
41#include <asm/tlbflush.h>
42#include "internal.h"
43
44#ifdef CONFIG_NUMA
45#define NUMA(x) (x)
46#define DO_NUMA(x) do { (x); } while (0)
47#else
48#define NUMA(x) (0)
49#define DO_NUMA(x) do { } while (0)
50#endif
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102struct mm_slot {
103 struct hlist_node link;
104 struct list_head mm_list;
105 struct rmap_item *rmap_list;
106 struct mm_struct *mm;
107};
108
109
110
111
112
113
114
115
116
117
118struct ksm_scan {
119 struct mm_slot *mm_slot;
120 unsigned long address;
121 struct rmap_item **rmap_list;
122 unsigned long seqnr;
123};
124
125
126
127
128
129
130
131
132
133
134struct stable_node {
135 union {
136 struct rb_node node;
137 struct {
138 struct list_head *head;
139 struct list_head list;
140 };
141 };
142 struct hlist_head hlist;
143 unsigned long kpfn;
144#ifdef CONFIG_NUMA
145 int nid;
146#endif
147};
148
149
150
151
152
153
154
155
156
157
158
159
160
161struct rmap_item {
162 struct rmap_item *rmap_list;
163 union {
164 struct anon_vma *anon_vma;
165#ifdef CONFIG_NUMA
166 int nid;
167#endif
168 };
169 struct mm_struct *mm;
170 unsigned long address;
171 unsigned int oldchecksum;
172 union {
173 struct rb_node node;
174 struct {
175 struct stable_node *head;
176 struct hlist_node hlist;
177 };
178 };
179};
180
181#define SEQNR_MASK 0x0ff
182#define UNSTABLE_FLAG 0x100
183#define STABLE_FLAG 0x200
184
185
186static struct rb_root one_stable_tree[1] = { RB_ROOT };
187static struct rb_root one_unstable_tree[1] = { RB_ROOT };
188static struct rb_root *root_stable_tree = one_stable_tree;
189static struct rb_root *root_unstable_tree = one_unstable_tree;
190
191
192static LIST_HEAD(migrate_nodes);
193
194#define MM_SLOTS_HASH_BITS 10
195static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
196
197static struct mm_slot ksm_mm_head = {
198 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
199};
200static struct ksm_scan ksm_scan = {
201 .mm_slot = &ksm_mm_head,
202};
203
204static struct kmem_cache *rmap_item_cache;
205static struct kmem_cache *stable_node_cache;
206static struct kmem_cache *mm_slot_cache;
207
208
209static unsigned long ksm_pages_shared;
210
211
212static unsigned long ksm_pages_sharing;
213
214
215static unsigned long ksm_pages_unshared;
216
217
218static unsigned long ksm_rmap_items;
219
220
221static unsigned int ksm_thread_pages_to_scan = 100;
222
223
224static unsigned int ksm_thread_sleep_millisecs = 20;
225
226#ifdef CONFIG_NUMA
227
228static unsigned int ksm_merge_across_nodes = 1;
229static int ksm_nr_node_ids = 1;
230#else
231#define ksm_merge_across_nodes 1U
232#define ksm_nr_node_ids 1
233#endif
234
235#define KSM_RUN_STOP 0
236#define KSM_RUN_MERGE 1
237#define KSM_RUN_UNMERGE 2
238#define KSM_RUN_OFFLINE 4
239static unsigned long ksm_run = KSM_RUN_STOP;
240static void wait_while_offlining(void);
241
242static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
243static DEFINE_MUTEX(ksm_thread_mutex);
244static DEFINE_SPINLOCK(ksm_mmlist_lock);
245
246#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
247 sizeof(struct __struct), __alignof__(struct __struct),\
248 (__flags), NULL)
249
250static int __init ksm_slab_init(void)
251{
252 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
253 if (!rmap_item_cache)
254 goto out;
255
256 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
257 if (!stable_node_cache)
258 goto out_free1;
259
260 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
261 if (!mm_slot_cache)
262 goto out_free2;
263
264 return 0;
265
266out_free2:
267 kmem_cache_destroy(stable_node_cache);
268out_free1:
269 kmem_cache_destroy(rmap_item_cache);
270out:
271 return -ENOMEM;
272}
273
274static void __init ksm_slab_free(void)
275{
276 kmem_cache_destroy(mm_slot_cache);
277 kmem_cache_destroy(stable_node_cache);
278 kmem_cache_destroy(rmap_item_cache);
279 mm_slot_cache = NULL;
280}
281
282static inline struct rmap_item *alloc_rmap_item(void)
283{
284 struct rmap_item *rmap_item;
285
286 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
287 if (rmap_item)
288 ksm_rmap_items++;
289 return rmap_item;
290}
291
292static inline void free_rmap_item(struct rmap_item *rmap_item)
293{
294 ksm_rmap_items--;
295 rmap_item->mm = NULL;
296 kmem_cache_free(rmap_item_cache, rmap_item);
297}
298
299static inline struct stable_node *alloc_stable_node(void)
300{
301 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
302}
303
304static inline void free_stable_node(struct stable_node *stable_node)
305{
306 kmem_cache_free(stable_node_cache, stable_node);
307}
308
309static inline struct mm_slot *alloc_mm_slot(void)
310{
311 if (!mm_slot_cache)
312 return NULL;
313 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
314}
315
316static inline void free_mm_slot(struct mm_slot *mm_slot)
317{
318 kmem_cache_free(mm_slot_cache, mm_slot);
319}
320
321static struct mm_slot *get_mm_slot(struct mm_struct *mm)
322{
323 struct mm_slot *slot;
324
325 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
326 if (slot->mm == mm)
327 return slot;
328
329 return NULL;
330}
331
332static void insert_to_mm_slots_hash(struct mm_struct *mm,
333 struct mm_slot *mm_slot)
334{
335 mm_slot->mm = mm;
336 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
337}
338
339
340
341
342
343
344
345
346
347static inline bool ksm_test_exit(struct mm_struct *mm)
348{
349 return atomic_read(&mm->mm_users) == 0;
350}
351
352
353
354
355
356
357
358
359
360
361
362
363static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
364{
365 struct page *page;
366 int ret = 0;
367
368 do {
369 cond_resched();
370 page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
371 if (IS_ERR_OR_NULL(page))
372 break;
373 if (PageKsm(page))
374 ret = handle_mm_fault(vma->vm_mm, vma, addr,
375 FAULT_FLAG_WRITE);
376 else
377 ret = VM_FAULT_WRITE;
378 put_page(page);
379 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
409}
410
411static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
412 unsigned long addr)
413{
414 struct vm_area_struct *vma;
415 if (ksm_test_exit(mm))
416 return NULL;
417 vma = find_vma(mm, addr);
418 if (!vma || vma->vm_start > addr)
419 return NULL;
420 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
421 return NULL;
422 return vma;
423}
424
425static void break_cow(struct rmap_item *rmap_item)
426{
427 struct mm_struct *mm = rmap_item->mm;
428 unsigned long addr = rmap_item->address;
429 struct vm_area_struct *vma;
430
431
432
433
434
435 put_anon_vma(rmap_item->anon_vma);
436
437 down_read(&mm->mmap_sem);
438 vma = find_mergeable_vma(mm, addr);
439 if (vma)
440 break_ksm(vma, addr);
441 up_read(&mm->mmap_sem);
442}
443
444static struct page *page_trans_compound_anon(struct page *page)
445{
446 if (PageTransCompound(page)) {
447 struct page *head = compound_trans_head(page);
448
449
450
451
452 if (PageAnon(head))
453 return head;
454 }
455 return NULL;
456}
457
458static struct page *get_mergeable_page(struct rmap_item *rmap_item)
459{
460 struct mm_struct *mm = rmap_item->mm;
461 unsigned long addr = rmap_item->address;
462 struct vm_area_struct *vma;
463 struct page *page;
464
465 down_read(&mm->mmap_sem);
466 vma = find_mergeable_vma(mm, addr);
467 if (!vma)
468 goto out;
469
470 page = follow_page(vma, addr, FOLL_GET);
471 if (IS_ERR_OR_NULL(page))
472 goto out;
473 if (PageAnon(page) || page_trans_compound_anon(page)) {
474 flush_anon_page(vma, page, addr);
475 flush_dcache_page(page);
476 } else {
477 put_page(page);
478out: page = NULL;
479 }
480 up_read(&mm->mmap_sem);
481 return page;
482}
483
484
485
486
487
488
489
490static inline int get_kpfn_nid(unsigned long kpfn)
491{
492 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
493}
494
495static void remove_node_from_stable_tree(struct stable_node *stable_node)
496{
497 struct rmap_item *rmap_item;
498
499 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
500 if (rmap_item->hlist.next)
501 ksm_pages_sharing--;
502 else
503 ksm_pages_shared--;
504 put_anon_vma(rmap_item->anon_vma);
505 rmap_item->address &= PAGE_MASK;
506 cond_resched();
507 }
508
509 if (stable_node->head == &migrate_nodes)
510 list_del(&stable_node->list);
511 else
512 rb_erase(&stable_node->node,
513 root_stable_tree + NUMA(stable_node->nid));
514 free_stable_node(stable_node);
515}
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
537{
538 struct page *page;
539 void *expected_mapping;
540 unsigned long kpfn;
541
542 expected_mapping = (void *)stable_node +
543 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
544again:
545 kpfn = ACCESS_ONCE(stable_node->kpfn);
546 page = pfn_to_page(kpfn);
547
548
549
550
551
552
553 smp_read_barrier_depends();
554 if (ACCESS_ONCE(page->mapping) != expected_mapping)
555 goto stale;
556
557
558
559
560
561
562
563
564
565
566 while (!get_page_unless_zero(page)) {
567
568
569
570
571
572
573
574
575 if (!PageSwapCache(page))
576 goto stale;
577 cpu_relax();
578 }
579
580 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
581 put_page(page);
582 goto stale;
583 }
584
585 if (lock_it) {
586 lock_page(page);
587 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
588 unlock_page(page);
589 put_page(page);
590 goto stale;
591 }
592 }
593 return page;
594
595stale:
596
597
598
599
600
601
602 smp_rmb();
603 if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
604 goto again;
605 remove_node_from_stable_tree(stable_node);
606 return NULL;
607}
608
609
610
611
612
613static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
614{
615 if (rmap_item->address & STABLE_FLAG) {
616 struct stable_node *stable_node;
617 struct page *page;
618
619 stable_node = rmap_item->head;
620 page = get_ksm_page(stable_node, true);
621 if (!page)
622 goto out;
623
624 hlist_del(&rmap_item->hlist);
625 unlock_page(page);
626 put_page(page);
627
628 if (stable_node->hlist.first)
629 ksm_pages_sharing--;
630 else
631 ksm_pages_shared--;
632
633 put_anon_vma(rmap_item->anon_vma);
634 rmap_item->address &= PAGE_MASK;
635
636 } else if (rmap_item->address & UNSTABLE_FLAG) {
637 unsigned char age;
638
639
640
641
642
643
644
645 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
646 BUG_ON(age > 1);
647 if (!age)
648 rb_erase(&rmap_item->node,
649 root_unstable_tree + NUMA(rmap_item->nid));
650 ksm_pages_unshared--;
651 rmap_item->address &= PAGE_MASK;
652 }
653out:
654 cond_resched();
655}
656
657static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
658 struct rmap_item **rmap_list)
659{
660 while (*rmap_list) {
661 struct rmap_item *rmap_item = *rmap_list;
662 *rmap_list = rmap_item->rmap_list;
663 remove_rmap_item_from_tree(rmap_item);
664 free_rmap_item(rmap_item);
665 }
666}
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681static int unmerge_ksm_pages(struct vm_area_struct *vma,
682 unsigned long start, unsigned long end)
683{
684 unsigned long addr;
685 int err = 0;
686
687 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
688 if (ksm_test_exit(vma->vm_mm))
689 break;
690 if (signal_pending(current))
691 err = -ERESTARTSYS;
692 else
693 err = break_ksm(vma, addr);
694 }
695 return err;
696}
697
698#ifdef CONFIG_SYSFS
699
700
701
702static int remove_stable_node(struct stable_node *stable_node)
703{
704 struct page *page;
705 int err;
706
707 page = get_ksm_page(stable_node, true);
708 if (!page) {
709
710
711
712 return 0;
713 }
714
715 if (WARN_ON_ONCE(page_mapped(page))) {
716
717
718
719
720 err = -EBUSY;
721 } else {
722
723
724
725
726
727
728
729
730 set_page_stable_node(page, NULL);
731 remove_node_from_stable_tree(stable_node);
732 err = 0;
733 }
734
735 unlock_page(page);
736 put_page(page);
737 return err;
738}
739
740static int remove_all_stable_nodes(void)
741{
742 struct stable_node *stable_node;
743 struct list_head *this, *next;
744 int nid;
745 int err = 0;
746
747 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
748 while (root_stable_tree[nid].rb_node) {
749 stable_node = rb_entry(root_stable_tree[nid].rb_node,
750 struct stable_node, node);
751 if (remove_stable_node(stable_node)) {
752 err = -EBUSY;
753 break;
754 }
755 cond_resched();
756 }
757 }
758 list_for_each_safe(this, next, &migrate_nodes) {
759 stable_node = list_entry(this, struct stable_node, list);
760 if (remove_stable_node(stable_node))
761 err = -EBUSY;
762 cond_resched();
763 }
764 return err;
765}
766
767static int unmerge_and_remove_all_rmap_items(void)
768{
769 struct mm_slot *mm_slot;
770 struct mm_struct *mm;
771 struct vm_area_struct *vma;
772 int err = 0;
773
774 spin_lock(&ksm_mmlist_lock);
775 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
776 struct mm_slot, mm_list);
777 spin_unlock(&ksm_mmlist_lock);
778
779 for (mm_slot = ksm_scan.mm_slot;
780 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
781 mm = mm_slot->mm;
782 down_read(&mm->mmap_sem);
783 for (vma = mm->mmap; vma; vma = vma->vm_next) {
784 if (ksm_test_exit(mm))
785 break;
786 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
787 continue;
788 err = unmerge_ksm_pages(vma,
789 vma->vm_start, vma->vm_end);
790 if (err)
791 goto error;
792 }
793
794 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
795
796 spin_lock(&ksm_mmlist_lock);
797 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
798 struct mm_slot, mm_list);
799 if (ksm_test_exit(mm)) {
800 hash_del(&mm_slot->link);
801 list_del(&mm_slot->mm_list);
802 spin_unlock(&ksm_mmlist_lock);
803
804 free_mm_slot(mm_slot);
805 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
806 up_read(&mm->mmap_sem);
807 mmdrop(mm);
808 } else {
809 spin_unlock(&ksm_mmlist_lock);
810 up_read(&mm->mmap_sem);
811 }
812 }
813
814
815 remove_all_stable_nodes();
816 ksm_scan.seqnr = 0;
817 return 0;
818
819error:
820 up_read(&mm->mmap_sem);
821 spin_lock(&ksm_mmlist_lock);
822 ksm_scan.mm_slot = &ksm_mm_head;
823 spin_unlock(&ksm_mmlist_lock);
824 return err;
825}
826#endif
827
828static u32 calc_checksum(struct page *page)
829{
830 u32 checksum;
831 void *addr = kmap_atomic(page);
832 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
833 kunmap_atomic(addr);
834 return checksum;
835}
836
837static int memcmp_pages(struct page *page1, struct page *page2)
838{
839 char *addr1, *addr2;
840 int ret;
841
842 addr1 = kmap_atomic(page1);
843 addr2 = kmap_atomic(page2);
844 ret = memcmp(addr1, addr2, PAGE_SIZE);
845 kunmap_atomic(addr2);
846 kunmap_atomic(addr1);
847 return ret;
848}
849
850static inline int pages_identical(struct page *page1, struct page *page2)
851{
852 return !memcmp_pages(page1, page2);
853}
854
855static int write_protect_page(struct vm_area_struct *vma, struct page *page,
856 pte_t *orig_pte)
857{
858 struct mm_struct *mm = vma->vm_mm;
859 unsigned long addr;
860 pte_t *ptep;
861 spinlock_t *ptl;
862 int swapped;
863 int err = -EFAULT;
864 unsigned long mmun_start;
865 unsigned long mmun_end;
866
867 addr = page_address_in_vma(page, vma);
868 if (addr == -EFAULT)
869 goto out;
870
871 BUG_ON(PageTransCompound(page));
872
873 mmun_start = addr;
874 mmun_end = addr + PAGE_SIZE;
875 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
876
877 ptep = page_check_address(page, mm, addr, &ptl, 0);
878 if (!ptep)
879 goto out_mn;
880
881 if (pte_write(*ptep) || pte_dirty(*ptep)) {
882 pte_t entry;
883
884 swapped = PageSwapCache(page);
885 flush_cache_page(vma, addr, page_to_pfn(page));
886
887
888
889
890
891
892
893
894
895 entry = ptep_clear_flush(vma, addr, ptep);
896
897
898
899
900 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
901 set_pte_at(mm, addr, ptep, entry);
902 goto out_unlock;
903 }
904 if (pte_dirty(entry))
905 set_page_dirty(page);
906 entry = pte_mkclean(pte_wrprotect(entry));
907 set_pte_at_notify(mm, addr, ptep, entry);
908 }
909 *orig_pte = *ptep;
910 err = 0;
911
912out_unlock:
913 pte_unmap_unlock(ptep, ptl);
914out_mn:
915 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
916out:
917 return err;
918}
919
920
921
922
923
924
925
926
927
928
929static int replace_page(struct vm_area_struct *vma, struct page *page,
930 struct page *kpage, pte_t orig_pte)
931{
932 struct mm_struct *mm = vma->vm_mm;
933 pmd_t *pmd;
934 pte_t *ptep;
935 spinlock_t *ptl;
936 unsigned long addr;
937 int err = -EFAULT;
938 unsigned long mmun_start;
939 unsigned long mmun_end;
940
941 addr = page_address_in_vma(page, vma);
942 if (addr == -EFAULT)
943 goto out;
944
945 pmd = mm_find_pmd(mm, addr);
946 if (!pmd)
947 goto out;
948 BUG_ON(pmd_trans_huge(*pmd));
949
950 mmun_start = addr;
951 mmun_end = addr + PAGE_SIZE;
952 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
953
954 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
955 if (!pte_same(*ptep, orig_pte)) {
956 pte_unmap_unlock(ptep, ptl);
957 goto out_mn;
958 }
959
960 get_page(kpage);
961 page_add_anon_rmap(kpage, vma, addr);
962
963 flush_cache_page(vma, addr, pte_pfn(*ptep));
964 ptep_clear_flush(vma, addr, ptep);
965 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
966
967 page_remove_rmap(page);
968 if (!page_mapped(page))
969 try_to_free_swap(page);
970 put_page(page);
971
972 pte_unmap_unlock(ptep, ptl);
973 err = 0;
974out_mn:
975 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
976out:
977 return err;
978}
979
980static int page_trans_compound_anon_split(struct page *page)
981{
982 int ret = 0;
983 struct page *transhuge_head = page_trans_compound_anon(page);
984 if (transhuge_head) {
985
986 if (get_page_unless_zero(transhuge_head)) {
987
988
989
990
991 if (PageAnon(transhuge_head))
992 ret = split_huge_page(transhuge_head);
993 else
994
995
996
997
998 ret = 1;
999 put_page(transhuge_head);
1000 } else
1001
1002 ret = 1;
1003 }
1004 return ret;
1005}
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016static int try_to_merge_one_page(struct vm_area_struct *vma,
1017 struct page *page, struct page *kpage)
1018{
1019 pte_t orig_pte = __pte(0);
1020 int err = -EFAULT;
1021
1022 if (page == kpage)
1023 return 0;
1024
1025 if (!(vma->vm_flags & VM_MERGEABLE))
1026 goto out;
1027 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
1028 goto out;
1029 BUG_ON(PageTransCompound(page));
1030 if (!PageAnon(page))
1031 goto out;
1032
1033
1034
1035
1036
1037
1038
1039
1040 if (!trylock_page(page))
1041 goto out;
1042
1043
1044
1045
1046
1047
1048 if (write_protect_page(vma, page, &orig_pte) == 0) {
1049 if (!kpage) {
1050
1051
1052
1053
1054
1055 set_page_stable_node(page, NULL);
1056 mark_page_accessed(page);
1057 err = 0;
1058 } else if (pages_identical(page, kpage))
1059 err = replace_page(vma, page, kpage, orig_pte);
1060 }
1061
1062 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1063 munlock_vma_page(page);
1064 if (!PageMlocked(kpage)) {
1065 unlock_page(page);
1066 lock_page(kpage);
1067 mlock_vma_page(kpage);
1068 page = kpage;
1069 }
1070 }
1071
1072 unlock_page(page);
1073out:
1074 return err;
1075}
1076
1077
1078
1079
1080
1081
1082
1083static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1084 struct page *page, struct page *kpage)
1085{
1086 struct mm_struct *mm = rmap_item->mm;
1087 struct vm_area_struct *vma;
1088 int err = -EFAULT;
1089
1090 down_read(&mm->mmap_sem);
1091 if (ksm_test_exit(mm))
1092 goto out;
1093 vma = find_vma(mm, rmap_item->address);
1094 if (!vma || vma->vm_start > rmap_item->address)
1095 goto out;
1096
1097 err = try_to_merge_one_page(vma, page, kpage);
1098 if (err)
1099 goto out;
1100
1101
1102 remove_rmap_item_from_tree(rmap_item);
1103
1104
1105 rmap_item->anon_vma = vma->anon_vma;
1106 get_anon_vma(vma->anon_vma);
1107out:
1108 up_read(&mm->mmap_sem);
1109 return err;
1110}
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1123 struct page *page,
1124 struct rmap_item *tree_rmap_item,
1125 struct page *tree_page)
1126{
1127 int err;
1128
1129 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1130 if (!err) {
1131 err = try_to_merge_with_ksm_page(tree_rmap_item,
1132 tree_page, page);
1133
1134
1135
1136
1137 if (err)
1138 break_cow(rmap_item);
1139 }
1140 return err ? NULL : page;
1141}
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152static struct page *stable_tree_search(struct page *page)
1153{
1154 int nid;
1155 struct rb_root *root;
1156 struct rb_node **new;
1157 struct rb_node *parent;
1158 struct stable_node *stable_node;
1159 struct stable_node *page_node;
1160
1161 page_node = page_stable_node(page);
1162 if (page_node && page_node->head != &migrate_nodes) {
1163
1164 get_page(page);
1165 return page;
1166 }
1167
1168 nid = get_kpfn_nid(page_to_pfn(page));
1169 root = root_stable_tree + nid;
1170again:
1171 new = &root->rb_node;
1172 parent = NULL;
1173
1174 while (*new) {
1175 struct page *tree_page;
1176 int ret;
1177
1178 cond_resched();
1179 stable_node = rb_entry(*new, struct stable_node, node);
1180 tree_page = get_ksm_page(stable_node, false);
1181 if (!tree_page)
1182 return NULL;
1183
1184 ret = memcmp_pages(page, tree_page);
1185 put_page(tree_page);
1186
1187 parent = *new;
1188 if (ret < 0)
1189 new = &parent->rb_left;
1190 else if (ret > 0)
1191 new = &parent->rb_right;
1192 else {
1193
1194
1195
1196
1197
1198
1199
1200 tree_page = get_ksm_page(stable_node, true);
1201 if (tree_page) {
1202 unlock_page(tree_page);
1203 if (get_kpfn_nid(stable_node->kpfn) !=
1204 NUMA(stable_node->nid)) {
1205 put_page(tree_page);
1206 goto replace;
1207 }
1208 return tree_page;
1209 }
1210
1211
1212
1213
1214 if (page_node)
1215 goto again;
1216 return NULL;
1217 }
1218 }
1219
1220 if (!page_node)
1221 return NULL;
1222
1223 list_del(&page_node->list);
1224 DO_NUMA(page_node->nid = nid);
1225 rb_link_node(&page_node->node, parent, new);
1226 rb_insert_color(&page_node->node, root);
1227 get_page(page);
1228 return page;
1229
1230replace:
1231 if (page_node) {
1232 list_del(&page_node->list);
1233 DO_NUMA(page_node->nid = nid);
1234 rb_replace_node(&stable_node->node, &page_node->node, root);
1235 get_page(page);
1236 } else {
1237 rb_erase(&stable_node->node, root);
1238 page = NULL;
1239 }
1240 stable_node->head = &migrate_nodes;
1241 list_add(&stable_node->list, stable_node->head);
1242 return page;
1243}
1244
1245
1246
1247
1248
1249
1250
1251
1252static struct stable_node *stable_tree_insert(struct page *kpage)
1253{
1254 int nid;
1255 unsigned long kpfn;
1256 struct rb_root *root;
1257 struct rb_node **new;
1258 struct rb_node *parent = NULL;
1259 struct stable_node *stable_node;
1260
1261 kpfn = page_to_pfn(kpage);
1262 nid = get_kpfn_nid(kpfn);
1263 root = root_stable_tree + nid;
1264 new = &root->rb_node;
1265
1266 while (*new) {
1267 struct page *tree_page;
1268 int ret;
1269
1270 cond_resched();
1271 stable_node = rb_entry(*new, struct stable_node, node);
1272 tree_page = get_ksm_page(stable_node, false);
1273 if (!tree_page)
1274 return NULL;
1275
1276 ret = memcmp_pages(kpage, tree_page);
1277 put_page(tree_page);
1278
1279 parent = *new;
1280 if (ret < 0)
1281 new = &parent->rb_left;
1282 else if (ret > 0)
1283 new = &parent->rb_right;
1284 else {
1285
1286
1287
1288
1289
1290 return NULL;
1291 }
1292 }
1293
1294 stable_node = alloc_stable_node();
1295 if (!stable_node)
1296 return NULL;
1297
1298 INIT_HLIST_HEAD(&stable_node->hlist);
1299 stable_node->kpfn = kpfn;
1300 set_page_stable_node(kpage, stable_node);
1301 DO_NUMA(stable_node->nid = nid);
1302 rb_link_node(&stable_node->node, parent, new);
1303 rb_insert_color(&stable_node->node, root);
1304
1305 return stable_node;
1306}
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322static
1323struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1324 struct page *page,
1325 struct page **tree_pagep)
1326{
1327 struct rb_node **new;
1328 struct rb_root *root;
1329 struct rb_node *parent = NULL;
1330 int nid;
1331
1332 nid = get_kpfn_nid(page_to_pfn(page));
1333 root = root_unstable_tree + nid;
1334 new = &root->rb_node;
1335
1336 while (*new) {
1337 struct rmap_item *tree_rmap_item;
1338 struct page *tree_page;
1339 int ret;
1340
1341 cond_resched();
1342 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1343 tree_page = get_mergeable_page(tree_rmap_item);
1344 if (IS_ERR_OR_NULL(tree_page))
1345 return NULL;
1346
1347
1348
1349
1350 if (page == tree_page) {
1351 put_page(tree_page);
1352 return NULL;
1353 }
1354
1355 ret = memcmp_pages(page, tree_page);
1356
1357 parent = *new;
1358 if (ret < 0) {
1359 put_page(tree_page);
1360 new = &parent->rb_left;
1361 } else if (ret > 0) {
1362 put_page(tree_page);
1363 new = &parent->rb_right;
1364 } else if (!ksm_merge_across_nodes &&
1365 page_to_nid(tree_page) != nid) {
1366
1367
1368
1369
1370
1371 put_page(tree_page);
1372 return NULL;
1373 } else {
1374 *tree_pagep = tree_page;
1375 return tree_rmap_item;
1376 }
1377 }
1378
1379 rmap_item->address |= UNSTABLE_FLAG;
1380 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1381 DO_NUMA(rmap_item->nid = nid);
1382 rb_link_node(&rmap_item->node, parent, new);
1383 rb_insert_color(&rmap_item->node, root);
1384
1385 ksm_pages_unshared++;
1386 return NULL;
1387}
1388
1389
1390
1391
1392
1393
1394static void stable_tree_append(struct rmap_item *rmap_item,
1395 struct stable_node *stable_node)
1396{
1397 rmap_item->head = stable_node;
1398 rmap_item->address |= STABLE_FLAG;
1399 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
1400
1401 if (rmap_item->hlist.next)
1402 ksm_pages_sharing++;
1403 else
1404 ksm_pages_shared++;
1405}
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1417{
1418 struct rmap_item *tree_rmap_item;
1419 struct page *tree_page = NULL;
1420 struct stable_node *stable_node;
1421 struct page *kpage;
1422 unsigned int checksum;
1423 int err;
1424
1425 stable_node = page_stable_node(page);
1426 if (stable_node) {
1427 if (stable_node->head != &migrate_nodes &&
1428 get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
1429 rb_erase(&stable_node->node,
1430 root_stable_tree + NUMA(stable_node->nid));
1431 stable_node->head = &migrate_nodes;
1432 list_add(&stable_node->list, stable_node->head);
1433 }
1434 if (stable_node->head != &migrate_nodes &&
1435 rmap_item->head == stable_node)
1436 return;
1437 }
1438
1439
1440 kpage = stable_tree_search(page);
1441 if (kpage == page && rmap_item->head == stable_node) {
1442 put_page(kpage);
1443 return;
1444 }
1445
1446 remove_rmap_item_from_tree(rmap_item);
1447
1448 if (kpage) {
1449 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1450 if (!err) {
1451
1452
1453
1454
1455 lock_page(kpage);
1456 stable_tree_append(rmap_item, page_stable_node(kpage));
1457 unlock_page(kpage);
1458 }
1459 put_page(kpage);
1460 return;
1461 }
1462
1463
1464
1465
1466
1467
1468
1469 checksum = calc_checksum(page);
1470 if (rmap_item->oldchecksum != checksum) {
1471 rmap_item->oldchecksum = checksum;
1472 return;
1473 }
1474
1475 tree_rmap_item =
1476 unstable_tree_search_insert(rmap_item, page, &tree_page);
1477 if (tree_rmap_item) {
1478 kpage = try_to_merge_two_pages(rmap_item, page,
1479 tree_rmap_item, tree_page);
1480 put_page(tree_page);
1481 if (kpage) {
1482
1483
1484
1485
1486 lock_page(kpage);
1487 stable_node = stable_tree_insert(kpage);
1488 if (stable_node) {
1489 stable_tree_append(tree_rmap_item, stable_node);
1490 stable_tree_append(rmap_item, stable_node);
1491 }
1492 unlock_page(kpage);
1493
1494
1495
1496
1497
1498
1499
1500 if (!stable_node) {
1501 break_cow(tree_rmap_item);
1502 break_cow(rmap_item);
1503 }
1504 }
1505 }
1506}
1507
1508static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1509 struct rmap_item **rmap_list,
1510 unsigned long addr)
1511{
1512 struct rmap_item *rmap_item;
1513
1514 while (*rmap_list) {
1515 rmap_item = *rmap_list;
1516 if ((rmap_item->address & PAGE_MASK) == addr)
1517 return rmap_item;
1518 if (rmap_item->address > addr)
1519 break;
1520 *rmap_list = rmap_item->rmap_list;
1521 remove_rmap_item_from_tree(rmap_item);
1522 free_rmap_item(rmap_item);
1523 }
1524
1525 rmap_item = alloc_rmap_item();
1526 if (rmap_item) {
1527
1528 rmap_item->mm = mm_slot->mm;
1529 rmap_item->address = addr;
1530 rmap_item->rmap_list = *rmap_list;
1531 *rmap_list = rmap_item;
1532 }
1533 return rmap_item;
1534}
1535
1536static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1537{
1538 struct mm_struct *mm;
1539 struct mm_slot *slot;
1540 struct vm_area_struct *vma;
1541 struct rmap_item *rmap_item;
1542 int nid;
1543
1544 if (list_empty(&ksm_mm_head.mm_list))
1545 return NULL;
1546
1547 slot = ksm_scan.mm_slot;
1548 if (slot == &ksm_mm_head) {
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559 lru_add_drain_all();
1560
1561
1562
1563
1564
1565
1566
1567 if (!ksm_merge_across_nodes) {
1568 struct stable_node *stable_node;
1569 struct list_head *this, *next;
1570 struct page *page;
1571
1572 list_for_each_safe(this, next, &migrate_nodes) {
1573 stable_node = list_entry(this,
1574 struct stable_node, list);
1575 page = get_ksm_page(stable_node, false);
1576 if (page)
1577 put_page(page);
1578 cond_resched();
1579 }
1580 }
1581
1582 for (nid = 0; nid < ksm_nr_node_ids; nid++)
1583 root_unstable_tree[nid] = RB_ROOT;
1584
1585 spin_lock(&ksm_mmlist_lock);
1586 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1587 ksm_scan.mm_slot = slot;
1588 spin_unlock(&ksm_mmlist_lock);
1589
1590
1591
1592
1593 if (slot == &ksm_mm_head)
1594 return NULL;
1595next_mm:
1596 ksm_scan.address = 0;
1597 ksm_scan.rmap_list = &slot->rmap_list;
1598 }
1599
1600 mm = slot->mm;
1601 down_read(&mm->mmap_sem);
1602 if (ksm_test_exit(mm))
1603 vma = NULL;
1604 else
1605 vma = find_vma(mm, ksm_scan.address);
1606
1607 for (; vma; vma = vma->vm_next) {
1608 if (!(vma->vm_flags & VM_MERGEABLE))
1609 continue;
1610 if (ksm_scan.address < vma->vm_start)
1611 ksm_scan.address = vma->vm_start;
1612 if (!vma->anon_vma)
1613 ksm_scan.address = vma->vm_end;
1614
1615 while (ksm_scan.address < vma->vm_end) {
1616 if (ksm_test_exit(mm))
1617 break;
1618 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1619 if (IS_ERR_OR_NULL(*page)) {
1620 ksm_scan.address += PAGE_SIZE;
1621 cond_resched();
1622 continue;
1623 }
1624 if (PageAnon(*page) ||
1625 page_trans_compound_anon(*page)) {
1626 flush_anon_page(vma, *page, ksm_scan.address);
1627 flush_dcache_page(*page);
1628 rmap_item = get_next_rmap_item(slot,
1629 ksm_scan.rmap_list, ksm_scan.address);
1630 if (rmap_item) {
1631 ksm_scan.rmap_list =
1632 &rmap_item->rmap_list;
1633 ksm_scan.address += PAGE_SIZE;
1634 } else
1635 put_page(*page);
1636 up_read(&mm->mmap_sem);
1637 return rmap_item;
1638 }
1639 put_page(*page);
1640 ksm_scan.address += PAGE_SIZE;
1641 cond_resched();
1642 }
1643 }
1644
1645 if (ksm_test_exit(mm)) {
1646 ksm_scan.address = 0;
1647 ksm_scan.rmap_list = &slot->rmap_list;
1648 }
1649
1650
1651
1652
1653 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
1654
1655 spin_lock(&ksm_mmlist_lock);
1656 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1657 struct mm_slot, mm_list);
1658 if (ksm_scan.address == 0) {
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668 hash_del(&slot->link);
1669 list_del(&slot->mm_list);
1670 spin_unlock(&ksm_mmlist_lock);
1671
1672 free_mm_slot(slot);
1673 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1674 up_read(&mm->mmap_sem);
1675 mmdrop(mm);
1676 } else {
1677 spin_unlock(&ksm_mmlist_lock);
1678 up_read(&mm->mmap_sem);
1679 }
1680
1681
1682 slot = ksm_scan.mm_slot;
1683 if (slot != &ksm_mm_head)
1684 goto next_mm;
1685
1686 ksm_scan.seqnr++;
1687 return NULL;
1688}
1689
1690
1691
1692
1693
1694static void ksm_do_scan(unsigned int scan_npages)
1695{
1696 struct rmap_item *rmap_item;
1697 struct page *uninitialized_var(page);
1698
1699 while (scan_npages-- && likely(!freezing(current))) {
1700 cond_resched();
1701 rmap_item = scan_get_next_rmap_item(&page);
1702 if (!rmap_item)
1703 return;
1704 cmp_and_merge_page(page, rmap_item);
1705 put_page(page);
1706 }
1707}
1708
1709static int ksmd_should_run(void)
1710{
1711 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1712}
1713
1714static int ksm_scan_thread(void *nothing)
1715{
1716 set_freezable();
1717 set_user_nice(current, 5);
1718
1719 while (!kthread_should_stop()) {
1720 mutex_lock(&ksm_thread_mutex);
1721 wait_while_offlining();
1722 if (ksmd_should_run())
1723 ksm_do_scan(ksm_thread_pages_to_scan);
1724 mutex_unlock(&ksm_thread_mutex);
1725
1726 try_to_freeze();
1727
1728 if (ksmd_should_run()) {
1729 schedule_timeout_interruptible(
1730 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1731 } else {
1732 wait_event_freezable(ksm_thread_wait,
1733 ksmd_should_run() || kthread_should_stop());
1734 }
1735 }
1736 return 0;
1737}
1738
1739int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1740 unsigned long end, int advice, unsigned long *vm_flags)
1741{
1742 struct mm_struct *mm = vma->vm_mm;
1743 int err;
1744
1745 switch (advice) {
1746 case MADV_MERGEABLE:
1747
1748
1749
1750 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1751 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1752 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
1753 return 0;
1754
1755#ifdef VM_SAO
1756 if (*vm_flags & VM_SAO)
1757 return 0;
1758#endif
1759
1760 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1761 err = __ksm_enter(mm);
1762 if (err)
1763 return err;
1764 }
1765
1766 *vm_flags |= VM_MERGEABLE;
1767 break;
1768
1769 case MADV_UNMERGEABLE:
1770 if (!(*vm_flags & VM_MERGEABLE))
1771 return 0;
1772
1773 if (vma->anon_vma) {
1774 err = unmerge_ksm_pages(vma, start, end);
1775 if (err)
1776 return err;
1777 }
1778
1779 *vm_flags &= ~VM_MERGEABLE;
1780 break;
1781 }
1782
1783 return 0;
1784}
1785
1786int __ksm_enter(struct mm_struct *mm)
1787{
1788 struct mm_slot *mm_slot;
1789 int needs_wakeup;
1790
1791 mm_slot = alloc_mm_slot();
1792 if (!mm_slot)
1793 return -ENOMEM;
1794
1795
1796 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1797
1798 spin_lock(&ksm_mmlist_lock);
1799 insert_to_mm_slots_hash(mm, mm_slot);
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810 if (ksm_run & KSM_RUN_UNMERGE)
1811 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
1812 else
1813 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1814 spin_unlock(&ksm_mmlist_lock);
1815
1816 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1817 atomic_inc(&mm->mm_count);
1818
1819 if (needs_wakeup)
1820 wake_up_interruptible(&ksm_thread_wait);
1821
1822 return 0;
1823}
1824
1825void __ksm_exit(struct mm_struct *mm)
1826{
1827 struct mm_slot *mm_slot;
1828 int easy_to_free = 0;
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839 spin_lock(&ksm_mmlist_lock);
1840 mm_slot = get_mm_slot(mm);
1841 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1842 if (!mm_slot->rmap_list) {
1843 hash_del(&mm_slot->link);
1844 list_del(&mm_slot->mm_list);
1845 easy_to_free = 1;
1846 } else {
1847 list_move(&mm_slot->mm_list,
1848 &ksm_scan.mm_slot->mm_list);
1849 }
1850 }
1851 spin_unlock(&ksm_mmlist_lock);
1852
1853 if (easy_to_free) {
1854 free_mm_slot(mm_slot);
1855 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1856 mmdrop(mm);
1857 } else if (mm_slot) {
1858 down_write(&mm->mmap_sem);
1859 up_write(&mm->mmap_sem);
1860 }
1861}
1862
1863struct page *ksm_might_need_to_copy(struct page *page,
1864 struct vm_area_struct *vma, unsigned long address)
1865{
1866 struct anon_vma *anon_vma = page_anon_vma(page);
1867 struct page *new_page;
1868
1869 if (PageKsm(page)) {
1870 if (page_stable_node(page) &&
1871 !(ksm_run & KSM_RUN_UNMERGE))
1872 return page;
1873 } else if (!anon_vma) {
1874 return page;
1875 } else if (anon_vma->root == vma->anon_vma->root &&
1876 page->index == linear_page_index(vma, address)) {
1877 return page;
1878 }
1879 if (!PageUptodate(page))
1880 return page;
1881
1882 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1883 if (new_page) {
1884 copy_user_highpage(new_page, page, address, vma);
1885
1886 SetPageDirty(new_page);
1887 __SetPageUptodate(new_page);
1888 __set_page_locked(new_page);
1889 }
1890
1891 return new_page;
1892}
1893
1894int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
1895 unsigned long *vm_flags)
1896{
1897 struct stable_node *stable_node;
1898 struct rmap_item *rmap_item;
1899 unsigned int mapcount = page_mapcount(page);
1900 int referenced = 0;
1901 int search_new_forks = 0;
1902
1903 VM_BUG_ON(!PageKsm(page));
1904 VM_BUG_ON(!PageLocked(page));
1905
1906 stable_node = page_stable_node(page);
1907 if (!stable_node)
1908 return 0;
1909again:
1910 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1911 struct anon_vma *anon_vma = rmap_item->anon_vma;
1912 struct anon_vma_chain *vmac;
1913 struct vm_area_struct *vma;
1914
1915 anon_vma_lock_read(anon_vma);
1916 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1917 0, ULONG_MAX) {
1918 vma = vmac->vma;
1919 if (rmap_item->address < vma->vm_start ||
1920 rmap_item->address >= vma->vm_end)
1921 continue;
1922
1923
1924
1925
1926
1927
1928 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1929 continue;
1930
1931 if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
1932 continue;
1933
1934 referenced += page_referenced_one(page, vma,
1935 rmap_item->address, &mapcount, vm_flags);
1936 if (!search_new_forks || !mapcount)
1937 break;
1938 }
1939 anon_vma_unlock_read(anon_vma);
1940 if (!mapcount)
1941 goto out;
1942 }
1943 if (!search_new_forks++)
1944 goto again;
1945out:
1946 return referenced;
1947}
1948
1949int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
1950{
1951 struct stable_node *stable_node;
1952 struct rmap_item *rmap_item;
1953 int ret = SWAP_AGAIN;
1954 int search_new_forks = 0;
1955
1956 VM_BUG_ON(!PageKsm(page));
1957 VM_BUG_ON(!PageLocked(page));
1958
1959 stable_node = page_stable_node(page);
1960 if (!stable_node)
1961 return SWAP_FAIL;
1962again:
1963 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1964 struct anon_vma *anon_vma = rmap_item->anon_vma;
1965 struct anon_vma_chain *vmac;
1966 struct vm_area_struct *vma;
1967
1968 anon_vma_lock_read(anon_vma);
1969 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1970 0, ULONG_MAX) {
1971 vma = vmac->vma;
1972 if (rmap_item->address < vma->vm_start ||
1973 rmap_item->address >= vma->vm_end)
1974 continue;
1975
1976
1977
1978
1979
1980
1981 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1982 continue;
1983
1984 ret = try_to_unmap_one(page, vma,
1985 rmap_item->address, flags);
1986 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1987 anon_vma_unlock_read(anon_vma);
1988 goto out;
1989 }
1990 }
1991 anon_vma_unlock_read(anon_vma);
1992 }
1993 if (!search_new_forks++)
1994 goto again;
1995out:
1996 return ret;
1997}
1998
1999#ifdef CONFIG_MIGRATION
2000int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
2001 struct vm_area_struct *, unsigned long, void *), void *arg)
2002{
2003 struct stable_node *stable_node;
2004 struct rmap_item *rmap_item;
2005 int ret = SWAP_AGAIN;
2006 int search_new_forks = 0;
2007
2008 VM_BUG_ON(!PageKsm(page));
2009 VM_BUG_ON(!PageLocked(page));
2010
2011 stable_node = page_stable_node(page);
2012 if (!stable_node)
2013 return ret;
2014again:
2015 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
2016 struct anon_vma *anon_vma = rmap_item->anon_vma;
2017 struct anon_vma_chain *vmac;
2018 struct vm_area_struct *vma;
2019
2020 anon_vma_lock_read(anon_vma);
2021 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
2022 0, ULONG_MAX) {
2023 vma = vmac->vma;
2024 if (rmap_item->address < vma->vm_start ||
2025 rmap_item->address >= vma->vm_end)
2026 continue;
2027
2028
2029
2030
2031
2032
2033 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
2034 continue;
2035
2036 ret = rmap_one(page, vma, rmap_item->address, arg);
2037 if (ret != SWAP_AGAIN) {
2038 anon_vma_unlock_read(anon_vma);
2039 goto out;
2040 }
2041 }
2042 anon_vma_unlock_read(anon_vma);
2043 }
2044 if (!search_new_forks++)
2045 goto again;
2046out:
2047 return ret;
2048}
2049
2050void ksm_migrate_page(struct page *newpage, struct page *oldpage)
2051{
2052 struct stable_node *stable_node;
2053
2054 VM_BUG_ON(!PageLocked(oldpage));
2055 VM_BUG_ON(!PageLocked(newpage));
2056 VM_BUG_ON(newpage->mapping != oldpage->mapping);
2057
2058 stable_node = page_stable_node(newpage);
2059 if (stable_node) {
2060 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
2061 stable_node->kpfn = page_to_pfn(newpage);
2062
2063
2064
2065
2066
2067
2068 smp_wmb();
2069 set_page_stable_node(oldpage, NULL);
2070 }
2071}
2072#endif
2073
2074#ifdef CONFIG_MEMORY_HOTREMOVE
2075static int just_wait(void *word)
2076{
2077 schedule();
2078 return 0;
2079}
2080
2081static void wait_while_offlining(void)
2082{
2083 while (ksm_run & KSM_RUN_OFFLINE) {
2084 mutex_unlock(&ksm_thread_mutex);
2085 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2086 just_wait, TASK_UNINTERRUPTIBLE);
2087 mutex_lock(&ksm_thread_mutex);
2088 }
2089}
2090
2091static void ksm_check_stable_tree(unsigned long start_pfn,
2092 unsigned long end_pfn)
2093{
2094 struct stable_node *stable_node;
2095 struct list_head *this, *next;
2096 struct rb_node *node;
2097 int nid;
2098
2099 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2100 node = rb_first(root_stable_tree + nid);
2101 while (node) {
2102 stable_node = rb_entry(node, struct stable_node, node);
2103 if (stable_node->kpfn >= start_pfn &&
2104 stable_node->kpfn < end_pfn) {
2105
2106
2107
2108
2109 remove_node_from_stable_tree(stable_node);
2110 node = rb_first(root_stable_tree + nid);
2111 } else
2112 node = rb_next(node);
2113 cond_resched();
2114 }
2115 }
2116 list_for_each_safe(this, next, &migrate_nodes) {
2117 stable_node = list_entry(this, struct stable_node, list);
2118 if (stable_node->kpfn >= start_pfn &&
2119 stable_node->kpfn < end_pfn)
2120 remove_node_from_stable_tree(stable_node);
2121 cond_resched();
2122 }
2123}
2124
2125static int ksm_memory_callback(struct notifier_block *self,
2126 unsigned long action, void *arg)
2127{
2128 struct memory_notify *mn = arg;
2129
2130 switch (action) {
2131 case MEM_GOING_OFFLINE:
2132
2133
2134
2135
2136
2137
2138
2139 mutex_lock(&ksm_thread_mutex);
2140 ksm_run |= KSM_RUN_OFFLINE;
2141 mutex_unlock(&ksm_thread_mutex);
2142 break;
2143
2144 case MEM_OFFLINE:
2145
2146
2147
2148
2149
2150
2151
2152 ksm_check_stable_tree(mn->start_pfn,
2153 mn->start_pfn + mn->nr_pages);
2154
2155
2156 case MEM_CANCEL_OFFLINE:
2157 mutex_lock(&ksm_thread_mutex);
2158 ksm_run &= ~KSM_RUN_OFFLINE;
2159 mutex_unlock(&ksm_thread_mutex);
2160
2161 smp_mb();
2162 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2163 break;
2164 }
2165 return NOTIFY_OK;
2166}
2167#else
2168static void wait_while_offlining(void)
2169{
2170}
2171#endif
2172
2173#ifdef CONFIG_SYSFS
2174
2175
2176
2177
2178#define KSM_ATTR_RO(_name) \
2179 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2180#define KSM_ATTR(_name) \
2181 static struct kobj_attribute _name##_attr = \
2182 __ATTR(_name, 0644, _name##_show, _name##_store)
2183
2184static ssize_t sleep_millisecs_show(struct kobject *kobj,
2185 struct kobj_attribute *attr, char *buf)
2186{
2187 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2188}
2189
2190static ssize_t sleep_millisecs_store(struct kobject *kobj,
2191 struct kobj_attribute *attr,
2192 const char *buf, size_t count)
2193{
2194 unsigned long msecs;
2195 int err;
2196
2197 err = strict_strtoul(buf, 10, &msecs);
2198 if (err || msecs > UINT_MAX)
2199 return -EINVAL;
2200
2201 ksm_thread_sleep_millisecs = msecs;
2202
2203 return count;
2204}
2205KSM_ATTR(sleep_millisecs);
2206
2207static ssize_t pages_to_scan_show(struct kobject *kobj,
2208 struct kobj_attribute *attr, char *buf)
2209{
2210 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2211}
2212
2213static ssize_t pages_to_scan_store(struct kobject *kobj,
2214 struct kobj_attribute *attr,
2215 const char *buf, size_t count)
2216{
2217 int err;
2218 unsigned long nr_pages;
2219
2220 err = strict_strtoul(buf, 10, &nr_pages);
2221 if (err || nr_pages > UINT_MAX)
2222 return -EINVAL;
2223
2224 ksm_thread_pages_to_scan = nr_pages;
2225
2226 return count;
2227}
2228KSM_ATTR(pages_to_scan);
2229
2230static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2231 char *buf)
2232{
2233 return sprintf(buf, "%lu\n", ksm_run);
2234}
2235
2236static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2237 const char *buf, size_t count)
2238{
2239 int err;
2240 unsigned long flags;
2241
2242 err = strict_strtoul(buf, 10, &flags);
2243 if (err || flags > UINT_MAX)
2244 return -EINVAL;
2245 if (flags > KSM_RUN_UNMERGE)
2246 return -EINVAL;
2247
2248
2249
2250
2251
2252
2253
2254
2255 mutex_lock(&ksm_thread_mutex);
2256 wait_while_offlining();
2257 if (ksm_run != flags) {
2258 ksm_run = flags;
2259 if (flags & KSM_RUN_UNMERGE) {
2260 set_current_oom_origin();
2261 err = unmerge_and_remove_all_rmap_items();
2262 clear_current_oom_origin();
2263 if (err) {
2264 ksm_run = KSM_RUN_STOP;
2265 count = err;
2266 }
2267 }
2268 }
2269 mutex_unlock(&ksm_thread_mutex);
2270
2271 if (flags & KSM_RUN_MERGE)
2272 wake_up_interruptible(&ksm_thread_wait);
2273
2274 return count;
2275}
2276KSM_ATTR(run);
2277
2278#ifdef CONFIG_NUMA
2279static ssize_t merge_across_nodes_show(struct kobject *kobj,
2280 struct kobj_attribute *attr, char *buf)
2281{
2282 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2283}
2284
2285static ssize_t merge_across_nodes_store(struct kobject *kobj,
2286 struct kobj_attribute *attr,
2287 const char *buf, size_t count)
2288{
2289 int err;
2290 unsigned long knob;
2291
2292 err = kstrtoul(buf, 10, &knob);
2293 if (err)
2294 return err;
2295 if (knob > 1)
2296 return -EINVAL;
2297
2298 mutex_lock(&ksm_thread_mutex);
2299 wait_while_offlining();
2300 if (ksm_merge_across_nodes != knob) {
2301 if (ksm_pages_shared || remove_all_stable_nodes())
2302 err = -EBUSY;
2303 else if (root_stable_tree == one_stable_tree) {
2304 struct rb_root *buf;
2305
2306
2307
2308
2309
2310
2311
2312 buf = kcalloc(nr_node_ids + nr_node_ids,
2313 sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
2314
2315 if (!buf)
2316 err = -ENOMEM;
2317 else {
2318 root_stable_tree = buf;
2319 root_unstable_tree = buf + nr_node_ids;
2320
2321 root_unstable_tree[0] = one_unstable_tree[0];
2322 }
2323 }
2324 if (!err) {
2325 ksm_merge_across_nodes = knob;
2326 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2327 }
2328 }
2329 mutex_unlock(&ksm_thread_mutex);
2330
2331 return err ? err : count;
2332}
2333KSM_ATTR(merge_across_nodes);
2334#endif
2335
2336static ssize_t pages_shared_show(struct kobject *kobj,
2337 struct kobj_attribute *attr, char *buf)
2338{
2339 return sprintf(buf, "%lu\n", ksm_pages_shared);
2340}
2341KSM_ATTR_RO(pages_shared);
2342
2343static ssize_t pages_sharing_show(struct kobject *kobj,
2344 struct kobj_attribute *attr, char *buf)
2345{
2346 return sprintf(buf, "%lu\n", ksm_pages_sharing);
2347}
2348KSM_ATTR_RO(pages_sharing);
2349
2350static ssize_t pages_unshared_show(struct kobject *kobj,
2351 struct kobj_attribute *attr, char *buf)
2352{
2353 return sprintf(buf, "%lu\n", ksm_pages_unshared);
2354}
2355KSM_ATTR_RO(pages_unshared);
2356
2357static ssize_t pages_volatile_show(struct kobject *kobj,
2358 struct kobj_attribute *attr, char *buf)
2359{
2360 long ksm_pages_volatile;
2361
2362 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
2363 - ksm_pages_sharing - ksm_pages_unshared;
2364
2365
2366
2367
2368 if (ksm_pages_volatile < 0)
2369 ksm_pages_volatile = 0;
2370 return sprintf(buf, "%ld\n", ksm_pages_volatile);
2371}
2372KSM_ATTR_RO(pages_volatile);
2373
2374static ssize_t full_scans_show(struct kobject *kobj,
2375 struct kobj_attribute *attr, char *buf)
2376{
2377 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
2378}
2379KSM_ATTR_RO(full_scans);
2380
2381static struct attribute *ksm_attrs[] = {
2382 &sleep_millisecs_attr.attr,
2383 &pages_to_scan_attr.attr,
2384 &run_attr.attr,
2385 &pages_shared_attr.attr,
2386 &pages_sharing_attr.attr,
2387 &pages_unshared_attr.attr,
2388 &pages_volatile_attr.attr,
2389 &full_scans_attr.attr,
2390#ifdef CONFIG_NUMA
2391 &merge_across_nodes_attr.attr,
2392#endif
2393 NULL,
2394};
2395
2396static struct attribute_group ksm_attr_group = {
2397 .attrs = ksm_attrs,
2398 .name = "ksm",
2399};
2400#endif
2401
2402static int __init ksm_init(void)
2403{
2404 struct task_struct *ksm_thread;
2405 int err;
2406
2407 err = ksm_slab_init();
2408 if (err)
2409 goto out;
2410
2411 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
2412 if (IS_ERR(ksm_thread)) {
2413 printk(KERN_ERR "ksm: creating kthread failed\n");
2414 err = PTR_ERR(ksm_thread);
2415 goto out_free;
2416 }
2417
2418#ifdef CONFIG_SYSFS
2419 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
2420 if (err) {
2421 printk(KERN_ERR "ksm: register sysfs failed\n");
2422 kthread_stop(ksm_thread);
2423 goto out_free;
2424 }
2425#else
2426 ksm_run = KSM_RUN_MERGE;
2427
2428#endif
2429
2430#ifdef CONFIG_MEMORY_HOTREMOVE
2431
2432 hotplug_memory_notifier(ksm_memory_callback, 100);
2433#endif
2434 return 0;
2435
2436out_free:
2437 ksm_slab_free();
2438out:
2439 return err;
2440}
2441module_init(ksm_init)
2442