1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/memory.h>
33#include <linux/mmu_notifier.h>
34#include <linux/swap.h>
35#include <linux/ksm.h>
36#include <linux/hashtable.h>
37#include <linux/freezer.h>
38#include <linux/oom.h>
39#include <linux/numa.h>
40
41#include <asm/tlbflush.h>
42#include "internal.h"
43
44#ifdef CONFIG_NUMA
45#define NUMA(x) (x)
46#define DO_NUMA(x) do { (x); } while (0)
47#else
48#define NUMA(x) (0)
49#define DO_NUMA(x) do { } while (0)
50#endif
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102struct mm_slot {
103 struct hlist_node link;
104 struct list_head mm_list;
105 struct rmap_item *rmap_list;
106 struct mm_struct *mm;
107};
108
109
110
111
112
113
114
115
116
117
118struct ksm_scan {
119 struct mm_slot *mm_slot;
120 unsigned long address;
121 struct rmap_item **rmap_list;
122 unsigned long seqnr;
123};
124
125
126
127
128
129
130
131
132
133
134struct stable_node {
135 union {
136 struct rb_node node;
137 struct {
138 struct list_head *head;
139 struct list_head list;
140 };
141 };
142 struct hlist_head hlist;
143 unsigned long kpfn;
144#ifdef CONFIG_NUMA
145 int nid;
146#endif
147};
148
149
150
151
152
153
154
155
156
157
158
159
160
161struct rmap_item {
162 struct rmap_item *rmap_list;
163 union {
164 struct anon_vma *anon_vma;
165#ifdef CONFIG_NUMA
166 int nid;
167#endif
168 };
169 struct mm_struct *mm;
170 unsigned long address;
171 unsigned int oldchecksum;
172 union {
173 struct rb_node node;
174 struct {
175 struct stable_node *head;
176 struct hlist_node hlist;
177 };
178 };
179};
180
181#define SEQNR_MASK 0x0ff
182#define UNSTABLE_FLAG 0x100
183#define STABLE_FLAG 0x200
184
185
186static struct rb_root one_stable_tree[1] = { RB_ROOT };
187static struct rb_root one_unstable_tree[1] = { RB_ROOT };
188static struct rb_root *root_stable_tree = one_stable_tree;
189static struct rb_root *root_unstable_tree = one_unstable_tree;
190
191
192static LIST_HEAD(migrate_nodes);
193
194#define MM_SLOTS_HASH_BITS 10
195static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
196
197static struct mm_slot ksm_mm_head = {
198 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
199};
200static struct ksm_scan ksm_scan = {
201 .mm_slot = &ksm_mm_head,
202};
203
204static struct kmem_cache *rmap_item_cache;
205static struct kmem_cache *stable_node_cache;
206static struct kmem_cache *mm_slot_cache;
207
208
209static unsigned long ksm_pages_shared;
210
211
212static unsigned long ksm_pages_sharing;
213
214
215static unsigned long ksm_pages_unshared;
216
217
218static unsigned long ksm_rmap_items;
219
220
221static unsigned int ksm_thread_pages_to_scan = 100;
222
223
224static unsigned int ksm_thread_sleep_millisecs = 20;
225
226#ifdef CONFIG_NUMA
227
228static unsigned int ksm_merge_across_nodes = 1;
229static int ksm_nr_node_ids = 1;
230#else
231#define ksm_merge_across_nodes 1U
232#define ksm_nr_node_ids 1
233#endif
234
235#define KSM_RUN_STOP 0
236#define KSM_RUN_MERGE 1
237#define KSM_RUN_UNMERGE 2
238#define KSM_RUN_OFFLINE 4
239static unsigned long ksm_run = KSM_RUN_STOP;
240static void wait_while_offlining(void);
241
242static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
243static DEFINE_MUTEX(ksm_thread_mutex);
244static DEFINE_SPINLOCK(ksm_mmlist_lock);
245
246#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
247 sizeof(struct __struct), __alignof__(struct __struct),\
248 (__flags), NULL)
249
250static int __init ksm_slab_init(void)
251{
252 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
253 if (!rmap_item_cache)
254 goto out;
255
256 stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
257 if (!stable_node_cache)
258 goto out_free1;
259
260 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
261 if (!mm_slot_cache)
262 goto out_free2;
263
264 return 0;
265
266out_free2:
267 kmem_cache_destroy(stable_node_cache);
268out_free1:
269 kmem_cache_destroy(rmap_item_cache);
270out:
271 return -ENOMEM;
272}
273
274static void __init ksm_slab_free(void)
275{
276 kmem_cache_destroy(mm_slot_cache);
277 kmem_cache_destroy(stable_node_cache);
278 kmem_cache_destroy(rmap_item_cache);
279 mm_slot_cache = NULL;
280}
281
282static inline struct rmap_item *alloc_rmap_item(void)
283{
284 struct rmap_item *rmap_item;
285
286 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
287 if (rmap_item)
288 ksm_rmap_items++;
289 return rmap_item;
290}
291
292static inline void free_rmap_item(struct rmap_item *rmap_item)
293{
294 ksm_rmap_items--;
295 rmap_item->mm = NULL;
296 kmem_cache_free(rmap_item_cache, rmap_item);
297}
298
299static inline struct stable_node *alloc_stable_node(void)
300{
301 return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
302}
303
304static inline void free_stable_node(struct stable_node *stable_node)
305{
306 kmem_cache_free(stable_node_cache, stable_node);
307}
308
309static inline struct mm_slot *alloc_mm_slot(void)
310{
311 if (!mm_slot_cache)
312 return NULL;
313 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
314}
315
316static inline void free_mm_slot(struct mm_slot *mm_slot)
317{
318 kmem_cache_free(mm_slot_cache, mm_slot);
319}
320
321static struct mm_slot *get_mm_slot(struct mm_struct *mm)
322{
323 struct mm_slot *slot;
324
325 hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
326 if (slot->mm == mm)
327 return slot;
328
329 return NULL;
330}
331
332static void insert_to_mm_slots_hash(struct mm_struct *mm,
333 struct mm_slot *mm_slot)
334{
335 mm_slot->mm = mm;
336 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
337}
338
339
340
341
342
343
344
345
346
347static inline bool ksm_test_exit(struct mm_struct *mm)
348{
349 return atomic_read(&mm->mm_users) == 0;
350}
351
352
353
354
355
356
357
358
359
360
361
362
363static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
364{
365 struct page *page;
366 int ret = 0;
367
368 do {
369 cond_resched();
370 page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
371 if (IS_ERR_OR_NULL(page))
372 break;
373 if (PageKsm(page))
374 ret = handle_mm_fault(vma->vm_mm, vma, addr,
375 FAULT_FLAG_WRITE);
376 else
377 ret = VM_FAULT_WRITE;
378 put_page(page);
379 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
409}
410
411static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
412 unsigned long addr)
413{
414 struct vm_area_struct *vma;
415 if (ksm_test_exit(mm))
416 return NULL;
417 vma = find_vma(mm, addr);
418 if (!vma || vma->vm_start > addr)
419 return NULL;
420 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
421 return NULL;
422 return vma;
423}
424
425static void break_cow(struct rmap_item *rmap_item)
426{
427 struct mm_struct *mm = rmap_item->mm;
428 unsigned long addr = rmap_item->address;
429 struct vm_area_struct *vma;
430
431
432
433
434
435 put_anon_vma(rmap_item->anon_vma);
436
437 down_read(&mm->mmap_sem);
438 vma = find_mergeable_vma(mm, addr);
439 if (vma)
440 break_ksm(vma, addr);
441 up_read(&mm->mmap_sem);
442}
443
444static struct page *page_trans_compound_anon(struct page *page)
445{
446 if (PageTransCompound(page)) {
447 struct page *head = compound_head(page);
448
449
450
451
452 if (PageAnon(head))
453 return head;
454 }
455 return NULL;
456}
457
458static struct page *get_mergeable_page(struct rmap_item *rmap_item)
459{
460 struct mm_struct *mm = rmap_item->mm;
461 unsigned long addr = rmap_item->address;
462 struct vm_area_struct *vma;
463 struct page *page;
464
465 down_read(&mm->mmap_sem);
466 vma = find_mergeable_vma(mm, addr);
467 if (!vma)
468 goto out;
469
470 page = follow_page(vma, addr, FOLL_GET);
471 if (IS_ERR_OR_NULL(page))
472 goto out;
473 if (PageAnon(page) || page_trans_compound_anon(page)) {
474 flush_anon_page(vma, page, addr);
475 flush_dcache_page(page);
476 } else {
477 put_page(page);
478out: page = NULL;
479 }
480 up_read(&mm->mmap_sem);
481 return page;
482}
483
484
485
486
487
488
489
490static inline int get_kpfn_nid(unsigned long kpfn)
491{
492 return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
493}
494
495static void remove_node_from_stable_tree(struct stable_node *stable_node)
496{
497 struct rmap_item *rmap_item;
498
499 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
500 if (rmap_item->hlist.next)
501 ksm_pages_sharing--;
502 else
503 ksm_pages_shared--;
504 put_anon_vma(rmap_item->anon_vma);
505 rmap_item->address &= PAGE_MASK;
506 cond_resched();
507 }
508
509 if (stable_node->head == &migrate_nodes)
510 list_del(&stable_node->list);
511 else
512 rb_erase(&stable_node->node,
513 root_stable_tree + NUMA(stable_node->nid));
514 free_stable_node(stable_node);
515}
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
537{
538 struct page *page;
539 void *expected_mapping;
540 unsigned long kpfn;
541
542 expected_mapping = (void *)stable_node +
543 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
544again:
545 kpfn = ACCESS_ONCE(stable_node->kpfn);
546 page = pfn_to_page(kpfn);
547
548
549
550
551
552
553 smp_read_barrier_depends();
554 if (ACCESS_ONCE(page->mapping) != expected_mapping)
555 goto stale;
556
557
558
559
560
561
562
563
564
565
566 while (!get_page_unless_zero(page)) {
567
568
569
570
571
572
573
574
575 if (!PageSwapCache(page))
576 goto stale;
577 cpu_relax();
578 }
579
580 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
581 put_page(page);
582 goto stale;
583 }
584
585 if (lock_it) {
586 lock_page(page);
587 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
588 unlock_page(page);
589 put_page(page);
590 goto stale;
591 }
592 }
593 return page;
594
595stale:
596
597
598
599
600
601
602 smp_rmb();
603 if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
604 goto again;
605 remove_node_from_stable_tree(stable_node);
606 return NULL;
607}
608
609
610
611
612
613static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
614{
615 if (rmap_item->address & STABLE_FLAG) {
616 struct stable_node *stable_node;
617 struct page *page;
618
619 stable_node = rmap_item->head;
620 page = get_ksm_page(stable_node, true);
621 if (!page)
622 goto out;
623
624 hlist_del(&rmap_item->hlist);
625 unlock_page(page);
626 put_page(page);
627
628 if (stable_node->hlist.first)
629 ksm_pages_sharing--;
630 else
631 ksm_pages_shared--;
632
633 put_anon_vma(rmap_item->anon_vma);
634 rmap_item->address &= PAGE_MASK;
635
636 } else if (rmap_item->address & UNSTABLE_FLAG) {
637 unsigned char age;
638
639
640
641
642
643
644
645 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
646 BUG_ON(age > 1);
647 if (!age)
648 rb_erase(&rmap_item->node,
649 root_unstable_tree + NUMA(rmap_item->nid));
650 ksm_pages_unshared--;
651 rmap_item->address &= PAGE_MASK;
652 }
653out:
654 cond_resched();
655}
656
657static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
658 struct rmap_item **rmap_list)
659{
660 while (*rmap_list) {
661 struct rmap_item *rmap_item = *rmap_list;
662 *rmap_list = rmap_item->rmap_list;
663 remove_rmap_item_from_tree(rmap_item);
664 free_rmap_item(rmap_item);
665 }
666}
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681static int unmerge_ksm_pages(struct vm_area_struct *vma,
682 unsigned long start, unsigned long end)
683{
684 unsigned long addr;
685 int err = 0;
686
687 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
688 if (ksm_test_exit(vma->vm_mm))
689 break;
690 if (signal_pending(current))
691 err = -ERESTARTSYS;
692 else
693 err = break_ksm(vma, addr);
694 }
695 return err;
696}
697
698#ifdef CONFIG_SYSFS
699
700
701
702static int remove_stable_node(struct stable_node *stable_node)
703{
704 struct page *page;
705 int err;
706
707 page = get_ksm_page(stable_node, true);
708 if (!page) {
709
710
711
712 return 0;
713 }
714
715 if (WARN_ON_ONCE(page_mapped(page))) {
716
717
718
719
720 err = -EBUSY;
721 } else {
722
723
724
725
726
727
728
729
730 set_page_stable_node(page, NULL);
731 remove_node_from_stable_tree(stable_node);
732 err = 0;
733 }
734
735 unlock_page(page);
736 put_page(page);
737 return err;
738}
739
740static int remove_all_stable_nodes(void)
741{
742 struct stable_node *stable_node;
743 struct list_head *this, *next;
744 int nid;
745 int err = 0;
746
747 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
748 while (root_stable_tree[nid].rb_node) {
749 stable_node = rb_entry(root_stable_tree[nid].rb_node,
750 struct stable_node, node);
751 if (remove_stable_node(stable_node)) {
752 err = -EBUSY;
753 break;
754 }
755 cond_resched();
756 }
757 }
758 list_for_each_safe(this, next, &migrate_nodes) {
759 stable_node = list_entry(this, struct stable_node, list);
760 if (remove_stable_node(stable_node))
761 err = -EBUSY;
762 cond_resched();
763 }
764 return err;
765}
766
767static int unmerge_and_remove_all_rmap_items(void)
768{
769 struct mm_slot *mm_slot;
770 struct mm_struct *mm;
771 struct vm_area_struct *vma;
772 int err = 0;
773
774 spin_lock(&ksm_mmlist_lock);
775 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
776 struct mm_slot, mm_list);
777 spin_unlock(&ksm_mmlist_lock);
778
779 for (mm_slot = ksm_scan.mm_slot;
780 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
781 mm = mm_slot->mm;
782 down_read(&mm->mmap_sem);
783 for (vma = mm->mmap; vma; vma = vma->vm_next) {
784 if (ksm_test_exit(mm))
785 break;
786 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
787 continue;
788 err = unmerge_ksm_pages(vma,
789 vma->vm_start, vma->vm_end);
790 if (err)
791 goto error;
792 }
793
794 remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
795
796 spin_lock(&ksm_mmlist_lock);
797 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
798 struct mm_slot, mm_list);
799 if (ksm_test_exit(mm)) {
800 hash_del(&mm_slot->link);
801 list_del(&mm_slot->mm_list);
802 spin_unlock(&ksm_mmlist_lock);
803
804 free_mm_slot(mm_slot);
805 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
806 up_read(&mm->mmap_sem);
807 mmdrop(mm);
808 } else {
809 spin_unlock(&ksm_mmlist_lock);
810 up_read(&mm->mmap_sem);
811 }
812 }
813
814
815 remove_all_stable_nodes();
816 ksm_scan.seqnr = 0;
817 return 0;
818
819error:
820 up_read(&mm->mmap_sem);
821 spin_lock(&ksm_mmlist_lock);
822 ksm_scan.mm_slot = &ksm_mm_head;
823 spin_unlock(&ksm_mmlist_lock);
824 return err;
825}
826#endif
827
828static u32 calc_checksum(struct page *page)
829{
830 u32 checksum;
831 void *addr = kmap_atomic(page);
832 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
833 kunmap_atomic(addr);
834 return checksum;
835}
836
837static int memcmp_pages(struct page *page1, struct page *page2)
838{
839 char *addr1, *addr2;
840 int ret;
841
842 addr1 = kmap_atomic(page1);
843 addr2 = kmap_atomic(page2);
844 ret = memcmp(addr1, addr2, PAGE_SIZE);
845 kunmap_atomic(addr2);
846 kunmap_atomic(addr1);
847 return ret;
848}
849
850static inline int pages_identical(struct page *page1, struct page *page2)
851{
852 return !memcmp_pages(page1, page2);
853}
854
855static int write_protect_page(struct vm_area_struct *vma, struct page *page,
856 pte_t *orig_pte)
857{
858 struct mm_struct *mm = vma->vm_mm;
859 unsigned long addr;
860 pte_t *ptep;
861 spinlock_t *ptl;
862 int swapped;
863 int err = -EFAULT;
864 unsigned long mmun_start;
865 unsigned long mmun_end;
866
867 addr = page_address_in_vma(page, vma);
868 if (addr == -EFAULT)
869 goto out;
870
871 BUG_ON(PageTransCompound(page));
872
873 mmun_start = addr;
874 mmun_end = addr + PAGE_SIZE;
875 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
876
877 ptep = page_check_address(page, mm, addr, &ptl, 0);
878 if (!ptep)
879 goto out_mn;
880
881 if (pte_write(*ptep) || pte_dirty(*ptep)) {
882 pte_t entry;
883
884 swapped = PageSwapCache(page);
885 flush_cache_page(vma, addr, page_to_pfn(page));
886
887
888
889
890
891
892
893
894
895 entry = ptep_clear_flush(vma, addr, ptep);
896
897
898
899
900 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
901 set_pte_at(mm, addr, ptep, entry);
902 goto out_unlock;
903 }
904 if (pte_dirty(entry))
905 set_page_dirty(page);
906 entry = pte_mkclean(pte_wrprotect(entry));
907 set_pte_at_notify(mm, addr, ptep, entry);
908 }
909 *orig_pte = *ptep;
910 err = 0;
911
912out_unlock:
913 pte_unmap_unlock(ptep, ptl);
914out_mn:
915 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
916out:
917 return err;
918}
919
920
921
922
923
924
925
926
927
928
929static int replace_page(struct vm_area_struct *vma, struct page *page,
930 struct page *kpage, pte_t orig_pte)
931{
932 struct mm_struct *mm = vma->vm_mm;
933 pmd_t *pmd;
934 pte_t *ptep;
935 spinlock_t *ptl;
936 unsigned long addr;
937 int err = -EFAULT;
938 unsigned long mmun_start;
939 unsigned long mmun_end;
940
941 addr = page_address_in_vma(page, vma);
942 if (addr == -EFAULT)
943 goto out;
944
945 pmd = mm_find_pmd(mm, addr);
946 if (!pmd)
947 goto out;
948 BUG_ON(pmd_trans_huge(*pmd));
949
950 mmun_start = addr;
951 mmun_end = addr + PAGE_SIZE;
952 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
953
954 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
955 if (!pte_same(*ptep, orig_pte)) {
956 pte_unmap_unlock(ptep, ptl);
957 goto out_mn;
958 }
959
960 get_page(kpage);
961 page_add_anon_rmap(kpage, vma, addr);
962
963 flush_cache_page(vma, addr, pte_pfn(*ptep));
964 ptep_clear_flush(vma, addr, ptep);
965 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
966
967 page_remove_rmap(page);
968 if (!page_mapped(page))
969 try_to_free_swap(page);
970 put_page(page);
971
972 pte_unmap_unlock(ptep, ptl);
973 err = 0;
974out_mn:
975 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
976out:
977 return err;
978}
979
980static int page_trans_compound_anon_split(struct page *page)
981{
982 int ret = 0;
983 struct page *transhuge_head = page_trans_compound_anon(page);
984 if (transhuge_head) {
985
986 if (get_page_unless_zero(transhuge_head)) {
987
988
989
990
991 if (PageAnon(transhuge_head))
992 ret = split_huge_page(transhuge_head);
993 else
994
995
996
997
998 ret = 1;
999 put_page(transhuge_head);
1000 } else
1001
1002 ret = 1;
1003 }
1004 return ret;
1005}
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016static int try_to_merge_one_page(struct vm_area_struct *vma,
1017 struct page *page, struct page *kpage)
1018{
1019 pte_t orig_pte = __pte(0);
1020 int err = -EFAULT;
1021
1022 if (page == kpage)
1023 return 0;
1024
1025 if (!(vma->vm_flags & VM_MERGEABLE))
1026 goto out;
1027 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
1028 goto out;
1029 BUG_ON(PageTransCompound(page));
1030 if (!PageAnon(page))
1031 goto out;
1032
1033
1034
1035
1036
1037
1038
1039
1040 if (!trylock_page(page))
1041 goto out;
1042
1043
1044
1045
1046
1047
1048 if (write_protect_page(vma, page, &orig_pte) == 0) {
1049 if (!kpage) {
1050
1051
1052
1053
1054
1055 set_page_stable_node(page, NULL);
1056 mark_page_accessed(page);
1057 err = 0;
1058 } else if (pages_identical(page, kpage))
1059 err = replace_page(vma, page, kpage, orig_pte);
1060 }
1061
1062 if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
1063 munlock_vma_page(page);
1064 if (!PageMlocked(kpage)) {
1065 unlock_page(page);
1066 lock_page(kpage);
1067 mlock_vma_page(kpage);
1068 page = kpage;
1069 }
1070 }
1071
1072 unlock_page(page);
1073out:
1074 return err;
1075}
1076
1077
1078
1079
1080
1081
1082
1083static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
1084 struct page *page, struct page *kpage)
1085{
1086 struct mm_struct *mm = rmap_item->mm;
1087 struct vm_area_struct *vma;
1088 int err = -EFAULT;
1089
1090 down_read(&mm->mmap_sem);
1091 if (ksm_test_exit(mm))
1092 goto out;
1093 vma = find_vma(mm, rmap_item->address);
1094 if (!vma || vma->vm_start > rmap_item->address)
1095 goto out;
1096
1097 err = try_to_merge_one_page(vma, page, kpage);
1098 if (err)
1099 goto out;
1100
1101
1102 remove_rmap_item_from_tree(rmap_item);
1103
1104
1105 rmap_item->anon_vma = vma->anon_vma;
1106 get_anon_vma(vma->anon_vma);
1107out:
1108 up_read(&mm->mmap_sem);
1109 return err;
1110}
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
1123 struct page *page,
1124 struct rmap_item *tree_rmap_item,
1125 struct page *tree_page)
1126{
1127 int err;
1128
1129 err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
1130 if (!err) {
1131 err = try_to_merge_with_ksm_page(tree_rmap_item,
1132 tree_page, page);
1133
1134
1135
1136
1137 if (err)
1138 break_cow(rmap_item);
1139 }
1140 return err ? NULL : page;
1141}
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152static struct page *stable_tree_search(struct page *page)
1153{
1154 int nid;
1155 struct rb_root *root;
1156 struct rb_node **new;
1157 struct rb_node *parent;
1158 struct stable_node *stable_node;
1159 struct stable_node *page_node;
1160
1161 page_node = page_stable_node(page);
1162 if (page_node && page_node->head != &migrate_nodes) {
1163
1164 get_page(page);
1165 return page;
1166 }
1167
1168 nid = get_kpfn_nid(page_to_pfn(page));
1169 root = root_stable_tree + nid;
1170again:
1171 new = &root->rb_node;
1172 parent = NULL;
1173
1174 while (*new) {
1175 struct page *tree_page;
1176 int ret;
1177
1178 cond_resched();
1179 stable_node = rb_entry(*new, struct stable_node, node);
1180 tree_page = get_ksm_page(stable_node, false);
1181 if (!tree_page)
1182 return NULL;
1183
1184 ret = memcmp_pages(page, tree_page);
1185 put_page(tree_page);
1186
1187 parent = *new;
1188 if (ret < 0)
1189 new = &parent->rb_left;
1190 else if (ret > 0)
1191 new = &parent->rb_right;
1192 else {
1193
1194
1195
1196
1197
1198
1199
1200 tree_page = get_ksm_page(stable_node, true);
1201 if (tree_page) {
1202 unlock_page(tree_page);
1203 if (get_kpfn_nid(stable_node->kpfn) !=
1204 NUMA(stable_node->nid)) {
1205 put_page(tree_page);
1206 goto replace;
1207 }
1208 return tree_page;
1209 }
1210
1211
1212
1213
1214 if (page_node)
1215 goto again;
1216 return NULL;
1217 }
1218 }
1219
1220 if (!page_node)
1221 return NULL;
1222
1223 list_del(&page_node->list);
1224 DO_NUMA(page_node->nid = nid);
1225 rb_link_node(&page_node->node, parent, new);
1226 rb_insert_color(&page_node->node, root);
1227 get_page(page);
1228 return page;
1229
1230replace:
1231 if (page_node) {
1232 list_del(&page_node->list);
1233 DO_NUMA(page_node->nid = nid);
1234 rb_replace_node(&stable_node->node, &page_node->node, root);
1235 get_page(page);
1236 } else {
1237 rb_erase(&stable_node->node, root);
1238 page = NULL;
1239 }
1240 stable_node->head = &migrate_nodes;
1241 list_add(&stable_node->list, stable_node->head);
1242 return page;
1243}
1244
1245
1246
1247
1248
1249
1250
1251
1252static struct stable_node *stable_tree_insert(struct page *kpage)
1253{
1254 int nid;
1255 unsigned long kpfn;
1256 struct rb_root *root;
1257 struct rb_node **new;
1258 struct rb_node *parent = NULL;
1259 struct stable_node *stable_node;
1260
1261 kpfn = page_to_pfn(kpage);
1262 nid = get_kpfn_nid(kpfn);
1263 root = root_stable_tree + nid;
1264 new = &root->rb_node;
1265
1266 while (*new) {
1267 struct page *tree_page;
1268 int ret;
1269
1270 cond_resched();
1271 stable_node = rb_entry(*new, struct stable_node, node);
1272 tree_page = get_ksm_page(stable_node, false);
1273 if (!tree_page)
1274 return NULL;
1275
1276 ret = memcmp_pages(kpage, tree_page);
1277 put_page(tree_page);
1278
1279 parent = *new;
1280 if (ret < 0)
1281 new = &parent->rb_left;
1282 else if (ret > 0)
1283 new = &parent->rb_right;
1284 else {
1285
1286
1287
1288
1289
1290 return NULL;
1291 }
1292 }
1293
1294 stable_node = alloc_stable_node();
1295 if (!stable_node)
1296 return NULL;
1297
1298 INIT_HLIST_HEAD(&stable_node->hlist);
1299 stable_node->kpfn = kpfn;
1300 set_page_stable_node(kpage, stable_node);
1301 DO_NUMA(stable_node->nid = nid);
1302 rb_link_node(&stable_node->node, parent, new);
1303 rb_insert_color(&stable_node->node, root);
1304
1305 return stable_node;
1306}
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322static
1323struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1324 struct page *page,
1325 struct page **tree_pagep)
1326{
1327 struct rb_node **new;
1328 struct rb_root *root;
1329 struct rb_node *parent = NULL;
1330 int nid;
1331
1332 nid = get_kpfn_nid(page_to_pfn(page));
1333 root = root_unstable_tree + nid;
1334 new = &root->rb_node;
1335
1336 while (*new) {
1337 struct rmap_item *tree_rmap_item;
1338 struct page *tree_page;
1339 int ret;
1340
1341 cond_resched();
1342 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1343 tree_page = get_mergeable_page(tree_rmap_item);
1344 if (IS_ERR_OR_NULL(tree_page))
1345 return NULL;
1346
1347
1348
1349
1350 if (page == tree_page) {
1351 put_page(tree_page);
1352 return NULL;
1353 }
1354
1355 ret = memcmp_pages(page, tree_page);
1356
1357 parent = *new;
1358 if (ret < 0) {
1359 put_page(tree_page);
1360 new = &parent->rb_left;
1361 } else if (ret > 0) {
1362 put_page(tree_page);
1363 new = &parent->rb_right;
1364 } else if (!ksm_merge_across_nodes &&
1365 page_to_nid(tree_page) != nid) {
1366
1367
1368
1369
1370
1371 put_page(tree_page);
1372 return NULL;
1373 } else {
1374 *tree_pagep = tree_page;
1375 return tree_rmap_item;
1376 }
1377 }
1378
1379 rmap_item->address |= UNSTABLE_FLAG;
1380 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1381 DO_NUMA(rmap_item->nid = nid);
1382 rb_link_node(&rmap_item->node, parent, new);
1383 rb_insert_color(&rmap_item->node, root);
1384
1385 ksm_pages_unshared++;
1386 return NULL;
1387}
1388
1389
1390
1391
1392
1393
1394static void stable_tree_append(struct rmap_item *rmap_item,
1395 struct stable_node *stable_node)
1396{
1397 rmap_item->head = stable_node;
1398 rmap_item->address |= STABLE_FLAG;
1399 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
1400
1401 if (rmap_item->hlist.next)
1402 ksm_pages_sharing++;
1403 else
1404 ksm_pages_shared++;
1405}
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1417{
1418 struct rmap_item *tree_rmap_item;
1419 struct page *tree_page = NULL;
1420 struct stable_node *stable_node;
1421 struct page *kpage;
1422 unsigned int checksum;
1423 int err;
1424
1425 stable_node = page_stable_node(page);
1426 if (stable_node) {
1427 if (stable_node->head != &migrate_nodes &&
1428 get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
1429 rb_erase(&stable_node->node,
1430 root_stable_tree + NUMA(stable_node->nid));
1431 stable_node->head = &migrate_nodes;
1432 list_add(&stable_node->list, stable_node->head);
1433 }
1434 if (stable_node->head != &migrate_nodes &&
1435 rmap_item->head == stable_node)
1436 return;
1437 }
1438
1439
1440 kpage = stable_tree_search(page);
1441 if (kpage == page && rmap_item->head == stable_node) {
1442 put_page(kpage);
1443 return;
1444 }
1445
1446 remove_rmap_item_from_tree(rmap_item);
1447
1448 if (kpage) {
1449 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1450 if (!err) {
1451
1452
1453
1454
1455 lock_page(kpage);
1456 stable_tree_append(rmap_item, page_stable_node(kpage));
1457 unlock_page(kpage);
1458 }
1459 put_page(kpage);
1460 return;
1461 }
1462
1463
1464
1465
1466
1467
1468
1469 checksum = calc_checksum(page);
1470 if (rmap_item->oldchecksum != checksum) {
1471 rmap_item->oldchecksum = checksum;
1472 return;
1473 }
1474
1475 tree_rmap_item =
1476 unstable_tree_search_insert(rmap_item, page, &tree_page);
1477 if (tree_rmap_item) {
1478 kpage = try_to_merge_two_pages(rmap_item, page,
1479 tree_rmap_item, tree_page);
1480 put_page(tree_page);
1481 if (kpage) {
1482
1483
1484
1485
1486 lock_page(kpage);
1487 stable_node = stable_tree_insert(kpage);
1488 if (stable_node) {
1489 stable_tree_append(tree_rmap_item, stable_node);
1490 stable_tree_append(rmap_item, stable_node);
1491 }
1492 unlock_page(kpage);
1493
1494
1495
1496
1497
1498
1499
1500 if (!stable_node) {
1501 break_cow(tree_rmap_item);
1502 break_cow(rmap_item);
1503 }
1504 }
1505 }
1506}
1507
1508static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1509 struct rmap_item **rmap_list,
1510 unsigned long addr)
1511{
1512 struct rmap_item *rmap_item;
1513
1514 while (*rmap_list) {
1515 rmap_item = *rmap_list;
1516 if ((rmap_item->address & PAGE_MASK) == addr)
1517 return rmap_item;
1518 if (rmap_item->address > addr)
1519 break;
1520 *rmap_list = rmap_item->rmap_list;
1521 remove_rmap_item_from_tree(rmap_item);
1522 free_rmap_item(rmap_item);
1523 }
1524
1525 rmap_item = alloc_rmap_item();
1526 if (rmap_item) {
1527
1528 rmap_item->mm = mm_slot->mm;
1529 rmap_item->address = addr;
1530 rmap_item->rmap_list = *rmap_list;
1531 *rmap_list = rmap_item;
1532 }
1533 return rmap_item;
1534}
1535
1536static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1537{
1538 struct mm_struct *mm;
1539 struct mm_slot *slot;
1540 struct vm_area_struct *vma;
1541 struct rmap_item *rmap_item;
1542 int nid;
1543
1544 if (list_empty(&ksm_mm_head.mm_list))
1545 return NULL;
1546
1547 slot = ksm_scan.mm_slot;
1548 if (slot == &ksm_mm_head) {
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559 lru_add_drain_all();
1560
1561
1562
1563
1564
1565
1566
1567 if (!ksm_merge_across_nodes) {
1568 struct stable_node *stable_node;
1569 struct list_head *this, *next;
1570 struct page *page;
1571
1572 list_for_each_safe(this, next, &migrate_nodes) {
1573 stable_node = list_entry(this,
1574 struct stable_node, list);
1575 page = get_ksm_page(stable_node, false);
1576 if (page)
1577 put_page(page);
1578 cond_resched();
1579 }
1580 }
1581
1582 for (nid = 0; nid < ksm_nr_node_ids; nid++)
1583 root_unstable_tree[nid] = RB_ROOT;
1584
1585 spin_lock(&ksm_mmlist_lock);
1586 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1587 ksm_scan.mm_slot = slot;
1588 spin_unlock(&ksm_mmlist_lock);
1589
1590
1591
1592
1593 if (slot == &ksm_mm_head)
1594 return NULL;
1595next_mm:
1596 ksm_scan.address = 0;
1597 ksm_scan.rmap_list = &slot->rmap_list;
1598 }
1599
1600 mm = slot->mm;
1601 down_read(&mm->mmap_sem);
1602 if (ksm_test_exit(mm))
1603 vma = NULL;
1604 else
1605 vma = find_vma(mm, ksm_scan.address);
1606
1607 for (; vma; vma = vma->vm_next) {
1608 if (!(vma->vm_flags & VM_MERGEABLE))
1609 continue;
1610 if (ksm_scan.address < vma->vm_start)
1611 ksm_scan.address = vma->vm_start;
1612 if (!vma->anon_vma)
1613 ksm_scan.address = vma->vm_end;
1614
1615 while (ksm_scan.address < vma->vm_end) {
1616 if (ksm_test_exit(mm))
1617 break;
1618 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1619 if (IS_ERR_OR_NULL(*page)) {
1620 ksm_scan.address += PAGE_SIZE;
1621 cond_resched();
1622 continue;
1623 }
1624 if (PageAnon(*page) ||
1625 page_trans_compound_anon(*page)) {
1626 flush_anon_page(vma, *page, ksm_scan.address);
1627 flush_dcache_page(*page);
1628 rmap_item = get_next_rmap_item(slot,
1629 ksm_scan.rmap_list, ksm_scan.address);
1630 if (rmap_item) {
1631 ksm_scan.rmap_list =
1632 &rmap_item->rmap_list;
1633 ksm_scan.address += PAGE_SIZE;
1634 } else
1635 put_page(*page);
1636 up_read(&mm->mmap_sem);
1637 return rmap_item;
1638 }
1639 put_page(*page);
1640 ksm_scan.address += PAGE_SIZE;
1641 cond_resched();
1642 }
1643 }
1644
1645 if (ksm_test_exit(mm)) {
1646 ksm_scan.address = 0;
1647 ksm_scan.rmap_list = &slot->rmap_list;
1648 }
1649
1650
1651
1652
1653 remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
1654
1655 spin_lock(&ksm_mmlist_lock);
1656 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1657 struct mm_slot, mm_list);
1658 if (ksm_scan.address == 0) {
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668 hash_del(&slot->link);
1669 list_del(&slot->mm_list);
1670 spin_unlock(&ksm_mmlist_lock);
1671
1672 free_mm_slot(slot);
1673 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1674 up_read(&mm->mmap_sem);
1675 mmdrop(mm);
1676 } else {
1677 spin_unlock(&ksm_mmlist_lock);
1678 up_read(&mm->mmap_sem);
1679 }
1680
1681
1682 slot = ksm_scan.mm_slot;
1683 if (slot != &ksm_mm_head)
1684 goto next_mm;
1685
1686 ksm_scan.seqnr++;
1687 return NULL;
1688}
1689
1690
1691
1692
1693
1694static void ksm_do_scan(unsigned int scan_npages)
1695{
1696 struct rmap_item *rmap_item;
1697 struct page *uninitialized_var(page);
1698
1699 while (scan_npages-- && likely(!freezing(current))) {
1700 cond_resched();
1701 rmap_item = scan_get_next_rmap_item(&page);
1702 if (!rmap_item)
1703 return;
1704 cmp_and_merge_page(page, rmap_item);
1705 put_page(page);
1706 }
1707}
1708
1709static int ksmd_should_run(void)
1710{
1711 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1712}
1713
1714static int ksm_scan_thread(void *nothing)
1715{
1716 set_freezable();
1717 set_user_nice(current, 5);
1718
1719 while (!kthread_should_stop()) {
1720 mutex_lock(&ksm_thread_mutex);
1721 wait_while_offlining();
1722 if (ksmd_should_run())
1723 ksm_do_scan(ksm_thread_pages_to_scan);
1724 mutex_unlock(&ksm_thread_mutex);
1725
1726 try_to_freeze();
1727
1728 if (ksmd_should_run()) {
1729 schedule_timeout_interruptible(
1730 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1731 } else {
1732 wait_event_freezable(ksm_thread_wait,
1733 ksmd_should_run() || kthread_should_stop());
1734 }
1735 }
1736 return 0;
1737}
1738
1739int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1740 unsigned long end, int advice, unsigned long *vm_flags)
1741{
1742 struct mm_struct *mm = vma->vm_mm;
1743 int err;
1744
1745 switch (advice) {
1746 case MADV_MERGEABLE:
1747
1748
1749
1750 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1751 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1752 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
1753 return 0;
1754
1755#ifdef VM_SAO
1756 if (*vm_flags & VM_SAO)
1757 return 0;
1758#endif
1759
1760 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1761 err = __ksm_enter(mm);
1762 if (err)
1763 return err;
1764 }
1765
1766 *vm_flags |= VM_MERGEABLE;
1767 break;
1768
1769 case MADV_UNMERGEABLE:
1770 if (!(*vm_flags & VM_MERGEABLE))
1771 return 0;
1772
1773 if (vma->anon_vma) {
1774 err = unmerge_ksm_pages(vma, start, end);
1775 if (err)
1776 return err;
1777 }
1778
1779 *vm_flags &= ~VM_MERGEABLE;
1780 break;
1781 }
1782
1783 return 0;
1784}
1785
1786int __ksm_enter(struct mm_struct *mm)
1787{
1788 struct mm_slot *mm_slot;
1789 int needs_wakeup;
1790
1791 mm_slot = alloc_mm_slot();
1792 if (!mm_slot)
1793 return -ENOMEM;
1794
1795
1796 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1797
1798 spin_lock(&ksm_mmlist_lock);
1799 insert_to_mm_slots_hash(mm, mm_slot);
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810 if (ksm_run & KSM_RUN_UNMERGE)
1811 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
1812 else
1813 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1814 spin_unlock(&ksm_mmlist_lock);
1815
1816 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1817 atomic_inc(&mm->mm_count);
1818
1819 if (needs_wakeup)
1820 wake_up_interruptible(&ksm_thread_wait);
1821
1822 return 0;
1823}
1824
1825void __ksm_exit(struct mm_struct *mm)
1826{
1827 struct mm_slot *mm_slot;
1828 int easy_to_free = 0;
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839 spin_lock(&ksm_mmlist_lock);
1840 mm_slot = get_mm_slot(mm);
1841 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1842 if (!mm_slot->rmap_list) {
1843 hash_del(&mm_slot->link);
1844 list_del(&mm_slot->mm_list);
1845 easy_to_free = 1;
1846 } else {
1847 list_move(&mm_slot->mm_list,
1848 &ksm_scan.mm_slot->mm_list);
1849 }
1850 }
1851 spin_unlock(&ksm_mmlist_lock);
1852
1853 if (easy_to_free) {
1854 free_mm_slot(mm_slot);
1855 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1856 mmdrop(mm);
1857 } else if (mm_slot) {
1858 down_write(&mm->mmap_sem);
1859 up_write(&mm->mmap_sem);
1860 }
1861}
1862
1863struct page *ksm_might_need_to_copy(struct page *page,
1864 struct vm_area_struct *vma, unsigned long address)
1865{
1866 struct anon_vma *anon_vma = page_anon_vma(page);
1867 struct page *new_page;
1868
1869 if (PageKsm(page)) {
1870 if (page_stable_node(page) &&
1871 !(ksm_run & KSM_RUN_UNMERGE))
1872 return page;
1873 } else if (!anon_vma) {
1874 return page;
1875 } else if (anon_vma->root == vma->anon_vma->root &&
1876 page->index == linear_page_index(vma, address)) {
1877 return page;
1878 }
1879 if (!PageUptodate(page))
1880 return page;
1881
1882 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1883 if (new_page) {
1884 copy_user_highpage(new_page, page, address, vma);
1885
1886 SetPageDirty(new_page);
1887 __SetPageUptodate(new_page);
1888 __set_page_locked(new_page);
1889 }
1890
1891 return new_page;
1892}
1893
1894int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
1895{
1896 struct stable_node *stable_node;
1897 struct rmap_item *rmap_item;
1898 int ret = SWAP_AGAIN;
1899 int search_new_forks = 0;
1900
1901 VM_BUG_ON_PAGE(!PageKsm(page), page);
1902
1903
1904
1905
1906
1907 VM_BUG_ON_PAGE(!PageLocked(page), page);
1908
1909 stable_node = page_stable_node(page);
1910 if (!stable_node)
1911 return ret;
1912again:
1913 hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
1914 struct anon_vma *anon_vma = rmap_item->anon_vma;
1915 struct anon_vma_chain *vmac;
1916 struct vm_area_struct *vma;
1917
1918 anon_vma_lock_read(anon_vma);
1919 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1920 0, ULONG_MAX) {
1921 vma = vmac->vma;
1922 if (rmap_item->address < vma->vm_start ||
1923 rmap_item->address >= vma->vm_end)
1924 continue;
1925
1926
1927
1928
1929
1930
1931 if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
1932 continue;
1933
1934 if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1935 continue;
1936
1937 ret = rwc->rmap_one(page, vma,
1938 rmap_item->address, rwc->arg);
1939 if (ret != SWAP_AGAIN) {
1940 anon_vma_unlock_read(anon_vma);
1941 goto out;
1942 }
1943 if (rwc->done && rwc->done(page)) {
1944 anon_vma_unlock_read(anon_vma);
1945 goto out;
1946 }
1947 }
1948 anon_vma_unlock_read(anon_vma);
1949 }
1950 if (!search_new_forks++)
1951 goto again;
1952out:
1953 return ret;
1954}
1955
1956#ifdef CONFIG_MIGRATION
1957void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1958{
1959 struct stable_node *stable_node;
1960
1961 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
1962 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1963 VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
1964
1965 stable_node = page_stable_node(newpage);
1966 if (stable_node) {
1967 VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
1968 stable_node->kpfn = page_to_pfn(newpage);
1969
1970
1971
1972
1973
1974
1975 smp_wmb();
1976 set_page_stable_node(oldpage, NULL);
1977 }
1978}
1979#endif
1980
1981#ifdef CONFIG_MEMORY_HOTREMOVE
1982static int just_wait(void *word)
1983{
1984 schedule();
1985 return 0;
1986}
1987
1988static void wait_while_offlining(void)
1989{
1990 while (ksm_run & KSM_RUN_OFFLINE) {
1991 mutex_unlock(&ksm_thread_mutex);
1992 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
1993 just_wait, TASK_UNINTERRUPTIBLE);
1994 mutex_lock(&ksm_thread_mutex);
1995 }
1996}
1997
1998static void ksm_check_stable_tree(unsigned long start_pfn,
1999 unsigned long end_pfn)
2000{
2001 struct stable_node *stable_node;
2002 struct list_head *this, *next;
2003 struct rb_node *node;
2004 int nid;
2005
2006 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2007 node = rb_first(root_stable_tree + nid);
2008 while (node) {
2009 stable_node = rb_entry(node, struct stable_node, node);
2010 if (stable_node->kpfn >= start_pfn &&
2011 stable_node->kpfn < end_pfn) {
2012
2013
2014
2015
2016 remove_node_from_stable_tree(stable_node);
2017 node = rb_first(root_stable_tree + nid);
2018 } else
2019 node = rb_next(node);
2020 cond_resched();
2021 }
2022 }
2023 list_for_each_safe(this, next, &migrate_nodes) {
2024 stable_node = list_entry(this, struct stable_node, list);
2025 if (stable_node->kpfn >= start_pfn &&
2026 stable_node->kpfn < end_pfn)
2027 remove_node_from_stable_tree(stable_node);
2028 cond_resched();
2029 }
2030}
2031
2032static int ksm_memory_callback(struct notifier_block *self,
2033 unsigned long action, void *arg)
2034{
2035 struct memory_notify *mn = arg;
2036
2037 switch (action) {
2038 case MEM_GOING_OFFLINE:
2039
2040
2041
2042
2043
2044
2045
2046 mutex_lock(&ksm_thread_mutex);
2047 ksm_run |= KSM_RUN_OFFLINE;
2048 mutex_unlock(&ksm_thread_mutex);
2049 break;
2050
2051 case MEM_OFFLINE:
2052
2053
2054
2055
2056
2057
2058
2059 ksm_check_stable_tree(mn->start_pfn,
2060 mn->start_pfn + mn->nr_pages);
2061
2062
2063 case MEM_CANCEL_OFFLINE:
2064 mutex_lock(&ksm_thread_mutex);
2065 ksm_run &= ~KSM_RUN_OFFLINE;
2066 mutex_unlock(&ksm_thread_mutex);
2067
2068 smp_mb();
2069 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
2070 break;
2071 }
2072 return NOTIFY_OK;
2073}
2074#else
2075static void wait_while_offlining(void)
2076{
2077}
2078#endif
2079
2080#ifdef CONFIG_SYSFS
2081
2082
2083
2084
2085#define KSM_ATTR_RO(_name) \
2086 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
2087#define KSM_ATTR(_name) \
2088 static struct kobj_attribute _name##_attr = \
2089 __ATTR(_name, 0644, _name##_show, _name##_store)
2090
2091static ssize_t sleep_millisecs_show(struct kobject *kobj,
2092 struct kobj_attribute *attr, char *buf)
2093{
2094 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
2095}
2096
2097static ssize_t sleep_millisecs_store(struct kobject *kobj,
2098 struct kobj_attribute *attr,
2099 const char *buf, size_t count)
2100{
2101 unsigned long msecs;
2102 int err;
2103
2104 err = kstrtoul(buf, 10, &msecs);
2105 if (err || msecs > UINT_MAX)
2106 return -EINVAL;
2107
2108 ksm_thread_sleep_millisecs = msecs;
2109
2110 return count;
2111}
2112KSM_ATTR(sleep_millisecs);
2113
2114static ssize_t pages_to_scan_show(struct kobject *kobj,
2115 struct kobj_attribute *attr, char *buf)
2116{
2117 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
2118}
2119
2120static ssize_t pages_to_scan_store(struct kobject *kobj,
2121 struct kobj_attribute *attr,
2122 const char *buf, size_t count)
2123{
2124 int err;
2125 unsigned long nr_pages;
2126
2127 err = kstrtoul(buf, 10, &nr_pages);
2128 if (err || nr_pages > UINT_MAX)
2129 return -EINVAL;
2130
2131 ksm_thread_pages_to_scan = nr_pages;
2132
2133 return count;
2134}
2135KSM_ATTR(pages_to_scan);
2136
2137static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
2138 char *buf)
2139{
2140 return sprintf(buf, "%lu\n", ksm_run);
2141}
2142
2143static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2144 const char *buf, size_t count)
2145{
2146 int err;
2147 unsigned long flags;
2148
2149 err = kstrtoul(buf, 10, &flags);
2150 if (err || flags > UINT_MAX)
2151 return -EINVAL;
2152 if (flags > KSM_RUN_UNMERGE)
2153 return -EINVAL;
2154
2155
2156
2157
2158
2159
2160
2161
2162 mutex_lock(&ksm_thread_mutex);
2163 wait_while_offlining();
2164 if (ksm_run != flags) {
2165 ksm_run = flags;
2166 if (flags & KSM_RUN_UNMERGE) {
2167 set_current_oom_origin();
2168 err = unmerge_and_remove_all_rmap_items();
2169 clear_current_oom_origin();
2170 if (err) {
2171 ksm_run = KSM_RUN_STOP;
2172 count = err;
2173 }
2174 }
2175 }
2176 mutex_unlock(&ksm_thread_mutex);
2177
2178 if (flags & KSM_RUN_MERGE)
2179 wake_up_interruptible(&ksm_thread_wait);
2180
2181 return count;
2182}
2183KSM_ATTR(run);
2184
2185#ifdef CONFIG_NUMA
2186static ssize_t merge_across_nodes_show(struct kobject *kobj,
2187 struct kobj_attribute *attr, char *buf)
2188{
2189 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2190}
2191
2192static ssize_t merge_across_nodes_store(struct kobject *kobj,
2193 struct kobj_attribute *attr,
2194 const char *buf, size_t count)
2195{
2196 int err;
2197 unsigned long knob;
2198
2199 err = kstrtoul(buf, 10, &knob);
2200 if (err)
2201 return err;
2202 if (knob > 1)
2203 return -EINVAL;
2204
2205 mutex_lock(&ksm_thread_mutex);
2206 wait_while_offlining();
2207 if (ksm_merge_across_nodes != knob) {
2208 if (ksm_pages_shared || remove_all_stable_nodes())
2209 err = -EBUSY;
2210 else if (root_stable_tree == one_stable_tree) {
2211 struct rb_root *buf;
2212
2213
2214
2215
2216
2217
2218
2219 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2220 GFP_KERNEL);
2221
2222 if (!buf)
2223 err = -ENOMEM;
2224 else {
2225 root_stable_tree = buf;
2226 root_unstable_tree = buf + nr_node_ids;
2227
2228 root_unstable_tree[0] = one_unstable_tree[0];
2229 }
2230 }
2231 if (!err) {
2232 ksm_merge_across_nodes = knob;
2233 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2234 }
2235 }
2236 mutex_unlock(&ksm_thread_mutex);
2237
2238 return err ? err : count;
2239}
2240KSM_ATTR(merge_across_nodes);
2241#endif
2242
2243static ssize_t pages_shared_show(struct kobject *kobj,
2244 struct kobj_attribute *attr, char *buf)
2245{
2246 return sprintf(buf, "%lu\n", ksm_pages_shared);
2247}
2248KSM_ATTR_RO(pages_shared);
2249
2250static ssize_t pages_sharing_show(struct kobject *kobj,
2251 struct kobj_attribute *attr, char *buf)
2252{
2253 return sprintf(buf, "%lu\n", ksm_pages_sharing);
2254}
2255KSM_ATTR_RO(pages_sharing);
2256
2257static ssize_t pages_unshared_show(struct kobject *kobj,
2258 struct kobj_attribute *attr, char *buf)
2259{
2260 return sprintf(buf, "%lu\n", ksm_pages_unshared);
2261}
2262KSM_ATTR_RO(pages_unshared);
2263
2264static ssize_t pages_volatile_show(struct kobject *kobj,
2265 struct kobj_attribute *attr, char *buf)
2266{
2267 long ksm_pages_volatile;
2268
2269 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
2270 - ksm_pages_sharing - ksm_pages_unshared;
2271
2272
2273
2274
2275 if (ksm_pages_volatile < 0)
2276 ksm_pages_volatile = 0;
2277 return sprintf(buf, "%ld\n", ksm_pages_volatile);
2278}
2279KSM_ATTR_RO(pages_volatile);
2280
2281static ssize_t full_scans_show(struct kobject *kobj,
2282 struct kobj_attribute *attr, char *buf)
2283{
2284 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
2285}
2286KSM_ATTR_RO(full_scans);
2287
2288static struct attribute *ksm_attrs[] = {
2289 &sleep_millisecs_attr.attr,
2290 &pages_to_scan_attr.attr,
2291 &run_attr.attr,
2292 &pages_shared_attr.attr,
2293 &pages_sharing_attr.attr,
2294 &pages_unshared_attr.attr,
2295 &pages_volatile_attr.attr,
2296 &full_scans_attr.attr,
2297#ifdef CONFIG_NUMA
2298 &merge_across_nodes_attr.attr,
2299#endif
2300 NULL,
2301};
2302
2303static struct attribute_group ksm_attr_group = {
2304 .attrs = ksm_attrs,
2305 .name = "ksm",
2306};
2307#endif
2308
2309static int __init ksm_init(void)
2310{
2311 struct task_struct *ksm_thread;
2312 int err;
2313
2314 err = ksm_slab_init();
2315 if (err)
2316 goto out;
2317
2318 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
2319 if (IS_ERR(ksm_thread)) {
2320 printk(KERN_ERR "ksm: creating kthread failed\n");
2321 err = PTR_ERR(ksm_thread);
2322 goto out_free;
2323 }
2324
2325#ifdef CONFIG_SYSFS
2326 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
2327 if (err) {
2328 printk(KERN_ERR "ksm: register sysfs failed\n");
2329 kthread_stop(ksm_thread);
2330 goto out_free;
2331 }
2332#else
2333 ksm_run = KSM_RUN_MERGE;
2334
2335#endif
2336
2337#ifdef CONFIG_MEMORY_HOTREMOVE
2338
2339 hotplug_memory_notifier(ksm_memory_callback, 100);
2340#endif
2341 return 0;
2342
2343out_free:
2344 ksm_slab_free();
2345out:
2346 return err;
2347}
2348subsys_initcall(ksm_init);
2349