1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include <linux/errno.h>
18#include <linux/mm.h>
19#include <linux/fs.h>
20#include <linux/mman.h>
21#include <linux/sched.h>
22#include <linux/rwsem.h>
23#include <linux/pagemap.h>
24#include <linux/rmap.h>
25#include <linux/spinlock.h>
26#include <linux/jhash.h>
27#include <linux/delay.h>
28#include <linux/kthread.h>
29#include <linux/wait.h>
30#include <linux/slab.h>
31#include <linux/rbtree.h>
32#include <linux/mmu_notifier.h>
33#include <linux/swap.h>
34#include <linux/ksm.h>
35
36#include <asm/tlbflush.h>
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85struct mm_slot {
86 struct hlist_node link;
87 struct list_head mm_list;
88 struct list_head rmap_list;
89 struct mm_struct *mm;
90};
91
92
93
94
95
96
97
98
99
100
101struct ksm_scan {
102 struct mm_slot *mm_slot;
103 unsigned long address;
104 struct rmap_item *rmap_item;
105 unsigned long seqnr;
106};
107
108
109
110
111
112
113
114
115
116
117
118struct rmap_item {
119 struct list_head link;
120 struct mm_struct *mm;
121 unsigned long address;
122 union {
123 unsigned int oldchecksum;
124 struct rmap_item *next;
125 };
126 union {
127 struct rb_node node;
128 struct rmap_item *prev;
129 };
130};
131
132#define SEQNR_MASK 0x0ff
133#define NODE_FLAG 0x100
134#define STABLE_FLAG 0x200
135
136
137static struct rb_root root_stable_tree = RB_ROOT;
138static struct rb_root root_unstable_tree = RB_ROOT;
139
140#define MM_SLOTS_HASH_HEADS 1024
141static struct hlist_head *mm_slots_hash;
142
143static struct mm_slot ksm_mm_head = {
144 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
145};
146static struct ksm_scan ksm_scan = {
147 .mm_slot = &ksm_mm_head,
148};
149
150static struct kmem_cache *rmap_item_cache;
151static struct kmem_cache *mm_slot_cache;
152
153
154static unsigned long ksm_pages_shared;
155
156
157static unsigned long ksm_pages_sharing;
158
159
160static unsigned long ksm_pages_unshared;
161
162
163static unsigned long ksm_rmap_items;
164
165
166static unsigned long ksm_max_kernel_pages;
167
168
169static unsigned int ksm_thread_pages_to_scan = 100;
170
171
172static unsigned int ksm_thread_sleep_millisecs = 20;
173
174#define KSM_RUN_STOP 0
175#define KSM_RUN_MERGE 1
176#define KSM_RUN_UNMERGE 2
177static unsigned int ksm_run = KSM_RUN_STOP;
178
179static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
180static DEFINE_MUTEX(ksm_thread_mutex);
181static DEFINE_SPINLOCK(ksm_mmlist_lock);
182
183#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
184 sizeof(struct __struct), __alignof__(struct __struct),\
185 (__flags), NULL)
186
187static int __init ksm_slab_init(void)
188{
189 rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
190 if (!rmap_item_cache)
191 goto out;
192
193 mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
194 if (!mm_slot_cache)
195 goto out_free;
196
197 return 0;
198
199out_free:
200 kmem_cache_destroy(rmap_item_cache);
201out:
202 return -ENOMEM;
203}
204
205static void __init ksm_slab_free(void)
206{
207 kmem_cache_destroy(mm_slot_cache);
208 kmem_cache_destroy(rmap_item_cache);
209 mm_slot_cache = NULL;
210}
211
212static inline struct rmap_item *alloc_rmap_item(void)
213{
214 struct rmap_item *rmap_item;
215
216 rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
217 if (rmap_item)
218 ksm_rmap_items++;
219 return rmap_item;
220}
221
222static inline void free_rmap_item(struct rmap_item *rmap_item)
223{
224 ksm_rmap_items--;
225 rmap_item->mm = NULL;
226 kmem_cache_free(rmap_item_cache, rmap_item);
227}
228
229static inline struct mm_slot *alloc_mm_slot(void)
230{
231 if (!mm_slot_cache)
232 return NULL;
233 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
234}
235
236static inline void free_mm_slot(struct mm_slot *mm_slot)
237{
238 kmem_cache_free(mm_slot_cache, mm_slot);
239}
240
241static int __init mm_slots_hash_init(void)
242{
243 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
244 GFP_KERNEL);
245 if (!mm_slots_hash)
246 return -ENOMEM;
247 return 0;
248}
249
250static void __init mm_slots_hash_free(void)
251{
252 kfree(mm_slots_hash);
253}
254
255static struct mm_slot *get_mm_slot(struct mm_struct *mm)
256{
257 struct mm_slot *mm_slot;
258 struct hlist_head *bucket;
259 struct hlist_node *node;
260
261 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
262 % MM_SLOTS_HASH_HEADS];
263 hlist_for_each_entry(mm_slot, node, bucket, link) {
264 if (mm == mm_slot->mm)
265 return mm_slot;
266 }
267 return NULL;
268}
269
270static void insert_to_mm_slots_hash(struct mm_struct *mm,
271 struct mm_slot *mm_slot)
272{
273 struct hlist_head *bucket;
274
275 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
276 % MM_SLOTS_HASH_HEADS];
277 mm_slot->mm = mm;
278 INIT_LIST_HEAD(&mm_slot->rmap_list);
279 hlist_add_head(&mm_slot->link, bucket);
280}
281
282static inline int in_stable_tree(struct rmap_item *rmap_item)
283{
284 return rmap_item->address & STABLE_FLAG;
285}
286
287
288
289
290
291
292
293
294
295static inline bool ksm_test_exit(struct mm_struct *mm)
296{
297 return atomic_read(&mm->mm_users) == 0;
298}
299
300
301
302
303
304
305
306
307
308
309
310
311static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
312{
313 struct page *page;
314 int ret = 0;
315
316 do {
317 cond_resched();
318 page = follow_page(vma, addr, FOLL_GET);
319 if (!page)
320 break;
321 if (PageKsm(page))
322 ret = handle_mm_fault(vma->vm_mm, vma, addr,
323 FAULT_FLAG_WRITE);
324 else
325 ret = VM_FAULT_WRITE;
326 put_page(page);
327 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
357}
358
359static void break_cow(struct mm_struct *mm, unsigned long addr)
360{
361 struct vm_area_struct *vma;
362
363 down_read(&mm->mmap_sem);
364 if (ksm_test_exit(mm))
365 goto out;
366 vma = find_vma(mm, addr);
367 if (!vma || vma->vm_start > addr)
368 goto out;
369 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
370 goto out;
371 break_ksm(vma, addr);
372out:
373 up_read(&mm->mmap_sem);
374}
375
376static struct page *get_mergeable_page(struct rmap_item *rmap_item)
377{
378 struct mm_struct *mm = rmap_item->mm;
379 unsigned long addr = rmap_item->address;
380 struct vm_area_struct *vma;
381 struct page *page;
382
383 down_read(&mm->mmap_sem);
384 if (ksm_test_exit(mm))
385 goto out;
386 vma = find_vma(mm, addr);
387 if (!vma || vma->vm_start > addr)
388 goto out;
389 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
390 goto out;
391
392 page = follow_page(vma, addr, FOLL_GET);
393 if (!page)
394 goto out;
395 if (PageAnon(page)) {
396 flush_anon_page(vma, page, addr);
397 flush_dcache_page(page);
398 } else {
399 put_page(page);
400out: page = NULL;
401 }
402 up_read(&mm->mmap_sem);
403 return page;
404}
405
406
407
408
409
410
411static struct page *get_ksm_page(struct rmap_item *rmap_item)
412{
413 struct page *page;
414
415 page = get_mergeable_page(rmap_item);
416 if (page && !PageKsm(page)) {
417 put_page(page);
418 page = NULL;
419 }
420 return page;
421}
422
423
424
425
426
427static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
428{
429 if (in_stable_tree(rmap_item)) {
430 struct rmap_item *next_item = rmap_item->next;
431
432 if (rmap_item->address & NODE_FLAG) {
433 if (next_item) {
434 rb_replace_node(&rmap_item->node,
435 &next_item->node,
436 &root_stable_tree);
437 next_item->address |= NODE_FLAG;
438 ksm_pages_sharing--;
439 } else {
440 rb_erase(&rmap_item->node, &root_stable_tree);
441 ksm_pages_shared--;
442 }
443 } else {
444 struct rmap_item *prev_item = rmap_item->prev;
445
446 BUG_ON(prev_item->next != rmap_item);
447 prev_item->next = next_item;
448 if (next_item) {
449 BUG_ON(next_item->prev != rmap_item);
450 next_item->prev = rmap_item->prev;
451 }
452 ksm_pages_sharing--;
453 }
454
455 rmap_item->next = NULL;
456
457 } else if (rmap_item->address & NODE_FLAG) {
458 unsigned char age;
459
460
461
462
463
464
465
466 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
467 BUG_ON(age > 1);
468 if (!age)
469 rb_erase(&rmap_item->node, &root_unstable_tree);
470 ksm_pages_unshared--;
471 }
472
473 rmap_item->address &= PAGE_MASK;
474
475 cond_resched();
476}
477
478static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
479 struct list_head *cur)
480{
481 struct rmap_item *rmap_item;
482
483 while (cur != &mm_slot->rmap_list) {
484 rmap_item = list_entry(cur, struct rmap_item, link);
485 cur = cur->next;
486 remove_rmap_item_from_tree(rmap_item);
487 list_del(&rmap_item->link);
488 free_rmap_item(rmap_item);
489 }
490}
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505static int unmerge_ksm_pages(struct vm_area_struct *vma,
506 unsigned long start, unsigned long end)
507{
508 unsigned long addr;
509 int err = 0;
510
511 for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
512 if (ksm_test_exit(vma->vm_mm))
513 break;
514 if (signal_pending(current))
515 err = -ERESTARTSYS;
516 else
517 err = break_ksm(vma, addr);
518 }
519 return err;
520}
521
522#ifdef CONFIG_SYSFS
523
524
525
526static int unmerge_and_remove_all_rmap_items(void)
527{
528 struct mm_slot *mm_slot;
529 struct mm_struct *mm;
530 struct vm_area_struct *vma;
531 int err = 0;
532
533 spin_lock(&ksm_mmlist_lock);
534 ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
535 struct mm_slot, mm_list);
536 spin_unlock(&ksm_mmlist_lock);
537
538 for (mm_slot = ksm_scan.mm_slot;
539 mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
540 mm = mm_slot->mm;
541 down_read(&mm->mmap_sem);
542 for (vma = mm->mmap; vma; vma = vma->vm_next) {
543 if (ksm_test_exit(mm))
544 break;
545 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
546 continue;
547 err = unmerge_ksm_pages(vma,
548 vma->vm_start, vma->vm_end);
549 if (err)
550 goto error;
551 }
552
553 remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
554
555 spin_lock(&ksm_mmlist_lock);
556 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
557 struct mm_slot, mm_list);
558 if (ksm_test_exit(mm)) {
559 hlist_del(&mm_slot->link);
560 list_del(&mm_slot->mm_list);
561 spin_unlock(&ksm_mmlist_lock);
562
563 free_mm_slot(mm_slot);
564 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
565 up_read(&mm->mmap_sem);
566 mmdrop(mm);
567 } else {
568 spin_unlock(&ksm_mmlist_lock);
569 up_read(&mm->mmap_sem);
570 }
571 }
572
573 ksm_scan.seqnr = 0;
574 return 0;
575
576error:
577 up_read(&mm->mmap_sem);
578 spin_lock(&ksm_mmlist_lock);
579 ksm_scan.mm_slot = &ksm_mm_head;
580 spin_unlock(&ksm_mmlist_lock);
581 return err;
582}
583#endif
584
585static u32 calc_checksum(struct page *page)
586{
587 u32 checksum;
588 void *addr = kmap_atomic(page, KM_USER0);
589 checksum = jhash2(addr, PAGE_SIZE / 4, 17);
590 kunmap_atomic(addr, KM_USER0);
591 return checksum;
592}
593
594static int memcmp_pages(struct page *page1, struct page *page2)
595{
596 char *addr1, *addr2;
597 int ret;
598
599 addr1 = kmap_atomic(page1, KM_USER0);
600 addr2 = kmap_atomic(page2, KM_USER1);
601 ret = memcmp(addr1, addr2, PAGE_SIZE);
602 kunmap_atomic(addr2, KM_USER1);
603 kunmap_atomic(addr1, KM_USER0);
604 return ret;
605}
606
607static inline int pages_identical(struct page *page1, struct page *page2)
608{
609 return !memcmp_pages(page1, page2);
610}
611
612static int write_protect_page(struct vm_area_struct *vma, struct page *page,
613 pte_t *orig_pte)
614{
615 struct mm_struct *mm = vma->vm_mm;
616 unsigned long addr;
617 pte_t *ptep;
618 spinlock_t *ptl;
619 int swapped;
620 int err = -EFAULT;
621
622 addr = page_address_in_vma(page, vma);
623 if (addr == -EFAULT)
624 goto out;
625
626 ptep = page_check_address(page, mm, addr, &ptl, 0);
627 if (!ptep)
628 goto out;
629
630 if (pte_write(*ptep)) {
631 pte_t entry;
632
633 swapped = PageSwapCache(page);
634 flush_cache_page(vma, addr, page_to_pfn(page));
635
636
637
638
639
640
641
642
643
644 entry = ptep_clear_flush(vma, addr, ptep);
645
646
647
648
649 if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
650 set_pte_at_notify(mm, addr, ptep, entry);
651 goto out_unlock;
652 }
653 entry = pte_wrprotect(entry);
654 set_pte_at_notify(mm, addr, ptep, entry);
655 }
656 *orig_pte = *ptep;
657 err = 0;
658
659out_unlock:
660 pte_unmap_unlock(ptep, ptl);
661out:
662 return err;
663}
664
665
666
667
668
669
670
671
672
673
674static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
675 struct page *newpage, pte_t orig_pte)
676{
677 struct mm_struct *mm = vma->vm_mm;
678 pgd_t *pgd;
679 pud_t *pud;
680 pmd_t *pmd;
681 pte_t *ptep;
682 spinlock_t *ptl;
683 unsigned long addr;
684 pgprot_t prot;
685 int err = -EFAULT;
686
687 prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
688
689 addr = page_address_in_vma(oldpage, vma);
690 if (addr == -EFAULT)
691 goto out;
692
693 pgd = pgd_offset(mm, addr);
694 if (!pgd_present(*pgd))
695 goto out;
696
697 pud = pud_offset(pgd, addr);
698 if (!pud_present(*pud))
699 goto out;
700
701 pmd = pmd_offset(pud, addr);
702 if (!pmd_present(*pmd))
703 goto out;
704
705 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
706 if (!pte_same(*ptep, orig_pte)) {
707 pte_unmap_unlock(ptep, ptl);
708 goto out;
709 }
710
711 get_page(newpage);
712 page_add_ksm_rmap(newpage);
713
714 flush_cache_page(vma, addr, pte_pfn(*ptep));
715 ptep_clear_flush(vma, addr, ptep);
716 set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
717
718 page_remove_rmap(oldpage);
719 put_page(oldpage);
720
721 pte_unmap_unlock(ptep, ptl);
722 err = 0;
723out:
724 return err;
725}
726
727
728
729
730
731
732
733
734
735
736
737
738
739static int try_to_merge_one_page(struct vm_area_struct *vma,
740 struct page *oldpage,
741 struct page *newpage)
742{
743 pte_t orig_pte = __pte(0);
744 int err = -EFAULT;
745
746 if (!(vma->vm_flags & VM_MERGEABLE))
747 goto out;
748
749 if (!PageAnon(oldpage))
750 goto out;
751
752 get_page(newpage);
753 get_page(oldpage);
754
755
756
757
758
759
760
761
762 if (!trylock_page(oldpage))
763 goto out_putpage;
764
765
766
767
768
769
770 if (write_protect_page(vma, oldpage, &orig_pte)) {
771 unlock_page(oldpage);
772 goto out_putpage;
773 }
774 unlock_page(oldpage);
775
776 if (pages_identical(oldpage, newpage))
777 err = replace_page(vma, oldpage, newpage, orig_pte);
778
779out_putpage:
780 put_page(oldpage);
781 put_page(newpage);
782out:
783 return err;
784}
785
786
787
788
789
790static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
791 unsigned long addr1,
792 struct page *page1,
793 struct page *kpage)
794{
795 struct vm_area_struct *vma;
796 int err = -EFAULT;
797
798 down_read(&mm1->mmap_sem);
799 if (ksm_test_exit(mm1))
800 goto out;
801
802 vma = find_vma(mm1, addr1);
803 if (!vma || vma->vm_start > addr1)
804 goto out;
805
806 err = try_to_merge_one_page(vma, page1, kpage);
807out:
808 up_read(&mm1->mmap_sem);
809 return err;
810}
811
812
813
814
815
816
817
818
819
820
821
822static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
823 struct page *page1, struct mm_struct *mm2,
824 unsigned long addr2, struct page *page2)
825{
826 struct vm_area_struct *vma;
827 struct page *kpage;
828 int err = -EFAULT;
829
830
831
832
833
834 if (ksm_max_kernel_pages &&
835 ksm_max_kernel_pages <= ksm_pages_shared)
836 return err;
837
838 kpage = alloc_page(GFP_HIGHUSER);
839 if (!kpage)
840 return err;
841
842 down_read(&mm1->mmap_sem);
843 if (ksm_test_exit(mm1)) {
844 up_read(&mm1->mmap_sem);
845 goto out;
846 }
847 vma = find_vma(mm1, addr1);
848 if (!vma || vma->vm_start > addr1) {
849 up_read(&mm1->mmap_sem);
850 goto out;
851 }
852
853 copy_user_highpage(kpage, page1, addr1, vma);
854 err = try_to_merge_one_page(vma, page1, kpage);
855 up_read(&mm1->mmap_sem);
856
857 if (!err) {
858 err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
859
860
861
862
863 if (err)
864 break_cow(mm1, addr1);
865 }
866out:
867 put_page(kpage);
868 return err;
869}
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884static struct rmap_item *stable_tree_search(struct page *page,
885 struct page **page2,
886 struct rmap_item *rmap_item)
887{
888 struct rb_node *node = root_stable_tree.rb_node;
889
890 while (node) {
891 struct rmap_item *tree_rmap_item, *next_rmap_item;
892 int ret;
893
894 tree_rmap_item = rb_entry(node, struct rmap_item, node);
895 while (tree_rmap_item) {
896 BUG_ON(!in_stable_tree(tree_rmap_item));
897 cond_resched();
898 page2[0] = get_ksm_page(tree_rmap_item);
899 if (page2[0])
900 break;
901 next_rmap_item = tree_rmap_item->next;
902 remove_rmap_item_from_tree(tree_rmap_item);
903 tree_rmap_item = next_rmap_item;
904 }
905 if (!tree_rmap_item)
906 return NULL;
907
908 ret = memcmp_pages(page, page2[0]);
909
910 if (ret < 0) {
911 put_page(page2[0]);
912 node = node->rb_left;
913 } else if (ret > 0) {
914 put_page(page2[0]);
915 node = node->rb_right;
916 } else {
917 return tree_rmap_item;
918 }
919 }
920
921 return NULL;
922}
923
924
925
926
927
928
929
930
931
932
933
934static struct rmap_item *stable_tree_insert(struct page *page,
935 struct rmap_item *rmap_item)
936{
937 struct rb_node **new = &root_stable_tree.rb_node;
938 struct rb_node *parent = NULL;
939
940 while (*new) {
941 struct rmap_item *tree_rmap_item, *next_rmap_item;
942 struct page *tree_page;
943 int ret;
944
945 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
946 while (tree_rmap_item) {
947 BUG_ON(!in_stable_tree(tree_rmap_item));
948 cond_resched();
949 tree_page = get_ksm_page(tree_rmap_item);
950 if (tree_page)
951 break;
952 next_rmap_item = tree_rmap_item->next;
953 remove_rmap_item_from_tree(tree_rmap_item);
954 tree_rmap_item = next_rmap_item;
955 }
956 if (!tree_rmap_item)
957 return NULL;
958
959 ret = memcmp_pages(page, tree_page);
960 put_page(tree_page);
961
962 parent = *new;
963 if (ret < 0)
964 new = &parent->rb_left;
965 else if (ret > 0)
966 new = &parent->rb_right;
967 else {
968
969
970
971
972
973 return NULL;
974 }
975 }
976
977 rmap_item->address |= NODE_FLAG | STABLE_FLAG;
978 rmap_item->next = NULL;
979 rb_link_node(&rmap_item->node, parent, new);
980 rb_insert_color(&rmap_item->node, &root_stable_tree);
981
982 ksm_pages_shared++;
983 return rmap_item;
984}
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004static struct rmap_item *unstable_tree_search_insert(struct page *page,
1005 struct page **page2,
1006 struct rmap_item *rmap_item)
1007{
1008 struct rb_node **new = &root_unstable_tree.rb_node;
1009 struct rb_node *parent = NULL;
1010
1011 while (*new) {
1012 struct rmap_item *tree_rmap_item;
1013 int ret;
1014
1015 cond_resched();
1016 tree_rmap_item = rb_entry(*new, struct rmap_item, node);
1017 page2[0] = get_mergeable_page(tree_rmap_item);
1018 if (!page2[0])
1019 return NULL;
1020
1021
1022
1023
1024
1025 if (page == page2[0]) {
1026 put_page(page2[0]);
1027 return NULL;
1028 }
1029
1030 ret = memcmp_pages(page, page2[0]);
1031
1032 parent = *new;
1033 if (ret < 0) {
1034 put_page(page2[0]);
1035 new = &parent->rb_left;
1036 } else if (ret > 0) {
1037 put_page(page2[0]);
1038 new = &parent->rb_right;
1039 } else {
1040 return tree_rmap_item;
1041 }
1042 }
1043
1044 rmap_item->address |= NODE_FLAG;
1045 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1046 rb_link_node(&rmap_item->node, parent, new);
1047 rb_insert_color(&rmap_item->node, &root_unstable_tree);
1048
1049 ksm_pages_unshared++;
1050 return NULL;
1051}
1052
1053
1054
1055
1056
1057
1058static void stable_tree_append(struct rmap_item *rmap_item,
1059 struct rmap_item *tree_rmap_item)
1060{
1061 rmap_item->next = tree_rmap_item->next;
1062 rmap_item->prev = tree_rmap_item;
1063
1064 if (tree_rmap_item->next)
1065 tree_rmap_item->next->prev = rmap_item;
1066
1067 tree_rmap_item->next = rmap_item;
1068 rmap_item->address |= STABLE_FLAG;
1069
1070 ksm_pages_sharing++;
1071}
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1083{
1084 struct page *page2[1];
1085 struct rmap_item *tree_rmap_item;
1086 unsigned int checksum;
1087 int err;
1088
1089 if (in_stable_tree(rmap_item))
1090 remove_rmap_item_from_tree(rmap_item);
1091
1092
1093 tree_rmap_item = stable_tree_search(page, page2, rmap_item);
1094 if (tree_rmap_item) {
1095 if (page == page2[0])
1096 err = 0;
1097 else
1098 err = try_to_merge_with_ksm_page(rmap_item->mm,
1099 rmap_item->address,
1100 page, page2[0]);
1101 put_page(page2[0]);
1102
1103 if (!err) {
1104
1105
1106
1107
1108 stable_tree_append(rmap_item, tree_rmap_item);
1109 }
1110 return;
1111 }
1112
1113
1114
1115
1116
1117
1118
1119 if (PageKsm(page))
1120 break_cow(rmap_item->mm, rmap_item->address);
1121
1122
1123
1124
1125
1126
1127
1128 checksum = calc_checksum(page);
1129 if (rmap_item->oldchecksum != checksum) {
1130 rmap_item->oldchecksum = checksum;
1131 return;
1132 }
1133
1134 tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
1135 if (tree_rmap_item) {
1136 err = try_to_merge_two_pages(rmap_item->mm,
1137 rmap_item->address, page,
1138 tree_rmap_item->mm,
1139 tree_rmap_item->address, page2[0]);
1140
1141
1142
1143
1144
1145 if (!err) {
1146 rb_erase(&tree_rmap_item->node, &root_unstable_tree);
1147 tree_rmap_item->address &= ~NODE_FLAG;
1148 ksm_pages_unshared--;
1149
1150
1151
1152
1153
1154
1155
1156 if (stable_tree_insert(page2[0], tree_rmap_item))
1157 stable_tree_append(rmap_item, tree_rmap_item);
1158 else {
1159 break_cow(tree_rmap_item->mm,
1160 tree_rmap_item->address);
1161 break_cow(rmap_item->mm, rmap_item->address);
1162 }
1163 }
1164
1165 put_page(page2[0]);
1166 }
1167}
1168
1169static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
1170 struct list_head *cur,
1171 unsigned long addr)
1172{
1173 struct rmap_item *rmap_item;
1174
1175 while (cur != &mm_slot->rmap_list) {
1176 rmap_item = list_entry(cur, struct rmap_item, link);
1177 if ((rmap_item->address & PAGE_MASK) == addr) {
1178 if (!in_stable_tree(rmap_item))
1179 remove_rmap_item_from_tree(rmap_item);
1180 return rmap_item;
1181 }
1182 if (rmap_item->address > addr)
1183 break;
1184 cur = cur->next;
1185 remove_rmap_item_from_tree(rmap_item);
1186 list_del(&rmap_item->link);
1187 free_rmap_item(rmap_item);
1188 }
1189
1190 rmap_item = alloc_rmap_item();
1191 if (rmap_item) {
1192
1193 rmap_item->mm = mm_slot->mm;
1194 rmap_item->address = addr;
1195 list_add_tail(&rmap_item->link, cur);
1196 }
1197 return rmap_item;
1198}
1199
1200static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1201{
1202 struct mm_struct *mm;
1203 struct mm_slot *slot;
1204 struct vm_area_struct *vma;
1205 struct rmap_item *rmap_item;
1206
1207 if (list_empty(&ksm_mm_head.mm_list))
1208 return NULL;
1209
1210 slot = ksm_scan.mm_slot;
1211 if (slot == &ksm_mm_head) {
1212 root_unstable_tree = RB_ROOT;
1213
1214 spin_lock(&ksm_mmlist_lock);
1215 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1216 ksm_scan.mm_slot = slot;
1217 spin_unlock(&ksm_mmlist_lock);
1218next_mm:
1219 ksm_scan.address = 0;
1220 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1221 struct rmap_item, link);
1222 }
1223
1224 mm = slot->mm;
1225 down_read(&mm->mmap_sem);
1226 if (ksm_test_exit(mm))
1227 vma = NULL;
1228 else
1229 vma = find_vma(mm, ksm_scan.address);
1230
1231 for (; vma; vma = vma->vm_next) {
1232 if (!(vma->vm_flags & VM_MERGEABLE))
1233 continue;
1234 if (ksm_scan.address < vma->vm_start)
1235 ksm_scan.address = vma->vm_start;
1236 if (!vma->anon_vma)
1237 ksm_scan.address = vma->vm_end;
1238
1239 while (ksm_scan.address < vma->vm_end) {
1240 if (ksm_test_exit(mm))
1241 break;
1242 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1243 if (*page && PageAnon(*page)) {
1244 flush_anon_page(vma, *page, ksm_scan.address);
1245 flush_dcache_page(*page);
1246 rmap_item = get_next_rmap_item(slot,
1247 ksm_scan.rmap_item->link.next,
1248 ksm_scan.address);
1249 if (rmap_item) {
1250 ksm_scan.rmap_item = rmap_item;
1251 ksm_scan.address += PAGE_SIZE;
1252 } else
1253 put_page(*page);
1254 up_read(&mm->mmap_sem);
1255 return rmap_item;
1256 }
1257 if (*page)
1258 put_page(*page);
1259 ksm_scan.address += PAGE_SIZE;
1260 cond_resched();
1261 }
1262 }
1263
1264 if (ksm_test_exit(mm)) {
1265 ksm_scan.address = 0;
1266 ksm_scan.rmap_item = list_entry(&slot->rmap_list,
1267 struct rmap_item, link);
1268 }
1269
1270
1271
1272
1273 remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
1274
1275 spin_lock(&ksm_mmlist_lock);
1276 ksm_scan.mm_slot = list_entry(slot->mm_list.next,
1277 struct mm_slot, mm_list);
1278 if (ksm_scan.address == 0) {
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288 hlist_del(&slot->link);
1289 list_del(&slot->mm_list);
1290 spin_unlock(&ksm_mmlist_lock);
1291
1292 free_mm_slot(slot);
1293 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1294 up_read(&mm->mmap_sem);
1295 mmdrop(mm);
1296 } else {
1297 spin_unlock(&ksm_mmlist_lock);
1298 up_read(&mm->mmap_sem);
1299 }
1300
1301
1302 slot = ksm_scan.mm_slot;
1303 if (slot != &ksm_mm_head)
1304 goto next_mm;
1305
1306 ksm_scan.seqnr++;
1307 return NULL;
1308}
1309
1310
1311
1312
1313
1314static void ksm_do_scan(unsigned int scan_npages)
1315{
1316 struct rmap_item *rmap_item;
1317 struct page *page;
1318
1319 while (scan_npages--) {
1320 cond_resched();
1321 rmap_item = scan_get_next_rmap_item(&page);
1322 if (!rmap_item)
1323 return;
1324 if (!PageKsm(page) || !in_stable_tree(rmap_item))
1325 cmp_and_merge_page(page, rmap_item);
1326 else if (page_mapcount(page) == 1) {
1327
1328
1329
1330 break_cow(rmap_item->mm, rmap_item->address);
1331 remove_rmap_item_from_tree(rmap_item);
1332 rmap_item->oldchecksum = calc_checksum(page);
1333 }
1334 put_page(page);
1335 }
1336}
1337
1338static int ksmd_should_run(void)
1339{
1340 return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
1341}
1342
1343static int ksm_scan_thread(void *nothing)
1344{
1345 set_user_nice(current, 5);
1346
1347 while (!kthread_should_stop()) {
1348 mutex_lock(&ksm_thread_mutex);
1349 if (ksmd_should_run())
1350 ksm_do_scan(ksm_thread_pages_to_scan);
1351 mutex_unlock(&ksm_thread_mutex);
1352
1353 if (ksmd_should_run()) {
1354 schedule_timeout_interruptible(
1355 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1356 } else {
1357 wait_event_interruptible(ksm_thread_wait,
1358 ksmd_should_run() || kthread_should_stop());
1359 }
1360 }
1361 return 0;
1362}
1363
1364int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1365 unsigned long end, int advice, unsigned long *vm_flags)
1366{
1367 struct mm_struct *mm = vma->vm_mm;
1368 int err;
1369
1370 switch (advice) {
1371 case MADV_MERGEABLE:
1372
1373
1374
1375 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1376 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1377 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1378 VM_MIXEDMAP | VM_SAO))
1379 return 0;
1380
1381 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1382 err = __ksm_enter(mm);
1383 if (err)
1384 return err;
1385 }
1386
1387 *vm_flags |= VM_MERGEABLE;
1388 break;
1389
1390 case MADV_UNMERGEABLE:
1391 if (!(*vm_flags & VM_MERGEABLE))
1392 return 0;
1393
1394 if (vma->anon_vma) {
1395 err = unmerge_ksm_pages(vma, start, end);
1396 if (err)
1397 return err;
1398 }
1399
1400 *vm_flags &= ~VM_MERGEABLE;
1401 break;
1402 }
1403
1404 return 0;
1405}
1406
1407int __ksm_enter(struct mm_struct *mm)
1408{
1409 struct mm_slot *mm_slot;
1410 int needs_wakeup;
1411
1412 mm_slot = alloc_mm_slot();
1413 if (!mm_slot)
1414 return -ENOMEM;
1415
1416
1417 needs_wakeup = list_empty(&ksm_mm_head.mm_list);
1418
1419 spin_lock(&ksm_mmlist_lock);
1420 insert_to_mm_slots_hash(mm, mm_slot);
1421
1422
1423
1424
1425
1426 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1427 spin_unlock(&ksm_mmlist_lock);
1428
1429 set_bit(MMF_VM_MERGEABLE, &mm->flags);
1430 atomic_inc(&mm->mm_count);
1431
1432 if (needs_wakeup)
1433 wake_up_interruptible(&ksm_thread_wait);
1434
1435 return 0;
1436}
1437
1438void __ksm_exit(struct mm_struct *mm)
1439{
1440 struct mm_slot *mm_slot;
1441 int easy_to_free = 0;
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452 spin_lock(&ksm_mmlist_lock);
1453 mm_slot = get_mm_slot(mm);
1454 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1455 if (list_empty(&mm_slot->rmap_list)) {
1456 hlist_del(&mm_slot->link);
1457 list_del(&mm_slot->mm_list);
1458 easy_to_free = 1;
1459 } else {
1460 list_move(&mm_slot->mm_list,
1461 &ksm_scan.mm_slot->mm_list);
1462 }
1463 }
1464 spin_unlock(&ksm_mmlist_lock);
1465
1466 if (easy_to_free) {
1467 free_mm_slot(mm_slot);
1468 clear_bit(MMF_VM_MERGEABLE, &mm->flags);
1469 mmdrop(mm);
1470 } else if (mm_slot) {
1471 down_write(&mm->mmap_sem);
1472 up_write(&mm->mmap_sem);
1473 }
1474}
1475
1476#ifdef CONFIG_SYSFS
1477
1478
1479
1480
1481#define KSM_ATTR_RO(_name) \
1482 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1483#define KSM_ATTR(_name) \
1484 static struct kobj_attribute _name##_attr = \
1485 __ATTR(_name, 0644, _name##_show, _name##_store)
1486
1487static ssize_t sleep_millisecs_show(struct kobject *kobj,
1488 struct kobj_attribute *attr, char *buf)
1489{
1490 return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
1491}
1492
1493static ssize_t sleep_millisecs_store(struct kobject *kobj,
1494 struct kobj_attribute *attr,
1495 const char *buf, size_t count)
1496{
1497 unsigned long msecs;
1498 int err;
1499
1500 err = strict_strtoul(buf, 10, &msecs);
1501 if (err || msecs > UINT_MAX)
1502 return -EINVAL;
1503
1504 ksm_thread_sleep_millisecs = msecs;
1505
1506 return count;
1507}
1508KSM_ATTR(sleep_millisecs);
1509
1510static ssize_t pages_to_scan_show(struct kobject *kobj,
1511 struct kobj_attribute *attr, char *buf)
1512{
1513 return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
1514}
1515
1516static ssize_t pages_to_scan_store(struct kobject *kobj,
1517 struct kobj_attribute *attr,
1518 const char *buf, size_t count)
1519{
1520 int err;
1521 unsigned long nr_pages;
1522
1523 err = strict_strtoul(buf, 10, &nr_pages);
1524 if (err || nr_pages > UINT_MAX)
1525 return -EINVAL;
1526
1527 ksm_thread_pages_to_scan = nr_pages;
1528
1529 return count;
1530}
1531KSM_ATTR(pages_to_scan);
1532
1533static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1534 char *buf)
1535{
1536 return sprintf(buf, "%u\n", ksm_run);
1537}
1538
1539static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1540 const char *buf, size_t count)
1541{
1542 int err;
1543 unsigned long flags;
1544
1545 err = strict_strtoul(buf, 10, &flags);
1546 if (err || flags > UINT_MAX)
1547 return -EINVAL;
1548 if (flags > KSM_RUN_UNMERGE)
1549 return -EINVAL;
1550
1551
1552
1553
1554
1555
1556
1557
1558 mutex_lock(&ksm_thread_mutex);
1559 if (ksm_run != flags) {
1560 ksm_run = flags;
1561 if (flags & KSM_RUN_UNMERGE) {
1562 current->flags |= PF_OOM_ORIGIN;
1563 err = unmerge_and_remove_all_rmap_items();
1564 current->flags &= ~PF_OOM_ORIGIN;
1565 if (err) {
1566 ksm_run = KSM_RUN_STOP;
1567 count = err;
1568 }
1569 }
1570 }
1571 mutex_unlock(&ksm_thread_mutex);
1572
1573 if (flags & KSM_RUN_MERGE)
1574 wake_up_interruptible(&ksm_thread_wait);
1575
1576 return count;
1577}
1578KSM_ATTR(run);
1579
1580static ssize_t max_kernel_pages_store(struct kobject *kobj,
1581 struct kobj_attribute *attr,
1582 const char *buf, size_t count)
1583{
1584 int err;
1585 unsigned long nr_pages;
1586
1587 err = strict_strtoul(buf, 10, &nr_pages);
1588 if (err)
1589 return -EINVAL;
1590
1591 ksm_max_kernel_pages = nr_pages;
1592
1593 return count;
1594}
1595
1596static ssize_t max_kernel_pages_show(struct kobject *kobj,
1597 struct kobj_attribute *attr, char *buf)
1598{
1599 return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
1600}
1601KSM_ATTR(max_kernel_pages);
1602
1603static ssize_t pages_shared_show(struct kobject *kobj,
1604 struct kobj_attribute *attr, char *buf)
1605{
1606 return sprintf(buf, "%lu\n", ksm_pages_shared);
1607}
1608KSM_ATTR_RO(pages_shared);
1609
1610static ssize_t pages_sharing_show(struct kobject *kobj,
1611 struct kobj_attribute *attr, char *buf)
1612{
1613 return sprintf(buf, "%lu\n", ksm_pages_sharing);
1614}
1615KSM_ATTR_RO(pages_sharing);
1616
1617static ssize_t pages_unshared_show(struct kobject *kobj,
1618 struct kobj_attribute *attr, char *buf)
1619{
1620 return sprintf(buf, "%lu\n", ksm_pages_unshared);
1621}
1622KSM_ATTR_RO(pages_unshared);
1623
1624static ssize_t pages_volatile_show(struct kobject *kobj,
1625 struct kobj_attribute *attr, char *buf)
1626{
1627 long ksm_pages_volatile;
1628
1629 ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
1630 - ksm_pages_sharing - ksm_pages_unshared;
1631
1632
1633
1634
1635 if (ksm_pages_volatile < 0)
1636 ksm_pages_volatile = 0;
1637 return sprintf(buf, "%ld\n", ksm_pages_volatile);
1638}
1639KSM_ATTR_RO(pages_volatile);
1640
1641static ssize_t full_scans_show(struct kobject *kobj,
1642 struct kobj_attribute *attr, char *buf)
1643{
1644 return sprintf(buf, "%lu\n", ksm_scan.seqnr);
1645}
1646KSM_ATTR_RO(full_scans);
1647
1648static struct attribute *ksm_attrs[] = {
1649 &sleep_millisecs_attr.attr,
1650 &pages_to_scan_attr.attr,
1651 &run_attr.attr,
1652 &max_kernel_pages_attr.attr,
1653 &pages_shared_attr.attr,
1654 &pages_sharing_attr.attr,
1655 &pages_unshared_attr.attr,
1656 &pages_volatile_attr.attr,
1657 &full_scans_attr.attr,
1658 NULL,
1659};
1660
1661static struct attribute_group ksm_attr_group = {
1662 .attrs = ksm_attrs,
1663 .name = "ksm",
1664};
1665#endif
1666
1667static int __init ksm_init(void)
1668{
1669 struct task_struct *ksm_thread;
1670 int err;
1671
1672 ksm_max_kernel_pages = totalram_pages / 4;
1673
1674 err = ksm_slab_init();
1675 if (err)
1676 goto out;
1677
1678 err = mm_slots_hash_init();
1679 if (err)
1680 goto out_free1;
1681
1682 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1683 if (IS_ERR(ksm_thread)) {
1684 printk(KERN_ERR "ksm: creating kthread failed\n");
1685 err = PTR_ERR(ksm_thread);
1686 goto out_free2;
1687 }
1688
1689#ifdef CONFIG_SYSFS
1690 err = sysfs_create_group(mm_kobj, &ksm_attr_group);
1691 if (err) {
1692 printk(KERN_ERR "ksm: register sysfs failed\n");
1693 kthread_stop(ksm_thread);
1694 goto out_free2;
1695 }
1696#else
1697 ksm_run = KSM_RUN_MERGE;
1698
1699#endif
1700
1701 return 0;
1702
1703out_free2:
1704 mm_slots_hash_free();
1705out_free1:
1706 ksm_slab_free();
1707out:
1708 return err;
1709}
1710module_init(ksm_init)
1711