1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/gfp.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmstat.h>
23#include <linux/file.h>
24#include <linux/writeback.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>
27
28#include <linux/mm_inline.h>
29#include <linux/pagevec.h>
30#include <linux/backing-dev.h>
31#include <linux/rmap.h>
32#include <linux/topology.h>
33#include <linux/cpu.h>
34#include <linux/cpuset.h>
35#include <linux/compaction.h>
36#include <linux/notifier.h>
37#include <linux/rwsem.h>
38#include <linux/delay.h>
39#include <linux/kthread.h>
40#include <linux/freezer.h>
41#include <linux/memcontrol.h>
42#include <linux/delayacct.h>
43#include <linux/sysctl.h>
44
45#include <asm/tlbflush.h>
46#include <asm/div64.h>
47
48#include <linux/swapops.h>
49
50#include "internal.h"
51
52#define CREATE_TRACE_POINTS
53#include <trace/events/vmscan.h>
54
55
56
57
58
59
60
61
62
63
64
65
66typedef unsigned __bitwise__ reclaim_mode_t;
67#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
68#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
69#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
70#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
71#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
72
73struct scan_control {
74
75 unsigned long nr_scanned;
76
77
78 unsigned long nr_reclaimed;
79
80
81 unsigned long nr_to_reclaim;
82
83 unsigned long hibernation_mode;
84
85
86 gfp_t gfp_mask;
87
88 int may_writepage;
89
90
91 int may_unmap;
92
93
94 int may_swap;
95
96 int swappiness;
97
98 int order;
99
100
101
102
103
104 reclaim_mode_t reclaim_mode;
105
106
107 struct mem_cgroup *mem_cgroup;
108
109
110
111
112
113 nodemask_t *nodemask;
114};
115
116#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
117
118#ifdef ARCH_HAS_PREFETCH
119#define prefetch_prev_lru_page(_page, _base, _field) \
120 do { \
121 if ((_page)->lru.prev != _base) { \
122 struct page *prev; \
123 \
124 prev = lru_to_page(&(_page->lru)); \
125 prefetch(&prev->_field); \
126 } \
127 } while (0)
128#else
129#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
130#endif
131
132#ifdef ARCH_HAS_PREFETCHW
133#define prefetchw_prev_lru_page(_page, _base, _field) \
134 do { \
135 if ((_page)->lru.prev != _base) { \
136 struct page *prev; \
137 \
138 prev = lru_to_page(&(_page->lru)); \
139 prefetchw(&prev->_field); \
140 } \
141 } while (0)
142#else
143#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
144#endif
145
146
147
148
149int vm_swappiness = 60;
150long vm_total_pages;
151
152static LIST_HEAD(shrinker_list);
153static DECLARE_RWSEM(shrinker_rwsem);
154
155#ifdef CONFIG_CGROUP_MEM_RES_CTLR
156#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
157#else
158#define scanning_global_lru(sc) (1)
159#endif
160
161static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
162 struct scan_control *sc)
163{
164 if (!scanning_global_lru(sc))
165 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
166
167 return &zone->reclaim_stat;
168}
169
170static unsigned long zone_nr_lru_pages(struct zone *zone,
171 struct scan_control *sc, enum lru_list lru)
172{
173 if (!scanning_global_lru(sc))
174 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
175
176 return zone_page_state(zone, NR_LRU_BASE + lru);
177}
178
179
180
181
182
183void register_shrinker(struct shrinker *shrinker)
184{
185 shrinker->nr = 0;
186 down_write(&shrinker_rwsem);
187 list_add_tail(&shrinker->list, &shrinker_list);
188 up_write(&shrinker_rwsem);
189}
190EXPORT_SYMBOL(register_shrinker);
191
192
193
194
195void unregister_shrinker(struct shrinker *shrinker)
196{
197 down_write(&shrinker_rwsem);
198 list_del(&shrinker->list);
199 up_write(&shrinker_rwsem);
200}
201EXPORT_SYMBOL(unregister_shrinker);
202
203#define SHRINK_BATCH 128
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
224 unsigned long lru_pages)
225{
226 struct shrinker *shrinker;
227 unsigned long ret = 0;
228
229 if (scanned == 0)
230 scanned = SWAP_CLUSTER_MAX;
231
232 if (!down_read_trylock(&shrinker_rwsem))
233 return 1;
234
235 list_for_each_entry(shrinker, &shrinker_list, list) {
236 unsigned long long delta;
237 unsigned long total_scan;
238 unsigned long max_pass;
239
240 max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
241 delta = (4 * scanned) / shrinker->seeks;
242 delta *= max_pass;
243 do_div(delta, lru_pages + 1);
244 shrinker->nr += delta;
245 if (shrinker->nr < 0) {
246 printk(KERN_ERR "shrink_slab: %pF negative objects to "
247 "delete nr=%ld\n",
248 shrinker->shrink, shrinker->nr);
249 shrinker->nr = max_pass;
250 }
251
252
253
254
255
256
257 if (shrinker->nr > max_pass * 2)
258 shrinker->nr = max_pass * 2;
259
260 total_scan = shrinker->nr;
261 shrinker->nr = 0;
262
263 while (total_scan >= SHRINK_BATCH) {
264 long this_scan = SHRINK_BATCH;
265 int shrink_ret;
266 int nr_before;
267
268 nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
269 shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
270 gfp_mask);
271 if (shrink_ret == -1)
272 break;
273 if (shrink_ret < nr_before)
274 ret += nr_before - shrink_ret;
275 count_vm_events(SLABS_SCANNED, this_scan);
276 total_scan -= this_scan;
277
278 cond_resched();
279 }
280
281 shrinker->nr += total_scan;
282 }
283 up_read(&shrinker_rwsem);
284 return ret;
285}
286
287static void set_reclaim_mode(int priority, struct scan_control *sc,
288 bool sync)
289{
290 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
291
292
293
294
295
296
297 if (COMPACTION_BUILD)
298 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
299 else
300 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
301
302
303
304
305
306
307 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
308 sc->reclaim_mode |= syncmode;
309 else if (sc->order && priority < DEF_PRIORITY - 2)
310 sc->reclaim_mode |= syncmode;
311 else
312 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
313}
314
315static void reset_reclaim_mode(struct scan_control *sc)
316{
317 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
318}
319
320static inline int is_page_cache_freeable(struct page *page)
321{
322
323
324
325
326
327 return page_count(page) - page_has_private(page) == 2;
328}
329
330static int may_write_to_queue(struct backing_dev_info *bdi,
331 struct scan_control *sc)
332{
333 if (current->flags & PF_SWAPWRITE)
334 return 1;
335 if (!bdi_write_congested(bdi))
336 return 1;
337 if (bdi == current->backing_dev_info)
338 return 1;
339
340
341 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
342 return 1;
343 return 0;
344}
345
346
347
348
349
350
351
352
353
354
355
356
357
358static void handle_write_error(struct address_space *mapping,
359 struct page *page, int error)
360{
361 lock_page_nosync(page);
362 if (page_mapping(page) == mapping)
363 mapping_set_error(mapping, error);
364 unlock_page(page);
365}
366
367
368typedef enum {
369
370 PAGE_KEEP,
371
372 PAGE_ACTIVATE,
373
374 PAGE_SUCCESS,
375
376 PAGE_CLEAN,
377} pageout_t;
378
379
380
381
382
383static pageout_t pageout(struct page *page, struct address_space *mapping,
384 struct scan_control *sc)
385{
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402 if (!is_page_cache_freeable(page))
403 return PAGE_KEEP;
404 if (!mapping) {
405
406
407
408
409 if (page_has_private(page)) {
410 if (try_to_free_buffers(page)) {
411 ClearPageDirty(page);
412 printk("%s: orphaned page\n", __func__);
413 return PAGE_CLEAN;
414 }
415 }
416 return PAGE_KEEP;
417 }
418 if (mapping->a_ops->writepage == NULL)
419 return PAGE_ACTIVATE;
420 if (!may_write_to_queue(mapping->backing_dev_info, sc))
421 return PAGE_KEEP;
422
423 if (clear_page_dirty_for_io(page)) {
424 int res;
425 struct writeback_control wbc = {
426 .sync_mode = WB_SYNC_NONE,
427 .nr_to_write = SWAP_CLUSTER_MAX,
428 .range_start = 0,
429 .range_end = LLONG_MAX,
430 .for_reclaim = 1,
431 };
432
433 SetPageReclaim(page);
434 res = mapping->a_ops->writepage(page, &wbc);
435 if (res < 0)
436 handle_write_error(mapping, page, res);
437 if (res == AOP_WRITEPAGE_ACTIVATE) {
438 ClearPageReclaim(page);
439 return PAGE_ACTIVATE;
440 }
441
442
443
444
445
446
447 if (PageWriteback(page) &&
448 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
449 wait_on_page_writeback(page);
450
451 if (!PageWriteback(page)) {
452
453 ClearPageReclaim(page);
454 }
455 trace_mm_vmscan_writepage(page,
456 trace_reclaim_flags(page, sc->reclaim_mode));
457 inc_zone_page_state(page, NR_VMSCAN_WRITE);
458 return PAGE_SUCCESS;
459 }
460
461 return PAGE_CLEAN;
462}
463
464
465
466
467
468static int __remove_mapping(struct address_space *mapping, struct page *page)
469{
470 BUG_ON(!PageLocked(page));
471 BUG_ON(mapping != page_mapping(page));
472
473 spin_lock_irq(&mapping->tree_lock);
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499 if (!page_freeze_refs(page, 2))
500 goto cannot_free;
501
502 if (unlikely(PageDirty(page))) {
503 page_unfreeze_refs(page, 2);
504 goto cannot_free;
505 }
506
507 if (PageSwapCache(page)) {
508 swp_entry_t swap = { .val = page_private(page) };
509 __delete_from_swap_cache(page);
510 spin_unlock_irq(&mapping->tree_lock);
511 swapcache_free(swap, page);
512 } else {
513 void (*freepage)(struct page *);
514
515 freepage = mapping->a_ops->freepage;
516
517 __remove_from_page_cache(page);
518 spin_unlock_irq(&mapping->tree_lock);
519 mem_cgroup_uncharge_cache_page(page);
520
521 if (freepage != NULL)
522 freepage(page);
523 }
524
525 return 1;
526
527cannot_free:
528 spin_unlock_irq(&mapping->tree_lock);
529 return 0;
530}
531
532
533
534
535
536
537
538int remove_mapping(struct address_space *mapping, struct page *page)
539{
540 if (__remove_mapping(mapping, page)) {
541
542
543
544
545
546 page_unfreeze_refs(page, 1);
547 return 1;
548 }
549 return 0;
550}
551
552
553
554
555
556
557
558
559
560
561void putback_lru_page(struct page *page)
562{
563 int lru;
564 int active = !!TestClearPageActive(page);
565 int was_unevictable = PageUnevictable(page);
566
567 VM_BUG_ON(PageLRU(page));
568
569redo:
570 ClearPageUnevictable(page);
571
572 if (page_evictable(page, NULL)) {
573
574
575
576
577
578
579 lru = active + page_lru_base_type(page);
580 lru_cache_add_lru(page, lru);
581 } else {
582
583
584
585
586 lru = LRU_UNEVICTABLE;
587 add_page_to_unevictable_list(page);
588
589
590
591
592
593
594
595
596
597 smp_mb();
598 }
599
600
601
602
603
604
605 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
606 if (!isolate_lru_page(page)) {
607 put_page(page);
608 goto redo;
609 }
610
611
612
613
614 }
615
616 if (was_unevictable && lru != LRU_UNEVICTABLE)
617 count_vm_event(UNEVICTABLE_PGRESCUED);
618 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
619 count_vm_event(UNEVICTABLE_PGCULLED);
620
621 put_page(page);
622}
623
624enum page_references {
625 PAGEREF_RECLAIM,
626 PAGEREF_RECLAIM_CLEAN,
627 PAGEREF_KEEP,
628 PAGEREF_ACTIVATE,
629};
630
631static enum page_references page_check_references(struct page *page,
632 struct scan_control *sc)
633{
634 int referenced_ptes, referenced_page;
635 unsigned long vm_flags;
636
637 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
638 referenced_page = TestClearPageReferenced(page);
639
640
641 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
642 return PAGEREF_RECLAIM;
643
644
645
646
647
648 if (vm_flags & VM_LOCKED)
649 return PAGEREF_RECLAIM;
650
651 if (referenced_ptes) {
652 if (PageAnon(page))
653 return PAGEREF_ACTIVATE;
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668 SetPageReferenced(page);
669
670 if (referenced_page)
671 return PAGEREF_ACTIVATE;
672
673 return PAGEREF_KEEP;
674 }
675
676
677 if (referenced_page && !PageSwapBacked(page))
678 return PAGEREF_RECLAIM_CLEAN;
679
680 return PAGEREF_RECLAIM;
681}
682
683static noinline_for_stack void free_page_list(struct list_head *free_pages)
684{
685 struct pagevec freed_pvec;
686 struct page *page, *tmp;
687
688 pagevec_init(&freed_pvec, 1);
689
690 list_for_each_entry_safe(page, tmp, free_pages, lru) {
691 list_del(&page->lru);
692 if (!pagevec_add(&freed_pvec, page)) {
693 __pagevec_free(&freed_pvec);
694 pagevec_reinit(&freed_pvec);
695 }
696 }
697
698 pagevec_free(&freed_pvec);
699}
700
701
702
703
704static unsigned long shrink_page_list(struct list_head *page_list,
705 struct zone *zone,
706 struct scan_control *sc)
707{
708 LIST_HEAD(ret_pages);
709 LIST_HEAD(free_pages);
710 int pgactivate = 0;
711 unsigned long nr_dirty = 0;
712 unsigned long nr_congested = 0;
713 unsigned long nr_reclaimed = 0;
714
715 cond_resched();
716
717 while (!list_empty(page_list)) {
718 enum page_references references;
719 struct address_space *mapping;
720 struct page *page;
721 int may_enter_fs;
722
723 cond_resched();
724
725 page = lru_to_page(page_list);
726 list_del(&page->lru);
727
728 if (!trylock_page(page))
729 goto keep;
730
731 VM_BUG_ON(PageActive(page));
732 VM_BUG_ON(page_zone(page) != zone);
733
734 sc->nr_scanned++;
735
736 if (unlikely(!page_evictable(page, NULL)))
737 goto cull_mlocked;
738
739 if (!sc->may_unmap && page_mapped(page))
740 goto keep_locked;
741
742
743 if (page_mapped(page) || PageSwapCache(page))
744 sc->nr_scanned++;
745
746 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
747 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
748
749 if (PageWriteback(page)) {
750
751
752
753
754
755
756
757
758 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
759 may_enter_fs)
760 wait_on_page_writeback(page);
761 else {
762 unlock_page(page);
763 goto keep_lumpy;
764 }
765 }
766
767 references = page_check_references(page, sc);
768 switch (references) {
769 case PAGEREF_ACTIVATE:
770 goto activate_locked;
771 case PAGEREF_KEEP:
772 goto keep_locked;
773 case PAGEREF_RECLAIM:
774 case PAGEREF_RECLAIM_CLEAN:
775 ;
776 }
777
778
779
780
781
782 if (PageAnon(page) && !PageSwapCache(page)) {
783 if (!(sc->gfp_mask & __GFP_IO))
784 goto keep_locked;
785 if (!add_to_swap(page))
786 goto activate_locked;
787 may_enter_fs = 1;
788 }
789
790 mapping = page_mapping(page);
791
792
793
794
795
796 if (page_mapped(page) && mapping) {
797 switch (try_to_unmap(page, TTU_UNMAP)) {
798 case SWAP_FAIL:
799 goto activate_locked;
800 case SWAP_AGAIN:
801 goto keep_locked;
802 case SWAP_MLOCK:
803 goto cull_mlocked;
804 case SWAP_SUCCESS:
805 ;
806 }
807 }
808
809 if (PageDirty(page)) {
810 nr_dirty++;
811
812 if (references == PAGEREF_RECLAIM_CLEAN)
813 goto keep_locked;
814 if (!may_enter_fs)
815 goto keep_locked;
816 if (!sc->may_writepage)
817 goto keep_locked;
818
819
820 switch (pageout(page, mapping, sc)) {
821 case PAGE_KEEP:
822 nr_congested++;
823 goto keep_locked;
824 case PAGE_ACTIVATE:
825 goto activate_locked;
826 case PAGE_SUCCESS:
827 if (PageWriteback(page))
828 goto keep_lumpy;
829 if (PageDirty(page))
830 goto keep;
831
832
833
834
835
836 if (!trylock_page(page))
837 goto keep;
838 if (PageDirty(page) || PageWriteback(page))
839 goto keep_locked;
840 mapping = page_mapping(page);
841 case PAGE_CLEAN:
842 ;
843 }
844 }
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867 if (page_has_private(page)) {
868 if (!try_to_release_page(page, sc->gfp_mask))
869 goto activate_locked;
870 if (!mapping && page_count(page) == 1) {
871 unlock_page(page);
872 if (put_page_testzero(page))
873 goto free_it;
874 else {
875
876
877
878
879
880
881
882 nr_reclaimed++;
883 continue;
884 }
885 }
886 }
887
888 if (!mapping || !__remove_mapping(mapping, page))
889 goto keep_locked;
890
891
892
893
894
895
896
897
898 __clear_page_locked(page);
899free_it:
900 nr_reclaimed++;
901
902
903
904
905
906 list_add(&page->lru, &free_pages);
907 continue;
908
909cull_mlocked:
910 if (PageSwapCache(page))
911 try_to_free_swap(page);
912 unlock_page(page);
913 putback_lru_page(page);
914 reset_reclaim_mode(sc);
915 continue;
916
917activate_locked:
918
919 if (PageSwapCache(page) && vm_swap_full())
920 try_to_free_swap(page);
921 VM_BUG_ON(PageActive(page));
922 SetPageActive(page);
923 pgactivate++;
924keep_locked:
925 unlock_page(page);
926keep:
927 reset_reclaim_mode(sc);
928keep_lumpy:
929 list_add(&page->lru, &ret_pages);
930 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
931 }
932
933
934
935
936
937
938
939 if (nr_dirty == nr_congested && nr_dirty != 0)
940 zone_set_flag(zone, ZONE_CONGESTED);
941
942 free_page_list(&free_pages);
943
944 list_splice(&ret_pages, page_list);
945 count_vm_events(PGACTIVATE, pgactivate);
946 return nr_reclaimed;
947}
948
949
950
951
952
953
954
955
956
957
958
959int __isolate_lru_page(struct page *page, int mode, int file)
960{
961 int ret = -EINVAL;
962
963
964 if (!PageLRU(page))
965 return ret;
966
967
968
969
970
971
972 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
973 return ret;
974
975 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
976 return ret;
977
978
979
980
981
982
983 if (PageUnevictable(page))
984 return ret;
985
986 ret = -EBUSY;
987
988 if (likely(get_page_unless_zero(page))) {
989
990
991
992
993
994 ClearPageLRU(page);
995 ret = 0;
996 }
997
998 return ret;
999}
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1022 struct list_head *src, struct list_head *dst,
1023 unsigned long *scanned, int order, int mode, int file)
1024{
1025 unsigned long nr_taken = 0;
1026 unsigned long nr_lumpy_taken = 0;
1027 unsigned long nr_lumpy_dirty = 0;
1028 unsigned long nr_lumpy_failed = 0;
1029 unsigned long scan;
1030
1031 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1032 struct page *page;
1033 unsigned long pfn;
1034 unsigned long end_pfn;
1035 unsigned long page_pfn;
1036 int zone_id;
1037
1038 page = lru_to_page(src);
1039 prefetchw_prev_lru_page(page, src, flags);
1040
1041 VM_BUG_ON(!PageLRU(page));
1042
1043 switch (__isolate_lru_page(page, mode, file)) {
1044 case 0:
1045 list_move(&page->lru, dst);
1046 mem_cgroup_del_lru(page);
1047 nr_taken += hpage_nr_pages(page);
1048 break;
1049
1050 case -EBUSY:
1051
1052 list_move(&page->lru, src);
1053 mem_cgroup_rotate_lru_list(page, page_lru(page));
1054 continue;
1055
1056 default:
1057 BUG();
1058 }
1059
1060 if (!order)
1061 continue;
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072 zone_id = page_zone_id(page);
1073 page_pfn = page_to_pfn(page);
1074 pfn = page_pfn & ~((1 << order) - 1);
1075 end_pfn = pfn + (1 << order);
1076 for (; pfn < end_pfn; pfn++) {
1077 struct page *cursor_page;
1078
1079
1080 if (unlikely(pfn == page_pfn))
1081 continue;
1082
1083
1084 if (unlikely(!pfn_valid_within(pfn)))
1085 break;
1086
1087 cursor_page = pfn_to_page(pfn);
1088
1089
1090 if (unlikely(page_zone_id(cursor_page) != zone_id))
1091 break;
1092
1093
1094
1095
1096
1097
1098 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
1099 !PageSwapCache(cursor_page))
1100 break;
1101
1102 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1103 list_move(&cursor_page->lru, dst);
1104 mem_cgroup_del_lru(cursor_page);
1105 nr_taken += hpage_nr_pages(page);
1106 nr_lumpy_taken++;
1107 if (PageDirty(cursor_page))
1108 nr_lumpy_dirty++;
1109 scan++;
1110 } else {
1111
1112 if (!page_count(cursor_page))
1113 continue;
1114 break;
1115 }
1116 }
1117
1118
1119 if (pfn < end_pfn)
1120 nr_lumpy_failed++;
1121 }
1122
1123 *scanned = scan;
1124
1125 trace_mm_vmscan_lru_isolate(order,
1126 nr_to_scan, scan,
1127 nr_taken,
1128 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1129 mode);
1130 return nr_taken;
1131}
1132
1133static unsigned long isolate_pages_global(unsigned long nr,
1134 struct list_head *dst,
1135 unsigned long *scanned, int order,
1136 int mode, struct zone *z,
1137 int active, int file)
1138{
1139 int lru = LRU_BASE;
1140 if (active)
1141 lru += LRU_ACTIVE;
1142 if (file)
1143 lru += LRU_FILE;
1144 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
1145 mode, file);
1146}
1147
1148
1149
1150
1151
1152static unsigned long clear_active_flags(struct list_head *page_list,
1153 unsigned int *count)
1154{
1155 int nr_active = 0;
1156 int lru;
1157 struct page *page;
1158
1159 list_for_each_entry(page, page_list, lru) {
1160 int numpages = hpage_nr_pages(page);
1161 lru = page_lru_base_type(page);
1162 if (PageActive(page)) {
1163 lru += LRU_ACTIVE;
1164 ClearPageActive(page);
1165 nr_active += numpages;
1166 }
1167 if (count)
1168 count[lru] += numpages;
1169 }
1170
1171 return nr_active;
1172}
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199int isolate_lru_page(struct page *page)
1200{
1201 int ret = -EBUSY;
1202
1203 if (PageLRU(page)) {
1204 struct zone *zone = page_zone(page);
1205
1206 spin_lock_irq(&zone->lru_lock);
1207 if (PageLRU(page) && get_page_unless_zero(page)) {
1208 int lru = page_lru(page);
1209 ret = 0;
1210 ClearPageLRU(page);
1211
1212 del_page_from_lru_list(zone, page, lru);
1213 }
1214 spin_unlock_irq(&zone->lru_lock);
1215 }
1216 return ret;
1217}
1218
1219
1220
1221
1222static int too_many_isolated(struct zone *zone, int file,
1223 struct scan_control *sc)
1224{
1225 unsigned long inactive, isolated;
1226
1227 if (current_is_kswapd())
1228 return 0;
1229
1230 if (!scanning_global_lru(sc))
1231 return 0;
1232
1233 if (file) {
1234 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1235 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1236 } else {
1237 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1238 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1239 }
1240
1241 return isolated > inactive;
1242}
1243
1244
1245
1246
1247static noinline_for_stack void
1248putback_lru_pages(struct zone *zone, struct scan_control *sc,
1249 unsigned long nr_anon, unsigned long nr_file,
1250 struct list_head *page_list)
1251{
1252 struct page *page;
1253 struct pagevec pvec;
1254 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1255
1256 pagevec_init(&pvec, 1);
1257
1258
1259
1260
1261 spin_lock(&zone->lru_lock);
1262 while (!list_empty(page_list)) {
1263 int lru;
1264 page = lru_to_page(page_list);
1265 VM_BUG_ON(PageLRU(page));
1266 list_del(&page->lru);
1267 if (unlikely(!page_evictable(page, NULL))) {
1268 spin_unlock_irq(&zone->lru_lock);
1269 putback_lru_page(page);
1270 spin_lock_irq(&zone->lru_lock);
1271 continue;
1272 }
1273 SetPageLRU(page);
1274 lru = page_lru(page);
1275 add_page_to_lru_list(zone, page, lru);
1276 if (is_active_lru(lru)) {
1277 int file = is_file_lru(lru);
1278 int numpages = hpage_nr_pages(page);
1279 reclaim_stat->recent_rotated[file] += numpages;
1280 }
1281 if (!pagevec_add(&pvec, page)) {
1282 spin_unlock_irq(&zone->lru_lock);
1283 __pagevec_release(&pvec);
1284 spin_lock_irq(&zone->lru_lock);
1285 }
1286 }
1287 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1288 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1289
1290 spin_unlock_irq(&zone->lru_lock);
1291 pagevec_release(&pvec);
1292}
1293
1294static noinline_for_stack void update_isolated_counts(struct zone *zone,
1295 struct scan_control *sc,
1296 unsigned long *nr_anon,
1297 unsigned long *nr_file,
1298 struct list_head *isolated_list)
1299{
1300 unsigned long nr_active;
1301 unsigned int count[NR_LRU_LISTS] = { 0, };
1302 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1303
1304 nr_active = clear_active_flags(isolated_list, count);
1305 __count_vm_events(PGDEACTIVATE, nr_active);
1306
1307 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1308 -count[LRU_ACTIVE_FILE]);
1309 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1310 -count[LRU_INACTIVE_FILE]);
1311 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1312 -count[LRU_ACTIVE_ANON]);
1313 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1314 -count[LRU_INACTIVE_ANON]);
1315
1316 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1317 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1318 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1319 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1320
1321 reclaim_stat->recent_scanned[0] += *nr_anon;
1322 reclaim_stat->recent_scanned[1] += *nr_file;
1323}
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333static inline bool should_reclaim_stall(unsigned long nr_taken,
1334 unsigned long nr_freed,
1335 int priority,
1336 struct scan_control *sc)
1337{
1338 int lumpy_stall_priority;
1339
1340
1341 if (current_is_kswapd())
1342 return false;
1343
1344
1345 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1346 return false;
1347
1348
1349 if (nr_freed == nr_taken)
1350 return false;
1351
1352
1353
1354
1355
1356
1357
1358 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1359 lumpy_stall_priority = DEF_PRIORITY;
1360 else
1361 lumpy_stall_priority = DEF_PRIORITY / 3;
1362
1363 return priority <= lumpy_stall_priority;
1364}
1365
1366
1367
1368
1369
1370static noinline_for_stack unsigned long
1371shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1372 struct scan_control *sc, int priority, int file)
1373{
1374 LIST_HEAD(page_list);
1375 unsigned long nr_scanned;
1376 unsigned long nr_reclaimed = 0;
1377 unsigned long nr_taken;
1378 unsigned long nr_anon;
1379 unsigned long nr_file;
1380
1381 while (unlikely(too_many_isolated(zone, file, sc))) {
1382 congestion_wait(BLK_RW_ASYNC, HZ/10);
1383
1384
1385 if (fatal_signal_pending(current))
1386 return SWAP_CLUSTER_MAX;
1387 }
1388
1389 set_reclaim_mode(priority, sc, false);
1390 lru_add_drain();
1391 spin_lock_irq(&zone->lru_lock);
1392
1393 if (scanning_global_lru(sc)) {
1394 nr_taken = isolate_pages_global(nr_to_scan,
1395 &page_list, &nr_scanned, sc->order,
1396 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1397 ISOLATE_BOTH : ISOLATE_INACTIVE,
1398 zone, 0, file);
1399 zone->pages_scanned += nr_scanned;
1400 if (current_is_kswapd())
1401 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1402 nr_scanned);
1403 else
1404 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1405 nr_scanned);
1406 } else {
1407 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1408 &page_list, &nr_scanned, sc->order,
1409 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1410 ISOLATE_BOTH : ISOLATE_INACTIVE,
1411 zone, sc->mem_cgroup,
1412 0, file);
1413
1414
1415
1416
1417 }
1418
1419 if (nr_taken == 0) {
1420 spin_unlock_irq(&zone->lru_lock);
1421 return 0;
1422 }
1423
1424 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
1425
1426 spin_unlock_irq(&zone->lru_lock);
1427
1428 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1429
1430
1431 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1432 set_reclaim_mode(priority, sc, true);
1433 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1434 }
1435
1436 local_irq_disable();
1437 if (current_is_kswapd())
1438 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1439 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1440
1441 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1442
1443 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1444 zone_idx(zone),
1445 nr_scanned, nr_reclaimed,
1446 priority,
1447 trace_shrink_flags(file, sc->reclaim_mode));
1448 return nr_reclaimed;
1449}
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469static void move_active_pages_to_lru(struct zone *zone,
1470 struct list_head *list,
1471 enum lru_list lru)
1472{
1473 unsigned long pgmoved = 0;
1474 struct pagevec pvec;
1475 struct page *page;
1476
1477 pagevec_init(&pvec, 1);
1478
1479 while (!list_empty(list)) {
1480 page = lru_to_page(list);
1481
1482 VM_BUG_ON(PageLRU(page));
1483 SetPageLRU(page);
1484
1485 list_move(&page->lru, &zone->lru[lru].list);
1486 mem_cgroup_add_lru_list(page, lru);
1487 pgmoved += hpage_nr_pages(page);
1488
1489 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1490 spin_unlock_irq(&zone->lru_lock);
1491 if (buffer_heads_over_limit)
1492 pagevec_strip(&pvec);
1493 __pagevec_release(&pvec);
1494 spin_lock_irq(&zone->lru_lock);
1495 }
1496 }
1497 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1498 if (!is_active_lru(lru))
1499 __count_vm_events(PGDEACTIVATE, pgmoved);
1500}
1501
1502static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1503 struct scan_control *sc, int priority, int file)
1504{
1505 unsigned long nr_taken;
1506 unsigned long pgscanned;
1507 unsigned long vm_flags;
1508 LIST_HEAD(l_hold);
1509 LIST_HEAD(l_active);
1510 LIST_HEAD(l_inactive);
1511 struct page *page;
1512 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1513 unsigned long nr_rotated = 0;
1514
1515 lru_add_drain();
1516 spin_lock_irq(&zone->lru_lock);
1517 if (scanning_global_lru(sc)) {
1518 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1519 &pgscanned, sc->order,
1520 ISOLATE_ACTIVE, zone,
1521 1, file);
1522 zone->pages_scanned += pgscanned;
1523 } else {
1524 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1525 &pgscanned, sc->order,
1526 ISOLATE_ACTIVE, zone,
1527 sc->mem_cgroup, 1, file);
1528
1529
1530
1531
1532 }
1533
1534 reclaim_stat->recent_scanned[file] += nr_taken;
1535
1536 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1537 if (file)
1538 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1539 else
1540 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1541 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1542 spin_unlock_irq(&zone->lru_lock);
1543
1544 while (!list_empty(&l_hold)) {
1545 cond_resched();
1546 page = lru_to_page(&l_hold);
1547 list_del(&page->lru);
1548
1549 if (unlikely(!page_evictable(page, NULL))) {
1550 putback_lru_page(page);
1551 continue;
1552 }
1553
1554 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1555 nr_rotated += hpage_nr_pages(page);
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1566 list_add(&page->lru, &l_active);
1567 continue;
1568 }
1569 }
1570
1571 ClearPageActive(page);
1572 list_add(&page->lru, &l_inactive);
1573 }
1574
1575
1576
1577
1578 spin_lock_irq(&zone->lru_lock);
1579
1580
1581
1582
1583
1584
1585 reclaim_stat->recent_rotated[file] += nr_rotated;
1586
1587 move_active_pages_to_lru(zone, &l_active,
1588 LRU_ACTIVE + file * LRU_FILE);
1589 move_active_pages_to_lru(zone, &l_inactive,
1590 LRU_BASE + file * LRU_FILE);
1591 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1592 spin_unlock_irq(&zone->lru_lock);
1593}
1594
1595#ifdef CONFIG_SWAP
1596static int inactive_anon_is_low_global(struct zone *zone)
1597{
1598 unsigned long active, inactive;
1599
1600 active = zone_page_state(zone, NR_ACTIVE_ANON);
1601 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1602
1603 if (inactive * zone->inactive_ratio < active)
1604 return 1;
1605
1606 return 0;
1607}
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1618{
1619 int low;
1620
1621
1622
1623
1624
1625 if (!total_swap_pages)
1626 return 0;
1627
1628 if (scanning_global_lru(sc))
1629 low = inactive_anon_is_low_global(zone);
1630 else
1631 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1632 return low;
1633}
1634#else
1635static inline int inactive_anon_is_low(struct zone *zone,
1636 struct scan_control *sc)
1637{
1638 return 0;
1639}
1640#endif
1641
1642static int inactive_file_is_low_global(struct zone *zone)
1643{
1644 unsigned long active, inactive;
1645
1646 active = zone_page_state(zone, NR_ACTIVE_FILE);
1647 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1648
1649 return (active > inactive);
1650}
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1668{
1669 int low;
1670
1671 if (scanning_global_lru(sc))
1672 low = inactive_file_is_low_global(zone);
1673 else
1674 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1675 return low;
1676}
1677
1678static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
1679 int file)
1680{
1681 if (file)
1682 return inactive_file_is_low(zone, sc);
1683 else
1684 return inactive_anon_is_low(zone, sc);
1685}
1686
1687static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1688 struct zone *zone, struct scan_control *sc, int priority)
1689{
1690 int file = is_file_lru(lru);
1691
1692 if (is_active_lru(lru)) {
1693 if (inactive_list_is_low(zone, sc, file))
1694 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1695 return 0;
1696 }
1697
1698 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1699}
1700
1701
1702
1703
1704
1705static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1706 unsigned long *nr_saved_scan)
1707{
1708 unsigned long nr;
1709
1710 *nr_saved_scan += nr_to_scan;
1711 nr = *nr_saved_scan;
1712
1713 if (nr >= SWAP_CLUSTER_MAX)
1714 *nr_saved_scan = 0;
1715 else
1716 nr = 0;
1717
1718 return nr;
1719}
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729static void get_scan_count(struct zone *zone, struct scan_control *sc,
1730 unsigned long *nr, int priority)
1731{
1732 unsigned long anon, file, free;
1733 unsigned long anon_prio, file_prio;
1734 unsigned long ap, fp;
1735 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1736 u64 fraction[2], denominator;
1737 enum lru_list l;
1738 int noswap = 0;
1739
1740
1741 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1742 noswap = 1;
1743 fraction[0] = 0;
1744 fraction[1] = 1;
1745 denominator = 1;
1746 goto out;
1747 }
1748
1749 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1750 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1751 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1752 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1753
1754 if (scanning_global_lru(sc)) {
1755 free = zone_page_state(zone, NR_FREE_PAGES);
1756
1757
1758 if (unlikely(file + free <= high_wmark_pages(zone))) {
1759 fraction[0] = 1;
1760 fraction[1] = 0;
1761 denominator = 1;
1762 goto out;
1763 }
1764 }
1765
1766
1767
1768
1769
1770 anon_prio = sc->swappiness;
1771 file_prio = 200 - sc->swappiness;
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784 spin_lock_irq(&zone->lru_lock);
1785 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1786 reclaim_stat->recent_scanned[0] /= 2;
1787 reclaim_stat->recent_rotated[0] /= 2;
1788 }
1789
1790 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1791 reclaim_stat->recent_scanned[1] /= 2;
1792 reclaim_stat->recent_rotated[1] /= 2;
1793 }
1794
1795
1796
1797
1798
1799
1800 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1801 ap /= reclaim_stat->recent_rotated[0] + 1;
1802
1803 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1804 fp /= reclaim_stat->recent_rotated[1] + 1;
1805 spin_unlock_irq(&zone->lru_lock);
1806
1807 fraction[0] = ap;
1808 fraction[1] = fp;
1809 denominator = ap + fp + 1;
1810out:
1811 for_each_evictable_lru(l) {
1812 int file = is_file_lru(l);
1813 unsigned long scan;
1814
1815 scan = zone_nr_lru_pages(zone, sc, l);
1816 if (priority || noswap) {
1817 scan >>= priority;
1818 scan = div64_u64(scan * fraction[file], denominator);
1819 }
1820 nr[l] = nr_scan_try_batch(scan,
1821 &reclaim_stat->nr_saved_scan[l]);
1822 }
1823}
1824
1825
1826
1827
1828
1829
1830
1831
1832static inline bool should_continue_reclaim(struct zone *zone,
1833 unsigned long nr_reclaimed,
1834 unsigned long nr_scanned,
1835 struct scan_control *sc)
1836{
1837 unsigned long pages_for_compaction;
1838 unsigned long inactive_lru_pages;
1839
1840
1841 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1842 return false;
1843
1844
1845 if (sc->gfp_mask & __GFP_REPEAT) {
1846
1847
1848
1849
1850
1851
1852 if (!nr_reclaimed && !nr_scanned)
1853 return false;
1854 } else {
1855
1856
1857
1858
1859
1860
1861
1862
1863 if (!nr_reclaimed)
1864 return false;
1865 }
1866
1867
1868
1869
1870
1871 pages_for_compaction = (2UL << sc->order);
1872 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1873 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1874 if (sc->nr_reclaimed < pages_for_compaction &&
1875 inactive_lru_pages > pages_for_compaction)
1876 return true;
1877
1878
1879 switch (compaction_suitable(zone, sc->order)) {
1880 case COMPACT_PARTIAL:
1881 case COMPACT_CONTINUE:
1882 return false;
1883 default:
1884 return true;
1885 }
1886}
1887
1888
1889
1890
1891static void shrink_zone(int priority, struct zone *zone,
1892 struct scan_control *sc)
1893{
1894 unsigned long nr[NR_LRU_LISTS];
1895 unsigned long nr_to_scan;
1896 enum lru_list l;
1897 unsigned long nr_reclaimed, nr_scanned;
1898 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1899
1900restart:
1901 nr_reclaimed = 0;
1902 nr_scanned = sc->nr_scanned;
1903 get_scan_count(zone, sc, nr, priority);
1904
1905 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1906 nr[LRU_INACTIVE_FILE]) {
1907 for_each_evictable_lru(l) {
1908 if (nr[l]) {
1909 nr_to_scan = min_t(unsigned long,
1910 nr[l], SWAP_CLUSTER_MAX);
1911 nr[l] -= nr_to_scan;
1912
1913 nr_reclaimed += shrink_list(l, nr_to_scan,
1914 zone, sc, priority);
1915 }
1916 }
1917
1918
1919
1920
1921
1922
1923
1924
1925 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1926 break;
1927 }
1928 sc->nr_reclaimed += nr_reclaimed;
1929
1930
1931
1932
1933
1934 if (inactive_anon_is_low(zone, sc))
1935 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1936
1937
1938 if (should_continue_reclaim(zone, nr_reclaimed,
1939 sc->nr_scanned - nr_scanned, sc))
1940 goto restart;
1941
1942 throttle_vm_writeout(sc->gfp_mask);
1943}
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961static void shrink_zones(int priority, struct zonelist *zonelist,
1962 struct scan_control *sc)
1963{
1964 struct zoneref *z;
1965 struct zone *zone;
1966
1967 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1968 gfp_zone(sc->gfp_mask), sc->nodemask) {
1969 if (!populated_zone(zone))
1970 continue;
1971
1972
1973
1974
1975 if (scanning_global_lru(sc)) {
1976 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1977 continue;
1978 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1979 continue;
1980 }
1981
1982 shrink_zone(priority, zone, sc);
1983 }
1984}
1985
1986static bool zone_reclaimable(struct zone *zone)
1987{
1988 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
1989}
1990
1991
1992
1993
1994
1995
1996static bool all_unreclaimable(struct zonelist *zonelist,
1997 struct scan_control *sc)
1998{
1999 struct zoneref *z;
2000 struct zone *zone;
2001 bool all_unreclaimable = true;
2002
2003 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2004 gfp_zone(sc->gfp_mask), sc->nodemask) {
2005 if (!populated_zone(zone))
2006 continue;
2007 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2008 continue;
2009 if (zone_reclaimable(zone)) {
2010 all_unreclaimable = false;
2011 break;
2012 }
2013 }
2014
2015 return all_unreclaimable;
2016}
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2035 struct scan_control *sc)
2036{
2037 int priority;
2038 unsigned long total_scanned = 0;
2039 struct reclaim_state *reclaim_state = current->reclaim_state;
2040 struct zoneref *z;
2041 struct zone *zone;
2042 unsigned long writeback_threshold;
2043
2044 get_mems_allowed();
2045 delayacct_freepages_start();
2046
2047 if (scanning_global_lru(sc))
2048 count_vm_event(ALLOCSTALL);
2049
2050 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2051 sc->nr_scanned = 0;
2052 if (!priority)
2053 disable_swap_token();
2054 shrink_zones(priority, zonelist, sc);
2055
2056
2057
2058
2059 if (scanning_global_lru(sc)) {
2060 unsigned long lru_pages = 0;
2061 for_each_zone_zonelist(zone, z, zonelist,
2062 gfp_zone(sc->gfp_mask)) {
2063 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2064 continue;
2065
2066 lru_pages += zone_reclaimable_pages(zone);
2067 }
2068
2069 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
2070 if (reclaim_state) {
2071 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2072 reclaim_state->reclaimed_slab = 0;
2073 }
2074 }
2075 total_scanned += sc->nr_scanned;
2076 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2077 goto out;
2078
2079
2080
2081
2082
2083
2084
2085
2086 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2087 if (total_scanned > writeback_threshold) {
2088 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
2089 sc->may_writepage = 1;
2090 }
2091
2092
2093 if (!sc->hibernation_mode && sc->nr_scanned &&
2094 priority < DEF_PRIORITY - 2) {
2095 struct zone *preferred_zone;
2096
2097 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2098 &cpuset_current_mems_allowed,
2099 &preferred_zone);
2100 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2101 }
2102 }
2103
2104out:
2105 delayacct_freepages_end();
2106 put_mems_allowed();
2107
2108 if (sc->nr_reclaimed)
2109 return sc->nr_reclaimed;
2110
2111
2112 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
2113 return 1;
2114
2115 return 0;
2116}
2117
2118unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2119 gfp_t gfp_mask, nodemask_t *nodemask)
2120{
2121 unsigned long nr_reclaimed;
2122 struct scan_control sc = {
2123 .gfp_mask = gfp_mask,
2124 .may_writepage = !laptop_mode,
2125 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2126 .may_unmap = 1,
2127 .may_swap = 1,
2128 .swappiness = vm_swappiness,
2129 .order = order,
2130 .mem_cgroup = NULL,
2131 .nodemask = nodemask,
2132 };
2133
2134 trace_mm_vmscan_direct_reclaim_begin(order,
2135 sc.may_writepage,
2136 gfp_mask);
2137
2138 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2139
2140 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2141
2142 return nr_reclaimed;
2143}
2144
2145#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2146
2147unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2148 gfp_t gfp_mask, bool noswap,
2149 unsigned int swappiness,
2150 struct zone *zone)
2151{
2152 struct scan_control sc = {
2153 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2154 .may_writepage = !laptop_mode,
2155 .may_unmap = 1,
2156 .may_swap = !noswap,
2157 .swappiness = swappiness,
2158 .order = 0,
2159 .mem_cgroup = mem,
2160 };
2161 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2162 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2163
2164 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
2165 sc.may_writepage,
2166 sc.gfp_mask);
2167
2168
2169
2170
2171
2172
2173
2174
2175 shrink_zone(0, zone, &sc);
2176
2177 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2178
2179 return sc.nr_reclaimed;
2180}
2181
2182unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2183 gfp_t gfp_mask,
2184 bool noswap,
2185 unsigned int swappiness)
2186{
2187 struct zonelist *zonelist;
2188 unsigned long nr_reclaimed;
2189 struct scan_control sc = {
2190 .may_writepage = !laptop_mode,
2191 .may_unmap = 1,
2192 .may_swap = !noswap,
2193 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2194 .swappiness = swappiness,
2195 .order = 0,
2196 .mem_cgroup = mem_cont,
2197 .nodemask = NULL,
2198 };
2199
2200 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2201 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2202 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
2203
2204 trace_mm_vmscan_memcg_reclaim_begin(0,
2205 sc.may_writepage,
2206 sc.gfp_mask);
2207
2208 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2209
2210 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2211
2212 return nr_reclaimed;
2213}
2214#endif
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2233 int classzone_idx)
2234{
2235 unsigned long present_pages = 0;
2236 int i;
2237
2238 for (i = 0; i <= classzone_idx; i++)
2239 present_pages += pgdat->node_zones[i].present_pages;
2240
2241 return balanced_pages > (present_pages >> 2);
2242}
2243
2244
2245static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2246 int classzone_idx)
2247{
2248 int i;
2249 unsigned long balanced = 0;
2250 bool all_zones_ok = true;
2251
2252
2253 if (remaining)
2254 return true;
2255
2256
2257 for (i = 0; i < pgdat->nr_zones; i++) {
2258 struct zone *zone = pgdat->node_zones + i;
2259
2260 if (!populated_zone(zone))
2261 continue;
2262
2263
2264
2265
2266
2267
2268
2269 if (zone->all_unreclaimable) {
2270 balanced += zone->present_pages;
2271 continue;
2272 }
2273
2274 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2275 classzone_idx, 0))
2276 all_zones_ok = false;
2277 else
2278 balanced += zone->present_pages;
2279 }
2280
2281
2282
2283
2284
2285
2286 if (order)
2287 return pgdat_balanced(pgdat, balanced, classzone_idx);
2288 else
2289 return !all_zones_ok;
2290}
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2314 int *classzone_idx)
2315{
2316 int all_zones_ok;
2317 unsigned long balanced;
2318 int priority;
2319 int i;
2320 int end_zone = 0;
2321 unsigned long total_scanned;
2322 struct reclaim_state *reclaim_state = current->reclaim_state;
2323 struct scan_control sc = {
2324 .gfp_mask = GFP_KERNEL,
2325 .may_unmap = 1,
2326 .may_swap = 1,
2327
2328
2329
2330
2331 .nr_to_reclaim = ULONG_MAX,
2332 .swappiness = vm_swappiness,
2333 .order = order,
2334 .mem_cgroup = NULL,
2335 };
2336loop_again:
2337 total_scanned = 0;
2338 sc.nr_reclaimed = 0;
2339 sc.may_writepage = !laptop_mode;
2340 count_vm_event(PAGEOUTRUN);
2341
2342 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2343 unsigned long lru_pages = 0;
2344 int has_under_min_watermark_zone = 0;
2345
2346
2347 if (!priority)
2348 disable_swap_token();
2349
2350 all_zones_ok = 1;
2351 balanced = 0;
2352
2353
2354
2355
2356
2357 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
2358 struct zone *zone = pgdat->node_zones + i;
2359
2360 if (!populated_zone(zone))
2361 continue;
2362
2363 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2364 continue;
2365
2366
2367
2368
2369
2370 if (inactive_anon_is_low(zone, &sc))
2371 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2372 &sc, priority, 0);
2373
2374 if (!zone_watermark_ok_safe(zone, order,
2375 high_wmark_pages(zone), 0, 0)) {
2376 end_zone = i;
2377 *classzone_idx = i;
2378 break;
2379 }
2380 }
2381 if (i < 0)
2382 goto out;
2383
2384 for (i = 0; i <= end_zone; i++) {
2385 struct zone *zone = pgdat->node_zones + i;
2386
2387 lru_pages += zone_reclaimable_pages(zone);
2388 }
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399 for (i = 0; i <= end_zone; i++) {
2400 int compaction;
2401 struct zone *zone = pgdat->node_zones + i;
2402 int nr_slab;
2403
2404 if (!populated_zone(zone))
2405 continue;
2406
2407 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2408 continue;
2409
2410 sc.nr_scanned = 0;
2411
2412
2413
2414
2415
2416 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
2417
2418
2419
2420
2421
2422 if (!zone_watermark_ok_safe(zone, order,
2423 8*high_wmark_pages(zone), end_zone, 0))
2424 shrink_zone(priority, zone, &sc);
2425 reclaim_state->reclaimed_slab = 0;
2426 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
2427 lru_pages);
2428 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2429 total_scanned += sc.nr_scanned;
2430
2431 compaction = 0;
2432 if (order &&
2433 zone_watermark_ok(zone, 0,
2434 high_wmark_pages(zone),
2435 end_zone, 0) &&
2436 !zone_watermark_ok(zone, order,
2437 high_wmark_pages(zone),
2438 end_zone, 0)) {
2439 compact_zone_order(zone,
2440 order,
2441 sc.gfp_mask, false,
2442 COMPACT_MODE_KSWAPD);
2443 compaction = 1;
2444 }
2445
2446 if (zone->all_unreclaimable)
2447 continue;
2448 if (!compaction && nr_slab == 0 &&
2449 !zone_reclaimable(zone))
2450 zone->all_unreclaimable = 1;
2451
2452
2453
2454
2455
2456 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2457 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2458 sc.may_writepage = 1;
2459
2460 if (!zone_watermark_ok_safe(zone, order,
2461 high_wmark_pages(zone), end_zone, 0)) {
2462 all_zones_ok = 0;
2463
2464
2465
2466
2467
2468 if (!zone_watermark_ok_safe(zone, order,
2469 min_wmark_pages(zone), end_zone, 0))
2470 has_under_min_watermark_zone = 1;
2471 } else {
2472
2473
2474
2475
2476
2477
2478
2479 zone_clear_flag(zone, ZONE_CONGESTED);
2480 if (i <= *classzone_idx)
2481 balanced += zone->present_pages;
2482 }
2483
2484 }
2485 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2486 break;
2487
2488
2489
2490
2491 if (total_scanned && (priority < DEF_PRIORITY - 2)) {
2492 if (has_under_min_watermark_zone)
2493 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2494 else
2495 congestion_wait(BLK_RW_ASYNC, HZ/10);
2496 }
2497
2498
2499
2500
2501
2502
2503
2504 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2505 break;
2506 }
2507out:
2508
2509
2510
2511
2512
2513
2514 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2515 cond_resched();
2516
2517 try_to_freeze();
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
2534 order = sc.order = 0;
2535
2536 goto loop_again;
2537 }
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547 if (order) {
2548 for (i = 0; i <= end_zone; i++) {
2549 struct zone *zone = pgdat->node_zones + i;
2550
2551 if (!populated_zone(zone))
2552 continue;
2553
2554 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2555 continue;
2556
2557
2558 if (!zone_watermark_ok(zone, 0,
2559 high_wmark_pages(zone), 0, 0)) {
2560 order = sc.order = 0;
2561 goto loop_again;
2562 }
2563
2564
2565 zone_clear_flag(zone, ZONE_CONGESTED);
2566 }
2567 }
2568
2569
2570
2571
2572
2573
2574
2575 *classzone_idx = end_zone;
2576 return order;
2577}
2578
2579static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2580{
2581 long remaining = 0;
2582 DEFINE_WAIT(wait);
2583
2584 if (freezing(current) || kthread_should_stop())
2585 return;
2586
2587 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2588
2589
2590 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2591 remaining = schedule_timeout(HZ/10);
2592 finish_wait(&pgdat->kswapd_wait, &wait);
2593 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2594 }
2595
2596
2597
2598
2599
2600 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2601 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2612 schedule();
2613 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2614 } else {
2615 if (remaining)
2616 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2617 else
2618 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2619 }
2620 finish_wait(&pgdat->kswapd_wait, &wait);
2621}
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636static int kswapd(void *p)
2637{
2638 unsigned long order;
2639 int classzone_idx;
2640 pg_data_t *pgdat = (pg_data_t*)p;
2641 struct task_struct *tsk = current;
2642
2643 struct reclaim_state reclaim_state = {
2644 .reclaimed_slab = 0,
2645 };
2646 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2647
2648 lockdep_set_current_reclaim_state(GFP_KERNEL);
2649
2650 if (!cpumask_empty(cpumask))
2651 set_cpus_allowed_ptr(tsk, cpumask);
2652 current->reclaim_state = &reclaim_state;
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2667 set_freezable();
2668
2669 order = 0;
2670 classzone_idx = MAX_NR_ZONES - 1;
2671 for ( ; ; ) {
2672 unsigned long new_order;
2673 int new_classzone_idx;
2674 int ret;
2675
2676 new_order = pgdat->kswapd_max_order;
2677 new_classzone_idx = pgdat->classzone_idx;
2678 pgdat->kswapd_max_order = 0;
2679 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2680 if (order < new_order || classzone_idx > new_classzone_idx) {
2681
2682
2683
2684
2685 order = new_order;
2686 classzone_idx = new_classzone_idx;
2687 } else {
2688 kswapd_try_to_sleep(pgdat, order, classzone_idx);
2689 order = pgdat->kswapd_max_order;
2690 classzone_idx = pgdat->classzone_idx;
2691 pgdat->kswapd_max_order = 0;
2692 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2693 }
2694
2695 ret = try_to_freeze();
2696 if (kthread_should_stop())
2697 break;
2698
2699
2700
2701
2702
2703 if (!ret) {
2704 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2705 order = balance_pgdat(pgdat, order, &classzone_idx);
2706 }
2707 }
2708 return 0;
2709}
2710
2711
2712
2713
2714void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2715{
2716 pg_data_t *pgdat;
2717
2718 if (!populated_zone(zone))
2719 return;
2720
2721 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2722 return;
2723 pgdat = zone->zone_pgdat;
2724 if (pgdat->kswapd_max_order < order) {
2725 pgdat->kswapd_max_order = order;
2726 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2727 }
2728 if (!waitqueue_active(&pgdat->kswapd_wait))
2729 return;
2730 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2731 return;
2732
2733 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2734 wake_up_interruptible(&pgdat->kswapd_wait);
2735}
2736
2737
2738
2739
2740
2741
2742
2743
2744unsigned long global_reclaimable_pages(void)
2745{
2746 int nr;
2747
2748 nr = global_page_state(NR_ACTIVE_FILE) +
2749 global_page_state(NR_INACTIVE_FILE);
2750
2751 if (nr_swap_pages > 0)
2752 nr += global_page_state(NR_ACTIVE_ANON) +
2753 global_page_state(NR_INACTIVE_ANON);
2754
2755 return nr;
2756}
2757
2758unsigned long zone_reclaimable_pages(struct zone *zone)
2759{
2760 int nr;
2761
2762 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
2763 zone_page_state(zone, NR_INACTIVE_FILE);
2764
2765 if (nr_swap_pages > 0)
2766 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
2767 zone_page_state(zone, NR_INACTIVE_ANON);
2768
2769 return nr;
2770}
2771
2772#ifdef CONFIG_HIBERNATION
2773
2774
2775
2776
2777
2778
2779
2780
2781unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2782{
2783 struct reclaim_state reclaim_state;
2784 struct scan_control sc = {
2785 .gfp_mask = GFP_HIGHUSER_MOVABLE,
2786 .may_swap = 1,
2787 .may_unmap = 1,
2788 .may_writepage = 1,
2789 .nr_to_reclaim = nr_to_reclaim,
2790 .hibernation_mode = 1,
2791 .swappiness = vm_swappiness,
2792 .order = 0,
2793 };
2794 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2795 struct task_struct *p = current;
2796 unsigned long nr_reclaimed;
2797
2798 p->flags |= PF_MEMALLOC;
2799 lockdep_set_current_reclaim_state(sc.gfp_mask);
2800 reclaim_state.reclaimed_slab = 0;
2801 p->reclaim_state = &reclaim_state;
2802
2803 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2804
2805 p->reclaim_state = NULL;
2806 lockdep_clear_current_reclaim_state();
2807 p->flags &= ~PF_MEMALLOC;
2808
2809 return nr_reclaimed;
2810}
2811#endif
2812
2813
2814
2815
2816
2817static int __devinit cpu_callback(struct notifier_block *nfb,
2818 unsigned long action, void *hcpu)
2819{
2820 int nid;
2821
2822 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
2823 for_each_node_state(nid, N_HIGH_MEMORY) {
2824 pg_data_t *pgdat = NODE_DATA(nid);
2825 const struct cpumask *mask;
2826
2827 mask = cpumask_of_node(pgdat->node_id);
2828
2829 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2830
2831 set_cpus_allowed_ptr(pgdat->kswapd, mask);
2832 }
2833 }
2834 return NOTIFY_OK;
2835}
2836
2837
2838
2839
2840
2841int kswapd_run(int nid)
2842{
2843 pg_data_t *pgdat = NODE_DATA(nid);
2844 int ret = 0;
2845
2846 if (pgdat->kswapd)
2847 return 0;
2848
2849 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
2850 if (IS_ERR(pgdat->kswapd)) {
2851
2852 BUG_ON(system_state == SYSTEM_BOOTING);
2853 printk("Failed to start kswapd on node %d\n",nid);
2854 ret = -1;
2855 }
2856 return ret;
2857}
2858
2859
2860
2861
2862void kswapd_stop(int nid)
2863{
2864 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2865
2866 if (kswapd)
2867 kthread_stop(kswapd);
2868}
2869
2870static int __init kswapd_init(void)
2871{
2872 int nid;
2873
2874 swap_setup();
2875 for_each_node_state(nid, N_HIGH_MEMORY)
2876 kswapd_run(nid);
2877 hotcpu_notifier(cpu_callback, 0);
2878 return 0;
2879}
2880
2881module_init(kswapd_init)
2882
2883#ifdef CONFIG_NUMA
2884
2885
2886
2887
2888
2889
2890int zone_reclaim_mode __read_mostly;
2891
2892#define RECLAIM_OFF 0
2893#define RECLAIM_ZONE (1<<0)
2894#define RECLAIM_WRITE (1<<1)
2895#define RECLAIM_SWAP (1<<2)
2896
2897
2898
2899
2900
2901
2902#define ZONE_RECLAIM_PRIORITY 4
2903
2904
2905
2906
2907
2908int sysctl_min_unmapped_ratio = 1;
2909
2910
2911
2912
2913
2914int sysctl_min_slab_ratio = 5;
2915
2916static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2917{
2918 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2919 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2920 zone_page_state(zone, NR_ACTIVE_FILE);
2921
2922
2923
2924
2925
2926
2927 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2928}
2929
2930
2931static long zone_pagecache_reclaimable(struct zone *zone)
2932{
2933 long nr_pagecache_reclaimable;
2934 long delta = 0;
2935
2936
2937
2938
2939
2940
2941
2942 if (zone_reclaim_mode & RECLAIM_SWAP)
2943 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2944 else
2945 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2946
2947
2948 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2949 delta += zone_page_state(zone, NR_FILE_DIRTY);
2950
2951
2952 if (unlikely(delta > nr_pagecache_reclaimable))
2953 delta = nr_pagecache_reclaimable;
2954
2955 return nr_pagecache_reclaimable - delta;
2956}
2957
2958
2959
2960
2961static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2962{
2963
2964 const unsigned long nr_pages = 1 << order;
2965 struct task_struct *p = current;
2966 struct reclaim_state reclaim_state;
2967 int priority;
2968 struct scan_control sc = {
2969 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2970 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2971 .may_swap = 1,
2972 .nr_to_reclaim = max_t(unsigned long, nr_pages,
2973 SWAP_CLUSTER_MAX),
2974 .gfp_mask = gfp_mask,
2975 .swappiness = vm_swappiness,
2976 .order = order,
2977 };
2978 unsigned long nr_slab_pages0, nr_slab_pages1;
2979
2980 cond_resched();
2981
2982
2983
2984
2985
2986 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
2987 lockdep_set_current_reclaim_state(gfp_mask);
2988 reclaim_state.reclaimed_slab = 0;
2989 p->reclaim_state = &reclaim_state;
2990
2991 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2992
2993
2994
2995
2996 priority = ZONE_RECLAIM_PRIORITY;
2997 do {
2998 shrink_zone(priority, zone, &sc);
2999 priority--;
3000 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
3001 }
3002
3003 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3004 if (nr_slab_pages0 > zone->min_slab_pages) {
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015 for (;;) {
3016 unsigned long lru_pages = zone_reclaimable_pages(zone);
3017
3018
3019 if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
3020 break;
3021
3022
3023 nr_slab_pages1 = zone_page_state(zone,
3024 NR_SLAB_RECLAIMABLE);
3025 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3026 break;
3027 }
3028
3029
3030
3031
3032
3033 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3034 if (nr_slab_pages1 < nr_slab_pages0)
3035 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3036 }
3037
3038 p->reclaim_state = NULL;
3039 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3040 lockdep_clear_current_reclaim_state();
3041 return sc.nr_reclaimed >= nr_pages;
3042}
3043
3044int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3045{
3046 int node_id;
3047 int ret;
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3060 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3061 return ZONE_RECLAIM_FULL;
3062
3063 if (zone->all_unreclaimable)
3064 return ZONE_RECLAIM_FULL;
3065
3066
3067
3068
3069 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3070 return ZONE_RECLAIM_NOSCAN;
3071
3072
3073
3074
3075
3076
3077
3078 node_id = zone_to_nid(zone);
3079 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3080 return ZONE_RECLAIM_NOSCAN;
3081
3082 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3083 return ZONE_RECLAIM_NOSCAN;
3084
3085 ret = __zone_reclaim(zone, gfp_mask, order);
3086 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3087
3088 if (!ret)
3089 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3090
3091 return ret;
3092}
3093#endif
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109int page_evictable(struct page *page, struct vm_area_struct *vma)
3110{
3111
3112 if (mapping_unevictable(page_mapping(page)))
3113 return 0;
3114
3115 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
3116 return 0;
3117
3118 return 1;
3119}
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132static void check_move_unevictable_page(struct page *page, struct zone *zone)
3133{
3134 VM_BUG_ON(PageActive(page));
3135
3136retry:
3137 ClearPageUnevictable(page);
3138 if (page_evictable(page, NULL)) {
3139 enum lru_list l = page_lru_base_type(page);
3140
3141 __dec_zone_state(zone, NR_UNEVICTABLE);
3142 list_move(&page->lru, &zone->lru[l].list);
3143 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
3144 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
3145 __count_vm_event(UNEVICTABLE_PGRESCUED);
3146 } else {
3147
3148
3149
3150 SetPageUnevictable(page);
3151 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
3152 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
3153 if (page_evictable(page, NULL))
3154 goto retry;
3155 }
3156}
3157
3158
3159
3160
3161
3162
3163
3164
3165void scan_mapping_unevictable_pages(struct address_space *mapping)
3166{
3167 pgoff_t next = 0;
3168 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
3169 PAGE_CACHE_SHIFT;
3170 struct zone *zone;
3171 struct pagevec pvec;
3172
3173 if (mapping->nrpages == 0)
3174 return;
3175
3176 pagevec_init(&pvec, 0);
3177 while (next < end &&
3178 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
3179 int i;
3180 int pg_scanned = 0;
3181
3182 zone = NULL;
3183
3184 for (i = 0; i < pagevec_count(&pvec); i++) {
3185 struct page *page = pvec.pages[i];
3186 pgoff_t page_index = page->index;
3187 struct zone *pagezone = page_zone(page);
3188
3189 pg_scanned++;
3190 if (page_index > next)
3191 next = page_index;
3192 next++;
3193
3194 if (pagezone != zone) {
3195 if (zone)
3196 spin_unlock_irq(&zone->lru_lock);
3197 zone = pagezone;
3198 spin_lock_irq(&zone->lru_lock);
3199 }
3200
3201 if (PageLRU(page) && PageUnevictable(page))
3202 check_move_unevictable_page(page, zone);
3203 }
3204 if (zone)
3205 spin_unlock_irq(&zone->lru_lock);
3206 pagevec_release(&pvec);
3207
3208 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
3209 }
3210
3211}
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL
3224static void scan_zone_unevictable_pages(struct zone *zone)
3225{
3226 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
3227 unsigned long scan;
3228 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
3229
3230 while (nr_to_scan > 0) {
3231 unsigned long batch_size = min(nr_to_scan,
3232 SCAN_UNEVICTABLE_BATCH_SIZE);
3233
3234 spin_lock_irq(&zone->lru_lock);
3235 for (scan = 0; scan < batch_size; scan++) {
3236 struct page *page = lru_to_page(l_unevictable);
3237
3238 if (!trylock_page(page))
3239 continue;
3240
3241 prefetchw_prev_lru_page(page, l_unevictable, flags);
3242
3243 if (likely(PageLRU(page) && PageUnevictable(page)))
3244 check_move_unevictable_page(page, zone);
3245
3246 unlock_page(page);
3247 }
3248 spin_unlock_irq(&zone->lru_lock);
3249
3250 nr_to_scan -= batch_size;
3251 }
3252}
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266static void scan_all_zones_unevictable_pages(void)
3267{
3268 struct zone *zone;
3269
3270 for_each_zone(zone) {
3271 scan_zone_unevictable_pages(zone);
3272 }
3273}
3274
3275
3276
3277
3278
3279unsigned long scan_unevictable_pages;
3280
3281int scan_unevictable_handler(struct ctl_table *table, int write,
3282 void __user *buffer,
3283 size_t *length, loff_t *ppos)
3284{
3285 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3286
3287 if (write && *(unsigned long *)table->data)
3288 scan_all_zones_unevictable_pages();
3289
3290 scan_unevictable_pages = 0;
3291 return 0;
3292}
3293
3294#ifdef CONFIG_NUMA
3295
3296
3297
3298
3299
3300static ssize_t read_scan_unevictable_node(struct sys_device *dev,
3301 struct sysdev_attribute *attr,
3302 char *buf)
3303{
3304 return sprintf(buf, "0\n");
3305}
3306
3307static ssize_t write_scan_unevictable_node(struct sys_device *dev,
3308 struct sysdev_attribute *attr,
3309 const char *buf, size_t count)
3310{
3311 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
3312 struct zone *zone;
3313 unsigned long res;
3314 unsigned long req = strict_strtoul(buf, 10, &res);
3315
3316 if (!req)
3317 return 1;
3318
3319 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
3320 if (!populated_zone(zone))
3321 continue;
3322 scan_zone_unevictable_pages(zone);
3323 }
3324 return 1;
3325}
3326
3327
3328static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3329 read_scan_unevictable_node,
3330 write_scan_unevictable_node);
3331
3332int scan_unevictable_register_node(struct node *node)
3333{
3334 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
3335}
3336
3337void scan_unevictable_unregister_node(struct node *node)
3338{
3339 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
3340}
3341#endif
3342