1
2
3
4
5
6
7
8
9
10
11
12
13
14#include <linux/mm.h>
15#include <linux/module.h>
16#include <linux/gfp.h>
17#include <linux/kernel_stat.h>
18#include <linux/swap.h>
19#include <linux/pagemap.h>
20#include <linux/init.h>
21#include <linux/highmem.h>
22#include <linux/vmstat.h>
23#include <linux/file.h>
24#include <linux/writeback.h>
25#include <linux/blkdev.h>
26#include <linux/buffer_head.h>
27
28#include <linux/mm_inline.h>
29#include <linux/pagevec.h>
30#include <linux/backing-dev.h>
31#include <linux/rmap.h>
32#include <linux/topology.h>
33#include <linux/cpu.h>
34#include <linux/cpuset.h>
35#include <linux/compaction.h>
36#include <linux/notifier.h>
37#include <linux/rwsem.h>
38#include <linux/delay.h>
39#include <linux/kthread.h>
40#include <linux/freezer.h>
41#include <linux/memcontrol.h>
42#include <linux/delayacct.h>
43#include <linux/sysctl.h>
44#include <linux/oom.h>
45
46#include <asm/tlbflush.h>
47#include <asm/div64.h>
48
49#include <linux/swapops.h>
50
51#include "internal.h"
52
53#define CREATE_TRACE_POINTS
54#include <trace/events/vmscan.h>
55
56
57
58
59
60
61
62
63
64
65
66
67typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
73
74struct scan_control {
75
76 unsigned long nr_scanned;
77
78
79 unsigned long nr_reclaimed;
80
81
82 unsigned long nr_to_reclaim;
83
84 unsigned long hibernation_mode;
85
86
87 gfp_t gfp_mask;
88
89 int may_writepage;
90
91
92 int may_unmap;
93
94
95 int may_swap;
96
97 int swappiness;
98
99 int order;
100
101
102
103
104
105 reclaim_mode_t reclaim_mode;
106
107
108 struct mem_cgroup *mem_cgroup;
109
110
111
112
113
114 nodemask_t *nodemask;
115};
116
117#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
118
119#ifdef ARCH_HAS_PREFETCH
120#define prefetch_prev_lru_page(_page, _base, _field) \
121 do { \
122 if ((_page)->lru.prev != _base) { \
123 struct page *prev; \
124 \
125 prev = lru_to_page(&(_page->lru)); \
126 prefetch(&prev->_field); \
127 } \
128 } while (0)
129#else
130#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
131#endif
132
133#ifdef ARCH_HAS_PREFETCHW
134#define prefetchw_prev_lru_page(_page, _base, _field) \
135 do { \
136 if ((_page)->lru.prev != _base) { \
137 struct page *prev; \
138 \
139 prev = lru_to_page(&(_page->lru)); \
140 prefetchw(&prev->_field); \
141 } \
142 } while (0)
143#else
144#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
145#endif
146
147
148
149
150int vm_swappiness = 60;
151long vm_total_pages;
152
153static LIST_HEAD(shrinker_list);
154static DECLARE_RWSEM(shrinker_rwsem);
155
156#ifdef CONFIG_CGROUP_MEM_RES_CTLR
157#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
158#else
159#define scanning_global_lru(sc) (1)
160#endif
161
162static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
163 struct scan_control *sc)
164{
165 if (!scanning_global_lru(sc))
166 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
167
168 return &zone->reclaim_stat;
169}
170
171static unsigned long zone_nr_lru_pages(struct zone *zone,
172 struct scan_control *sc, enum lru_list lru)
173{
174 if (!scanning_global_lru(sc))
175 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
176
177 return zone_page_state(zone, NR_LRU_BASE + lru);
178}
179
180
181
182
183
184void register_shrinker(struct shrinker *shrinker)
185{
186 shrinker->nr = 0;
187 down_write(&shrinker_rwsem);
188 list_add_tail(&shrinker->list, &shrinker_list);
189 up_write(&shrinker_rwsem);
190}
191EXPORT_SYMBOL(register_shrinker);
192
193
194
195
196void unregister_shrinker(struct shrinker *shrinker)
197{
198 down_write(&shrinker_rwsem);
199 list_del(&shrinker->list);
200 up_write(&shrinker_rwsem);
201}
202EXPORT_SYMBOL(unregister_shrinker);
203
204#define SHRINK_BATCH 128
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
225 unsigned long lru_pages)
226{
227 struct shrinker *shrinker;
228 unsigned long ret = 0;
229
230 if (scanned == 0)
231 scanned = SWAP_CLUSTER_MAX;
232
233 if (!down_read_trylock(&shrinker_rwsem))
234 return 1;
235
236 list_for_each_entry(shrinker, &shrinker_list, list) {
237 unsigned long long delta;
238 unsigned long total_scan;
239 unsigned long max_pass;
240
241 max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
242 delta = (4 * scanned) / shrinker->seeks;
243 delta *= max_pass;
244 do_div(delta, lru_pages + 1);
245 shrinker->nr += delta;
246 if (shrinker->nr < 0) {
247 printk(KERN_ERR "shrink_slab: %pF negative objects to "
248 "delete nr=%ld\n",
249 shrinker->shrink, shrinker->nr);
250 shrinker->nr = max_pass;
251 }
252
253
254
255
256
257
258 if (shrinker->nr > max_pass * 2)
259 shrinker->nr = max_pass * 2;
260
261 total_scan = shrinker->nr;
262 shrinker->nr = 0;
263
264 while (total_scan >= SHRINK_BATCH) {
265 long this_scan = SHRINK_BATCH;
266 int shrink_ret;
267 int nr_before;
268
269 nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
270 shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
271 gfp_mask);
272 if (shrink_ret == -1)
273 break;
274 if (shrink_ret < nr_before)
275 ret += nr_before - shrink_ret;
276 count_vm_events(SLABS_SCANNED, this_scan);
277 total_scan -= this_scan;
278
279 cond_resched();
280 }
281
282 shrinker->nr += total_scan;
283 }
284 up_read(&shrinker_rwsem);
285 return ret;
286}
287
288static void set_reclaim_mode(int priority, struct scan_control *sc,
289 bool sync)
290{
291 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
292
293
294
295
296
297
298 if (COMPACTION_BUILD)
299 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
300 else
301 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
302
303
304
305
306
307
308 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
309 sc->reclaim_mode |= syncmode;
310 else if (sc->order && priority < DEF_PRIORITY - 2)
311 sc->reclaim_mode |= syncmode;
312 else
313 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
314}
315
316static void reset_reclaim_mode(struct scan_control *sc)
317{
318 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
319}
320
321static inline int is_page_cache_freeable(struct page *page)
322{
323
324
325
326
327
328 return page_count(page) - page_has_private(page) == 2;
329}
330
331static int may_write_to_queue(struct backing_dev_info *bdi,
332 struct scan_control *sc)
333{
334 if (current->flags & PF_SWAPWRITE)
335 return 1;
336 if (!bdi_write_congested(bdi))
337 return 1;
338 if (bdi == current->backing_dev_info)
339 return 1;
340
341
342 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
343 return 1;
344 return 0;
345}
346
347
348
349
350
351
352
353
354
355
356
357
358
359static void handle_write_error(struct address_space *mapping,
360 struct page *page, int error)
361{
362 lock_page(page);
363 if (page_mapping(page) == mapping)
364 mapping_set_error(mapping, error);
365 unlock_page(page);
366}
367
368
369typedef enum {
370
371 PAGE_KEEP,
372
373 PAGE_ACTIVATE,
374
375 PAGE_SUCCESS,
376
377 PAGE_CLEAN,
378} pageout_t;
379
380
381
382
383
384static pageout_t pageout(struct page *page, struct address_space *mapping,
385 struct scan_control *sc)
386{
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403 if (!is_page_cache_freeable(page))
404 return PAGE_KEEP;
405 if (!mapping) {
406
407
408
409
410 if (page_has_private(page)) {
411 if (try_to_free_buffers(page)) {
412 ClearPageDirty(page);
413 printk("%s: orphaned page\n", __func__);
414 return PAGE_CLEAN;
415 }
416 }
417 return PAGE_KEEP;
418 }
419 if (mapping->a_ops->writepage == NULL)
420 return PAGE_ACTIVATE;
421 if (!may_write_to_queue(mapping->backing_dev_info, sc))
422 return PAGE_KEEP;
423
424 if (clear_page_dirty_for_io(page)) {
425 int res;
426 struct writeback_control wbc = {
427 .sync_mode = WB_SYNC_NONE,
428 .nr_to_write = SWAP_CLUSTER_MAX,
429 .range_start = 0,
430 .range_end = LLONG_MAX,
431 .for_reclaim = 1,
432 };
433
434 SetPageReclaim(page);
435 res = mapping->a_ops->writepage(page, &wbc);
436 if (res < 0)
437 handle_write_error(mapping, page, res);
438 if (res == AOP_WRITEPAGE_ACTIVATE) {
439 ClearPageReclaim(page);
440 return PAGE_ACTIVATE;
441 }
442
443
444
445
446
447
448 if (PageWriteback(page) &&
449 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
450 wait_on_page_writeback(page);
451
452 if (!PageWriteback(page)) {
453
454 ClearPageReclaim(page);
455 }
456 trace_mm_vmscan_writepage(page,
457 trace_reclaim_flags(page, sc->reclaim_mode));
458 inc_zone_page_state(page, NR_VMSCAN_WRITE);
459 return PAGE_SUCCESS;
460 }
461
462 return PAGE_CLEAN;
463}
464
465
466
467
468
469static int __remove_mapping(struct address_space *mapping, struct page *page)
470{
471 BUG_ON(!PageLocked(page));
472 BUG_ON(mapping != page_mapping(page));
473
474 spin_lock_irq(&mapping->tree_lock);
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500 if (!page_freeze_refs(page, 2))
501 goto cannot_free;
502
503 if (unlikely(PageDirty(page))) {
504 page_unfreeze_refs(page, 2);
505 goto cannot_free;
506 }
507
508 if (PageSwapCache(page)) {
509 swp_entry_t swap = { .val = page_private(page) };
510 __delete_from_swap_cache(page);
511 spin_unlock_irq(&mapping->tree_lock);
512 swapcache_free(swap, page);
513 } else {
514 void (*freepage)(struct page *);
515
516 freepage = mapping->a_ops->freepage;
517
518 __delete_from_page_cache(page);
519 spin_unlock_irq(&mapping->tree_lock);
520 mem_cgroup_uncharge_cache_page(page);
521
522 if (freepage != NULL)
523 freepage(page);
524 }
525
526 return 1;
527
528cannot_free:
529 spin_unlock_irq(&mapping->tree_lock);
530 return 0;
531}
532
533
534
535
536
537
538
539int remove_mapping(struct address_space *mapping, struct page *page)
540{
541 if (__remove_mapping(mapping, page)) {
542
543
544
545
546
547 page_unfreeze_refs(page, 1);
548 return 1;
549 }
550 return 0;
551}
552
553
554
555
556
557
558
559
560
561
562void putback_lru_page(struct page *page)
563{
564 int lru;
565 int active = !!TestClearPageActive(page);
566 int was_unevictable = PageUnevictable(page);
567
568 VM_BUG_ON(PageLRU(page));
569
570redo:
571 ClearPageUnevictable(page);
572
573 if (page_evictable(page, NULL)) {
574
575
576
577
578
579
580 lru = active + page_lru_base_type(page);
581 lru_cache_add_lru(page, lru);
582 } else {
583
584
585
586
587 lru = LRU_UNEVICTABLE;
588 add_page_to_unevictable_list(page);
589
590
591
592
593
594
595
596
597
598 smp_mb();
599 }
600
601
602
603
604
605
606 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
607 if (!isolate_lru_page(page)) {
608 put_page(page);
609 goto redo;
610 }
611
612
613
614
615 }
616
617 if (was_unevictable && lru != LRU_UNEVICTABLE)
618 count_vm_event(UNEVICTABLE_PGRESCUED);
619 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
620 count_vm_event(UNEVICTABLE_PGCULLED);
621
622 put_page(page);
623}
624
625enum page_references {
626 PAGEREF_RECLAIM,
627 PAGEREF_RECLAIM_CLEAN,
628 PAGEREF_KEEP,
629 PAGEREF_ACTIVATE,
630};
631
632static enum page_references page_check_references(struct page *page,
633 struct scan_control *sc)
634{
635 int referenced_ptes, referenced_page;
636 unsigned long vm_flags;
637
638 referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
639 referenced_page = TestClearPageReferenced(page);
640
641
642 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
643 return PAGEREF_RECLAIM;
644
645
646
647
648
649 if (vm_flags & VM_LOCKED)
650 return PAGEREF_RECLAIM;
651
652 if (referenced_ptes) {
653 if (PageAnon(page))
654 return PAGEREF_ACTIVATE;
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669 SetPageReferenced(page);
670
671 if (referenced_page)
672 return PAGEREF_ACTIVATE;
673
674 return PAGEREF_KEEP;
675 }
676
677
678 if (referenced_page && !PageSwapBacked(page))
679 return PAGEREF_RECLAIM_CLEAN;
680
681 return PAGEREF_RECLAIM;
682}
683
684static noinline_for_stack void free_page_list(struct list_head *free_pages)
685{
686 struct pagevec freed_pvec;
687 struct page *page, *tmp;
688
689 pagevec_init(&freed_pvec, 1);
690
691 list_for_each_entry_safe(page, tmp, free_pages, lru) {
692 list_del(&page->lru);
693 if (!pagevec_add(&freed_pvec, page)) {
694 __pagevec_free(&freed_pvec);
695 pagevec_reinit(&freed_pvec);
696 }
697 }
698
699 pagevec_free(&freed_pvec);
700}
701
702
703
704
705static unsigned long shrink_page_list(struct list_head *page_list,
706 struct zone *zone,
707 struct scan_control *sc)
708{
709 LIST_HEAD(ret_pages);
710 LIST_HEAD(free_pages);
711 int pgactivate = 0;
712 unsigned long nr_dirty = 0;
713 unsigned long nr_congested = 0;
714 unsigned long nr_reclaimed = 0;
715
716 cond_resched();
717
718 while (!list_empty(page_list)) {
719 enum page_references references;
720 struct address_space *mapping;
721 struct page *page;
722 int may_enter_fs;
723
724 cond_resched();
725
726 page = lru_to_page(page_list);
727 list_del(&page->lru);
728
729 if (!trylock_page(page))
730 goto keep;
731
732 VM_BUG_ON(PageActive(page));
733 VM_BUG_ON(page_zone(page) != zone);
734
735 sc->nr_scanned++;
736
737 if (unlikely(!page_evictable(page, NULL)))
738 goto cull_mlocked;
739
740 if (!sc->may_unmap && page_mapped(page))
741 goto keep_locked;
742
743
744 if (page_mapped(page) || PageSwapCache(page))
745 sc->nr_scanned++;
746
747 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
748 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
749
750 if (PageWriteback(page)) {
751
752
753
754
755
756
757
758
759 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
760 may_enter_fs)
761 wait_on_page_writeback(page);
762 else {
763 unlock_page(page);
764 goto keep_lumpy;
765 }
766 }
767
768 references = page_check_references(page, sc);
769 switch (references) {
770 case PAGEREF_ACTIVATE:
771 goto activate_locked;
772 case PAGEREF_KEEP:
773 goto keep_locked;
774 case PAGEREF_RECLAIM:
775 case PAGEREF_RECLAIM_CLEAN:
776 ;
777 }
778
779
780
781
782
783 if (PageAnon(page) && !PageSwapCache(page)) {
784 if (!(sc->gfp_mask & __GFP_IO))
785 goto keep_locked;
786 if (!add_to_swap(page))
787 goto activate_locked;
788 may_enter_fs = 1;
789 }
790
791 mapping = page_mapping(page);
792
793
794
795
796
797 if (page_mapped(page) && mapping) {
798 switch (try_to_unmap(page, TTU_UNMAP)) {
799 case SWAP_FAIL:
800 goto activate_locked;
801 case SWAP_AGAIN:
802 goto keep_locked;
803 case SWAP_MLOCK:
804 goto cull_mlocked;
805 case SWAP_SUCCESS:
806 ;
807 }
808 }
809
810 if (PageDirty(page)) {
811 nr_dirty++;
812
813 if (references == PAGEREF_RECLAIM_CLEAN)
814 goto keep_locked;
815 if (!may_enter_fs)
816 goto keep_locked;
817 if (!sc->may_writepage)
818 goto keep_locked;
819
820
821 switch (pageout(page, mapping, sc)) {
822 case PAGE_KEEP:
823 nr_congested++;
824 goto keep_locked;
825 case PAGE_ACTIVATE:
826 goto activate_locked;
827 case PAGE_SUCCESS:
828 if (PageWriteback(page))
829 goto keep_lumpy;
830 if (PageDirty(page))
831 goto keep;
832
833
834
835
836
837 if (!trylock_page(page))
838 goto keep;
839 if (PageDirty(page) || PageWriteback(page))
840 goto keep_locked;
841 mapping = page_mapping(page);
842 case PAGE_CLEAN:
843 ;
844 }
845 }
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868 if (page_has_private(page)) {
869 if (!try_to_release_page(page, sc->gfp_mask))
870 goto activate_locked;
871 if (!mapping && page_count(page) == 1) {
872 unlock_page(page);
873 if (put_page_testzero(page))
874 goto free_it;
875 else {
876
877
878
879
880
881
882
883 nr_reclaimed++;
884 continue;
885 }
886 }
887 }
888
889 if (!mapping || !__remove_mapping(mapping, page))
890 goto keep_locked;
891
892
893
894
895
896
897
898
899 __clear_page_locked(page);
900free_it:
901 nr_reclaimed++;
902
903
904
905
906
907 list_add(&page->lru, &free_pages);
908 continue;
909
910cull_mlocked:
911 if (PageSwapCache(page))
912 try_to_free_swap(page);
913 unlock_page(page);
914 putback_lru_page(page);
915 reset_reclaim_mode(sc);
916 continue;
917
918activate_locked:
919
920 if (PageSwapCache(page) && vm_swap_full())
921 try_to_free_swap(page);
922 VM_BUG_ON(PageActive(page));
923 SetPageActive(page);
924 pgactivate++;
925keep_locked:
926 unlock_page(page);
927keep:
928 reset_reclaim_mode(sc);
929keep_lumpy:
930 list_add(&page->lru, &ret_pages);
931 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
932 }
933
934
935
936
937
938
939
940 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
941 zone_set_flag(zone, ZONE_CONGESTED);
942
943 free_page_list(&free_pages);
944
945 list_splice(&ret_pages, page_list);
946 count_vm_events(PGACTIVATE, pgactivate);
947 return nr_reclaimed;
948}
949
950
951
952
953
954
955
956
957
958
959
960int __isolate_lru_page(struct page *page, int mode, int file)
961{
962 int ret = -EINVAL;
963
964
965 if (!PageLRU(page))
966 return ret;
967
968
969
970
971
972
973 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
974 return ret;
975
976 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
977 return ret;
978
979
980
981
982
983
984 if (PageUnevictable(page))
985 return ret;
986
987 ret = -EBUSY;
988
989 if (likely(get_page_unless_zero(page))) {
990
991
992
993
994
995 ClearPageLRU(page);
996 ret = 0;
997 }
998
999 return ret;
1000}
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1023 struct list_head *src, struct list_head *dst,
1024 unsigned long *scanned, int order, int mode, int file)
1025{
1026 unsigned long nr_taken = 0;
1027 unsigned long nr_lumpy_taken = 0;
1028 unsigned long nr_lumpy_dirty = 0;
1029 unsigned long nr_lumpy_failed = 0;
1030 unsigned long scan;
1031
1032 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1033 struct page *page;
1034 unsigned long pfn;
1035 unsigned long end_pfn;
1036 unsigned long page_pfn;
1037 int zone_id;
1038
1039 page = lru_to_page(src);
1040 prefetchw_prev_lru_page(page, src, flags);
1041
1042 VM_BUG_ON(!PageLRU(page));
1043
1044 switch (__isolate_lru_page(page, mode, file)) {
1045 case 0:
1046 list_move(&page->lru, dst);
1047 mem_cgroup_del_lru(page);
1048 nr_taken += hpage_nr_pages(page);
1049 break;
1050
1051 case -EBUSY:
1052
1053 list_move(&page->lru, src);
1054 mem_cgroup_rotate_lru_list(page, page_lru(page));
1055 continue;
1056
1057 default:
1058 BUG();
1059 }
1060
1061 if (!order)
1062 continue;
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073 zone_id = page_zone_id(page);
1074 page_pfn = page_to_pfn(page);
1075 pfn = page_pfn & ~((1 << order) - 1);
1076 end_pfn = pfn + (1 << order);
1077 for (; pfn < end_pfn; pfn++) {
1078 struct page *cursor_page;
1079
1080
1081 if (unlikely(pfn == page_pfn))
1082 continue;
1083
1084
1085 if (unlikely(!pfn_valid_within(pfn)))
1086 break;
1087
1088 cursor_page = pfn_to_page(pfn);
1089
1090
1091 if (unlikely(page_zone_id(cursor_page) != zone_id))
1092 break;
1093
1094
1095
1096
1097
1098
1099 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
1100 !PageSwapCache(cursor_page))
1101 break;
1102
1103 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1104 list_move(&cursor_page->lru, dst);
1105 mem_cgroup_del_lru(cursor_page);
1106 nr_taken += hpage_nr_pages(page);
1107 nr_lumpy_taken++;
1108 if (PageDirty(cursor_page))
1109 nr_lumpy_dirty++;
1110 scan++;
1111 } else {
1112
1113 if (!page_count(cursor_page))
1114 continue;
1115 break;
1116 }
1117 }
1118
1119
1120 if (pfn < end_pfn)
1121 nr_lumpy_failed++;
1122 }
1123
1124 *scanned = scan;
1125
1126 trace_mm_vmscan_lru_isolate(order,
1127 nr_to_scan, scan,
1128 nr_taken,
1129 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1130 mode);
1131 return nr_taken;
1132}
1133
1134static unsigned long isolate_pages_global(unsigned long nr,
1135 struct list_head *dst,
1136 unsigned long *scanned, int order,
1137 int mode, struct zone *z,
1138 int active, int file)
1139{
1140 int lru = LRU_BASE;
1141 if (active)
1142 lru += LRU_ACTIVE;
1143 if (file)
1144 lru += LRU_FILE;
1145 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
1146 mode, file);
1147}
1148
1149
1150
1151
1152
1153static unsigned long clear_active_flags(struct list_head *page_list,
1154 unsigned int *count)
1155{
1156 int nr_active = 0;
1157 int lru;
1158 struct page *page;
1159
1160 list_for_each_entry(page, page_list, lru) {
1161 int numpages = hpage_nr_pages(page);
1162 lru = page_lru_base_type(page);
1163 if (PageActive(page)) {
1164 lru += LRU_ACTIVE;
1165 ClearPageActive(page);
1166 nr_active += numpages;
1167 }
1168 if (count)
1169 count[lru] += numpages;
1170 }
1171
1172 return nr_active;
1173}
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200int isolate_lru_page(struct page *page)
1201{
1202 int ret = -EBUSY;
1203
1204 if (PageLRU(page)) {
1205 struct zone *zone = page_zone(page);
1206
1207 spin_lock_irq(&zone->lru_lock);
1208 if (PageLRU(page) && get_page_unless_zero(page)) {
1209 int lru = page_lru(page);
1210 ret = 0;
1211 ClearPageLRU(page);
1212
1213 del_page_from_lru_list(zone, page, lru);
1214 }
1215 spin_unlock_irq(&zone->lru_lock);
1216 }
1217 return ret;
1218}
1219
1220
1221
1222
1223static int too_many_isolated(struct zone *zone, int file,
1224 struct scan_control *sc)
1225{
1226 unsigned long inactive, isolated;
1227
1228 if (current_is_kswapd())
1229 return 0;
1230
1231 if (!scanning_global_lru(sc))
1232 return 0;
1233
1234 if (file) {
1235 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1236 isolated = zone_page_state(zone, NR_ISOLATED_FILE);
1237 } else {
1238 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1239 isolated = zone_page_state(zone, NR_ISOLATED_ANON);
1240 }
1241
1242 return isolated > inactive;
1243}
1244
1245
1246
1247
1248static noinline_for_stack void
1249putback_lru_pages(struct zone *zone, struct scan_control *sc,
1250 unsigned long nr_anon, unsigned long nr_file,
1251 struct list_head *page_list)
1252{
1253 struct page *page;
1254 struct pagevec pvec;
1255 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1256
1257 pagevec_init(&pvec, 1);
1258
1259
1260
1261
1262 spin_lock(&zone->lru_lock);
1263 while (!list_empty(page_list)) {
1264 int lru;
1265 page = lru_to_page(page_list);
1266 VM_BUG_ON(PageLRU(page));
1267 list_del(&page->lru);
1268 if (unlikely(!page_evictable(page, NULL))) {
1269 spin_unlock_irq(&zone->lru_lock);
1270 putback_lru_page(page);
1271 spin_lock_irq(&zone->lru_lock);
1272 continue;
1273 }
1274 SetPageLRU(page);
1275 lru = page_lru(page);
1276 add_page_to_lru_list(zone, page, lru);
1277 if (is_active_lru(lru)) {
1278 int file = is_file_lru(lru);
1279 int numpages = hpage_nr_pages(page);
1280 reclaim_stat->recent_rotated[file] += numpages;
1281 }
1282 if (!pagevec_add(&pvec, page)) {
1283 spin_unlock_irq(&zone->lru_lock);
1284 __pagevec_release(&pvec);
1285 spin_lock_irq(&zone->lru_lock);
1286 }
1287 }
1288 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1289 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1290
1291 spin_unlock_irq(&zone->lru_lock);
1292 pagevec_release(&pvec);
1293}
1294
1295static noinline_for_stack void update_isolated_counts(struct zone *zone,
1296 struct scan_control *sc,
1297 unsigned long *nr_anon,
1298 unsigned long *nr_file,
1299 struct list_head *isolated_list)
1300{
1301 unsigned long nr_active;
1302 unsigned int count[NR_LRU_LISTS] = { 0, };
1303 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1304
1305 nr_active = clear_active_flags(isolated_list, count);
1306 __count_vm_events(PGDEACTIVATE, nr_active);
1307
1308 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1309 -count[LRU_ACTIVE_FILE]);
1310 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1311 -count[LRU_INACTIVE_FILE]);
1312 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1313 -count[LRU_ACTIVE_ANON]);
1314 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1315 -count[LRU_INACTIVE_ANON]);
1316
1317 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1318 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1319 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1320 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1321
1322 reclaim_stat->recent_scanned[0] += *nr_anon;
1323 reclaim_stat->recent_scanned[1] += *nr_file;
1324}
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334static inline bool should_reclaim_stall(unsigned long nr_taken,
1335 unsigned long nr_freed,
1336 int priority,
1337 struct scan_control *sc)
1338{
1339 int lumpy_stall_priority;
1340
1341
1342 if (current_is_kswapd())
1343 return false;
1344
1345
1346 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1347 return false;
1348
1349
1350 if (nr_freed == nr_taken)
1351 return false;
1352
1353
1354
1355
1356
1357
1358
1359 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1360 lumpy_stall_priority = DEF_PRIORITY;
1361 else
1362 lumpy_stall_priority = DEF_PRIORITY / 3;
1363
1364 return priority <= lumpy_stall_priority;
1365}
1366
1367
1368
1369
1370
1371static noinline_for_stack unsigned long
1372shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1373 struct scan_control *sc, int priority, int file)
1374{
1375 LIST_HEAD(page_list);
1376 unsigned long nr_scanned;
1377 unsigned long nr_reclaimed = 0;
1378 unsigned long nr_taken;
1379 unsigned long nr_anon;
1380 unsigned long nr_file;
1381
1382 while (unlikely(too_many_isolated(zone, file, sc))) {
1383 congestion_wait(BLK_RW_ASYNC, HZ/10);
1384
1385
1386 if (fatal_signal_pending(current))
1387 return SWAP_CLUSTER_MAX;
1388 }
1389
1390 set_reclaim_mode(priority, sc, false);
1391 lru_add_drain();
1392 spin_lock_irq(&zone->lru_lock);
1393
1394 if (scanning_global_lru(sc)) {
1395 nr_taken = isolate_pages_global(nr_to_scan,
1396 &page_list, &nr_scanned, sc->order,
1397 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1398 ISOLATE_BOTH : ISOLATE_INACTIVE,
1399 zone, 0, file);
1400 zone->pages_scanned += nr_scanned;
1401 if (current_is_kswapd())
1402 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1403 nr_scanned);
1404 else
1405 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1406 nr_scanned);
1407 } else {
1408 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1409 &page_list, &nr_scanned, sc->order,
1410 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1411 ISOLATE_BOTH : ISOLATE_INACTIVE,
1412 zone, sc->mem_cgroup,
1413 0, file);
1414
1415
1416
1417
1418 }
1419
1420 if (nr_taken == 0) {
1421 spin_unlock_irq(&zone->lru_lock);
1422 return 0;
1423 }
1424
1425 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
1426
1427 spin_unlock_irq(&zone->lru_lock);
1428
1429 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1430
1431
1432 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1433 set_reclaim_mode(priority, sc, true);
1434 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1435 }
1436
1437 local_irq_disable();
1438 if (current_is_kswapd())
1439 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1440 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1441
1442 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1443
1444 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1445 zone_idx(zone),
1446 nr_scanned, nr_reclaimed,
1447 priority,
1448 trace_shrink_flags(file, sc->reclaim_mode));
1449 return nr_reclaimed;
1450}
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470static void move_active_pages_to_lru(struct zone *zone,
1471 struct list_head *list,
1472 enum lru_list lru)
1473{
1474 unsigned long pgmoved = 0;
1475 struct pagevec pvec;
1476 struct page *page;
1477
1478 pagevec_init(&pvec, 1);
1479
1480 while (!list_empty(list)) {
1481 page = lru_to_page(list);
1482
1483 VM_BUG_ON(PageLRU(page));
1484 SetPageLRU(page);
1485
1486 list_move(&page->lru, &zone->lru[lru].list);
1487 mem_cgroup_add_lru_list(page, lru);
1488 pgmoved += hpage_nr_pages(page);
1489
1490 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1491 spin_unlock_irq(&zone->lru_lock);
1492 if (buffer_heads_over_limit)
1493 pagevec_strip(&pvec);
1494 __pagevec_release(&pvec);
1495 spin_lock_irq(&zone->lru_lock);
1496 }
1497 }
1498 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1499 if (!is_active_lru(lru))
1500 __count_vm_events(PGDEACTIVATE, pgmoved);
1501}
1502
1503static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1504 struct scan_control *sc, int priority, int file)
1505{
1506 unsigned long nr_taken;
1507 unsigned long pgscanned;
1508 unsigned long vm_flags;
1509 LIST_HEAD(l_hold);
1510 LIST_HEAD(l_active);
1511 LIST_HEAD(l_inactive);
1512 struct page *page;
1513 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1514 unsigned long nr_rotated = 0;
1515
1516 lru_add_drain();
1517 spin_lock_irq(&zone->lru_lock);
1518 if (scanning_global_lru(sc)) {
1519 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1520 &pgscanned, sc->order,
1521 ISOLATE_ACTIVE, zone,
1522 1, file);
1523 zone->pages_scanned += pgscanned;
1524 } else {
1525 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1526 &pgscanned, sc->order,
1527 ISOLATE_ACTIVE, zone,
1528 sc->mem_cgroup, 1, file);
1529
1530
1531
1532
1533 }
1534
1535 reclaim_stat->recent_scanned[file] += nr_taken;
1536
1537 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1538 if (file)
1539 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1540 else
1541 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1542 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1543 spin_unlock_irq(&zone->lru_lock);
1544
1545 while (!list_empty(&l_hold)) {
1546 cond_resched();
1547 page = lru_to_page(&l_hold);
1548 list_del(&page->lru);
1549
1550 if (unlikely(!page_evictable(page, NULL))) {
1551 putback_lru_page(page);
1552 continue;
1553 }
1554
1555 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1556 nr_rotated += hpage_nr_pages(page);
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1567 list_add(&page->lru, &l_active);
1568 continue;
1569 }
1570 }
1571
1572 ClearPageActive(page);
1573 list_add(&page->lru, &l_inactive);
1574 }
1575
1576
1577
1578
1579 spin_lock_irq(&zone->lru_lock);
1580
1581
1582
1583
1584
1585
1586 reclaim_stat->recent_rotated[file] += nr_rotated;
1587
1588 move_active_pages_to_lru(zone, &l_active,
1589 LRU_ACTIVE + file * LRU_FILE);
1590 move_active_pages_to_lru(zone, &l_inactive,
1591 LRU_BASE + file * LRU_FILE);
1592 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1593 spin_unlock_irq(&zone->lru_lock);
1594}
1595
1596#ifdef CONFIG_SWAP
1597static int inactive_anon_is_low_global(struct zone *zone)
1598{
1599 unsigned long active, inactive;
1600
1601 active = zone_page_state(zone, NR_ACTIVE_ANON);
1602 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1603
1604 if (inactive * zone->inactive_ratio < active)
1605 return 1;
1606
1607 return 0;
1608}
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1619{
1620 int low;
1621
1622
1623
1624
1625
1626 if (!total_swap_pages)
1627 return 0;
1628
1629 if (scanning_global_lru(sc))
1630 low = inactive_anon_is_low_global(zone);
1631 else
1632 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1633 return low;
1634}
1635#else
1636static inline int inactive_anon_is_low(struct zone *zone,
1637 struct scan_control *sc)
1638{
1639 return 0;
1640}
1641#endif
1642
1643static int inactive_file_is_low_global(struct zone *zone)
1644{
1645 unsigned long active, inactive;
1646
1647 active = zone_page_state(zone, NR_ACTIVE_FILE);
1648 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1649
1650 return (active > inactive);
1651}
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1669{
1670 int low;
1671
1672 if (scanning_global_lru(sc))
1673 low = inactive_file_is_low_global(zone);
1674 else
1675 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1676 return low;
1677}
1678
1679static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
1680 int file)
1681{
1682 if (file)
1683 return inactive_file_is_low(zone, sc);
1684 else
1685 return inactive_anon_is_low(zone, sc);
1686}
1687
1688static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1689 struct zone *zone, struct scan_control *sc, int priority)
1690{
1691 int file = is_file_lru(lru);
1692
1693 if (is_active_lru(lru)) {
1694 if (inactive_list_is_low(zone, sc, file))
1695 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1696 return 0;
1697 }
1698
1699 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1700}
1701
1702
1703
1704
1705
1706static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1707 unsigned long *nr_saved_scan)
1708{
1709 unsigned long nr;
1710
1711 *nr_saved_scan += nr_to_scan;
1712 nr = *nr_saved_scan;
1713
1714 if (nr >= SWAP_CLUSTER_MAX)
1715 *nr_saved_scan = 0;
1716 else
1717 nr = 0;
1718
1719 return nr;
1720}
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730static void get_scan_count(struct zone *zone, struct scan_control *sc,
1731 unsigned long *nr, int priority)
1732{
1733 unsigned long anon, file, free;
1734 unsigned long anon_prio, file_prio;
1735 unsigned long ap, fp;
1736 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1737 u64 fraction[2], denominator;
1738 enum lru_list l;
1739 int noswap = 0;
1740
1741
1742 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1743 noswap = 1;
1744 fraction[0] = 0;
1745 fraction[1] = 1;
1746 denominator = 1;
1747 goto out;
1748 }
1749
1750 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1751 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1752 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1753 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1754
1755 if (scanning_global_lru(sc)) {
1756 free = zone_page_state(zone, NR_FREE_PAGES);
1757
1758
1759 if (unlikely(file + free <= high_wmark_pages(zone))) {
1760 fraction[0] = 1;
1761 fraction[1] = 0;
1762 denominator = 1;
1763 goto out;
1764 }
1765 }
1766
1767
1768
1769
1770
1771 anon_prio = sc->swappiness;
1772 file_prio = 200 - sc->swappiness;
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785 spin_lock_irq(&zone->lru_lock);
1786 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1787 reclaim_stat->recent_scanned[0] /= 2;
1788 reclaim_stat->recent_rotated[0] /= 2;
1789 }
1790
1791 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1792 reclaim_stat->recent_scanned[1] /= 2;
1793 reclaim_stat->recent_rotated[1] /= 2;
1794 }
1795
1796
1797
1798
1799
1800
1801 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1802 ap /= reclaim_stat->recent_rotated[0] + 1;
1803
1804 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1805 fp /= reclaim_stat->recent_rotated[1] + 1;
1806 spin_unlock_irq(&zone->lru_lock);
1807
1808 fraction[0] = ap;
1809 fraction[1] = fp;
1810 denominator = ap + fp + 1;
1811out:
1812 for_each_evictable_lru(l) {
1813 int file = is_file_lru(l);
1814 unsigned long scan;
1815
1816 scan = zone_nr_lru_pages(zone, sc, l);
1817 if (priority || noswap) {
1818 scan >>= priority;
1819 scan = div64_u64(scan * fraction[file], denominator);
1820 }
1821 nr[l] = nr_scan_try_batch(scan,
1822 &reclaim_stat->nr_saved_scan[l]);
1823 }
1824}
1825
1826
1827
1828
1829
1830
1831
1832
1833static inline bool should_continue_reclaim(struct zone *zone,
1834 unsigned long nr_reclaimed,
1835 unsigned long nr_scanned,
1836 struct scan_control *sc)
1837{
1838 unsigned long pages_for_compaction;
1839 unsigned long inactive_lru_pages;
1840
1841
1842 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1843 return false;
1844
1845
1846 if (sc->gfp_mask & __GFP_REPEAT) {
1847
1848
1849
1850
1851
1852
1853 if (!nr_reclaimed && !nr_scanned)
1854 return false;
1855 } else {
1856
1857
1858
1859
1860
1861
1862
1863
1864 if (!nr_reclaimed)
1865 return false;
1866 }
1867
1868
1869
1870
1871
1872 pages_for_compaction = (2UL << sc->order);
1873 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1874 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1875 if (sc->nr_reclaimed < pages_for_compaction &&
1876 inactive_lru_pages > pages_for_compaction)
1877 return true;
1878
1879
1880 switch (compaction_suitable(zone, sc->order)) {
1881 case COMPACT_PARTIAL:
1882 case COMPACT_CONTINUE:
1883 return false;
1884 default:
1885 return true;
1886 }
1887}
1888
1889
1890
1891
1892static void shrink_zone(int priority, struct zone *zone,
1893 struct scan_control *sc)
1894{
1895 unsigned long nr[NR_LRU_LISTS];
1896 unsigned long nr_to_scan;
1897 enum lru_list l;
1898 unsigned long nr_reclaimed, nr_scanned;
1899 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1900
1901restart:
1902 nr_reclaimed = 0;
1903 nr_scanned = sc->nr_scanned;
1904 get_scan_count(zone, sc, nr, priority);
1905
1906 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1907 nr[LRU_INACTIVE_FILE]) {
1908 for_each_evictable_lru(l) {
1909 if (nr[l]) {
1910 nr_to_scan = min_t(unsigned long,
1911 nr[l], SWAP_CLUSTER_MAX);
1912 nr[l] -= nr_to_scan;
1913
1914 nr_reclaimed += shrink_list(l, nr_to_scan,
1915 zone, sc, priority);
1916 }
1917 }
1918
1919
1920
1921
1922
1923
1924
1925
1926 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1927 break;
1928 }
1929 sc->nr_reclaimed += nr_reclaimed;
1930
1931
1932
1933
1934
1935 if (inactive_anon_is_low(zone, sc))
1936 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1937
1938
1939 if (should_continue_reclaim(zone, nr_reclaimed,
1940 sc->nr_scanned - nr_scanned, sc))
1941 goto restart;
1942
1943 throttle_vm_writeout(sc->gfp_mask);
1944}
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962static void shrink_zones(int priority, struct zonelist *zonelist,
1963 struct scan_control *sc)
1964{
1965 struct zoneref *z;
1966 struct zone *zone;
1967
1968 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1969 gfp_zone(sc->gfp_mask), sc->nodemask) {
1970 if (!populated_zone(zone))
1971 continue;
1972
1973
1974
1975
1976 if (scanning_global_lru(sc)) {
1977 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1978 continue;
1979 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1980 continue;
1981 }
1982
1983 shrink_zone(priority, zone, sc);
1984 }
1985}
1986
1987static bool zone_reclaimable(struct zone *zone)
1988{
1989 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
1990}
1991
1992
1993static bool all_unreclaimable(struct zonelist *zonelist,
1994 struct scan_control *sc)
1995{
1996 struct zoneref *z;
1997 struct zone *zone;
1998
1999 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2000 gfp_zone(sc->gfp_mask), sc->nodemask) {
2001 if (!populated_zone(zone))
2002 continue;
2003 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2004 continue;
2005 if (!zone->all_unreclaimable)
2006 return false;
2007 }
2008
2009 return true;
2010}
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2029 struct scan_control *sc)
2030{
2031 int priority;
2032 unsigned long total_scanned = 0;
2033 struct reclaim_state *reclaim_state = current->reclaim_state;
2034 struct zoneref *z;
2035 struct zone *zone;
2036 unsigned long writeback_threshold;
2037
2038 get_mems_allowed();
2039 delayacct_freepages_start();
2040
2041 if (scanning_global_lru(sc))
2042 count_vm_event(ALLOCSTALL);
2043
2044 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2045 sc->nr_scanned = 0;
2046 if (!priority)
2047 disable_swap_token();
2048 shrink_zones(priority, zonelist, sc);
2049
2050
2051
2052
2053 if (scanning_global_lru(sc)) {
2054 unsigned long lru_pages = 0;
2055 for_each_zone_zonelist(zone, z, zonelist,
2056 gfp_zone(sc->gfp_mask)) {
2057 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2058 continue;
2059
2060 lru_pages += zone_reclaimable_pages(zone);
2061 }
2062
2063 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
2064 if (reclaim_state) {
2065 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2066 reclaim_state->reclaimed_slab = 0;
2067 }
2068 }
2069 total_scanned += sc->nr_scanned;
2070 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2071 goto out;
2072
2073
2074
2075
2076
2077
2078
2079
2080 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2081 if (total_scanned > writeback_threshold) {
2082 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
2083 sc->may_writepage = 1;
2084 }
2085
2086
2087 if (!sc->hibernation_mode && sc->nr_scanned &&
2088 priority < DEF_PRIORITY - 2) {
2089 struct zone *preferred_zone;
2090
2091 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2092 &cpuset_current_mems_allowed,
2093 &preferred_zone);
2094 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2095 }
2096 }
2097
2098out:
2099 delayacct_freepages_end();
2100 put_mems_allowed();
2101
2102 if (sc->nr_reclaimed)
2103 return sc->nr_reclaimed;
2104
2105
2106
2107
2108
2109
2110 if (oom_killer_disabled)
2111 return 0;
2112
2113
2114 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
2115 return 1;
2116
2117 return 0;
2118}
2119
2120unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2121 gfp_t gfp_mask, nodemask_t *nodemask)
2122{
2123 unsigned long nr_reclaimed;
2124 struct scan_control sc = {
2125 .gfp_mask = gfp_mask,
2126 .may_writepage = !laptop_mode,
2127 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2128 .may_unmap = 1,
2129 .may_swap = 1,
2130 .swappiness = vm_swappiness,
2131 .order = order,
2132 .mem_cgroup = NULL,
2133 .nodemask = nodemask,
2134 };
2135
2136 trace_mm_vmscan_direct_reclaim_begin(order,
2137 sc.may_writepage,
2138 gfp_mask);
2139
2140 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2141
2142 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2143
2144 return nr_reclaimed;
2145}
2146
2147#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2148
2149unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2150 gfp_t gfp_mask, bool noswap,
2151 unsigned int swappiness,
2152 struct zone *zone)
2153{
2154 struct scan_control sc = {
2155 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2156 .may_writepage = !laptop_mode,
2157 .may_unmap = 1,
2158 .may_swap = !noswap,
2159 .swappiness = swappiness,
2160 .order = 0,
2161 .mem_cgroup = mem,
2162 };
2163 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2164 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2165
2166 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
2167 sc.may_writepage,
2168 sc.gfp_mask);
2169
2170
2171
2172
2173
2174
2175
2176
2177 shrink_zone(0, zone, &sc);
2178
2179 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2180
2181 return sc.nr_reclaimed;
2182}
2183
2184unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2185 gfp_t gfp_mask,
2186 bool noswap,
2187 unsigned int swappiness)
2188{
2189 struct zonelist *zonelist;
2190 unsigned long nr_reclaimed;
2191 struct scan_control sc = {
2192 .may_writepage = !laptop_mode,
2193 .may_unmap = 1,
2194 .may_swap = !noswap,
2195 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2196 .swappiness = swappiness,
2197 .order = 0,
2198 .mem_cgroup = mem_cont,
2199 .nodemask = NULL,
2200 };
2201
2202 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2203 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2204 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
2205
2206 trace_mm_vmscan_memcg_reclaim_begin(0,
2207 sc.may_writepage,
2208 sc.gfp_mask);
2209
2210 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2211
2212 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2213
2214 return nr_reclaimed;
2215}
2216#endif
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2235 int classzone_idx)
2236{
2237 unsigned long present_pages = 0;
2238 int i;
2239
2240 for (i = 0; i <= classzone_idx; i++)
2241 present_pages += pgdat->node_zones[i].present_pages;
2242
2243 return balanced_pages > (present_pages >> 2);
2244}
2245
2246
2247static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2248 int classzone_idx)
2249{
2250 int i;
2251 unsigned long balanced = 0;
2252 bool all_zones_ok = true;
2253
2254
2255 if (remaining)
2256 return true;
2257
2258
2259 for (i = 0; i < pgdat->nr_zones; i++) {
2260 struct zone *zone = pgdat->node_zones + i;
2261
2262 if (!populated_zone(zone))
2263 continue;
2264
2265
2266
2267
2268
2269
2270
2271 if (zone->all_unreclaimable) {
2272 balanced += zone->present_pages;
2273 continue;
2274 }
2275
2276 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2277 classzone_idx, 0))
2278 all_zones_ok = false;
2279 else
2280 balanced += zone->present_pages;
2281 }
2282
2283
2284
2285
2286
2287
2288 if (order)
2289 return pgdat_balanced(pgdat, balanced, classzone_idx);
2290 else
2291 return !all_zones_ok;
2292}
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2316 int *classzone_idx)
2317{
2318 int all_zones_ok;
2319 unsigned long balanced;
2320 int priority;
2321 int i;
2322 int end_zone = 0;
2323 unsigned long total_scanned;
2324 struct reclaim_state *reclaim_state = current->reclaim_state;
2325 struct scan_control sc = {
2326 .gfp_mask = GFP_KERNEL,
2327 .may_unmap = 1,
2328 .may_swap = 1,
2329
2330
2331
2332
2333 .nr_to_reclaim = ULONG_MAX,
2334 .swappiness = vm_swappiness,
2335 .order = order,
2336 .mem_cgroup = NULL,
2337 };
2338loop_again:
2339 total_scanned = 0;
2340 sc.nr_reclaimed = 0;
2341 sc.may_writepage = !laptop_mode;
2342 count_vm_event(PAGEOUTRUN);
2343
2344 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2345 unsigned long lru_pages = 0;
2346 int has_under_min_watermark_zone = 0;
2347
2348
2349 if (!priority)
2350 disable_swap_token();
2351
2352 all_zones_ok = 1;
2353 balanced = 0;
2354
2355
2356
2357
2358
2359 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
2360 struct zone *zone = pgdat->node_zones + i;
2361
2362 if (!populated_zone(zone))
2363 continue;
2364
2365 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2366 continue;
2367
2368
2369
2370
2371
2372 if (inactive_anon_is_low(zone, &sc))
2373 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2374 &sc, priority, 0);
2375
2376 if (!zone_watermark_ok_safe(zone, order,
2377 high_wmark_pages(zone), 0, 0)) {
2378 end_zone = i;
2379 *classzone_idx = i;
2380 break;
2381 }
2382 }
2383 if (i < 0)
2384 goto out;
2385
2386 for (i = 0; i <= end_zone; i++) {
2387 struct zone *zone = pgdat->node_zones + i;
2388
2389 lru_pages += zone_reclaimable_pages(zone);
2390 }
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401 for (i = 0; i <= end_zone; i++) {
2402 struct zone *zone = pgdat->node_zones + i;
2403 int nr_slab;
2404 unsigned long balance_gap;
2405
2406 if (!populated_zone(zone))
2407 continue;
2408
2409 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2410 continue;
2411
2412 sc.nr_scanned = 0;
2413
2414
2415
2416
2417
2418 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428 balance_gap = min(low_wmark_pages(zone),
2429 (zone->present_pages +
2430 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2431 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2432 if (!zone_watermark_ok_safe(zone, order,
2433 high_wmark_pages(zone) + balance_gap,
2434 end_zone, 0))
2435 shrink_zone(priority, zone, &sc);
2436 reclaim_state->reclaimed_slab = 0;
2437 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
2438 lru_pages);
2439 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2440 total_scanned += sc.nr_scanned;
2441
2442 if (zone->all_unreclaimable)
2443 continue;
2444 if (nr_slab == 0 &&
2445 !zone_reclaimable(zone))
2446 zone->all_unreclaimable = 1;
2447
2448
2449
2450
2451
2452 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
2453 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2454 sc.may_writepage = 1;
2455
2456 if (!zone_watermark_ok_safe(zone, order,
2457 high_wmark_pages(zone), end_zone, 0)) {
2458 all_zones_ok = 0;
2459
2460
2461
2462
2463
2464 if (!zone_watermark_ok_safe(zone, order,
2465 min_wmark_pages(zone), end_zone, 0))
2466 has_under_min_watermark_zone = 1;
2467 } else {
2468
2469
2470
2471
2472
2473
2474
2475 zone_clear_flag(zone, ZONE_CONGESTED);
2476 if (i <= *classzone_idx)
2477 balanced += zone->present_pages;
2478 }
2479
2480 }
2481 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2482 break;
2483
2484
2485
2486
2487 if (total_scanned && (priority < DEF_PRIORITY - 2)) {
2488 if (has_under_min_watermark_zone)
2489 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2490 else
2491 congestion_wait(BLK_RW_ASYNC, HZ/10);
2492 }
2493
2494
2495
2496
2497
2498
2499
2500 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2501 break;
2502 }
2503out:
2504
2505
2506
2507
2508
2509
2510 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2511 cond_resched();
2512
2513 try_to_freeze();
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
2530 order = sc.order = 0;
2531
2532 goto loop_again;
2533 }
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543 if (order) {
2544 for (i = 0; i <= end_zone; i++) {
2545 struct zone *zone = pgdat->node_zones + i;
2546
2547 if (!populated_zone(zone))
2548 continue;
2549
2550 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2551 continue;
2552
2553
2554 if (!zone_watermark_ok(zone, 0,
2555 high_wmark_pages(zone), 0, 0)) {
2556 order = sc.order = 0;
2557 goto loop_again;
2558 }
2559
2560
2561 zone_clear_flag(zone, ZONE_CONGESTED);
2562 }
2563 }
2564
2565
2566
2567
2568
2569
2570
2571 *classzone_idx = end_zone;
2572 return order;
2573}
2574
2575static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2576{
2577 long remaining = 0;
2578 DEFINE_WAIT(wait);
2579
2580 if (freezing(current) || kthread_should_stop())
2581 return;
2582
2583 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2584
2585
2586 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2587 remaining = schedule_timeout(HZ/10);
2588 finish_wait(&pgdat->kswapd_wait, &wait);
2589 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2590 }
2591
2592
2593
2594
2595
2596 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2597 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2608 schedule();
2609 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2610 } else {
2611 if (remaining)
2612 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2613 else
2614 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2615 }
2616 finish_wait(&pgdat->kswapd_wait, &wait);
2617}
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632static int kswapd(void *p)
2633{
2634 unsigned long order;
2635 int classzone_idx;
2636 pg_data_t *pgdat = (pg_data_t*)p;
2637 struct task_struct *tsk = current;
2638
2639 struct reclaim_state reclaim_state = {
2640 .reclaimed_slab = 0,
2641 };
2642 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2643
2644 lockdep_set_current_reclaim_state(GFP_KERNEL);
2645
2646 if (!cpumask_empty(cpumask))
2647 set_cpus_allowed_ptr(tsk, cpumask);
2648 current->reclaim_state = &reclaim_state;
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2663 set_freezable();
2664
2665 order = 0;
2666 classzone_idx = MAX_NR_ZONES - 1;
2667 for ( ; ; ) {
2668 unsigned long new_order;
2669 int new_classzone_idx;
2670 int ret;
2671
2672 new_order = pgdat->kswapd_max_order;
2673 new_classzone_idx = pgdat->classzone_idx;
2674 pgdat->kswapd_max_order = 0;
2675 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2676 if (order < new_order || classzone_idx > new_classzone_idx) {
2677
2678
2679
2680
2681 order = new_order;
2682 classzone_idx = new_classzone_idx;
2683 } else {
2684 kswapd_try_to_sleep(pgdat, order, classzone_idx);
2685 order = pgdat->kswapd_max_order;
2686 classzone_idx = pgdat->classzone_idx;
2687 pgdat->kswapd_max_order = 0;
2688 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2689 }
2690
2691 ret = try_to_freeze();
2692 if (kthread_should_stop())
2693 break;
2694
2695
2696
2697
2698
2699 if (!ret) {
2700 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2701 order = balance_pgdat(pgdat, order, &classzone_idx);
2702 }
2703 }
2704 return 0;
2705}
2706
2707
2708
2709
2710void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2711{
2712 pg_data_t *pgdat;
2713
2714 if (!populated_zone(zone))
2715 return;
2716
2717 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2718 return;
2719 pgdat = zone->zone_pgdat;
2720 if (pgdat->kswapd_max_order < order) {
2721 pgdat->kswapd_max_order = order;
2722 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2723 }
2724 if (!waitqueue_active(&pgdat->kswapd_wait))
2725 return;
2726 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2727 return;
2728
2729 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2730 wake_up_interruptible(&pgdat->kswapd_wait);
2731}
2732
2733
2734
2735
2736
2737
2738
2739
2740unsigned long global_reclaimable_pages(void)
2741{
2742 int nr;
2743
2744 nr = global_page_state(NR_ACTIVE_FILE) +
2745 global_page_state(NR_INACTIVE_FILE);
2746
2747 if (nr_swap_pages > 0)
2748 nr += global_page_state(NR_ACTIVE_ANON) +
2749 global_page_state(NR_INACTIVE_ANON);
2750
2751 return nr;
2752}
2753
2754unsigned long zone_reclaimable_pages(struct zone *zone)
2755{
2756 int nr;
2757
2758 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
2759 zone_page_state(zone, NR_INACTIVE_FILE);
2760
2761 if (nr_swap_pages > 0)
2762 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
2763 zone_page_state(zone, NR_INACTIVE_ANON);
2764
2765 return nr;
2766}
2767
2768#ifdef CONFIG_HIBERNATION
2769
2770
2771
2772
2773
2774
2775
2776
2777unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2778{
2779 struct reclaim_state reclaim_state;
2780 struct scan_control sc = {
2781 .gfp_mask = GFP_HIGHUSER_MOVABLE,
2782 .may_swap = 1,
2783 .may_unmap = 1,
2784 .may_writepage = 1,
2785 .nr_to_reclaim = nr_to_reclaim,
2786 .hibernation_mode = 1,
2787 .swappiness = vm_swappiness,
2788 .order = 0,
2789 };
2790 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2791 struct task_struct *p = current;
2792 unsigned long nr_reclaimed;
2793
2794 p->flags |= PF_MEMALLOC;
2795 lockdep_set_current_reclaim_state(sc.gfp_mask);
2796 reclaim_state.reclaimed_slab = 0;
2797 p->reclaim_state = &reclaim_state;
2798
2799 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2800
2801 p->reclaim_state = NULL;
2802 lockdep_clear_current_reclaim_state();
2803 p->flags &= ~PF_MEMALLOC;
2804
2805 return nr_reclaimed;
2806}
2807#endif
2808
2809
2810
2811
2812
2813static int __devinit cpu_callback(struct notifier_block *nfb,
2814 unsigned long action, void *hcpu)
2815{
2816 int nid;
2817
2818 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
2819 for_each_node_state(nid, N_HIGH_MEMORY) {
2820 pg_data_t *pgdat = NODE_DATA(nid);
2821 const struct cpumask *mask;
2822
2823 mask = cpumask_of_node(pgdat->node_id);
2824
2825 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2826
2827 set_cpus_allowed_ptr(pgdat->kswapd, mask);
2828 }
2829 }
2830 return NOTIFY_OK;
2831}
2832
2833
2834
2835
2836
2837int kswapd_run(int nid)
2838{
2839 pg_data_t *pgdat = NODE_DATA(nid);
2840 int ret = 0;
2841
2842 if (pgdat->kswapd)
2843 return 0;
2844
2845 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
2846 if (IS_ERR(pgdat->kswapd)) {
2847
2848 BUG_ON(system_state == SYSTEM_BOOTING);
2849 printk("Failed to start kswapd on node %d\n",nid);
2850 ret = -1;
2851 }
2852 return ret;
2853}
2854
2855
2856
2857
2858void kswapd_stop(int nid)
2859{
2860 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2861
2862 if (kswapd)
2863 kthread_stop(kswapd);
2864}
2865
2866static int __init kswapd_init(void)
2867{
2868 int nid;
2869
2870 swap_setup();
2871 for_each_node_state(nid, N_HIGH_MEMORY)
2872 kswapd_run(nid);
2873 hotcpu_notifier(cpu_callback, 0);
2874 return 0;
2875}
2876
2877module_init(kswapd_init)
2878
2879#ifdef CONFIG_NUMA
2880
2881
2882
2883
2884
2885
2886int zone_reclaim_mode __read_mostly;
2887
2888#define RECLAIM_OFF 0
2889#define RECLAIM_ZONE (1<<0)
2890#define RECLAIM_WRITE (1<<1)
2891#define RECLAIM_SWAP (1<<2)
2892
2893
2894
2895
2896
2897
2898#define ZONE_RECLAIM_PRIORITY 4
2899
2900
2901
2902
2903
2904int sysctl_min_unmapped_ratio = 1;
2905
2906
2907
2908
2909
2910int sysctl_min_slab_ratio = 5;
2911
2912static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2913{
2914 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2915 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2916 zone_page_state(zone, NR_ACTIVE_FILE);
2917
2918
2919
2920
2921
2922
2923 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2924}
2925
2926
2927static long zone_pagecache_reclaimable(struct zone *zone)
2928{
2929 long nr_pagecache_reclaimable;
2930 long delta = 0;
2931
2932
2933
2934
2935
2936
2937
2938 if (zone_reclaim_mode & RECLAIM_SWAP)
2939 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2940 else
2941 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2942
2943
2944 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2945 delta += zone_page_state(zone, NR_FILE_DIRTY);
2946
2947
2948 if (unlikely(delta > nr_pagecache_reclaimable))
2949 delta = nr_pagecache_reclaimable;
2950
2951 return nr_pagecache_reclaimable - delta;
2952}
2953
2954
2955
2956
2957static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2958{
2959
2960 const unsigned long nr_pages = 1 << order;
2961 struct task_struct *p = current;
2962 struct reclaim_state reclaim_state;
2963 int priority;
2964 struct scan_control sc = {
2965 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2966 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2967 .may_swap = 1,
2968 .nr_to_reclaim = max_t(unsigned long, nr_pages,
2969 SWAP_CLUSTER_MAX),
2970 .gfp_mask = gfp_mask,
2971 .swappiness = vm_swappiness,
2972 .order = order,
2973 };
2974 unsigned long nr_slab_pages0, nr_slab_pages1;
2975
2976 cond_resched();
2977
2978
2979
2980
2981
2982 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
2983 lockdep_set_current_reclaim_state(gfp_mask);
2984 reclaim_state.reclaimed_slab = 0;
2985 p->reclaim_state = &reclaim_state;
2986
2987 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2988
2989
2990
2991
2992 priority = ZONE_RECLAIM_PRIORITY;
2993 do {
2994 shrink_zone(priority, zone, &sc);
2995 priority--;
2996 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2997 }
2998
2999 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3000 if (nr_slab_pages0 > zone->min_slab_pages) {
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011 for (;;) {
3012 unsigned long lru_pages = zone_reclaimable_pages(zone);
3013
3014
3015 if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
3016 break;
3017
3018
3019 nr_slab_pages1 = zone_page_state(zone,
3020 NR_SLAB_RECLAIMABLE);
3021 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
3022 break;
3023 }
3024
3025
3026
3027
3028
3029 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
3030 if (nr_slab_pages1 < nr_slab_pages0)
3031 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
3032 }
3033
3034 p->reclaim_state = NULL;
3035 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
3036 lockdep_clear_current_reclaim_state();
3037 return sc.nr_reclaimed >= nr_pages;
3038}
3039
3040int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3041{
3042 int node_id;
3043 int ret;
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
3056 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3057 return ZONE_RECLAIM_FULL;
3058
3059 if (zone->all_unreclaimable)
3060 return ZONE_RECLAIM_FULL;
3061
3062
3063
3064
3065 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
3066 return ZONE_RECLAIM_NOSCAN;
3067
3068
3069
3070
3071
3072
3073
3074 node_id = zone_to_nid(zone);
3075 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
3076 return ZONE_RECLAIM_NOSCAN;
3077
3078 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
3079 return ZONE_RECLAIM_NOSCAN;
3080
3081 ret = __zone_reclaim(zone, gfp_mask, order);
3082 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
3083
3084 if (!ret)
3085 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3086
3087 return ret;
3088}
3089#endif
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105int page_evictable(struct page *page, struct vm_area_struct *vma)
3106{
3107
3108 if (mapping_unevictable(page_mapping(page)))
3109 return 0;
3110
3111 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
3112 return 0;
3113
3114 return 1;
3115}
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128static void check_move_unevictable_page(struct page *page, struct zone *zone)
3129{
3130 VM_BUG_ON(PageActive(page));
3131
3132retry:
3133 ClearPageUnevictable(page);
3134 if (page_evictable(page, NULL)) {
3135 enum lru_list l = page_lru_base_type(page);
3136
3137 __dec_zone_state(zone, NR_UNEVICTABLE);
3138 list_move(&page->lru, &zone->lru[l].list);
3139 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
3140 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
3141 __count_vm_event(UNEVICTABLE_PGRESCUED);
3142 } else {
3143
3144
3145
3146 SetPageUnevictable(page);
3147 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
3148 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
3149 if (page_evictable(page, NULL))
3150 goto retry;
3151 }
3152}
3153
3154
3155
3156
3157
3158
3159
3160
3161void scan_mapping_unevictable_pages(struct address_space *mapping)
3162{
3163 pgoff_t next = 0;
3164 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
3165 PAGE_CACHE_SHIFT;
3166 struct zone *zone;
3167 struct pagevec pvec;
3168
3169 if (mapping->nrpages == 0)
3170 return;
3171
3172 pagevec_init(&pvec, 0);
3173 while (next < end &&
3174 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
3175 int i;
3176 int pg_scanned = 0;
3177
3178 zone = NULL;
3179
3180 for (i = 0; i < pagevec_count(&pvec); i++) {
3181 struct page *page = pvec.pages[i];
3182 pgoff_t page_index = page->index;
3183 struct zone *pagezone = page_zone(page);
3184
3185 pg_scanned++;
3186 if (page_index > next)
3187 next = page_index;
3188 next++;
3189
3190 if (pagezone != zone) {
3191 if (zone)
3192 spin_unlock_irq(&zone->lru_lock);
3193 zone = pagezone;
3194 spin_lock_irq(&zone->lru_lock);
3195 }
3196
3197 if (PageLRU(page) && PageUnevictable(page))
3198 check_move_unevictable_page(page, zone);
3199 }
3200 if (zone)
3201 spin_unlock_irq(&zone->lru_lock);
3202 pagevec_release(&pvec);
3203
3204 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
3205 }
3206
3207}
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL
3220static void scan_zone_unevictable_pages(struct zone *zone)
3221{
3222 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
3223 unsigned long scan;
3224 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
3225
3226 while (nr_to_scan > 0) {
3227 unsigned long batch_size = min(nr_to_scan,
3228 SCAN_UNEVICTABLE_BATCH_SIZE);
3229
3230 spin_lock_irq(&zone->lru_lock);
3231 for (scan = 0; scan < batch_size; scan++) {
3232 struct page *page = lru_to_page(l_unevictable);
3233
3234 if (!trylock_page(page))
3235 continue;
3236
3237 prefetchw_prev_lru_page(page, l_unevictable, flags);
3238
3239 if (likely(PageLRU(page) && PageUnevictable(page)))
3240 check_move_unevictable_page(page, zone);
3241
3242 unlock_page(page);
3243 }
3244 spin_unlock_irq(&zone->lru_lock);
3245
3246 nr_to_scan -= batch_size;
3247 }
3248}
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262static void scan_all_zones_unevictable_pages(void)
3263{
3264 struct zone *zone;
3265
3266 for_each_zone(zone) {
3267 scan_zone_unevictable_pages(zone);
3268 }
3269}
3270
3271
3272
3273
3274
3275unsigned long scan_unevictable_pages;
3276
3277int scan_unevictable_handler(struct ctl_table *table, int write,
3278 void __user *buffer,
3279 size_t *length, loff_t *ppos)
3280{
3281 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3282
3283 if (write && *(unsigned long *)table->data)
3284 scan_all_zones_unevictable_pages();
3285
3286 scan_unevictable_pages = 0;
3287 return 0;
3288}
3289
3290#ifdef CONFIG_NUMA
3291
3292
3293
3294
3295
3296static ssize_t read_scan_unevictable_node(struct sys_device *dev,
3297 struct sysdev_attribute *attr,
3298 char *buf)
3299{
3300 return sprintf(buf, "0\n");
3301}
3302
3303static ssize_t write_scan_unevictable_node(struct sys_device *dev,
3304 struct sysdev_attribute *attr,
3305 const char *buf, size_t count)
3306{
3307 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
3308 struct zone *zone;
3309 unsigned long res;
3310 unsigned long req = strict_strtoul(buf, 10, &res);
3311
3312 if (!req)
3313 return 1;
3314
3315 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
3316 if (!populated_zone(zone))
3317 continue;
3318 scan_zone_unevictable_pages(zone);
3319 }
3320 return 1;
3321}
3322
3323
3324static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
3325 read_scan_unevictable_node,
3326 write_scan_unevictable_node);
3327
3328int scan_unevictable_register_node(struct node *node)
3329{
3330 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
3331}
3332
3333void scan_unevictable_unregister_node(struct node *node)
3334{
3335 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
3336}
3337#endif
3338