1
2
3
4
5
6
7
8
9
10
11
12
13
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15
16#include <linux/mm.h>
17#include <linux/sched/mm.h>
18#include <linux/module.h>
19#include <linux/gfp.h>
20#include <linux/kernel_stat.h>
21#include <linux/swap.h>
22#include <linux/pagemap.h>
23#include <linux/init.h>
24#include <linux/highmem.h>
25#include <linux/vmpressure.h>
26#include <linux/vmstat.h>
27#include <linux/file.h>
28#include <linux/writeback.h>
29#include <linux/blkdev.h>
30#include <linux/buffer_head.h>
31
32#include <linux/mm_inline.h>
33#include <linux/backing-dev.h>
34#include <linux/rmap.h>
35#include <linux/topology.h>
36#include <linux/cpu.h>
37#include <linux/cpuset.h>
38#include <linux/compaction.h>
39#include <linux/notifier.h>
40#include <linux/rwsem.h>
41#include <linux/delay.h>
42#include <linux/kthread.h>
43#include <linux/freezer.h>
44#include <linux/memcontrol.h>
45#include <linux/delayacct.h>
46#include <linux/sysctl.h>
47#include <linux/oom.h>
48#include <linux/prefetch.h>
49#include <linux/printk.h>
50#include <linux/dax.h>
51
52#include <asm/tlbflush.h>
53#include <asm/div64.h>
54
55#include <linux/swapops.h>
56#include <linux/balloon_compaction.h>
57
58#include "internal.h"
59
60#define CREATE_TRACE_POINTS
61#include <trace/events/vmscan.h>
62
63struct scan_control {
64
65 unsigned long nr_to_reclaim;
66
67
68 gfp_t gfp_mask;
69
70
71 int order;
72
73
74
75
76
77 nodemask_t *nodemask;
78
79
80
81
82
83 struct mem_cgroup *target_mem_cgroup;
84
85
86 int priority;
87
88
89 enum zone_type reclaim_idx;
90
91
92 unsigned int may_writepage:1;
93
94
95 unsigned int may_unmap:1;
96
97
98 unsigned int may_swap:1;
99
100
101
102
103
104
105 unsigned int memcg_low_reclaim:1;
106 unsigned int memcg_low_skipped:1;
107
108 unsigned int hibernation_mode:1;
109
110
111 unsigned int compaction_ready:1;
112
113
114 unsigned long nr_scanned;
115
116
117 unsigned long nr_reclaimed;
118};
119
120#ifdef ARCH_HAS_PREFETCH
121#define prefetch_prev_lru_page(_page, _base, _field) \
122 do { \
123 if ((_page)->lru.prev != _base) { \
124 struct page *prev; \
125 \
126 prev = lru_to_page(&(_page->lru)); \
127 prefetch(&prev->_field); \
128 } \
129 } while (0)
130#else
131#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
132#endif
133
134#ifdef ARCH_HAS_PREFETCHW
135#define prefetchw_prev_lru_page(_page, _base, _field) \
136 do { \
137 if ((_page)->lru.prev != _base) { \
138 struct page *prev; \
139 \
140 prev = lru_to_page(&(_page->lru)); \
141 prefetchw(&prev->_field); \
142 } \
143 } while (0)
144#else
145#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
146#endif
147
148
149
150
151int vm_swappiness = 60;
152
153
154
155
156unsigned long vm_total_pages;
157
158static LIST_HEAD(shrinker_list);
159static DECLARE_RWSEM(shrinker_rwsem);
160
161#ifdef CONFIG_MEMCG
162static bool global_reclaim(struct scan_control *sc)
163{
164 return !sc->target_mem_cgroup;
165}
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180static bool sane_reclaim(struct scan_control *sc)
181{
182 struct mem_cgroup *memcg = sc->target_mem_cgroup;
183
184 if (!memcg)
185 return true;
186#ifdef CONFIG_CGROUP_WRITEBACK
187 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
188 return true;
189#endif
190 return false;
191}
192#else
193static bool global_reclaim(struct scan_control *sc)
194{
195 return true;
196}
197
198static bool sane_reclaim(struct scan_control *sc)
199{
200 return true;
201}
202#endif
203
204
205
206
207
208
209unsigned long zone_reclaimable_pages(struct zone *zone)
210{
211 unsigned long nr;
212
213 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
214 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
215 if (get_nr_swap_pages() > 0)
216 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
217 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
218
219 return nr;
220}
221
222unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
223{
224 unsigned long nr;
225
226 nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
227 node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
228 node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
229
230 if (get_nr_swap_pages() > 0)
231 nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
232 node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
233 node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
234
235 return nr;
236}
237
238
239
240
241
242
243
244unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
245{
246 unsigned long lru_size;
247 int zid;
248
249 if (!mem_cgroup_disabled())
250 lru_size = mem_cgroup_get_lru_size(lruvec, lru);
251 else
252 lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
253
254 for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
255 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
256 unsigned long size;
257
258 if (!managed_zone(zone))
259 continue;
260
261 if (!mem_cgroup_disabled())
262 size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
263 else
264 size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
265 NR_ZONE_LRU_BASE + lru);
266 lru_size -= min(size, lru_size);
267 }
268
269 return lru_size;
270
271}
272
273
274
275
276int register_shrinker(struct shrinker *shrinker)
277{
278 size_t size = sizeof(*shrinker->nr_deferred);
279
280 if (shrinker->flags & SHRINKER_NUMA_AWARE)
281 size *= nr_node_ids;
282
283 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
284 if (!shrinker->nr_deferred)
285 return -ENOMEM;
286
287 down_write(&shrinker_rwsem);
288 list_add_tail(&shrinker->list, &shrinker_list);
289 up_write(&shrinker_rwsem);
290 return 0;
291}
292EXPORT_SYMBOL(register_shrinker);
293
294
295
296
297void unregister_shrinker(struct shrinker *shrinker)
298{
299 down_write(&shrinker_rwsem);
300 list_del(&shrinker->list);
301 up_write(&shrinker_rwsem);
302 kfree(shrinker->nr_deferred);
303}
304EXPORT_SYMBOL(unregister_shrinker);
305
306#define SHRINK_BATCH 128
307
308static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
309 struct shrinker *shrinker,
310 unsigned long nr_scanned,
311 unsigned long nr_eligible)
312{
313 unsigned long freed = 0;
314 unsigned long long delta;
315 long total_scan;
316 long freeable;
317 long nr;
318 long new_nr;
319 int nid = shrinkctl->nid;
320 long batch_size = shrinker->batch ? shrinker->batch
321 : SHRINK_BATCH;
322 long scanned = 0, next_deferred;
323
324 freeable = shrinker->count_objects(shrinker, shrinkctl);
325 if (freeable == 0)
326 return 0;
327
328
329
330
331
332
333 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
334
335 total_scan = nr;
336 delta = (4 * nr_scanned) / shrinker->seeks;
337 delta *= freeable;
338 do_div(delta, nr_eligible + 1);
339 total_scan += delta;
340 if (total_scan < 0) {
341 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
342 shrinker->scan_objects, total_scan);
343 total_scan = freeable;
344 next_deferred = nr;
345 } else
346 next_deferred = total_scan;
347
348
349
350
351
352
353
354
355
356
357
358
359
360 if (delta < freeable / 4)
361 total_scan = min(total_scan, freeable / 2);
362
363
364
365
366
367
368 if (total_scan > freeable * 2)
369 total_scan = freeable * 2;
370
371 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
372 nr_scanned, nr_eligible,
373 freeable, delta, total_scan);
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390 while (total_scan >= batch_size ||
391 total_scan >= freeable) {
392 unsigned long ret;
393 unsigned long nr_to_scan = min(batch_size, total_scan);
394
395 shrinkctl->nr_to_scan = nr_to_scan;
396 ret = shrinker->scan_objects(shrinker, shrinkctl);
397 if (ret == SHRINK_STOP)
398 break;
399 freed += ret;
400
401 count_vm_events(SLABS_SCANNED, nr_to_scan);
402 total_scan -= nr_to_scan;
403 scanned += nr_to_scan;
404
405 cond_resched();
406 }
407
408 if (next_deferred >= scanned)
409 next_deferred -= scanned;
410 else
411 next_deferred = 0;
412
413
414
415
416
417 if (next_deferred > 0)
418 new_nr = atomic_long_add_return(next_deferred,
419 &shrinker->nr_deferred[nid]);
420 else
421 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
422
423 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
424 return freed;
425}
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
456 struct mem_cgroup *memcg,
457 unsigned long nr_scanned,
458 unsigned long nr_eligible)
459{
460 struct shrinker *shrinker;
461 unsigned long freed = 0;
462
463 if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
464 return 0;
465
466 if (nr_scanned == 0)
467 nr_scanned = SWAP_CLUSTER_MAX;
468
469 if (!down_read_trylock(&shrinker_rwsem)) {
470
471
472
473
474
475
476 freed = 1;
477 goto out;
478 }
479
480 list_for_each_entry(shrinker, &shrinker_list, list) {
481 struct shrink_control sc = {
482 .gfp_mask = gfp_mask,
483 .nid = nid,
484 .memcg = memcg,
485 };
486
487
488
489
490
491
492 if (memcg_kmem_enabled() &&
493 !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
494 continue;
495
496 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
497 sc.nid = 0;
498
499 freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
500 }
501
502 up_read(&shrinker_rwsem);
503out:
504 cond_resched();
505 return freed;
506}
507
508void drop_slab_node(int nid)
509{
510 unsigned long freed;
511
512 do {
513 struct mem_cgroup *memcg = NULL;
514
515 freed = 0;
516 do {
517 freed += shrink_slab(GFP_KERNEL, nid, memcg,
518 1000, 1000);
519 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
520 } while (freed > 10);
521}
522
523void drop_slab(void)
524{
525 int nid;
526
527 for_each_online_node(nid)
528 drop_slab_node(nid);
529}
530
531static inline int is_page_cache_freeable(struct page *page)
532{
533
534
535
536
537
538 return page_count(page) - page_has_private(page) == 2;
539}
540
541static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
542{
543 if (current->flags & PF_SWAPWRITE)
544 return 1;
545 if (!inode_write_congested(inode))
546 return 1;
547 if (inode_to_bdi(inode) == current->backing_dev_info)
548 return 1;
549 return 0;
550}
551
552
553
554
555
556
557
558
559
560
561
562
563
564static void handle_write_error(struct address_space *mapping,
565 struct page *page, int error)
566{
567 lock_page(page);
568 if (page_mapping(page) == mapping)
569 mapping_set_error(mapping, error);
570 unlock_page(page);
571}
572
573
574typedef enum {
575
576 PAGE_KEEP,
577
578 PAGE_ACTIVATE,
579
580 PAGE_SUCCESS,
581
582 PAGE_CLEAN,
583} pageout_t;
584
585
586
587
588
589static pageout_t pageout(struct page *page, struct address_space *mapping,
590 struct scan_control *sc)
591{
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608 if (!is_page_cache_freeable(page))
609 return PAGE_KEEP;
610 if (!mapping) {
611
612
613
614
615 if (page_has_private(page)) {
616 if (try_to_free_buffers(page)) {
617 ClearPageDirty(page);
618 pr_info("%s: orphaned page\n", __func__);
619 return PAGE_CLEAN;
620 }
621 }
622 return PAGE_KEEP;
623 }
624 if (mapping->a_ops->writepage == NULL)
625 return PAGE_ACTIVATE;
626 if (!may_write_to_inode(mapping->host, sc))
627 return PAGE_KEEP;
628
629 if (clear_page_dirty_for_io(page)) {
630 int res;
631 struct writeback_control wbc = {
632 .sync_mode = WB_SYNC_NONE,
633 .nr_to_write = SWAP_CLUSTER_MAX,
634 .range_start = 0,
635 .range_end = LLONG_MAX,
636 .for_reclaim = 1,
637 };
638
639 SetPageReclaim(page);
640 res = mapping->a_ops->writepage(page, &wbc);
641 if (res < 0)
642 handle_write_error(mapping, page, res);
643 if (res == AOP_WRITEPAGE_ACTIVATE) {
644 ClearPageReclaim(page);
645 return PAGE_ACTIVATE;
646 }
647
648 if (!PageWriteback(page)) {
649
650 ClearPageReclaim(page);
651 }
652 trace_mm_vmscan_writepage(page);
653 inc_node_page_state(page, NR_VMSCAN_WRITE);
654 return PAGE_SUCCESS;
655 }
656
657 return PAGE_CLEAN;
658}
659
660
661
662
663
664static int __remove_mapping(struct address_space *mapping, struct page *page,
665 bool reclaimed)
666{
667 unsigned long flags;
668
669 BUG_ON(!PageLocked(page));
670 BUG_ON(mapping != page_mapping(page));
671
672 spin_lock_irqsave(&mapping->tree_lock, flags);
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698 if (!page_ref_freeze(page, 2))
699 goto cannot_free;
700
701 if (unlikely(PageDirty(page))) {
702 page_ref_unfreeze(page, 2);
703 goto cannot_free;
704 }
705
706 if (PageSwapCache(page)) {
707 swp_entry_t swap = { .val = page_private(page) };
708 mem_cgroup_swapout(page, swap);
709 __delete_from_swap_cache(page);
710 spin_unlock_irqrestore(&mapping->tree_lock, flags);
711 swapcache_free(swap);
712 } else {
713 void (*freepage)(struct page *);
714 void *shadow = NULL;
715
716 freepage = mapping->a_ops->freepage;
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733 if (reclaimed && page_is_file_cache(page) &&
734 !mapping_exiting(mapping) && !dax_mapping(mapping))
735 shadow = workingset_eviction(mapping, page);
736 __delete_from_page_cache(page, shadow);
737 spin_unlock_irqrestore(&mapping->tree_lock, flags);
738
739 if (freepage != NULL)
740 freepage(page);
741 }
742
743 return 1;
744
745cannot_free:
746 spin_unlock_irqrestore(&mapping->tree_lock, flags);
747 return 0;
748}
749
750
751
752
753
754
755
756int remove_mapping(struct address_space *mapping, struct page *page)
757{
758 if (__remove_mapping(mapping, page, false)) {
759
760
761
762
763
764 page_ref_unfreeze(page, 1);
765 return 1;
766 }
767 return 0;
768}
769
770
771
772
773
774
775
776
777
778
779void putback_lru_page(struct page *page)
780{
781 bool is_unevictable;
782 int was_unevictable = PageUnevictable(page);
783
784 VM_BUG_ON_PAGE(PageLRU(page), page);
785
786redo:
787 ClearPageUnevictable(page);
788
789 if (page_evictable(page)) {
790
791
792
793
794
795
796 is_unevictable = false;
797 lru_cache_add(page);
798 } else {
799
800
801
802
803 is_unevictable = true;
804 add_page_to_unevictable_list(page);
805
806
807
808
809
810
811
812
813
814
815 smp_mb();
816 }
817
818
819
820
821
822
823 if (is_unevictable && page_evictable(page)) {
824 if (!isolate_lru_page(page)) {
825 put_page(page);
826 goto redo;
827 }
828
829
830
831
832 }
833
834 if (was_unevictable && !is_unevictable)
835 count_vm_event(UNEVICTABLE_PGRESCUED);
836 else if (!was_unevictable && is_unevictable)
837 count_vm_event(UNEVICTABLE_PGCULLED);
838
839 put_page(page);
840}
841
842enum page_references {
843 PAGEREF_RECLAIM,
844 PAGEREF_RECLAIM_CLEAN,
845 PAGEREF_KEEP,
846 PAGEREF_ACTIVATE,
847};
848
849static enum page_references page_check_references(struct page *page,
850 struct scan_control *sc)
851{
852 int referenced_ptes, referenced_page;
853 unsigned long vm_flags;
854
855 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
856 &vm_flags);
857 referenced_page = TestClearPageReferenced(page);
858
859
860
861
862
863 if (vm_flags & VM_LOCKED)
864 return PAGEREF_RECLAIM;
865
866 if (referenced_ptes) {
867 if (PageSwapBacked(page))
868 return PAGEREF_ACTIVATE;
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883 SetPageReferenced(page);
884
885 if (referenced_page || referenced_ptes > 1)
886 return PAGEREF_ACTIVATE;
887
888
889
890
891 if (vm_flags & VM_EXEC)
892 return PAGEREF_ACTIVATE;
893
894 return PAGEREF_KEEP;
895 }
896
897
898 if (referenced_page && !PageSwapBacked(page))
899 return PAGEREF_RECLAIM_CLEAN;
900
901 return PAGEREF_RECLAIM;
902}
903
904
905static void page_check_dirty_writeback(struct page *page,
906 bool *dirty, bool *writeback)
907{
908 struct address_space *mapping;
909
910
911
912
913
914 if (!page_is_file_cache(page) ||
915 (PageAnon(page) && !PageSwapBacked(page))) {
916 *dirty = false;
917 *writeback = false;
918 return;
919 }
920
921
922 *dirty = PageDirty(page);
923 *writeback = PageWriteback(page);
924
925
926 if (!page_has_private(page))
927 return;
928
929 mapping = page_mapping(page);
930 if (mapping && mapping->a_ops->is_dirty_writeback)
931 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
932}
933
934struct reclaim_stat {
935 unsigned nr_dirty;
936 unsigned nr_unqueued_dirty;
937 unsigned nr_congested;
938 unsigned nr_writeback;
939 unsigned nr_immediate;
940 unsigned nr_activate;
941 unsigned nr_ref_keep;
942 unsigned nr_unmap_fail;
943};
944
945
946
947
948static unsigned long shrink_page_list(struct list_head *page_list,
949 struct pglist_data *pgdat,
950 struct scan_control *sc,
951 enum ttu_flags ttu_flags,
952 struct reclaim_stat *stat,
953 bool force_reclaim)
954{
955 LIST_HEAD(ret_pages);
956 LIST_HEAD(free_pages);
957 int pgactivate = 0;
958 unsigned nr_unqueued_dirty = 0;
959 unsigned nr_dirty = 0;
960 unsigned nr_congested = 0;
961 unsigned nr_reclaimed = 0;
962 unsigned nr_writeback = 0;
963 unsigned nr_immediate = 0;
964 unsigned nr_ref_keep = 0;
965 unsigned nr_unmap_fail = 0;
966
967 cond_resched();
968
969 while (!list_empty(page_list)) {
970 struct address_space *mapping;
971 struct page *page;
972 int may_enter_fs;
973 enum page_references references = PAGEREF_RECLAIM_CLEAN;
974 bool dirty, writeback;
975
976 cond_resched();
977
978 page = lru_to_page(page_list);
979 list_del(&page->lru);
980
981 if (!trylock_page(page))
982 goto keep;
983
984 VM_BUG_ON_PAGE(PageActive(page), page);
985
986 sc->nr_scanned++;
987
988 if (unlikely(!page_evictable(page)))
989 goto activate_locked;
990
991 if (!sc->may_unmap && page_mapped(page))
992 goto keep_locked;
993
994
995 if ((page_mapped(page) || PageSwapCache(page)) &&
996 !(PageAnon(page) && !PageSwapBacked(page)))
997 sc->nr_scanned++;
998
999 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1000 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1001
1002
1003
1004
1005
1006
1007
1008 page_check_dirty_writeback(page, &dirty, &writeback);
1009 if (dirty || writeback)
1010 nr_dirty++;
1011
1012 if (dirty && !writeback)
1013 nr_unqueued_dirty++;
1014
1015
1016
1017
1018
1019
1020
1021 mapping = page_mapping(page);
1022 if (((dirty || writeback) && mapping &&
1023 inode_write_congested(mapping->host)) ||
1024 (writeback && PageReclaim(page)))
1025 nr_congested++;
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069 if (PageWriteback(page)) {
1070
1071 if (current_is_kswapd() &&
1072 PageReclaim(page) &&
1073 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1074 nr_immediate++;
1075 goto activate_locked;
1076
1077
1078 } else if (sane_reclaim(sc) ||
1079 !PageReclaim(page) || !may_enter_fs) {
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091 SetPageReclaim(page);
1092 nr_writeback++;
1093 goto activate_locked;
1094
1095
1096 } else {
1097 unlock_page(page);
1098 wait_on_page_writeback(page);
1099
1100 list_add_tail(&page->lru, page_list);
1101 continue;
1102 }
1103 }
1104
1105 if (!force_reclaim)
1106 references = page_check_references(page, sc);
1107
1108 switch (references) {
1109 case PAGEREF_ACTIVATE:
1110 goto activate_locked;
1111 case PAGEREF_KEEP:
1112 nr_ref_keep++;
1113 goto keep_locked;
1114 case PAGEREF_RECLAIM:
1115 case PAGEREF_RECLAIM_CLEAN:
1116 ;
1117 }
1118
1119
1120
1121
1122
1123
1124 if (PageAnon(page) && PageSwapBacked(page) &&
1125 !PageSwapCache(page)) {
1126 if (!(sc->gfp_mask & __GFP_IO))
1127 goto keep_locked;
1128 if (!add_to_swap(page, page_list))
1129 goto activate_locked;
1130 may_enter_fs = 1;
1131
1132
1133 mapping = page_mapping(page);
1134 } else if (unlikely(PageTransHuge(page))) {
1135
1136 if (split_huge_page_to_list(page, page_list))
1137 goto keep_locked;
1138 }
1139
1140 VM_BUG_ON_PAGE(PageTransHuge(page), page);
1141
1142
1143
1144
1145
1146 if (page_mapped(page)) {
1147 if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
1148 nr_unmap_fail++;
1149 goto activate_locked;
1150 }
1151 }
1152
1153 if (PageDirty(page)) {
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164 if (page_is_file_cache(page) &&
1165 (!current_is_kswapd() || !PageReclaim(page) ||
1166 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1167
1168
1169
1170
1171
1172
1173 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1174 SetPageReclaim(page);
1175
1176 goto activate_locked;
1177 }
1178
1179 if (references == PAGEREF_RECLAIM_CLEAN)
1180 goto keep_locked;
1181 if (!may_enter_fs)
1182 goto keep_locked;
1183 if (!sc->may_writepage)
1184 goto keep_locked;
1185
1186
1187
1188
1189
1190
1191 try_to_unmap_flush_dirty();
1192 switch (pageout(page, mapping, sc)) {
1193 case PAGE_KEEP:
1194 goto keep_locked;
1195 case PAGE_ACTIVATE:
1196 goto activate_locked;
1197 case PAGE_SUCCESS:
1198 if (PageWriteback(page))
1199 goto keep;
1200 if (PageDirty(page))
1201 goto keep;
1202
1203
1204
1205
1206
1207 if (!trylock_page(page))
1208 goto keep;
1209 if (PageDirty(page) || PageWriteback(page))
1210 goto keep_locked;
1211 mapping = page_mapping(page);
1212 case PAGE_CLEAN:
1213 ;
1214 }
1215 }
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238 if (page_has_private(page)) {
1239 if (!try_to_release_page(page, sc->gfp_mask))
1240 goto activate_locked;
1241 if (!mapping && page_count(page) == 1) {
1242 unlock_page(page);
1243 if (put_page_testzero(page))
1244 goto free_it;
1245 else {
1246
1247
1248
1249
1250
1251
1252
1253 nr_reclaimed++;
1254 continue;
1255 }
1256 }
1257 }
1258
1259 if (PageAnon(page) && !PageSwapBacked(page)) {
1260
1261 if (!page_ref_freeze(page, 1))
1262 goto keep_locked;
1263 if (PageDirty(page)) {
1264 page_ref_unfreeze(page, 1);
1265 goto keep_locked;
1266 }
1267
1268 count_vm_event(PGLAZYFREED);
1269 } else if (!mapping || !__remove_mapping(mapping, page, true))
1270 goto keep_locked;
1271
1272
1273
1274
1275
1276
1277
1278 __ClearPageLocked(page);
1279free_it:
1280 nr_reclaimed++;
1281
1282
1283
1284
1285
1286 list_add(&page->lru, &free_pages);
1287 continue;
1288
1289activate_locked:
1290
1291 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1292 PageMlocked(page)))
1293 try_to_free_swap(page);
1294 VM_BUG_ON_PAGE(PageActive(page), page);
1295 if (!PageMlocked(page)) {
1296 SetPageActive(page);
1297 pgactivate++;
1298 }
1299keep_locked:
1300 unlock_page(page);
1301keep:
1302 list_add(&page->lru, &ret_pages);
1303 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1304 }
1305
1306 mem_cgroup_uncharge_list(&free_pages);
1307 try_to_unmap_flush();
1308 free_hot_cold_page_list(&free_pages, true);
1309
1310 list_splice(&ret_pages, page_list);
1311 count_vm_events(PGACTIVATE, pgactivate);
1312
1313 if (stat) {
1314 stat->nr_dirty = nr_dirty;
1315 stat->nr_congested = nr_congested;
1316 stat->nr_unqueued_dirty = nr_unqueued_dirty;
1317 stat->nr_writeback = nr_writeback;
1318 stat->nr_immediate = nr_immediate;
1319 stat->nr_activate = pgactivate;
1320 stat->nr_ref_keep = nr_ref_keep;
1321 stat->nr_unmap_fail = nr_unmap_fail;
1322 }
1323 return nr_reclaimed;
1324}
1325
1326unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1327 struct list_head *page_list)
1328{
1329 struct scan_control sc = {
1330 .gfp_mask = GFP_KERNEL,
1331 .priority = DEF_PRIORITY,
1332 .may_unmap = 1,
1333 };
1334 unsigned long ret;
1335 struct page *page, *next;
1336 LIST_HEAD(clean_pages);
1337
1338 list_for_each_entry_safe(page, next, page_list, lru) {
1339 if (page_is_file_cache(page) && !PageDirty(page) &&
1340 !__PageMovable(page)) {
1341 ClearPageActive(page);
1342 list_move(&page->lru, &clean_pages);
1343 }
1344 }
1345
1346 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1347 TTU_IGNORE_ACCESS, NULL, true);
1348 list_splice(&clean_pages, page_list);
1349 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
1350 return ret;
1351}
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1364{
1365 int ret = -EINVAL;
1366
1367
1368 if (!PageLRU(page))
1369 return ret;
1370
1371
1372 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1373 return ret;
1374
1375 ret = -EBUSY;
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385 if (mode & ISOLATE_ASYNC_MIGRATE) {
1386
1387 if (PageWriteback(page))
1388 return ret;
1389
1390 if (PageDirty(page)) {
1391 struct address_space *mapping;
1392
1393
1394
1395
1396
1397
1398 mapping = page_mapping(page);
1399 if (mapping && !mapping->a_ops->migratepage)
1400 return ret;
1401 }
1402 }
1403
1404 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1405 return ret;
1406
1407 if (likely(get_page_unless_zero(page))) {
1408
1409
1410
1411
1412
1413 ClearPageLRU(page);
1414 ret = 0;
1415 }
1416
1417 return ret;
1418}
1419
1420
1421
1422
1423
1424
1425static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1426 enum lru_list lru, unsigned long *nr_zone_taken)
1427{
1428 int zid;
1429
1430 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1431 if (!nr_zone_taken[zid])
1432 continue;
1433
1434 __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1435#ifdef CONFIG_MEMCG
1436 mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1437#endif
1438 }
1439
1440}
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1463 struct lruvec *lruvec, struct list_head *dst,
1464 unsigned long *nr_scanned, struct scan_control *sc,
1465 isolate_mode_t mode, enum lru_list lru)
1466{
1467 struct list_head *src = &lruvec->lists[lru];
1468 unsigned long nr_taken = 0;
1469 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1470 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1471 unsigned long skipped = 0;
1472 unsigned long scan, total_scan, nr_pages;
1473 LIST_HEAD(pages_skipped);
1474
1475 scan = 0;
1476 for (total_scan = 0;
1477 scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
1478 total_scan++) {
1479 struct page *page;
1480
1481 page = lru_to_page(src);
1482 prefetchw_prev_lru_page(page, src, flags);
1483
1484 VM_BUG_ON_PAGE(!PageLRU(page), page);
1485
1486 if (page_zonenum(page) > sc->reclaim_idx) {
1487 list_move(&page->lru, &pages_skipped);
1488 nr_skipped[page_zonenum(page)]++;
1489 continue;
1490 }
1491
1492
1493
1494
1495
1496
1497
1498 scan++;
1499 switch (__isolate_lru_page(page, mode)) {
1500 case 0:
1501 nr_pages = hpage_nr_pages(page);
1502 nr_taken += nr_pages;
1503 nr_zone_taken[page_zonenum(page)] += nr_pages;
1504 list_move(&page->lru, dst);
1505 break;
1506
1507 case -EBUSY:
1508
1509 list_move(&page->lru, src);
1510 continue;
1511
1512 default:
1513 BUG();
1514 }
1515 }
1516
1517
1518
1519
1520
1521
1522
1523
1524 if (!list_empty(&pages_skipped)) {
1525 int zid;
1526
1527 list_splice(&pages_skipped, src);
1528 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1529 if (!nr_skipped[zid])
1530 continue;
1531
1532 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1533 skipped += nr_skipped[zid];
1534 }
1535 }
1536 *nr_scanned = total_scan;
1537 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1538 total_scan, skipped, nr_taken, mode, lru);
1539 update_lru_sizes(lruvec, lru, nr_zone_taken);
1540 return nr_taken;
1541}
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568int isolate_lru_page(struct page *page)
1569{
1570 int ret = -EBUSY;
1571
1572 VM_BUG_ON_PAGE(!page_count(page), page);
1573 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1574
1575 if (PageLRU(page)) {
1576 struct zone *zone = page_zone(page);
1577 struct lruvec *lruvec;
1578
1579 spin_lock_irq(zone_lru_lock(zone));
1580 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
1581 if (PageLRU(page)) {
1582 int lru = page_lru(page);
1583 get_page(page);
1584 ClearPageLRU(page);
1585 del_page_from_lru_list(page, lruvec, lru);
1586 ret = 0;
1587 }
1588 spin_unlock_irq(zone_lru_lock(zone));
1589 }
1590 return ret;
1591}
1592
1593
1594
1595
1596
1597
1598
1599
1600static int too_many_isolated(struct pglist_data *pgdat, int file,
1601 struct scan_control *sc)
1602{
1603 unsigned long inactive, isolated;
1604
1605 if (current_is_kswapd())
1606 return 0;
1607
1608 if (!sane_reclaim(sc))
1609 return 0;
1610
1611 if (file) {
1612 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1613 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1614 } else {
1615 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1616 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1617 }
1618
1619
1620
1621
1622
1623
1624 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1625 inactive >>= 3;
1626
1627 return isolated > inactive;
1628}
1629
1630static noinline_for_stack void
1631putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1632{
1633 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1634 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1635 LIST_HEAD(pages_to_free);
1636
1637
1638
1639
1640 while (!list_empty(page_list)) {
1641 struct page *page = lru_to_page(page_list);
1642 int lru;
1643
1644 VM_BUG_ON_PAGE(PageLRU(page), page);
1645 list_del(&page->lru);
1646 if (unlikely(!page_evictable(page))) {
1647 spin_unlock_irq(&pgdat->lru_lock);
1648 putback_lru_page(page);
1649 spin_lock_irq(&pgdat->lru_lock);
1650 continue;
1651 }
1652
1653 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1654
1655 SetPageLRU(page);
1656 lru = page_lru(page);
1657 add_page_to_lru_list(page, lruvec, lru);
1658
1659 if (is_active_lru(lru)) {
1660 int file = is_file_lru(lru);
1661 int numpages = hpage_nr_pages(page);
1662 reclaim_stat->recent_rotated[file] += numpages;
1663 }
1664 if (put_page_testzero(page)) {
1665 __ClearPageLRU(page);
1666 __ClearPageActive(page);
1667 del_page_from_lru_list(page, lruvec, lru);
1668
1669 if (unlikely(PageCompound(page))) {
1670 spin_unlock_irq(&pgdat->lru_lock);
1671 mem_cgroup_uncharge(page);
1672 (*get_compound_page_dtor(page))(page);
1673 spin_lock_irq(&pgdat->lru_lock);
1674 } else
1675 list_add(&page->lru, &pages_to_free);
1676 }
1677 }
1678
1679
1680
1681
1682 list_splice(&pages_to_free, page_list);
1683}
1684
1685
1686
1687
1688
1689
1690
1691static int current_may_throttle(void)
1692{
1693 return !(current->flags & PF_LESS_THROTTLE) ||
1694 current->backing_dev_info == NULL ||
1695 bdi_write_congested(current->backing_dev_info);
1696}
1697
1698
1699
1700
1701
1702static noinline_for_stack unsigned long
1703shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1704 struct scan_control *sc, enum lru_list lru)
1705{
1706 LIST_HEAD(page_list);
1707 unsigned long nr_scanned;
1708 unsigned long nr_reclaimed = 0;
1709 unsigned long nr_taken;
1710 struct reclaim_stat stat = {};
1711 isolate_mode_t isolate_mode = 0;
1712 int file = is_file_lru(lru);
1713 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1714 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1715
1716 while (unlikely(too_many_isolated(pgdat, file, sc))) {
1717 congestion_wait(BLK_RW_ASYNC, HZ/10);
1718
1719
1720 if (fatal_signal_pending(current))
1721 return SWAP_CLUSTER_MAX;
1722 }
1723
1724 lru_add_drain();
1725
1726 if (!sc->may_unmap)
1727 isolate_mode |= ISOLATE_UNMAPPED;
1728
1729 spin_lock_irq(&pgdat->lru_lock);
1730
1731 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1732 &nr_scanned, sc, isolate_mode, lru);
1733
1734 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1735 reclaim_stat->recent_scanned[file] += nr_taken;
1736
1737 if (global_reclaim(sc)) {
1738 if (current_is_kswapd())
1739 __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
1740 else
1741 __count_vm_events(PGSCAN_DIRECT, nr_scanned);
1742 }
1743 spin_unlock_irq(&pgdat->lru_lock);
1744
1745 if (nr_taken == 0)
1746 return 0;
1747
1748 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
1749 &stat, false);
1750
1751 spin_lock_irq(&pgdat->lru_lock);
1752
1753 if (global_reclaim(sc)) {
1754 if (current_is_kswapd())
1755 __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
1756 else
1757 __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
1758 }
1759
1760 putback_inactive_pages(lruvec, &page_list);
1761
1762 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1763
1764 spin_unlock_irq(&pgdat->lru_lock);
1765
1766 mem_cgroup_uncharge_list(&page_list);
1767 free_hot_cold_page_list(&page_list, true);
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783 if (stat.nr_writeback && stat.nr_writeback == nr_taken)
1784 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
1785
1786
1787
1788
1789
1790 if (sane_reclaim(sc)) {
1791
1792
1793
1794
1795 if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
1796 set_bit(PGDAT_CONGESTED, &pgdat->flags);
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810 if (stat.nr_unqueued_dirty == nr_taken) {
1811 wakeup_flusher_threads(0, WB_REASON_VMSCAN);
1812 set_bit(PGDAT_DIRTY, &pgdat->flags);
1813 }
1814
1815
1816
1817
1818
1819
1820
1821 if (stat.nr_immediate && current_may_throttle())
1822 congestion_wait(BLK_RW_ASYNC, HZ/10);
1823 }
1824
1825
1826
1827
1828
1829
1830 if (!sc->hibernation_mode && !current_is_kswapd() &&
1831 current_may_throttle())
1832 wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
1833
1834 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
1835 nr_scanned, nr_reclaimed,
1836 stat.nr_dirty, stat.nr_writeback,
1837 stat.nr_congested, stat.nr_immediate,
1838 stat.nr_activate, stat.nr_ref_keep,
1839 stat.nr_unmap_fail,
1840 sc->priority, file);
1841 return nr_reclaimed;
1842}
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
1865 struct list_head *list,
1866 struct list_head *pages_to_free,
1867 enum lru_list lru)
1868{
1869 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1870 struct page *page;
1871 int nr_pages;
1872 int nr_moved = 0;
1873
1874 while (!list_empty(list)) {
1875 page = lru_to_page(list);
1876 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1877
1878 VM_BUG_ON_PAGE(PageLRU(page), page);
1879 SetPageLRU(page);
1880
1881 nr_pages = hpage_nr_pages(page);
1882 update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
1883 list_move(&page->lru, &lruvec->lists[lru]);
1884
1885 if (put_page_testzero(page)) {
1886 __ClearPageLRU(page);
1887 __ClearPageActive(page);
1888 del_page_from_lru_list(page, lruvec, lru);
1889
1890 if (unlikely(PageCompound(page))) {
1891 spin_unlock_irq(&pgdat->lru_lock);
1892 mem_cgroup_uncharge(page);
1893 (*get_compound_page_dtor(page))(page);
1894 spin_lock_irq(&pgdat->lru_lock);
1895 } else
1896 list_add(&page->lru, pages_to_free);
1897 } else {
1898 nr_moved += nr_pages;
1899 }
1900 }
1901
1902 if (!is_active_lru(lru))
1903 __count_vm_events(PGDEACTIVATE, nr_moved);
1904
1905 return nr_moved;
1906}
1907
1908static void shrink_active_list(unsigned long nr_to_scan,
1909 struct lruvec *lruvec,
1910 struct scan_control *sc,
1911 enum lru_list lru)
1912{
1913 unsigned long nr_taken;
1914 unsigned long nr_scanned;
1915 unsigned long vm_flags;
1916 LIST_HEAD(l_hold);
1917 LIST_HEAD(l_active);
1918 LIST_HEAD(l_inactive);
1919 struct page *page;
1920 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1921 unsigned nr_deactivate, nr_activate;
1922 unsigned nr_rotated = 0;
1923 isolate_mode_t isolate_mode = 0;
1924 int file = is_file_lru(lru);
1925 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1926
1927 lru_add_drain();
1928
1929 if (!sc->may_unmap)
1930 isolate_mode |= ISOLATE_UNMAPPED;
1931
1932 spin_lock_irq(&pgdat->lru_lock);
1933
1934 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1935 &nr_scanned, sc, isolate_mode, lru);
1936
1937 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1938 reclaim_stat->recent_scanned[file] += nr_taken;
1939
1940 __count_vm_events(PGREFILL, nr_scanned);
1941
1942 spin_unlock_irq(&pgdat->lru_lock);
1943
1944 while (!list_empty(&l_hold)) {
1945 cond_resched();
1946 page = lru_to_page(&l_hold);
1947 list_del(&page->lru);
1948
1949 if (unlikely(!page_evictable(page))) {
1950 putback_lru_page(page);
1951 continue;
1952 }
1953
1954 if (unlikely(buffer_heads_over_limit)) {
1955 if (page_has_private(page) && trylock_page(page)) {
1956 if (page_has_private(page))
1957 try_to_release_page(page, 0);
1958 unlock_page(page);
1959 }
1960 }
1961
1962 if (page_referenced(page, 0, sc->target_mem_cgroup,
1963 &vm_flags)) {
1964 nr_rotated += hpage_nr_pages(page);
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
1975 list_add(&page->lru, &l_active);
1976 continue;
1977 }
1978 }
1979
1980 ClearPageActive(page);
1981 list_add(&page->lru, &l_inactive);
1982 }
1983
1984
1985
1986
1987 spin_lock_irq(&pgdat->lru_lock);
1988
1989
1990
1991
1992
1993
1994 reclaim_stat->recent_rotated[file] += nr_rotated;
1995
1996 nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1997 nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1998 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1999 spin_unlock_irq(&pgdat->lru_lock);
2000
2001 mem_cgroup_uncharge_list(&l_hold);
2002 free_hot_cold_page_list(&l_hold, true);
2003 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2004 nr_deactivate, nr_rotated, sc->priority, file);
2005}
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2036 struct mem_cgroup *memcg,
2037 struct scan_control *sc, bool actual_reclaim)
2038{
2039 enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
2040 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2041 enum lru_list inactive_lru = file * LRU_FILE;
2042 unsigned long inactive, active;
2043 unsigned long inactive_ratio;
2044 unsigned long refaults;
2045 unsigned long gb;
2046
2047
2048
2049
2050
2051 if (!file && !total_swap_pages)
2052 return false;
2053
2054 inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
2055 active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
2056
2057 if (memcg)
2058 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
2059 else
2060 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
2061
2062
2063
2064
2065
2066
2067 if (file && actual_reclaim && lruvec->refaults != refaults) {
2068 inactive_ratio = 0;
2069 } else {
2070 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2071 if (gb)
2072 inactive_ratio = int_sqrt(10 * gb);
2073 else
2074 inactive_ratio = 1;
2075 }
2076
2077 if (actual_reclaim)
2078 trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
2079 lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
2080 lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
2081 inactive_ratio, file);
2082
2083 return inactive * inactive_ratio < active;
2084}
2085
2086static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2087 struct lruvec *lruvec, struct mem_cgroup *memcg,
2088 struct scan_control *sc)
2089{
2090 if (is_active_lru(lru)) {
2091 if (inactive_list_is_low(lruvec, is_file_lru(lru),
2092 memcg, sc, true))
2093 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2094 return 0;
2095 }
2096
2097 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2098}
2099
2100enum scan_balance {
2101 SCAN_EQUAL,
2102 SCAN_FRACT,
2103 SCAN_ANON,
2104 SCAN_FILE,
2105};
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2117 struct scan_control *sc, unsigned long *nr,
2118 unsigned long *lru_pages)
2119{
2120 int swappiness = mem_cgroup_swappiness(memcg);
2121 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2122 u64 fraction[2];
2123 u64 denominator = 0;
2124 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2125 unsigned long anon_prio, file_prio;
2126 enum scan_balance scan_balance;
2127 unsigned long anon, file;
2128 unsigned long ap, fp;
2129 enum lru_list lru;
2130
2131
2132 if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2133 scan_balance = SCAN_FILE;
2134 goto out;
2135 }
2136
2137
2138
2139
2140
2141
2142
2143
2144 if (!global_reclaim(sc) && !swappiness) {
2145 scan_balance = SCAN_FILE;
2146 goto out;
2147 }
2148
2149
2150
2151
2152
2153
2154 if (!sc->priority && swappiness) {
2155 scan_balance = SCAN_EQUAL;
2156 goto out;
2157 }
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168 if (global_reclaim(sc)) {
2169 unsigned long pgdatfile;
2170 unsigned long pgdatfree;
2171 int z;
2172 unsigned long total_high_wmark = 0;
2173
2174 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2175 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
2176 node_page_state(pgdat, NR_INACTIVE_FILE);
2177
2178 for (z = 0; z < MAX_NR_ZONES; z++) {
2179 struct zone *zone = &pgdat->node_zones[z];
2180 if (!managed_zone(zone))
2181 continue;
2182
2183 total_high_wmark += high_wmark_pages(zone);
2184 }
2185
2186 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
2187 scan_balance = SCAN_ANON;
2188 goto out;
2189 }
2190 }
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201 if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
2202 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
2203 scan_balance = SCAN_FILE;
2204 goto out;
2205 }
2206
2207 scan_balance = SCAN_FRACT;
2208
2209
2210
2211
2212
2213 anon_prio = swappiness;
2214 file_prio = 200 - anon_prio;
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
2229 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
2230 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
2231 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
2232
2233 spin_lock_irq(&pgdat->lru_lock);
2234 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
2235 reclaim_stat->recent_scanned[0] /= 2;
2236 reclaim_stat->recent_rotated[0] /= 2;
2237 }
2238
2239 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
2240 reclaim_stat->recent_scanned[1] /= 2;
2241 reclaim_stat->recent_rotated[1] /= 2;
2242 }
2243
2244
2245
2246
2247
2248
2249 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
2250 ap /= reclaim_stat->recent_rotated[0] + 1;
2251
2252 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
2253 fp /= reclaim_stat->recent_rotated[1] + 1;
2254 spin_unlock_irq(&pgdat->lru_lock);
2255
2256 fraction[0] = ap;
2257 fraction[1] = fp;
2258 denominator = ap + fp + 1;
2259out:
2260 *lru_pages = 0;
2261 for_each_evictable_lru(lru) {
2262 int file = is_file_lru(lru);
2263 unsigned long size;
2264 unsigned long scan;
2265
2266 size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2267 scan = size >> sc->priority;
2268
2269
2270
2271
2272 if (!scan && !mem_cgroup_online(memcg))
2273 scan = min(size, SWAP_CLUSTER_MAX);
2274
2275 switch (scan_balance) {
2276 case SCAN_EQUAL:
2277
2278 break;
2279 case SCAN_FRACT:
2280
2281
2282
2283
2284 scan = div64_u64(scan * fraction[file],
2285 denominator);
2286 break;
2287 case SCAN_FILE:
2288 case SCAN_ANON:
2289
2290 if ((scan_balance == SCAN_FILE) != file) {
2291 size = 0;
2292 scan = 0;
2293 }
2294 break;
2295 default:
2296
2297 BUG();
2298 }
2299
2300 *lru_pages += size;
2301 nr[lru] = scan;
2302 }
2303}
2304
2305
2306
2307
2308static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
2309 struct scan_control *sc, unsigned long *lru_pages)
2310{
2311 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2312 unsigned long nr[NR_LRU_LISTS];
2313 unsigned long targets[NR_LRU_LISTS];
2314 unsigned long nr_to_scan;
2315 enum lru_list lru;
2316 unsigned long nr_reclaimed = 0;
2317 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2318 struct blk_plug plug;
2319 bool scan_adjusted;
2320
2321 get_scan_count(lruvec, memcg, sc, nr, lru_pages);
2322
2323
2324 memcpy(targets, nr, sizeof(nr));
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2338 sc->priority == DEF_PRIORITY);
2339
2340 blk_start_plug(&plug);
2341 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2342 nr[LRU_INACTIVE_FILE]) {
2343 unsigned long nr_anon, nr_file, percentage;
2344 unsigned long nr_scanned;
2345
2346 for_each_evictable_lru(lru) {
2347 if (nr[lru]) {
2348 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2349 nr[lru] -= nr_to_scan;
2350
2351 nr_reclaimed += shrink_list(lru, nr_to_scan,
2352 lruvec, memcg, sc);
2353 }
2354 }
2355
2356 cond_resched();
2357
2358 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2359 continue;
2360
2361
2362
2363
2364
2365
2366
2367
2368 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2369 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2370
2371
2372
2373
2374
2375
2376
2377 if (!nr_file || !nr_anon)
2378 break;
2379
2380 if (nr_file > nr_anon) {
2381 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2382 targets[LRU_ACTIVE_ANON] + 1;
2383 lru = LRU_BASE;
2384 percentage = nr_anon * 100 / scan_target;
2385 } else {
2386 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2387 targets[LRU_ACTIVE_FILE] + 1;
2388 lru = LRU_FILE;
2389 percentage = nr_file * 100 / scan_target;
2390 }
2391
2392
2393 nr[lru] = 0;
2394 nr[lru + LRU_ACTIVE] = 0;
2395
2396
2397
2398
2399
2400 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2401 nr_scanned = targets[lru] - nr[lru];
2402 nr[lru] = targets[lru] * (100 - percentage) / 100;
2403 nr[lru] -= min(nr[lru], nr_scanned);
2404
2405 lru += LRU_ACTIVE;
2406 nr_scanned = targets[lru] - nr[lru];
2407 nr[lru] = targets[lru] * (100 - percentage) / 100;
2408 nr[lru] -= min(nr[lru], nr_scanned);
2409
2410 scan_adjusted = true;
2411 }
2412 blk_finish_plug(&plug);
2413 sc->nr_reclaimed += nr_reclaimed;
2414
2415
2416
2417
2418
2419 if (inactive_list_is_low(lruvec, false, memcg, sc, true))
2420 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2421 sc, LRU_ACTIVE_ANON);
2422}
2423
2424
2425static bool in_reclaim_compaction(struct scan_control *sc)
2426{
2427 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2428 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2429 sc->priority < DEF_PRIORITY - 2))
2430 return true;
2431
2432 return false;
2433}
2434
2435
2436
2437
2438
2439
2440
2441
2442static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2443 unsigned long nr_reclaimed,
2444 unsigned long nr_scanned,
2445 struct scan_control *sc)
2446{
2447 unsigned long pages_for_compaction;
2448 unsigned long inactive_lru_pages;
2449 int z;
2450
2451
2452 if (!in_reclaim_compaction(sc))
2453 return false;
2454
2455
2456 if (sc->gfp_mask & __GFP_REPEAT) {
2457
2458
2459
2460
2461
2462
2463 if (!nr_reclaimed && !nr_scanned)
2464 return false;
2465 } else {
2466
2467
2468
2469
2470
2471
2472
2473
2474 if (!nr_reclaimed)
2475 return false;
2476 }
2477
2478
2479
2480
2481
2482 pages_for_compaction = compact_gap(sc->order);
2483 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2484 if (get_nr_swap_pages() > 0)
2485 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2486 if (sc->nr_reclaimed < pages_for_compaction &&
2487 inactive_lru_pages > pages_for_compaction)
2488 return true;
2489
2490
2491 for (z = 0; z <= sc->reclaim_idx; z++) {
2492 struct zone *zone = &pgdat->node_zones[z];
2493 if (!managed_zone(zone))
2494 continue;
2495
2496 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2497 case COMPACT_SUCCESS:
2498 case COMPACT_CONTINUE:
2499 return false;
2500 default:
2501
2502 ;
2503 }
2504 }
2505 return true;
2506}
2507
2508static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2509{
2510 struct reclaim_state *reclaim_state = current->reclaim_state;
2511 unsigned long nr_reclaimed, nr_scanned;
2512 bool reclaimable = false;
2513
2514 do {
2515 struct mem_cgroup *root = sc->target_mem_cgroup;
2516 struct mem_cgroup_reclaim_cookie reclaim = {
2517 .pgdat = pgdat,
2518 .priority = sc->priority,
2519 };
2520 unsigned long node_lru_pages = 0;
2521 struct mem_cgroup *memcg;
2522
2523 nr_reclaimed = sc->nr_reclaimed;
2524 nr_scanned = sc->nr_scanned;
2525
2526 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2527 do {
2528 unsigned long lru_pages;
2529 unsigned long reclaimed;
2530 unsigned long scanned;
2531
2532 if (mem_cgroup_low(root, memcg)) {
2533 if (!sc->memcg_low_reclaim) {
2534 sc->memcg_low_skipped = 1;
2535 continue;
2536 }
2537 mem_cgroup_event(memcg, MEMCG_LOW);
2538 }
2539
2540 reclaimed = sc->nr_reclaimed;
2541 scanned = sc->nr_scanned;
2542
2543 shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
2544 node_lru_pages += lru_pages;
2545
2546 if (memcg)
2547 shrink_slab(sc->gfp_mask, pgdat->node_id,
2548 memcg, sc->nr_scanned - scanned,
2549 lru_pages);
2550
2551
2552 vmpressure(sc->gfp_mask, memcg, false,
2553 sc->nr_scanned - scanned,
2554 sc->nr_reclaimed - reclaimed);
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566 if (!global_reclaim(sc) &&
2567 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2568 mem_cgroup_iter_break(root, memcg);
2569 break;
2570 }
2571 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
2572
2573
2574
2575
2576
2577 if (global_reclaim(sc))
2578 shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
2579 sc->nr_scanned - nr_scanned,
2580 node_lru_pages);
2581
2582 if (reclaim_state) {
2583 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2584 reclaim_state->reclaimed_slab = 0;
2585 }
2586
2587
2588 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2589 sc->nr_scanned - nr_scanned,
2590 sc->nr_reclaimed - nr_reclaimed);
2591
2592 if (sc->nr_reclaimed - nr_reclaimed)
2593 reclaimable = true;
2594
2595 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2596 sc->nr_scanned - nr_scanned, sc));
2597
2598
2599
2600
2601
2602
2603
2604 if (reclaimable)
2605 pgdat->kswapd_failures = 0;
2606
2607 return reclaimable;
2608}
2609
2610
2611
2612
2613
2614
2615static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2616{
2617 unsigned long watermark;
2618 enum compact_result suitable;
2619
2620 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2621 if (suitable == COMPACT_SUCCESS)
2622
2623 return true;
2624 if (suitable == COMPACT_SKIPPED)
2625
2626 return false;
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2638
2639 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2640}
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2651{
2652 struct zoneref *z;
2653 struct zone *zone;
2654 unsigned long nr_soft_reclaimed;
2655 unsigned long nr_soft_scanned;
2656 gfp_t orig_mask;
2657 pg_data_t *last_pgdat = NULL;
2658
2659
2660
2661
2662
2663
2664 orig_mask = sc->gfp_mask;
2665 if (buffer_heads_over_limit) {
2666 sc->gfp_mask |= __GFP_HIGHMEM;
2667 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2668 }
2669
2670 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2671 sc->reclaim_idx, sc->nodemask) {
2672
2673
2674
2675
2676 if (global_reclaim(sc)) {
2677 if (!cpuset_zone_allowed(zone,
2678 GFP_KERNEL | __GFP_HARDWALL))
2679 continue;
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690 if (IS_ENABLED(CONFIG_COMPACTION) &&
2691 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2692 compaction_ready(zone, sc)) {
2693 sc->compaction_ready = true;
2694 continue;
2695 }
2696
2697
2698
2699
2700
2701
2702
2703 if (zone->zone_pgdat == last_pgdat)
2704 continue;
2705
2706
2707
2708
2709
2710
2711
2712 nr_soft_scanned = 0;
2713 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2714 sc->order, sc->gfp_mask,
2715 &nr_soft_scanned);
2716 sc->nr_reclaimed += nr_soft_reclaimed;
2717 sc->nr_scanned += nr_soft_scanned;
2718
2719 }
2720
2721
2722 if (zone->zone_pgdat == last_pgdat)
2723 continue;
2724 last_pgdat = zone->zone_pgdat;
2725 shrink_node(zone->zone_pgdat, sc);
2726 }
2727
2728
2729
2730
2731
2732 sc->gfp_mask = orig_mask;
2733}
2734
2735static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
2736{
2737 struct mem_cgroup *memcg;
2738
2739 memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
2740 do {
2741 unsigned long refaults;
2742 struct lruvec *lruvec;
2743
2744 if (memcg)
2745 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
2746 else
2747 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
2748
2749 lruvec = mem_cgroup_lruvec(pgdat, memcg);
2750 lruvec->refaults = refaults;
2751 } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
2752}
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2771 struct scan_control *sc)
2772{
2773 int initial_priority = sc->priority;
2774 pg_data_t *last_pgdat;
2775 struct zoneref *z;
2776 struct zone *zone;
2777retry:
2778 delayacct_freepages_start();
2779
2780 if (global_reclaim(sc))
2781 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
2782
2783 do {
2784 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2785 sc->priority);
2786 sc->nr_scanned = 0;
2787 shrink_zones(zonelist, sc);
2788
2789 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2790 break;
2791
2792 if (sc->compaction_ready)
2793 break;
2794
2795
2796
2797
2798
2799 if (sc->priority < DEF_PRIORITY - 2)
2800 sc->may_writepage = 1;
2801 } while (--sc->priority >= 0);
2802
2803 last_pgdat = NULL;
2804 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
2805 sc->nodemask) {
2806 if (zone->zone_pgdat == last_pgdat)
2807 continue;
2808 last_pgdat = zone->zone_pgdat;
2809 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
2810 }
2811
2812 delayacct_freepages_end();
2813
2814 if (sc->nr_reclaimed)
2815 return sc->nr_reclaimed;
2816
2817
2818 if (sc->compaction_ready)
2819 return 1;
2820
2821
2822 if (sc->memcg_low_skipped) {
2823 sc->priority = initial_priority;
2824 sc->memcg_low_reclaim = 1;
2825 sc->memcg_low_skipped = 0;
2826 goto retry;
2827 }
2828
2829 return 0;
2830}
2831
2832static bool allow_direct_reclaim(pg_data_t *pgdat)
2833{
2834 struct zone *zone;
2835 unsigned long pfmemalloc_reserve = 0;
2836 unsigned long free_pages = 0;
2837 int i;
2838 bool wmark_ok;
2839
2840 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
2841 return true;
2842
2843 for (i = 0; i <= ZONE_NORMAL; i++) {
2844 zone = &pgdat->node_zones[i];
2845 if (!managed_zone(zone))
2846 continue;
2847
2848 if (!zone_reclaimable_pages(zone))
2849 continue;
2850
2851 pfmemalloc_reserve += min_wmark_pages(zone);
2852 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2853 }
2854
2855
2856 if (!pfmemalloc_reserve)
2857 return true;
2858
2859 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2860
2861
2862 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2863 pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
2864 (enum zone_type)ZONE_NORMAL);
2865 wake_up_interruptible(&pgdat->kswapd_wait);
2866 }
2867
2868 return wmark_ok;
2869}
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2881 nodemask_t *nodemask)
2882{
2883 struct zoneref *z;
2884 struct zone *zone;
2885 pg_data_t *pgdat = NULL;
2886
2887
2888
2889
2890
2891
2892
2893
2894 if (current->flags & PF_KTHREAD)
2895 goto out;
2896
2897
2898
2899
2900
2901 if (fatal_signal_pending(current))
2902 goto out;
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2919 gfp_zone(gfp_mask), nodemask) {
2920 if (zone_idx(zone) > ZONE_NORMAL)
2921 continue;
2922
2923
2924 pgdat = zone->zone_pgdat;
2925 if (allow_direct_reclaim(pgdat))
2926 goto out;
2927 break;
2928 }
2929
2930
2931 if (!pgdat)
2932 goto out;
2933
2934
2935 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945 if (!(gfp_mask & __GFP_FS)) {
2946 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2947 allow_direct_reclaim(pgdat), HZ);
2948
2949 goto check_pending;
2950 }
2951
2952
2953 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2954 allow_direct_reclaim(pgdat));
2955
2956check_pending:
2957 if (fatal_signal_pending(current))
2958 return true;
2959
2960out:
2961 return false;
2962}
2963
2964unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2965 gfp_t gfp_mask, nodemask_t *nodemask)
2966{
2967 unsigned long nr_reclaimed;
2968 struct scan_control sc = {
2969 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2970 .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
2971 .reclaim_idx = gfp_zone(gfp_mask),
2972 .order = order,
2973 .nodemask = nodemask,
2974 .priority = DEF_PRIORITY,
2975 .may_writepage = !laptop_mode,
2976 .may_unmap = 1,
2977 .may_swap = 1,
2978 };
2979
2980
2981
2982
2983
2984
2985 if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
2986 return 1;
2987
2988 trace_mm_vmscan_direct_reclaim_begin(order,
2989 sc.may_writepage,
2990 gfp_mask,
2991 sc.reclaim_idx);
2992
2993 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2994
2995 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2996
2997 return nr_reclaimed;
2998}
2999
3000#ifdef CONFIG_MEMCG
3001
3002unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3003 gfp_t gfp_mask, bool noswap,
3004 pg_data_t *pgdat,
3005 unsigned long *nr_scanned)
3006{
3007 struct scan_control sc = {
3008 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3009 .target_mem_cgroup = memcg,
3010 .may_writepage = !laptop_mode,
3011 .may_unmap = 1,
3012 .reclaim_idx = MAX_NR_ZONES - 1,
3013 .may_swap = !noswap,
3014 };
3015 unsigned long lru_pages;
3016
3017 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3018 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3019
3020 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3021 sc.may_writepage,
3022 sc.gfp_mask,
3023 sc.reclaim_idx);
3024
3025
3026
3027
3028
3029
3030
3031
3032 shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
3033
3034 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3035
3036 *nr_scanned = sc.nr_scanned;
3037 return sc.nr_reclaimed;
3038}
3039
3040unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3041 unsigned long nr_pages,
3042 gfp_t gfp_mask,
3043 bool may_swap)
3044{
3045 struct zonelist *zonelist;
3046 unsigned long nr_reclaimed;
3047 int nid;
3048 unsigned int noreclaim_flag;
3049 struct scan_control sc = {
3050 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3051 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3052 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3053 .reclaim_idx = MAX_NR_ZONES - 1,
3054 .target_mem_cgroup = memcg,
3055 .priority = DEF_PRIORITY,
3056 .may_writepage = !laptop_mode,
3057 .may_unmap = 1,
3058 .may_swap = may_swap,
3059 };
3060
3061
3062
3063
3064
3065
3066 nid = mem_cgroup_select_victim_node(memcg);
3067
3068 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
3069
3070 trace_mm_vmscan_memcg_reclaim_begin(0,
3071 sc.may_writepage,
3072 sc.gfp_mask,
3073 sc.reclaim_idx);
3074
3075 noreclaim_flag = memalloc_noreclaim_save();
3076 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3077 memalloc_noreclaim_restore(noreclaim_flag);
3078
3079 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3080
3081 return nr_reclaimed;
3082}
3083#endif
3084
3085static void age_active_anon(struct pglist_data *pgdat,
3086 struct scan_control *sc)
3087{
3088 struct mem_cgroup *memcg;
3089
3090 if (!total_swap_pages)
3091 return;
3092
3093 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3094 do {
3095 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
3096
3097 if (inactive_list_is_low(lruvec, false, memcg, sc, true))
3098 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3099 sc, LRU_ACTIVE_ANON);
3100
3101 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3102 } while (memcg);
3103}
3104
3105
3106
3107
3108
3109static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3110{
3111 int i;
3112 unsigned long mark = -1;
3113 struct zone *zone;
3114
3115 for (i = 0; i <= classzone_idx; i++) {
3116 zone = pgdat->node_zones + i;
3117
3118 if (!managed_zone(zone))
3119 continue;
3120
3121 mark = high_wmark_pages(zone);
3122 if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
3123 return true;
3124 }
3125
3126
3127
3128
3129
3130
3131 if (mark == -1)
3132 return true;
3133
3134 return false;
3135}
3136
3137
3138static void clear_pgdat_congested(pg_data_t *pgdat)
3139{
3140 clear_bit(PGDAT_CONGESTED, &pgdat->flags);
3141 clear_bit(PGDAT_DIRTY, &pgdat->flags);
3142 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3143}
3144
3145
3146
3147
3148
3149
3150
3151static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3152{
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3167 wake_up_all(&pgdat->pfmemalloc_wait);
3168
3169
3170 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3171 return true;
3172
3173 if (pgdat_balanced(pgdat, order, classzone_idx)) {
3174 clear_pgdat_congested(pgdat);
3175 return true;
3176 }
3177
3178 return false;
3179}
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189static bool kswapd_shrink_node(pg_data_t *pgdat,
3190 struct scan_control *sc)
3191{
3192 struct zone *zone;
3193 int z;
3194
3195
3196 sc->nr_to_reclaim = 0;
3197 for (z = 0; z <= sc->reclaim_idx; z++) {
3198 zone = pgdat->node_zones + z;
3199 if (!managed_zone(zone))
3200 continue;
3201
3202 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3203 }
3204
3205
3206
3207
3208
3209 shrink_node(pgdat, sc);
3210
3211
3212
3213
3214
3215
3216
3217
3218 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3219 sc->order = 0;
3220
3221 return sc->nr_scanned >= sc->nr_to_reclaim;
3222}
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3238{
3239 int i;
3240 unsigned long nr_soft_reclaimed;
3241 unsigned long nr_soft_scanned;
3242 struct zone *zone;
3243 struct scan_control sc = {
3244 .gfp_mask = GFP_KERNEL,
3245 .order = order,
3246 .priority = DEF_PRIORITY,
3247 .may_writepage = !laptop_mode,
3248 .may_unmap = 1,
3249 .may_swap = 1,
3250 };
3251 count_vm_event(PAGEOUTRUN);
3252
3253 do {
3254 unsigned long nr_reclaimed = sc.nr_reclaimed;
3255 bool raise_priority = true;
3256
3257 sc.reclaim_idx = classzone_idx;
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269 if (buffer_heads_over_limit) {
3270 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3271 zone = pgdat->node_zones + i;
3272 if (!managed_zone(zone))
3273 continue;
3274
3275 sc.reclaim_idx = i;
3276 break;
3277 }
3278 }
3279
3280
3281
3282
3283
3284
3285 if (pgdat_balanced(pgdat, sc.order, classzone_idx))
3286 goto out;
3287
3288
3289
3290
3291
3292
3293
3294 age_active_anon(pgdat, &sc);
3295
3296
3297
3298
3299
3300 if (sc.priority < DEF_PRIORITY - 2)
3301 sc.may_writepage = 1;
3302
3303
3304 sc.nr_scanned = 0;
3305 nr_soft_scanned = 0;
3306 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3307 sc.gfp_mask, &nr_soft_scanned);
3308 sc.nr_reclaimed += nr_soft_reclaimed;
3309
3310
3311
3312
3313
3314
3315 if (kswapd_shrink_node(pgdat, &sc))
3316 raise_priority = false;
3317
3318
3319
3320
3321
3322
3323 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3324 allow_direct_reclaim(pgdat))
3325 wake_up_all(&pgdat->pfmemalloc_wait);
3326
3327
3328 if (try_to_freeze() || kthread_should_stop())
3329 break;
3330
3331
3332
3333
3334
3335 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3336 if (raise_priority || !nr_reclaimed)
3337 sc.priority--;
3338 } while (sc.priority >= 1);
3339
3340 if (!sc.nr_reclaimed)
3341 pgdat->kswapd_failures++;
3342
3343out:
3344 snapshot_refaults(NULL, pgdat);
3345
3346
3347
3348
3349
3350
3351 return sc.order;
3352}
3353
3354
3355
3356
3357
3358
3359
3360
3361static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
3362 enum zone_type classzone_idx)
3363{
3364 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3365 return classzone_idx;
3366
3367 return max(pgdat->kswapd_classzone_idx, classzone_idx);
3368}
3369
3370static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3371 unsigned int classzone_idx)
3372{
3373 long remaining = 0;
3374 DEFINE_WAIT(wait);
3375
3376 if (freezing(current) || kthread_should_stop())
3377 return;
3378
3379 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3380
3381
3382
3383
3384
3385
3386
3387
3388 if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3389
3390
3391
3392
3393
3394
3395 reset_isolation_suitable(pgdat);
3396
3397
3398
3399
3400
3401 wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
3402
3403 remaining = schedule_timeout(HZ/10);
3404
3405
3406
3407
3408
3409
3410 if (remaining) {
3411 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3412 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
3413 }
3414
3415 finish_wait(&pgdat->kswapd_wait, &wait);
3416 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3417 }
3418
3419
3420
3421
3422
3423 if (!remaining &&
3424 prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3425 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3436
3437 if (!kthread_should_stop())
3438 schedule();
3439
3440 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3441 } else {
3442 if (remaining)
3443 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3444 else
3445 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3446 }
3447 finish_wait(&pgdat->kswapd_wait, &wait);
3448}
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463static int kswapd(void *p)
3464{
3465 unsigned int alloc_order, reclaim_order;
3466 unsigned int classzone_idx = MAX_NR_ZONES - 1;
3467 pg_data_t *pgdat = (pg_data_t*)p;
3468 struct task_struct *tsk = current;
3469
3470 struct reclaim_state reclaim_state = {
3471 .reclaimed_slab = 0,
3472 };
3473 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3474
3475 lockdep_set_current_reclaim_state(GFP_KERNEL);
3476
3477 if (!cpumask_empty(cpumask))
3478 set_cpus_allowed_ptr(tsk, cpumask);
3479 current->reclaim_state = &reclaim_state;
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3494 set_freezable();
3495
3496 pgdat->kswapd_order = 0;
3497 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3498 for ( ; ; ) {
3499 bool ret;
3500
3501 alloc_order = reclaim_order = pgdat->kswapd_order;
3502 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3503
3504kswapd_try_sleep:
3505 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3506 classzone_idx);
3507
3508
3509 alloc_order = reclaim_order = pgdat->kswapd_order;
3510 classzone_idx = kswapd_classzone_idx(pgdat, 0);
3511 pgdat->kswapd_order = 0;
3512 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3513
3514 ret = try_to_freeze();
3515 if (kthread_should_stop())
3516 break;
3517
3518
3519
3520
3521
3522 if (ret)
3523 continue;
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
3534 alloc_order);
3535 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
3536 if (reclaim_order < alloc_order)
3537 goto kswapd_try_sleep;
3538 }
3539
3540 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3541 current->reclaim_state = NULL;
3542 lockdep_clear_current_reclaim_state();
3543
3544 return 0;
3545}
3546
3547
3548
3549
3550void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3551{
3552 pg_data_t *pgdat;
3553
3554 if (!managed_zone(zone))
3555 return;
3556
3557 if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
3558 return;
3559 pgdat = zone->zone_pgdat;
3560 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
3561 classzone_idx);
3562 pgdat->kswapd_order = max(pgdat->kswapd_order, order);
3563 if (!waitqueue_active(&pgdat->kswapd_wait))
3564 return;
3565
3566
3567 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3568 return;
3569
3570 if (pgdat_balanced(pgdat, order, classzone_idx))
3571 return;
3572
3573 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
3574 wake_up_interruptible(&pgdat->kswapd_wait);
3575}
3576
3577#ifdef CONFIG_HIBERNATION
3578
3579
3580
3581
3582
3583
3584
3585
3586unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3587{
3588 struct reclaim_state reclaim_state;
3589 struct scan_control sc = {
3590 .nr_to_reclaim = nr_to_reclaim,
3591 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3592 .reclaim_idx = MAX_NR_ZONES - 1,
3593 .priority = DEF_PRIORITY,
3594 .may_writepage = 1,
3595 .may_unmap = 1,
3596 .may_swap = 1,
3597 .hibernation_mode = 1,
3598 };
3599 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3600 struct task_struct *p = current;
3601 unsigned long nr_reclaimed;
3602 unsigned int noreclaim_flag;
3603
3604 noreclaim_flag = memalloc_noreclaim_save();
3605 lockdep_set_current_reclaim_state(sc.gfp_mask);
3606 reclaim_state.reclaimed_slab = 0;
3607 p->reclaim_state = &reclaim_state;
3608
3609 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3610
3611 p->reclaim_state = NULL;
3612 lockdep_clear_current_reclaim_state();
3613 memalloc_noreclaim_restore(noreclaim_flag);
3614
3615 return nr_reclaimed;
3616}
3617#endif
3618
3619
3620
3621
3622
3623static int kswapd_cpu_online(unsigned int cpu)
3624{
3625 int nid;
3626
3627 for_each_node_state(nid, N_MEMORY) {
3628 pg_data_t *pgdat = NODE_DATA(nid);
3629 const struct cpumask *mask;
3630
3631 mask = cpumask_of_node(pgdat->node_id);
3632
3633 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3634
3635 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3636 }
3637 return 0;
3638}
3639
3640
3641
3642
3643
3644int kswapd_run(int nid)
3645{
3646 pg_data_t *pgdat = NODE_DATA(nid);
3647 int ret = 0;
3648
3649 if (pgdat->kswapd)
3650 return 0;
3651
3652 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3653 if (IS_ERR(pgdat->kswapd)) {
3654
3655 BUG_ON(system_state == SYSTEM_BOOTING);
3656 pr_err("Failed to start kswapd on node %d\n", nid);
3657 ret = PTR_ERR(pgdat->kswapd);
3658 pgdat->kswapd = NULL;
3659 }
3660 return ret;
3661}
3662
3663
3664
3665
3666
3667void kswapd_stop(int nid)
3668{
3669 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3670
3671 if (kswapd) {
3672 kthread_stop(kswapd);
3673 NODE_DATA(nid)->kswapd = NULL;
3674 }
3675}
3676
3677static int __init kswapd_init(void)
3678{
3679 int nid, ret;
3680
3681 swap_setup();
3682 for_each_node_state(nid, N_MEMORY)
3683 kswapd_run(nid);
3684 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
3685 "mm/vmscan:online", kswapd_cpu_online,
3686 NULL);
3687 WARN_ON(ret < 0);
3688 return 0;
3689}
3690
3691module_init(kswapd_init)
3692
3693#ifdef CONFIG_NUMA
3694
3695
3696
3697
3698
3699
3700int node_reclaim_mode __read_mostly;
3701
3702#define RECLAIM_OFF 0
3703#define RECLAIM_ZONE (1<<0)
3704#define RECLAIM_WRITE (1<<1)
3705#define RECLAIM_UNMAP (1<<2)
3706
3707
3708
3709
3710
3711
3712#define NODE_RECLAIM_PRIORITY 4
3713
3714
3715
3716
3717
3718int sysctl_min_unmapped_ratio = 1;
3719
3720
3721
3722
3723
3724int sysctl_min_slab_ratio = 5;
3725
3726static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
3727{
3728 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
3729 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
3730 node_page_state(pgdat, NR_ACTIVE_FILE);
3731
3732
3733
3734
3735
3736
3737 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
3738}
3739
3740
3741static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
3742{
3743 unsigned long nr_pagecache_reclaimable;
3744 unsigned long delta = 0;
3745
3746
3747
3748
3749
3750
3751
3752 if (node_reclaim_mode & RECLAIM_UNMAP)
3753 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
3754 else
3755 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
3756
3757
3758 if (!(node_reclaim_mode & RECLAIM_WRITE))
3759 delta += node_page_state(pgdat, NR_FILE_DIRTY);
3760
3761
3762 if (unlikely(delta > nr_pagecache_reclaimable))
3763 delta = nr_pagecache_reclaimable;
3764
3765 return nr_pagecache_reclaimable - delta;
3766}
3767
3768
3769
3770
3771static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
3772{
3773
3774 const unsigned long nr_pages = 1 << order;
3775 struct task_struct *p = current;
3776 struct reclaim_state reclaim_state;
3777 int classzone_idx = gfp_zone(gfp_mask);
3778 unsigned int noreclaim_flag;
3779 struct scan_control sc = {
3780 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3781 .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
3782 .order = order,
3783 .priority = NODE_RECLAIM_PRIORITY,
3784 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
3785 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
3786 .may_swap = 1,
3787 .reclaim_idx = classzone_idx,
3788 };
3789
3790 cond_resched();
3791
3792
3793
3794
3795
3796 noreclaim_flag = memalloc_noreclaim_save();
3797 p->flags |= PF_SWAPWRITE;
3798 lockdep_set_current_reclaim_state(gfp_mask);
3799 reclaim_state.reclaimed_slab = 0;
3800 p->reclaim_state = &reclaim_state;
3801
3802 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
3803
3804
3805
3806
3807 do {
3808 shrink_node(pgdat, &sc);
3809 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3810 }
3811
3812 p->reclaim_state = NULL;
3813 current->flags &= ~PF_SWAPWRITE;
3814 memalloc_noreclaim_restore(noreclaim_flag);
3815 lockdep_clear_current_reclaim_state();
3816 return sc.nr_reclaimed >= nr_pages;
3817}
3818
3819int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
3820{
3821 int ret;
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
3834 sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
3835 return NODE_RECLAIM_FULL;
3836
3837
3838
3839
3840 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
3841 return NODE_RECLAIM_NOSCAN;
3842
3843
3844
3845
3846
3847
3848
3849 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
3850 return NODE_RECLAIM_NOSCAN;
3851
3852 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
3853 return NODE_RECLAIM_NOSCAN;
3854
3855 ret = __node_reclaim(pgdat, gfp_mask, order);
3856 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
3857
3858 if (!ret)
3859 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
3860
3861 return ret;
3862}
3863#endif
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877int page_evictable(struct page *page)
3878{
3879 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3880}
3881
3882#ifdef CONFIG_SHMEM
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892void check_move_unevictable_pages(struct page **pages, int nr_pages)
3893{
3894 struct lruvec *lruvec;
3895 struct pglist_data *pgdat = NULL;
3896 int pgscanned = 0;
3897 int pgrescued = 0;
3898 int i;
3899
3900 for (i = 0; i < nr_pages; i++) {
3901 struct page *page = pages[i];
3902 struct pglist_data *pagepgdat = page_pgdat(page);
3903
3904 pgscanned++;
3905 if (pagepgdat != pgdat) {
3906 if (pgdat)
3907 spin_unlock_irq(&pgdat->lru_lock);
3908 pgdat = pagepgdat;
3909 spin_lock_irq(&pgdat->lru_lock);
3910 }
3911 lruvec = mem_cgroup_page_lruvec(page, pgdat);
3912
3913 if (!PageLRU(page) || !PageUnevictable(page))
3914 continue;
3915
3916 if (page_evictable(page)) {
3917 enum lru_list lru = page_lru_base_type(page);
3918
3919 VM_BUG_ON_PAGE(PageActive(page), page);
3920 ClearPageUnevictable(page);
3921 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3922 add_page_to_lru_list(page, lruvec, lru);
3923 pgrescued++;
3924 }
3925 }
3926
3927 if (pgdat) {
3928 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
3929 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
3930 spin_unlock_irq(&pgdat->lru_lock);
3931 }
3932}
3933#endif
3934