1
2
3
4
5
6
7
8
9
10
11
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15#include <linux/mm.h>
16#include <linux/sched/mm.h>
17#include <linux/module.h>
18#include <linux/gfp.h>
19#include <linux/kernel_stat.h>
20#include <linux/swap.h>
21#include <linux/pagemap.h>
22#include <linux/init.h>
23#include <linux/highmem.h>
24#include <linux/vmpressure.h>
25#include <linux/vmstat.h>
26#include <linux/file.h>
27#include <linux/writeback.h>
28#include <linux/blkdev.h>
29#include <linux/buffer_head.h>
30
31#include <linux/mm_inline.h>
32#include <linux/backing-dev.h>
33#include <linux/rmap.h>
34#include <linux/topology.h>
35#include <linux/cpu.h>
36#include <linux/cpuset.h>
37#include <linux/compaction.h>
38#include <linux/notifier.h>
39#include <linux/rwsem.h>
40#include <linux/delay.h>
41#include <linux/kthread.h>
42#include <linux/freezer.h>
43#include <linux/memcontrol.h>
44#include <linux/delayacct.h>
45#include <linux/sysctl.h>
46#include <linux/oom.h>
47#include <linux/pagevec.h>
48#include <linux/prefetch.h>
49#include <linux/printk.h>
50#include <linux/dax.h>
51#include <linux/psi.h>
52
53#include <asm/tlbflush.h>
54#include <asm/div64.h>
55
56#include <linux/swapops.h>
57#include <linux/balloon_compaction.h>
58
59#include "internal.h"
60
61#define CREATE_TRACE_POINTS
62#include <trace/events/vmscan.h>
63
64struct scan_control {
65
66 unsigned long nr_to_reclaim;
67
68
69
70
71
72 nodemask_t *nodemask;
73
74
75
76
77
78 struct mem_cgroup *target_mem_cgroup;
79
80
81
82
83 unsigned long anon_cost;
84 unsigned long file_cost;
85
86
87#define DEACTIVATE_ANON 1
88#define DEACTIVATE_FILE 2
89 unsigned int may_deactivate:2;
90 unsigned int force_deactivate:1;
91 unsigned int skipped_deactivate:1;
92
93
94 unsigned int may_writepage:1;
95
96
97 unsigned int may_unmap:1;
98
99
100 unsigned int may_swap:1;
101
102
103
104
105
106
107 unsigned int memcg_low_reclaim:1;
108 unsigned int memcg_low_skipped:1;
109
110 unsigned int hibernation_mode:1;
111
112
113 unsigned int compaction_ready:1;
114
115
116 unsigned int cache_trim_mode:1;
117
118
119 unsigned int file_is_tiny:1;
120
121
122 s8 order;
123
124
125 s8 priority;
126
127
128 s8 reclaim_idx;
129
130
131 gfp_t gfp_mask;
132
133
134 unsigned long nr_scanned;
135
136
137 unsigned long nr_reclaimed;
138
139 struct {
140 unsigned int dirty;
141 unsigned int unqueued_dirty;
142 unsigned int congested;
143 unsigned int writeback;
144 unsigned int immediate;
145 unsigned int file_taken;
146 unsigned int taken;
147 } nr;
148
149
150 struct reclaim_state reclaim_state;
151};
152
153#ifdef ARCH_HAS_PREFETCHW
154#define prefetchw_prev_lru_page(_page, _base, _field) \
155 do { \
156 if ((_page)->lru.prev != _base) { \
157 struct page *prev; \
158 \
159 prev = lru_to_page(&(_page->lru)); \
160 prefetchw(&prev->_field); \
161 } \
162 } while (0)
163#else
164#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
165#endif
166
167
168
169
170int vm_swappiness = 60;
171
172static void set_task_reclaim_state(struct task_struct *task,
173 struct reclaim_state *rs)
174{
175
176 WARN_ON_ONCE(rs && task->reclaim_state);
177
178
179 WARN_ON_ONCE(!rs && !task->reclaim_state);
180
181 task->reclaim_state = rs;
182}
183
184static LIST_HEAD(shrinker_list);
185static DECLARE_RWSEM(shrinker_rwsem);
186
187#ifdef CONFIG_MEMCG
188
189
190
191
192
193
194
195
196
197
198
199#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
200
201static DEFINE_IDR(shrinker_idr);
202static int shrinker_nr_max;
203
204static int prealloc_memcg_shrinker(struct shrinker *shrinker)
205{
206 int id, ret = -ENOMEM;
207
208 down_write(&shrinker_rwsem);
209
210 id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
211 if (id < 0)
212 goto unlock;
213
214 if (id >= shrinker_nr_max) {
215 if (memcg_expand_shrinker_maps(id)) {
216 idr_remove(&shrinker_idr, id);
217 goto unlock;
218 }
219
220 shrinker_nr_max = id + 1;
221 }
222 shrinker->id = id;
223 ret = 0;
224unlock:
225 up_write(&shrinker_rwsem);
226 return ret;
227}
228
229static void unregister_memcg_shrinker(struct shrinker *shrinker)
230{
231 int id = shrinker->id;
232
233 BUG_ON(id < 0);
234
235 down_write(&shrinker_rwsem);
236 idr_remove(&shrinker_idr, id);
237 up_write(&shrinker_rwsem);
238}
239
240static bool cgroup_reclaim(struct scan_control *sc)
241{
242 return sc->target_mem_cgroup;
243}
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258static bool writeback_throttling_sane(struct scan_control *sc)
259{
260 if (!cgroup_reclaim(sc))
261 return true;
262#ifdef CONFIG_CGROUP_WRITEBACK
263 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
264 return true;
265#endif
266 return false;
267}
268#else
269static int prealloc_memcg_shrinker(struct shrinker *shrinker)
270{
271 return 0;
272}
273
274static void unregister_memcg_shrinker(struct shrinker *shrinker)
275{
276}
277
278static bool cgroup_reclaim(struct scan_control *sc)
279{
280 return false;
281}
282
283static bool writeback_throttling_sane(struct scan_control *sc)
284{
285 return true;
286}
287#endif
288
289
290
291
292
293
294unsigned long zone_reclaimable_pages(struct zone *zone)
295{
296 unsigned long nr;
297
298 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
299 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
300 if (get_nr_swap_pages() > 0)
301 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
302 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
303
304 return nr;
305}
306
307
308
309
310
311
312
313unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
314{
315 unsigned long size = 0;
316 int zid;
317
318 for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
319 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
320
321 if (!managed_zone(zone))
322 continue;
323
324 if (!mem_cgroup_disabled())
325 size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
326 else
327 size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
328 }
329 return size;
330}
331
332
333
334
335int prealloc_shrinker(struct shrinker *shrinker)
336{
337 unsigned int size = sizeof(*shrinker->nr_deferred);
338
339 if (shrinker->flags & SHRINKER_NUMA_AWARE)
340 size *= nr_node_ids;
341
342 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
343 if (!shrinker->nr_deferred)
344 return -ENOMEM;
345
346 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
347 if (prealloc_memcg_shrinker(shrinker))
348 goto free_deferred;
349 }
350
351 return 0;
352
353free_deferred:
354 kfree(shrinker->nr_deferred);
355 shrinker->nr_deferred = NULL;
356 return -ENOMEM;
357}
358
359void free_prealloced_shrinker(struct shrinker *shrinker)
360{
361 if (!shrinker->nr_deferred)
362 return;
363
364 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
365 unregister_memcg_shrinker(shrinker);
366
367 kfree(shrinker->nr_deferred);
368 shrinker->nr_deferred = NULL;
369}
370
371void register_shrinker_prepared(struct shrinker *shrinker)
372{
373 down_write(&shrinker_rwsem);
374 list_add_tail(&shrinker->list, &shrinker_list);
375#ifdef CONFIG_MEMCG
376 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
377 idr_replace(&shrinker_idr, shrinker, shrinker->id);
378#endif
379 up_write(&shrinker_rwsem);
380}
381
382int register_shrinker(struct shrinker *shrinker)
383{
384 int err = prealloc_shrinker(shrinker);
385
386 if (err)
387 return err;
388 register_shrinker_prepared(shrinker);
389 return 0;
390}
391EXPORT_SYMBOL(register_shrinker);
392
393
394
395
396void unregister_shrinker(struct shrinker *shrinker)
397{
398 if (!shrinker->nr_deferred)
399 return;
400 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
401 unregister_memcg_shrinker(shrinker);
402 down_write(&shrinker_rwsem);
403 list_del(&shrinker->list);
404 up_write(&shrinker_rwsem);
405 kfree(shrinker->nr_deferred);
406 shrinker->nr_deferred = NULL;
407}
408EXPORT_SYMBOL(unregister_shrinker);
409
410#define SHRINK_BATCH 128
411
412static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
413 struct shrinker *shrinker, int priority)
414{
415 unsigned long freed = 0;
416 unsigned long long delta;
417 long total_scan;
418 long freeable;
419 long nr;
420 long new_nr;
421 int nid = shrinkctl->nid;
422 long batch_size = shrinker->batch ? shrinker->batch
423 : SHRINK_BATCH;
424 long scanned = 0, next_deferred;
425
426 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
427 nid = 0;
428
429 freeable = shrinker->count_objects(shrinker, shrinkctl);
430 if (freeable == 0 || freeable == SHRINK_EMPTY)
431 return freeable;
432
433
434
435
436
437
438 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
439
440 total_scan = nr;
441 if (shrinker->seeks) {
442 delta = freeable >> priority;
443 delta *= 4;
444 do_div(delta, shrinker->seeks);
445 } else {
446
447
448
449
450
451 delta = freeable / 2;
452 }
453
454 total_scan += delta;
455 if (total_scan < 0) {
456 pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
457 shrinker->scan_objects, total_scan);
458 total_scan = freeable;
459 next_deferred = nr;
460 } else
461 next_deferred = total_scan;
462
463
464
465
466
467
468
469
470
471
472
473
474
475 if (delta < freeable / 4)
476 total_scan = min(total_scan, freeable / 2);
477
478
479
480
481
482
483 if (total_scan > freeable * 2)
484 total_scan = freeable * 2;
485
486 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
487 freeable, delta, total_scan, priority);
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504 while (total_scan >= batch_size ||
505 total_scan >= freeable) {
506 unsigned long ret;
507 unsigned long nr_to_scan = min(batch_size, total_scan);
508
509 shrinkctl->nr_to_scan = nr_to_scan;
510 shrinkctl->nr_scanned = nr_to_scan;
511 ret = shrinker->scan_objects(shrinker, shrinkctl);
512 if (ret == SHRINK_STOP)
513 break;
514 freed += ret;
515
516 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
517 total_scan -= shrinkctl->nr_scanned;
518 scanned += shrinkctl->nr_scanned;
519
520 cond_resched();
521 }
522
523 if (next_deferred >= scanned)
524 next_deferred -= scanned;
525 else
526 next_deferred = 0;
527
528
529
530
531
532 if (next_deferred > 0)
533 new_nr = atomic_long_add_return(next_deferred,
534 &shrinker->nr_deferred[nid]);
535 else
536 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
537
538 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
539 return freed;
540}
541
542#ifdef CONFIG_MEMCG
543static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
544 struct mem_cgroup *memcg, int priority)
545{
546 struct memcg_shrinker_map *map;
547 unsigned long ret, freed = 0;
548 int i;
549
550 if (!mem_cgroup_online(memcg))
551 return 0;
552
553 if (!down_read_trylock(&shrinker_rwsem))
554 return 0;
555
556 map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
557 true);
558 if (unlikely(!map))
559 goto unlock;
560
561 for_each_set_bit(i, map->map, shrinker_nr_max) {
562 struct shrink_control sc = {
563 .gfp_mask = gfp_mask,
564 .nid = nid,
565 .memcg = memcg,
566 };
567 struct shrinker *shrinker;
568
569 shrinker = idr_find(&shrinker_idr, i);
570 if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
571 if (!shrinker)
572 clear_bit(i, map->map);
573 continue;
574 }
575
576
577 if (!memcg_kmem_enabled() &&
578 !(shrinker->flags & SHRINKER_NONSLAB))
579 continue;
580
581 ret = do_shrink_slab(&sc, shrinker, priority);
582 if (ret == SHRINK_EMPTY) {
583 clear_bit(i, map->map);
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599 smp_mb__after_atomic();
600 ret = do_shrink_slab(&sc, shrinker, priority);
601 if (ret == SHRINK_EMPTY)
602 ret = 0;
603 else
604 memcg_set_shrinker_bit(memcg, nid, i);
605 }
606 freed += ret;
607
608 if (rwsem_is_contended(&shrinker_rwsem)) {
609 freed = freed ? : 1;
610 break;
611 }
612 }
613unlock:
614 up_read(&shrinker_rwsem);
615 return freed;
616}
617#else
618static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
619 struct mem_cgroup *memcg, int priority)
620{
621 return 0;
622}
623#endif
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
646 struct mem_cgroup *memcg,
647 int priority)
648{
649 unsigned long ret, freed = 0;
650 struct shrinker *shrinker;
651
652
653
654
655
656
657
658
659 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
660 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
661
662 if (!down_read_trylock(&shrinker_rwsem))
663 goto out;
664
665 list_for_each_entry(shrinker, &shrinker_list, list) {
666 struct shrink_control sc = {
667 .gfp_mask = gfp_mask,
668 .nid = nid,
669 .memcg = memcg,
670 };
671
672 ret = do_shrink_slab(&sc, shrinker, priority);
673 if (ret == SHRINK_EMPTY)
674 ret = 0;
675 freed += ret;
676
677
678
679
680
681 if (rwsem_is_contended(&shrinker_rwsem)) {
682 freed = freed ? : 1;
683 break;
684 }
685 }
686
687 up_read(&shrinker_rwsem);
688out:
689 cond_resched();
690 return freed;
691}
692
693void drop_slab_node(int nid)
694{
695 unsigned long freed;
696
697 do {
698 struct mem_cgroup *memcg = NULL;
699
700 if (fatal_signal_pending(current))
701 return;
702
703 freed = 0;
704 memcg = mem_cgroup_iter(NULL, NULL, NULL);
705 do {
706 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
707 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
708 } while (freed > 10);
709}
710
711void drop_slab(void)
712{
713 int nid;
714
715 for_each_online_node(nid)
716 drop_slab_node(nid);
717}
718
719static inline int is_page_cache_freeable(struct page *page)
720{
721
722
723
724
725
726 int page_cache_pins = thp_nr_pages(page);
727 return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
728}
729
730static int may_write_to_inode(struct inode *inode)
731{
732 if (current->flags & PF_SWAPWRITE)
733 return 1;
734 if (!inode_write_congested(inode))
735 return 1;
736 if (inode_to_bdi(inode) == current->backing_dev_info)
737 return 1;
738 return 0;
739}
740
741
742
743
744
745
746
747
748
749
750
751
752
753static void handle_write_error(struct address_space *mapping,
754 struct page *page, int error)
755{
756 lock_page(page);
757 if (page_mapping(page) == mapping)
758 mapping_set_error(mapping, error);
759 unlock_page(page);
760}
761
762
763typedef enum {
764
765 PAGE_KEEP,
766
767 PAGE_ACTIVATE,
768
769 PAGE_SUCCESS,
770
771 PAGE_CLEAN,
772} pageout_t;
773
774
775
776
777
778static pageout_t pageout(struct page *page, struct address_space *mapping)
779{
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796 if (!is_page_cache_freeable(page))
797 return PAGE_KEEP;
798 if (!mapping) {
799
800
801
802
803 if (page_has_private(page)) {
804 if (try_to_free_buffers(page)) {
805 ClearPageDirty(page);
806 pr_info("%s: orphaned page\n", __func__);
807 return PAGE_CLEAN;
808 }
809 }
810 return PAGE_KEEP;
811 }
812 if (mapping->a_ops->writepage == NULL)
813 return PAGE_ACTIVATE;
814 if (!may_write_to_inode(mapping->host))
815 return PAGE_KEEP;
816
817 if (clear_page_dirty_for_io(page)) {
818 int res;
819 struct writeback_control wbc = {
820 .sync_mode = WB_SYNC_NONE,
821 .nr_to_write = SWAP_CLUSTER_MAX,
822 .range_start = 0,
823 .range_end = LLONG_MAX,
824 .for_reclaim = 1,
825 };
826
827 SetPageReclaim(page);
828 res = mapping->a_ops->writepage(page, &wbc);
829 if (res < 0)
830 handle_write_error(mapping, page, res);
831 if (res == AOP_WRITEPAGE_ACTIVATE) {
832 ClearPageReclaim(page);
833 return PAGE_ACTIVATE;
834 }
835
836 if (!PageWriteback(page)) {
837
838 ClearPageReclaim(page);
839 }
840 trace_mm_vmscan_writepage(page);
841 inc_node_page_state(page, NR_VMSCAN_WRITE);
842 return PAGE_SUCCESS;
843 }
844
845 return PAGE_CLEAN;
846}
847
848
849
850
851
852static int __remove_mapping(struct address_space *mapping, struct page *page,
853 bool reclaimed, struct mem_cgroup *target_memcg)
854{
855 unsigned long flags;
856 int refcount;
857 void *shadow = NULL;
858
859 BUG_ON(!PageLocked(page));
860 BUG_ON(mapping != page_mapping(page));
861
862 xa_lock_irqsave(&mapping->i_pages, flags);
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888 refcount = 1 + compound_nr(page);
889 if (!page_ref_freeze(page, refcount))
890 goto cannot_free;
891
892 if (unlikely(PageDirty(page))) {
893 page_ref_unfreeze(page, refcount);
894 goto cannot_free;
895 }
896
897 if (PageSwapCache(page)) {
898 swp_entry_t swap = { .val = page_private(page) };
899 mem_cgroup_swapout(page, swap);
900 if (reclaimed && !mapping_exiting(mapping))
901 shadow = workingset_eviction(page, target_memcg);
902 __delete_from_swap_cache(page, swap, shadow);
903 xa_unlock_irqrestore(&mapping->i_pages, flags);
904 put_swap_page(page, swap);
905 } else {
906 void (*freepage)(struct page *);
907
908 freepage = mapping->a_ops->freepage;
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925 if (reclaimed && page_is_file_lru(page) &&
926 !mapping_exiting(mapping) && !dax_mapping(mapping))
927 shadow = workingset_eviction(page, target_memcg);
928 __delete_from_page_cache(page, shadow);
929 xa_unlock_irqrestore(&mapping->i_pages, flags);
930
931 if (freepage != NULL)
932 freepage(page);
933 }
934
935 return 1;
936
937cannot_free:
938 xa_unlock_irqrestore(&mapping->i_pages, flags);
939 return 0;
940}
941
942
943
944
945
946
947
948int remove_mapping(struct address_space *mapping, struct page *page)
949{
950 if (__remove_mapping(mapping, page, false, NULL)) {
951
952
953
954
955
956 page_ref_unfreeze(page, 1);
957 return 1;
958 }
959 return 0;
960}
961
962
963
964
965
966
967
968
969
970
971void putback_lru_page(struct page *page)
972{
973 lru_cache_add(page);
974 put_page(page);
975}
976
977enum page_references {
978 PAGEREF_RECLAIM,
979 PAGEREF_RECLAIM_CLEAN,
980 PAGEREF_KEEP,
981 PAGEREF_ACTIVATE,
982};
983
984static enum page_references page_check_references(struct page *page,
985 struct scan_control *sc)
986{
987 int referenced_ptes, referenced_page;
988 unsigned long vm_flags;
989
990 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
991 &vm_flags);
992 referenced_page = TestClearPageReferenced(page);
993
994
995
996
997
998 if (vm_flags & VM_LOCKED)
999 return PAGEREF_RECLAIM;
1000
1001 if (referenced_ptes) {
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016 SetPageReferenced(page);
1017
1018 if (referenced_page || referenced_ptes > 1)
1019 return PAGEREF_ACTIVATE;
1020
1021
1022
1023
1024 if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
1025 return PAGEREF_ACTIVATE;
1026
1027 return PAGEREF_KEEP;
1028 }
1029
1030
1031 if (referenced_page && !PageSwapBacked(page))
1032 return PAGEREF_RECLAIM_CLEAN;
1033
1034 return PAGEREF_RECLAIM;
1035}
1036
1037
1038static void page_check_dirty_writeback(struct page *page,
1039 bool *dirty, bool *writeback)
1040{
1041 struct address_space *mapping;
1042
1043
1044
1045
1046
1047 if (!page_is_file_lru(page) ||
1048 (PageAnon(page) && !PageSwapBacked(page))) {
1049 *dirty = false;
1050 *writeback = false;
1051 return;
1052 }
1053
1054
1055 *dirty = PageDirty(page);
1056 *writeback = PageWriteback(page);
1057
1058
1059 if (!page_has_private(page))
1060 return;
1061
1062 mapping = page_mapping(page);
1063 if (mapping && mapping->a_ops->is_dirty_writeback)
1064 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1065}
1066
1067
1068
1069
1070static unsigned int shrink_page_list(struct list_head *page_list,
1071 struct pglist_data *pgdat,
1072 struct scan_control *sc,
1073 struct reclaim_stat *stat,
1074 bool ignore_references)
1075{
1076 LIST_HEAD(ret_pages);
1077 LIST_HEAD(free_pages);
1078 unsigned int nr_reclaimed = 0;
1079 unsigned int pgactivate = 0;
1080
1081 memset(stat, 0, sizeof(*stat));
1082 cond_resched();
1083
1084 while (!list_empty(page_list)) {
1085 struct address_space *mapping;
1086 struct page *page;
1087 enum page_references references = PAGEREF_RECLAIM;
1088 bool dirty, writeback, may_enter_fs;
1089 unsigned int nr_pages;
1090
1091 cond_resched();
1092
1093 page = lru_to_page(page_list);
1094 list_del(&page->lru);
1095
1096 if (!trylock_page(page))
1097 goto keep;
1098
1099 VM_BUG_ON_PAGE(PageActive(page), page);
1100
1101 nr_pages = compound_nr(page);
1102
1103
1104 sc->nr_scanned += nr_pages;
1105
1106 if (unlikely(!page_evictable(page)))
1107 goto activate_locked;
1108
1109 if (!sc->may_unmap && page_mapped(page))
1110 goto keep_locked;
1111
1112 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1113 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1114
1115
1116
1117
1118
1119
1120
1121 page_check_dirty_writeback(page, &dirty, &writeback);
1122 if (dirty || writeback)
1123 stat->nr_dirty++;
1124
1125 if (dirty && !writeback)
1126 stat->nr_unqueued_dirty++;
1127
1128
1129
1130
1131
1132
1133
1134 mapping = page_mapping(page);
1135 if (((dirty || writeback) && mapping &&
1136 inode_write_congested(mapping->host)) ||
1137 (writeback && PageReclaim(page)))
1138 stat->nr_congested++;
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182 if (PageWriteback(page)) {
1183
1184 if (current_is_kswapd() &&
1185 PageReclaim(page) &&
1186 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1187 stat->nr_immediate++;
1188 goto activate_locked;
1189
1190
1191 } else if (writeback_throttling_sane(sc) ||
1192 !PageReclaim(page) || !may_enter_fs) {
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204 SetPageReclaim(page);
1205 stat->nr_writeback++;
1206 goto activate_locked;
1207
1208
1209 } else {
1210 unlock_page(page);
1211 wait_on_page_writeback(page);
1212
1213 list_add_tail(&page->lru, page_list);
1214 continue;
1215 }
1216 }
1217
1218 if (!ignore_references)
1219 references = page_check_references(page, sc);
1220
1221 switch (references) {
1222 case PAGEREF_ACTIVATE:
1223 goto activate_locked;
1224 case PAGEREF_KEEP:
1225 stat->nr_ref_keep += nr_pages;
1226 goto keep_locked;
1227 case PAGEREF_RECLAIM:
1228 case PAGEREF_RECLAIM_CLEAN:
1229 ;
1230 }
1231
1232
1233
1234
1235
1236
1237 if (PageAnon(page) && PageSwapBacked(page)) {
1238 if (!PageSwapCache(page)) {
1239 if (!(sc->gfp_mask & __GFP_IO))
1240 goto keep_locked;
1241 if (page_maybe_dma_pinned(page))
1242 goto keep_locked;
1243 if (PageTransHuge(page)) {
1244
1245 if (!can_split_huge_page(page, NULL))
1246 goto activate_locked;
1247
1248
1249
1250
1251
1252 if (!compound_mapcount(page) &&
1253 split_huge_page_to_list(page,
1254 page_list))
1255 goto activate_locked;
1256 }
1257 if (!add_to_swap(page)) {
1258 if (!PageTransHuge(page))
1259 goto activate_locked_split;
1260
1261 if (split_huge_page_to_list(page,
1262 page_list))
1263 goto activate_locked;
1264#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1265 count_vm_event(THP_SWPOUT_FALLBACK);
1266#endif
1267 if (!add_to_swap(page))
1268 goto activate_locked_split;
1269 }
1270
1271 may_enter_fs = true;
1272
1273
1274 mapping = page_mapping(page);
1275 }
1276 } else if (unlikely(PageTransHuge(page))) {
1277
1278 if (split_huge_page_to_list(page, page_list))
1279 goto keep_locked;
1280 }
1281
1282
1283
1284
1285
1286
1287
1288
1289 if ((nr_pages > 1) && !PageTransHuge(page)) {
1290 sc->nr_scanned -= (nr_pages - 1);
1291 nr_pages = 1;
1292 }
1293
1294
1295
1296
1297
1298 if (page_mapped(page)) {
1299 enum ttu_flags flags = TTU_BATCH_FLUSH;
1300 bool was_swapbacked = PageSwapBacked(page);
1301
1302 if (unlikely(PageTransHuge(page)))
1303 flags |= TTU_SPLIT_HUGE_PMD;
1304
1305 if (!try_to_unmap(page, flags)) {
1306 stat->nr_unmap_fail += nr_pages;
1307 if (!was_swapbacked && PageSwapBacked(page))
1308 stat->nr_lazyfree_fail += nr_pages;
1309 goto activate_locked;
1310 }
1311 }
1312
1313 if (PageDirty(page)) {
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324 if (page_is_file_lru(page) &&
1325 (!current_is_kswapd() || !PageReclaim(page) ||
1326 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1327
1328
1329
1330
1331
1332
1333 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1334 SetPageReclaim(page);
1335
1336 goto activate_locked;
1337 }
1338
1339 if (references == PAGEREF_RECLAIM_CLEAN)
1340 goto keep_locked;
1341 if (!may_enter_fs)
1342 goto keep_locked;
1343 if (!sc->may_writepage)
1344 goto keep_locked;
1345
1346
1347
1348
1349
1350
1351 try_to_unmap_flush_dirty();
1352 switch (pageout(page, mapping)) {
1353 case PAGE_KEEP:
1354 goto keep_locked;
1355 case PAGE_ACTIVATE:
1356 goto activate_locked;
1357 case PAGE_SUCCESS:
1358 stat->nr_pageout += thp_nr_pages(page);
1359
1360 if (PageWriteback(page))
1361 goto keep;
1362 if (PageDirty(page))
1363 goto keep;
1364
1365
1366
1367
1368
1369 if (!trylock_page(page))
1370 goto keep;
1371 if (PageDirty(page) || PageWriteback(page))
1372 goto keep_locked;
1373 mapping = page_mapping(page);
1374 fallthrough;
1375 case PAGE_CLEAN:
1376 ;
1377 }
1378 }
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401 if (page_has_private(page)) {
1402 if (!try_to_release_page(page, sc->gfp_mask))
1403 goto activate_locked;
1404 if (!mapping && page_count(page) == 1) {
1405 unlock_page(page);
1406 if (put_page_testzero(page))
1407 goto free_it;
1408 else {
1409
1410
1411
1412
1413
1414
1415
1416 nr_reclaimed++;
1417 continue;
1418 }
1419 }
1420 }
1421
1422 if (PageAnon(page) && !PageSwapBacked(page)) {
1423
1424 if (!page_ref_freeze(page, 1))
1425 goto keep_locked;
1426 if (PageDirty(page)) {
1427 page_ref_unfreeze(page, 1);
1428 goto keep_locked;
1429 }
1430
1431 count_vm_event(PGLAZYFREED);
1432 count_memcg_page_event(page, PGLAZYFREED);
1433 } else if (!mapping || !__remove_mapping(mapping, page, true,
1434 sc->target_mem_cgroup))
1435 goto keep_locked;
1436
1437 unlock_page(page);
1438free_it:
1439
1440
1441
1442
1443 nr_reclaimed += nr_pages;
1444
1445
1446
1447
1448
1449 if (unlikely(PageTransHuge(page)))
1450 destroy_compound_page(page);
1451 else
1452 list_add(&page->lru, &free_pages);
1453 continue;
1454
1455activate_locked_split:
1456
1457
1458
1459
1460 if (nr_pages > 1) {
1461 sc->nr_scanned -= (nr_pages - 1);
1462 nr_pages = 1;
1463 }
1464activate_locked:
1465
1466 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1467 PageMlocked(page)))
1468 try_to_free_swap(page);
1469 VM_BUG_ON_PAGE(PageActive(page), page);
1470 if (!PageMlocked(page)) {
1471 int type = page_is_file_lru(page);
1472 SetPageActive(page);
1473 stat->nr_activate[type] += nr_pages;
1474 count_memcg_page_event(page, PGACTIVATE);
1475 }
1476keep_locked:
1477 unlock_page(page);
1478keep:
1479 list_add(&page->lru, &ret_pages);
1480 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1481 }
1482
1483 pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
1484
1485 mem_cgroup_uncharge_list(&free_pages);
1486 try_to_unmap_flush();
1487 free_unref_page_list(&free_pages);
1488
1489 list_splice(&ret_pages, page_list);
1490 count_vm_events(PGACTIVATE, pgactivate);
1491
1492 return nr_reclaimed;
1493}
1494
1495unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1496 struct list_head *page_list)
1497{
1498 struct scan_control sc = {
1499 .gfp_mask = GFP_KERNEL,
1500 .priority = DEF_PRIORITY,
1501 .may_unmap = 1,
1502 };
1503 struct reclaim_stat stat;
1504 unsigned int nr_reclaimed;
1505 struct page *page, *next;
1506 LIST_HEAD(clean_pages);
1507
1508 list_for_each_entry_safe(page, next, page_list, lru) {
1509 if (page_is_file_lru(page) && !PageDirty(page) &&
1510 !__PageMovable(page) && !PageUnevictable(page)) {
1511 ClearPageActive(page);
1512 list_move(&page->lru, &clean_pages);
1513 }
1514 }
1515
1516 nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1517 &stat, true);
1518 list_splice(&clean_pages, page_list);
1519 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1520 -(long)nr_reclaimed);
1521
1522
1523
1524
1525
1526
1527 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
1528 stat.nr_lazyfree_fail);
1529 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1530 -(long)stat.nr_lazyfree_fail);
1531 return nr_reclaimed;
1532}
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
1545{
1546 int ret = -EBUSY;
1547
1548
1549 if (!PageLRU(page))
1550 return ret;
1551
1552
1553 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1554 return ret;
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564 if (mode & ISOLATE_ASYNC_MIGRATE) {
1565
1566 if (PageWriteback(page))
1567 return ret;
1568
1569 if (PageDirty(page)) {
1570 struct address_space *mapping;
1571 bool migrate_dirty;
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582 if (!trylock_page(page))
1583 return ret;
1584
1585 mapping = page_mapping(page);
1586 migrate_dirty = !mapping || mapping->a_ops->migratepage;
1587 unlock_page(page);
1588 if (!migrate_dirty)
1589 return ret;
1590 }
1591 }
1592
1593 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1594 return ret;
1595
1596 return 0;
1597}
1598
1599
1600
1601
1602
1603static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1604 enum lru_list lru, unsigned long *nr_zone_taken)
1605{
1606 int zid;
1607
1608 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1609 if (!nr_zone_taken[zid])
1610 continue;
1611
1612 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1613 }
1614
1615}
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1639 struct lruvec *lruvec, struct list_head *dst,
1640 unsigned long *nr_scanned, struct scan_control *sc,
1641 enum lru_list lru)
1642{
1643 struct list_head *src = &lruvec->lists[lru];
1644 unsigned long nr_taken = 0;
1645 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1646 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1647 unsigned long skipped = 0;
1648 unsigned long scan, total_scan, nr_pages;
1649 LIST_HEAD(pages_skipped);
1650 isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
1651
1652 total_scan = 0;
1653 scan = 0;
1654 while (scan < nr_to_scan && !list_empty(src)) {
1655 struct page *page;
1656
1657 page = lru_to_page(src);
1658 prefetchw_prev_lru_page(page, src, flags);
1659
1660 nr_pages = compound_nr(page);
1661 total_scan += nr_pages;
1662
1663 if (page_zonenum(page) > sc->reclaim_idx) {
1664 list_move(&page->lru, &pages_skipped);
1665 nr_skipped[page_zonenum(page)] += nr_pages;
1666 continue;
1667 }
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679 scan += nr_pages;
1680 switch (__isolate_lru_page_prepare(page, mode)) {
1681 case 0:
1682
1683
1684
1685
1686
1687 if (unlikely(!get_page_unless_zero(page)))
1688 goto busy;
1689
1690 if (!TestClearPageLRU(page)) {
1691
1692
1693
1694
1695 put_page(page);
1696 goto busy;
1697 }
1698
1699 nr_taken += nr_pages;
1700 nr_zone_taken[page_zonenum(page)] += nr_pages;
1701 list_move(&page->lru, dst);
1702 break;
1703
1704 default:
1705busy:
1706
1707 list_move(&page->lru, src);
1708 }
1709 }
1710
1711
1712
1713
1714
1715
1716
1717
1718 if (!list_empty(&pages_skipped)) {
1719 int zid;
1720
1721 list_splice(&pages_skipped, src);
1722 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1723 if (!nr_skipped[zid])
1724 continue;
1725
1726 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1727 skipped += nr_skipped[zid];
1728 }
1729 }
1730 *nr_scanned = total_scan;
1731 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1732 total_scan, skipped, nr_taken, mode, lru);
1733 update_lru_sizes(lruvec, lru, nr_zone_taken);
1734 return nr_taken;
1735}
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763int isolate_lru_page(struct page *page)
1764{
1765 int ret = -EBUSY;
1766
1767 VM_BUG_ON_PAGE(!page_count(page), page);
1768 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1769
1770 if (TestClearPageLRU(page)) {
1771 struct lruvec *lruvec;
1772
1773 get_page(page);
1774 lruvec = lock_page_lruvec_irq(page);
1775 del_page_from_lru_list(page, lruvec, page_lru(page));
1776 unlock_page_lruvec_irq(lruvec);
1777 ret = 0;
1778 }
1779
1780 return ret;
1781}
1782
1783
1784
1785
1786
1787
1788
1789
1790static int too_many_isolated(struct pglist_data *pgdat, int file,
1791 struct scan_control *sc)
1792{
1793 unsigned long inactive, isolated;
1794
1795 if (current_is_kswapd())
1796 return 0;
1797
1798 if (!writeback_throttling_sane(sc))
1799 return 0;
1800
1801 if (file) {
1802 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1803 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1804 } else {
1805 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1806 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1807 }
1808
1809
1810
1811
1812
1813
1814 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1815 inactive >>= 3;
1816
1817 return isolated > inactive;
1818}
1819
1820
1821
1822
1823
1824
1825
1826static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
1827 struct list_head *list)
1828{
1829 int nr_pages, nr_moved = 0;
1830 LIST_HEAD(pages_to_free);
1831 struct page *page;
1832 enum lru_list lru;
1833
1834 while (!list_empty(list)) {
1835 page = lru_to_page(list);
1836 VM_BUG_ON_PAGE(PageLRU(page), page);
1837 list_del(&page->lru);
1838 if (unlikely(!page_evictable(page))) {
1839 spin_unlock_irq(&lruvec->lru_lock);
1840 putback_lru_page(page);
1841 spin_lock_irq(&lruvec->lru_lock);
1842 continue;
1843 }
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856 SetPageLRU(page);
1857
1858 if (unlikely(put_page_testzero(page))) {
1859 __ClearPageLRU(page);
1860 __ClearPageActive(page);
1861
1862 if (unlikely(PageCompound(page))) {
1863 spin_unlock_irq(&lruvec->lru_lock);
1864 destroy_compound_page(page);
1865 spin_lock_irq(&lruvec->lru_lock);
1866 } else
1867 list_add(&page->lru, &pages_to_free);
1868
1869 continue;
1870 }
1871
1872
1873
1874
1875
1876 VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
1877 lru = page_lru(page);
1878 nr_pages = thp_nr_pages(page);
1879
1880 update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
1881 list_add(&page->lru, &lruvec->lists[lru]);
1882 nr_moved += nr_pages;
1883 if (PageActive(page))
1884 workingset_age_nonresident(lruvec, nr_pages);
1885 }
1886
1887
1888
1889
1890 list_splice(&pages_to_free, list);
1891
1892 return nr_moved;
1893}
1894
1895
1896
1897
1898
1899
1900
1901static int current_may_throttle(void)
1902{
1903 return !(current->flags & PF_LOCAL_THROTTLE) ||
1904 current->backing_dev_info == NULL ||
1905 bdi_write_congested(current->backing_dev_info);
1906}
1907
1908
1909
1910
1911
1912static noinline_for_stack unsigned long
1913shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1914 struct scan_control *sc, enum lru_list lru)
1915{
1916 LIST_HEAD(page_list);
1917 unsigned long nr_scanned;
1918 unsigned int nr_reclaimed = 0;
1919 unsigned long nr_taken;
1920 struct reclaim_stat stat;
1921 bool file = is_file_lru(lru);
1922 enum vm_event_item item;
1923 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1924 bool stalled = false;
1925
1926 while (unlikely(too_many_isolated(pgdat, file, sc))) {
1927 if (stalled)
1928 return 0;
1929
1930
1931 msleep(100);
1932 stalled = true;
1933
1934
1935 if (fatal_signal_pending(current))
1936 return SWAP_CLUSTER_MAX;
1937 }
1938
1939 lru_add_drain();
1940
1941 spin_lock_irq(&lruvec->lru_lock);
1942
1943 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1944 &nr_scanned, sc, lru);
1945
1946 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1947 item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
1948 if (!cgroup_reclaim(sc))
1949 __count_vm_events(item, nr_scanned);
1950 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
1951 __count_vm_events(PGSCAN_ANON + file, nr_scanned);
1952
1953 spin_unlock_irq(&lruvec->lru_lock);
1954
1955 if (nr_taken == 0)
1956 return 0;
1957
1958 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
1959
1960 spin_lock_irq(&lruvec->lru_lock);
1961 move_pages_to_lru(lruvec, &page_list);
1962
1963 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1964 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
1965 if (!cgroup_reclaim(sc))
1966 __count_vm_events(item, nr_reclaimed);
1967 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
1968 __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
1969 spin_unlock_irq(&lruvec->lru_lock);
1970
1971 lru_note_cost(lruvec, file, stat.nr_pageout);
1972 mem_cgroup_uncharge_list(&page_list);
1973 free_unref_page_list(&page_list);
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986 if (stat.nr_unqueued_dirty == nr_taken)
1987 wakeup_flusher_threads(WB_REASON_VMSCAN);
1988
1989 sc->nr.dirty += stat.nr_dirty;
1990 sc->nr.congested += stat.nr_congested;
1991 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
1992 sc->nr.writeback += stat.nr_writeback;
1993 sc->nr.immediate += stat.nr_immediate;
1994 sc->nr.taken += nr_taken;
1995 if (file)
1996 sc->nr.file_taken += nr_taken;
1997
1998 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
1999 nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2000 return nr_reclaimed;
2001}
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020static void shrink_active_list(unsigned long nr_to_scan,
2021 struct lruvec *lruvec,
2022 struct scan_control *sc,
2023 enum lru_list lru)
2024{
2025 unsigned long nr_taken;
2026 unsigned long nr_scanned;
2027 unsigned long vm_flags;
2028 LIST_HEAD(l_hold);
2029 LIST_HEAD(l_active);
2030 LIST_HEAD(l_inactive);
2031 struct page *page;
2032 unsigned nr_deactivate, nr_activate;
2033 unsigned nr_rotated = 0;
2034 int file = is_file_lru(lru);
2035 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2036
2037 lru_add_drain();
2038
2039 spin_lock_irq(&lruvec->lru_lock);
2040
2041 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2042 &nr_scanned, sc, lru);
2043
2044 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2045
2046 if (!cgroup_reclaim(sc))
2047 __count_vm_events(PGREFILL, nr_scanned);
2048 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2049
2050 spin_unlock_irq(&lruvec->lru_lock);
2051
2052 while (!list_empty(&l_hold)) {
2053 cond_resched();
2054 page = lru_to_page(&l_hold);
2055 list_del(&page->lru);
2056
2057 if (unlikely(!page_evictable(page))) {
2058 putback_lru_page(page);
2059 continue;
2060 }
2061
2062 if (unlikely(buffer_heads_over_limit)) {
2063 if (page_has_private(page) && trylock_page(page)) {
2064 if (page_has_private(page))
2065 try_to_release_page(page, 0);
2066 unlock_page(page);
2067 }
2068 }
2069
2070 if (page_referenced(page, 0, sc->target_mem_cgroup,
2071 &vm_flags)) {
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081 if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
2082 nr_rotated += thp_nr_pages(page);
2083 list_add(&page->lru, &l_active);
2084 continue;
2085 }
2086 }
2087
2088 ClearPageActive(page);
2089 SetPageWorkingset(page);
2090 list_add(&page->lru, &l_inactive);
2091 }
2092
2093
2094
2095
2096 spin_lock_irq(&lruvec->lru_lock);
2097
2098 nr_activate = move_pages_to_lru(lruvec, &l_active);
2099 nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2100
2101 list_splice(&l_inactive, &l_active);
2102
2103 __count_vm_events(PGDEACTIVATE, nr_deactivate);
2104 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2105
2106 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2107 spin_unlock_irq(&lruvec->lru_lock);
2108
2109 mem_cgroup_uncharge_list(&l_active);
2110 free_unref_page_list(&l_active);
2111 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2112 nr_deactivate, nr_rotated, sc->priority, file);
2113}
2114
2115unsigned long reclaim_pages(struct list_head *page_list)
2116{
2117 int nid = NUMA_NO_NODE;
2118 unsigned int nr_reclaimed = 0;
2119 LIST_HEAD(node_page_list);
2120 struct reclaim_stat dummy_stat;
2121 struct page *page;
2122 struct scan_control sc = {
2123 .gfp_mask = GFP_KERNEL,
2124 .priority = DEF_PRIORITY,
2125 .may_writepage = 1,
2126 .may_unmap = 1,
2127 .may_swap = 1,
2128 };
2129
2130 while (!list_empty(page_list)) {
2131 page = lru_to_page(page_list);
2132 if (nid == NUMA_NO_NODE) {
2133 nid = page_to_nid(page);
2134 INIT_LIST_HEAD(&node_page_list);
2135 }
2136
2137 if (nid == page_to_nid(page)) {
2138 ClearPageActive(page);
2139 list_move(&page->lru, &node_page_list);
2140 continue;
2141 }
2142
2143 nr_reclaimed += shrink_page_list(&node_page_list,
2144 NODE_DATA(nid),
2145 &sc, &dummy_stat, false);
2146 while (!list_empty(&node_page_list)) {
2147 page = lru_to_page(&node_page_list);
2148 list_del(&page->lru);
2149 putback_lru_page(page);
2150 }
2151
2152 nid = NUMA_NO_NODE;
2153 }
2154
2155 if (!list_empty(&node_page_list)) {
2156 nr_reclaimed += shrink_page_list(&node_page_list,
2157 NODE_DATA(nid),
2158 &sc, &dummy_stat, false);
2159 while (!list_empty(&node_page_list)) {
2160 page = lru_to_page(&node_page_list);
2161 list_del(&page->lru);
2162 putback_lru_page(page);
2163 }
2164 }
2165
2166 return nr_reclaimed;
2167}
2168
2169static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2170 struct lruvec *lruvec, struct scan_control *sc)
2171{
2172 if (is_active_lru(lru)) {
2173 if (sc->may_deactivate & (1 << is_file_lru(lru)))
2174 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2175 else
2176 sc->skipped_deactivate = 1;
2177 return 0;
2178 }
2179
2180 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2181}
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2212{
2213 enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2214 unsigned long inactive, active;
2215 unsigned long inactive_ratio;
2216 unsigned long gb;
2217
2218 inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2219 active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2220
2221 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2222 if (gb)
2223 inactive_ratio = int_sqrt(10 * gb);
2224 else
2225 inactive_ratio = 1;
2226
2227 return inactive * inactive_ratio < active;
2228}
2229
2230enum scan_balance {
2231 SCAN_EQUAL,
2232 SCAN_FRACT,
2233 SCAN_ANON,
2234 SCAN_FILE,
2235};
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2247 unsigned long *nr)
2248{
2249 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2250 unsigned long anon_cost, file_cost, total_cost;
2251 int swappiness = mem_cgroup_swappiness(memcg);
2252 u64 fraction[ANON_AND_FILE];
2253 u64 denominator = 0;
2254 enum scan_balance scan_balance;
2255 unsigned long ap, fp;
2256 enum lru_list lru;
2257
2258
2259 if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2260 scan_balance = SCAN_FILE;
2261 goto out;
2262 }
2263
2264
2265
2266
2267
2268
2269
2270
2271 if (cgroup_reclaim(sc) && !swappiness) {
2272 scan_balance = SCAN_FILE;
2273 goto out;
2274 }
2275
2276
2277
2278
2279
2280
2281 if (!sc->priority && swappiness) {
2282 scan_balance = SCAN_EQUAL;
2283 goto out;
2284 }
2285
2286
2287
2288
2289 if (sc->file_is_tiny) {
2290 scan_balance = SCAN_ANON;
2291 goto out;
2292 }
2293
2294
2295
2296
2297
2298 if (sc->cache_trim_mode) {
2299 scan_balance = SCAN_FILE;
2300 goto out;
2301 }
2302
2303 scan_balance = SCAN_FRACT;
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319 total_cost = sc->anon_cost + sc->file_cost;
2320 anon_cost = total_cost + sc->anon_cost;
2321 file_cost = total_cost + sc->file_cost;
2322 total_cost = anon_cost + file_cost;
2323
2324 ap = swappiness * (total_cost + 1);
2325 ap /= anon_cost + 1;
2326
2327 fp = (200 - swappiness) * (total_cost + 1);
2328 fp /= file_cost + 1;
2329
2330 fraction[0] = ap;
2331 fraction[1] = fp;
2332 denominator = ap + fp;
2333out:
2334 for_each_evictable_lru(lru) {
2335 int file = is_file_lru(lru);
2336 unsigned long lruvec_size;
2337 unsigned long scan;
2338 unsigned long protection;
2339
2340 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2341 protection = mem_cgroup_protection(sc->target_mem_cgroup,
2342 memcg,
2343 sc->memcg_low_reclaim);
2344
2345 if (protection) {
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375 unsigned long cgroup_size = mem_cgroup_size(memcg);
2376
2377
2378 cgroup_size = max(cgroup_size, protection);
2379
2380 scan = lruvec_size - lruvec_size * protection /
2381 cgroup_size;
2382
2383
2384
2385
2386
2387
2388 scan = max(scan, SWAP_CLUSTER_MAX);
2389 } else {
2390 scan = lruvec_size;
2391 }
2392
2393 scan >>= sc->priority;
2394
2395
2396
2397
2398
2399 if (!scan && !mem_cgroup_online(memcg))
2400 scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2401
2402 switch (scan_balance) {
2403 case SCAN_EQUAL:
2404
2405 break;
2406 case SCAN_FRACT:
2407
2408
2409
2410
2411
2412
2413
2414 scan = mem_cgroup_online(memcg) ?
2415 div64_u64(scan * fraction[file], denominator) :
2416 DIV64_U64_ROUND_UP(scan * fraction[file],
2417 denominator);
2418 break;
2419 case SCAN_FILE:
2420 case SCAN_ANON:
2421
2422 if ((scan_balance == SCAN_FILE) != file)
2423 scan = 0;
2424 break;
2425 default:
2426
2427 BUG();
2428 }
2429
2430 nr[lru] = scan;
2431 }
2432}
2433
2434static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2435{
2436 unsigned long nr[NR_LRU_LISTS];
2437 unsigned long targets[NR_LRU_LISTS];
2438 unsigned long nr_to_scan;
2439 enum lru_list lru;
2440 unsigned long nr_reclaimed = 0;
2441 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2442 struct blk_plug plug;
2443 bool scan_adjusted;
2444
2445 get_scan_count(lruvec, sc, nr);
2446
2447
2448 memcpy(targets, nr, sizeof(nr));
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461 scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
2462 sc->priority == DEF_PRIORITY);
2463
2464 blk_start_plug(&plug);
2465 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2466 nr[LRU_INACTIVE_FILE]) {
2467 unsigned long nr_anon, nr_file, percentage;
2468 unsigned long nr_scanned;
2469
2470 for_each_evictable_lru(lru) {
2471 if (nr[lru]) {
2472 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2473 nr[lru] -= nr_to_scan;
2474
2475 nr_reclaimed += shrink_list(lru, nr_to_scan,
2476 lruvec, sc);
2477 }
2478 }
2479
2480 cond_resched();
2481
2482 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2483 continue;
2484
2485
2486
2487
2488
2489
2490
2491
2492 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2493 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2494
2495
2496
2497
2498
2499
2500
2501 if (!nr_file || !nr_anon)
2502 break;
2503
2504 if (nr_file > nr_anon) {
2505 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2506 targets[LRU_ACTIVE_ANON] + 1;
2507 lru = LRU_BASE;
2508 percentage = nr_anon * 100 / scan_target;
2509 } else {
2510 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2511 targets[LRU_ACTIVE_FILE] + 1;
2512 lru = LRU_FILE;
2513 percentage = nr_file * 100 / scan_target;
2514 }
2515
2516
2517 nr[lru] = 0;
2518 nr[lru + LRU_ACTIVE] = 0;
2519
2520
2521
2522
2523
2524 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2525 nr_scanned = targets[lru] - nr[lru];
2526 nr[lru] = targets[lru] * (100 - percentage) / 100;
2527 nr[lru] -= min(nr[lru], nr_scanned);
2528
2529 lru += LRU_ACTIVE;
2530 nr_scanned = targets[lru] - nr[lru];
2531 nr[lru] = targets[lru] * (100 - percentage) / 100;
2532 nr[lru] -= min(nr[lru], nr_scanned);
2533
2534 scan_adjusted = true;
2535 }
2536 blk_finish_plug(&plug);
2537 sc->nr_reclaimed += nr_reclaimed;
2538
2539
2540
2541
2542
2543 if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
2544 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2545 sc, LRU_ACTIVE_ANON);
2546}
2547
2548
2549static bool in_reclaim_compaction(struct scan_control *sc)
2550{
2551 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2552 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2553 sc->priority < DEF_PRIORITY - 2))
2554 return true;
2555
2556 return false;
2557}
2558
2559
2560
2561
2562
2563
2564
2565
2566static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2567 unsigned long nr_reclaimed,
2568 struct scan_control *sc)
2569{
2570 unsigned long pages_for_compaction;
2571 unsigned long inactive_lru_pages;
2572 int z;
2573
2574
2575 if (!in_reclaim_compaction(sc))
2576 return false;
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588 if (!nr_reclaimed)
2589 return false;
2590
2591
2592 for (z = 0; z <= sc->reclaim_idx; z++) {
2593 struct zone *zone = &pgdat->node_zones[z];
2594 if (!managed_zone(zone))
2595 continue;
2596
2597 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2598 case COMPACT_SUCCESS:
2599 case COMPACT_CONTINUE:
2600 return false;
2601 default:
2602
2603 ;
2604 }
2605 }
2606
2607
2608
2609
2610
2611 pages_for_compaction = compact_gap(sc->order);
2612 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2613 if (get_nr_swap_pages() > 0)
2614 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2615
2616 return inactive_lru_pages > pages_for_compaction;
2617}
2618
2619static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
2620{
2621 struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
2622 struct mem_cgroup *memcg;
2623
2624 memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
2625 do {
2626 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
2627 unsigned long reclaimed;
2628 unsigned long scanned;
2629
2630
2631
2632
2633
2634
2635
2636 cond_resched();
2637
2638 mem_cgroup_calculate_protection(target_memcg, memcg);
2639
2640 if (mem_cgroup_below_min(memcg)) {
2641
2642
2643
2644
2645 continue;
2646 } else if (mem_cgroup_below_low(memcg)) {
2647
2648
2649
2650
2651
2652
2653 if (!sc->memcg_low_reclaim) {
2654 sc->memcg_low_skipped = 1;
2655 continue;
2656 }
2657 memcg_memory_event(memcg, MEMCG_LOW);
2658 }
2659
2660 reclaimed = sc->nr_reclaimed;
2661 scanned = sc->nr_scanned;
2662
2663 shrink_lruvec(lruvec, sc);
2664
2665 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
2666 sc->priority);
2667
2668
2669 vmpressure(sc->gfp_mask, memcg, false,
2670 sc->nr_scanned - scanned,
2671 sc->nr_reclaimed - reclaimed);
2672
2673 } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
2674}
2675
2676static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2677{
2678 struct reclaim_state *reclaim_state = current->reclaim_state;
2679 unsigned long nr_reclaimed, nr_scanned;
2680 struct lruvec *target_lruvec;
2681 bool reclaimable = false;
2682 unsigned long file;
2683
2684 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
2685
2686again:
2687 memset(&sc->nr, 0, sizeof(sc->nr));
2688
2689 nr_reclaimed = sc->nr_reclaimed;
2690 nr_scanned = sc->nr_scanned;
2691
2692
2693
2694
2695 spin_lock_irq(&target_lruvec->lru_lock);
2696 sc->anon_cost = target_lruvec->anon_cost;
2697 sc->file_cost = target_lruvec->file_cost;
2698 spin_unlock_irq(&target_lruvec->lru_lock);
2699
2700
2701
2702
2703
2704 if (!sc->force_deactivate) {
2705 unsigned long refaults;
2706
2707 refaults = lruvec_page_state(target_lruvec,
2708 WORKINGSET_ACTIVATE_ANON);
2709 if (refaults != target_lruvec->refaults[0] ||
2710 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
2711 sc->may_deactivate |= DEACTIVATE_ANON;
2712 else
2713 sc->may_deactivate &= ~DEACTIVATE_ANON;
2714
2715
2716
2717
2718
2719
2720 refaults = lruvec_page_state(target_lruvec,
2721 WORKINGSET_ACTIVATE_FILE);
2722 if (refaults != target_lruvec->refaults[1] ||
2723 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
2724 sc->may_deactivate |= DEACTIVATE_FILE;
2725 else
2726 sc->may_deactivate &= ~DEACTIVATE_FILE;
2727 } else
2728 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
2729
2730
2731
2732
2733
2734
2735 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
2736 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
2737 sc->cache_trim_mode = 1;
2738 else
2739 sc->cache_trim_mode = 0;
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750 if (!cgroup_reclaim(sc)) {
2751 unsigned long total_high_wmark = 0;
2752 unsigned long free, anon;
2753 int z;
2754
2755 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2756 file = node_page_state(pgdat, NR_ACTIVE_FILE) +
2757 node_page_state(pgdat, NR_INACTIVE_FILE);
2758
2759 for (z = 0; z < MAX_NR_ZONES; z++) {
2760 struct zone *zone = &pgdat->node_zones[z];
2761 if (!managed_zone(zone))
2762 continue;
2763
2764 total_high_wmark += high_wmark_pages(zone);
2765 }
2766
2767
2768
2769
2770
2771
2772 anon = node_page_state(pgdat, NR_INACTIVE_ANON);
2773
2774 sc->file_is_tiny =
2775 file + free <= total_high_wmark &&
2776 !(sc->may_deactivate & DEACTIVATE_ANON) &&
2777 anon >> sc->priority;
2778 }
2779
2780 shrink_node_memcgs(pgdat, sc);
2781
2782 if (reclaim_state) {
2783 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2784 reclaim_state->reclaimed_slab = 0;
2785 }
2786
2787
2788 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2789 sc->nr_scanned - nr_scanned,
2790 sc->nr_reclaimed - nr_reclaimed);
2791
2792 if (sc->nr_reclaimed - nr_reclaimed)
2793 reclaimable = true;
2794
2795 if (current_is_kswapd()) {
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
2814 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
2815
2816
2817 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
2818 set_bit(PGDAT_DIRTY, &pgdat->flags);
2819
2820
2821
2822
2823
2824
2825
2826 if (sc->nr.immediate)
2827 congestion_wait(BLK_RW_ASYNC, HZ/10);
2828 }
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838 if ((current_is_kswapd() ||
2839 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
2840 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2841 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
2842
2843
2844
2845
2846
2847
2848
2849 if (!current_is_kswapd() && current_may_throttle() &&
2850 !sc->hibernation_mode &&
2851 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
2852 wait_iff_congested(BLK_RW_ASYNC, HZ/10);
2853
2854 if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2855 sc))
2856 goto again;
2857
2858
2859
2860
2861
2862
2863
2864 if (reclaimable)
2865 pgdat->kswapd_failures = 0;
2866}
2867
2868
2869
2870
2871
2872
2873static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2874{
2875 unsigned long watermark;
2876 enum compact_result suitable;
2877
2878 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2879 if (suitable == COMPACT_SUCCESS)
2880
2881 return true;
2882 if (suitable == COMPACT_SKIPPED)
2883
2884 return false;
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2896
2897 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2898}
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2909{
2910 struct zoneref *z;
2911 struct zone *zone;
2912 unsigned long nr_soft_reclaimed;
2913 unsigned long nr_soft_scanned;
2914 gfp_t orig_mask;
2915 pg_data_t *last_pgdat = NULL;
2916
2917
2918
2919
2920
2921
2922 orig_mask = sc->gfp_mask;
2923 if (buffer_heads_over_limit) {
2924 sc->gfp_mask |= __GFP_HIGHMEM;
2925 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2926 }
2927
2928 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2929 sc->reclaim_idx, sc->nodemask) {
2930
2931
2932
2933
2934 if (!cgroup_reclaim(sc)) {
2935 if (!cpuset_zone_allowed(zone,
2936 GFP_KERNEL | __GFP_HARDWALL))
2937 continue;
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948 if (IS_ENABLED(CONFIG_COMPACTION) &&
2949 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2950 compaction_ready(zone, sc)) {
2951 sc->compaction_ready = true;
2952 continue;
2953 }
2954
2955
2956
2957
2958
2959
2960
2961 if (zone->zone_pgdat == last_pgdat)
2962 continue;
2963
2964
2965
2966
2967
2968
2969
2970 nr_soft_scanned = 0;
2971 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2972 sc->order, sc->gfp_mask,
2973 &nr_soft_scanned);
2974 sc->nr_reclaimed += nr_soft_reclaimed;
2975 sc->nr_scanned += nr_soft_scanned;
2976
2977 }
2978
2979
2980 if (zone->zone_pgdat == last_pgdat)
2981 continue;
2982 last_pgdat = zone->zone_pgdat;
2983 shrink_node(zone->zone_pgdat, sc);
2984 }
2985
2986
2987
2988
2989
2990 sc->gfp_mask = orig_mask;
2991}
2992
2993static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
2994{
2995 struct lruvec *target_lruvec;
2996 unsigned long refaults;
2997
2998 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
2999 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
3000 target_lruvec->refaults[0] = refaults;
3001 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
3002 target_lruvec->refaults[1] = refaults;
3003}
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3022 struct scan_control *sc)
3023{
3024 int initial_priority = sc->priority;
3025 pg_data_t *last_pgdat;
3026 struct zoneref *z;
3027 struct zone *zone;
3028retry:
3029 delayacct_freepages_start();
3030
3031 if (!cgroup_reclaim(sc))
3032 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3033
3034 do {
3035 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3036 sc->priority);
3037 sc->nr_scanned = 0;
3038 shrink_zones(zonelist, sc);
3039
3040 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3041 break;
3042
3043 if (sc->compaction_ready)
3044 break;
3045
3046
3047
3048
3049
3050 if (sc->priority < DEF_PRIORITY - 2)
3051 sc->may_writepage = 1;
3052 } while (--sc->priority >= 0);
3053
3054 last_pgdat = NULL;
3055 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3056 sc->nodemask) {
3057 if (zone->zone_pgdat == last_pgdat)
3058 continue;
3059 last_pgdat = zone->zone_pgdat;
3060
3061 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3062
3063 if (cgroup_reclaim(sc)) {
3064 struct lruvec *lruvec;
3065
3066 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
3067 zone->zone_pgdat);
3068 clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3069 }
3070 }
3071
3072 delayacct_freepages_end();
3073
3074 if (sc->nr_reclaimed)
3075 return sc->nr_reclaimed;
3076
3077
3078 if (sc->compaction_ready)
3079 return 1;
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090 if (sc->skipped_deactivate) {
3091 sc->priority = initial_priority;
3092 sc->force_deactivate = 1;
3093 sc->skipped_deactivate = 0;
3094 goto retry;
3095 }
3096
3097
3098 if (sc->memcg_low_skipped) {
3099 sc->priority = initial_priority;
3100 sc->force_deactivate = 0;
3101 sc->memcg_low_reclaim = 1;
3102 sc->memcg_low_skipped = 0;
3103 goto retry;
3104 }
3105
3106 return 0;
3107}
3108
3109static bool allow_direct_reclaim(pg_data_t *pgdat)
3110{
3111 struct zone *zone;
3112 unsigned long pfmemalloc_reserve = 0;
3113 unsigned long free_pages = 0;
3114 int i;
3115 bool wmark_ok;
3116
3117 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3118 return true;
3119
3120 for (i = 0; i <= ZONE_NORMAL; i++) {
3121 zone = &pgdat->node_zones[i];
3122 if (!managed_zone(zone))
3123 continue;
3124
3125 if (!zone_reclaimable_pages(zone))
3126 continue;
3127
3128 pfmemalloc_reserve += min_wmark_pages(zone);
3129 free_pages += zone_page_state(zone, NR_FREE_PAGES);
3130 }
3131
3132
3133 if (!pfmemalloc_reserve)
3134 return true;
3135
3136 wmark_ok = free_pages > pfmemalloc_reserve / 2;
3137
3138
3139 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3140 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
3141 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
3142
3143 wake_up_interruptible(&pgdat->kswapd_wait);
3144 }
3145
3146 return wmark_ok;
3147}
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3159 nodemask_t *nodemask)
3160{
3161 struct zoneref *z;
3162 struct zone *zone;
3163 pg_data_t *pgdat = NULL;
3164
3165
3166
3167
3168
3169
3170
3171
3172 if (current->flags & PF_KTHREAD)
3173 goto out;
3174
3175
3176
3177
3178
3179 if (fatal_signal_pending(current))
3180 goto out;
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3197 gfp_zone(gfp_mask), nodemask) {
3198 if (zone_idx(zone) > ZONE_NORMAL)
3199 continue;
3200
3201
3202 pgdat = zone->zone_pgdat;
3203 if (allow_direct_reclaim(pgdat))
3204 goto out;
3205 break;
3206 }
3207
3208
3209 if (!pgdat)
3210 goto out;
3211
3212
3213 count_vm_event(PGSCAN_DIRECT_THROTTLE);
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223 if (!(gfp_mask & __GFP_FS)) {
3224 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3225 allow_direct_reclaim(pgdat), HZ);
3226
3227 goto check_pending;
3228 }
3229
3230
3231 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3232 allow_direct_reclaim(pgdat));
3233
3234check_pending:
3235 if (fatal_signal_pending(current))
3236 return true;
3237
3238out:
3239 return false;
3240}
3241
3242unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3243 gfp_t gfp_mask, nodemask_t *nodemask)
3244{
3245 unsigned long nr_reclaimed;
3246 struct scan_control sc = {
3247 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3248 .gfp_mask = current_gfp_context(gfp_mask),
3249 .reclaim_idx = gfp_zone(gfp_mask),
3250 .order = order,
3251 .nodemask = nodemask,
3252 .priority = DEF_PRIORITY,
3253 .may_writepage = !laptop_mode,
3254 .may_unmap = 1,
3255 .may_swap = 1,
3256 };
3257
3258
3259
3260
3261
3262 BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3263 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3264 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3265
3266
3267
3268
3269
3270
3271 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3272 return 1;
3273
3274 set_task_reclaim_state(current, &sc.reclaim_state);
3275 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3276
3277 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3278
3279 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3280 set_task_reclaim_state(current, NULL);
3281
3282 return nr_reclaimed;
3283}
3284
3285#ifdef CONFIG_MEMCG
3286
3287
3288unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3289 gfp_t gfp_mask, bool noswap,
3290 pg_data_t *pgdat,
3291 unsigned long *nr_scanned)
3292{
3293 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3294 struct scan_control sc = {
3295 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3296 .target_mem_cgroup = memcg,
3297 .may_writepage = !laptop_mode,
3298 .may_unmap = 1,
3299 .reclaim_idx = MAX_NR_ZONES - 1,
3300 .may_swap = !noswap,
3301 };
3302
3303 WARN_ON_ONCE(!current->reclaim_state);
3304
3305 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3306 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3307
3308 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3309 sc.gfp_mask);
3310
3311
3312
3313
3314
3315
3316
3317
3318 shrink_lruvec(lruvec, &sc);
3319
3320 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3321
3322 *nr_scanned = sc.nr_scanned;
3323
3324 return sc.nr_reclaimed;
3325}
3326
3327unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3328 unsigned long nr_pages,
3329 gfp_t gfp_mask,
3330 bool may_swap)
3331{
3332 unsigned long nr_reclaimed;
3333 unsigned int noreclaim_flag;
3334 struct scan_control sc = {
3335 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3336 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3337 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3338 .reclaim_idx = MAX_NR_ZONES - 1,
3339 .target_mem_cgroup = memcg,
3340 .priority = DEF_PRIORITY,
3341 .may_writepage = !laptop_mode,
3342 .may_unmap = 1,
3343 .may_swap = may_swap,
3344 };
3345
3346
3347
3348
3349
3350 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3351
3352 set_task_reclaim_state(current, &sc.reclaim_state);
3353 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
3354 noreclaim_flag = memalloc_noreclaim_save();
3355
3356 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3357
3358 memalloc_noreclaim_restore(noreclaim_flag);
3359 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3360 set_task_reclaim_state(current, NULL);
3361
3362 return nr_reclaimed;
3363}
3364#endif
3365
3366static void age_active_anon(struct pglist_data *pgdat,
3367 struct scan_control *sc)
3368{
3369 struct mem_cgroup *memcg;
3370 struct lruvec *lruvec;
3371
3372 if (!total_swap_pages)
3373 return;
3374
3375 lruvec = mem_cgroup_lruvec(NULL, pgdat);
3376 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
3377 return;
3378
3379 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3380 do {
3381 lruvec = mem_cgroup_lruvec(memcg, pgdat);
3382 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3383 sc, LRU_ACTIVE_ANON);
3384 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3385 } while (memcg);
3386}
3387
3388static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
3389{
3390 int i;
3391 struct zone *zone;
3392
3393
3394
3395
3396
3397
3398
3399
3400 for (i = highest_zoneidx; i >= 0; i--) {
3401 zone = pgdat->node_zones + i;
3402 if (!managed_zone(zone))
3403 continue;
3404
3405 if (zone->watermark_boost)
3406 return true;
3407 }
3408
3409 return false;
3410}
3411
3412
3413
3414
3415
3416static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
3417{
3418 int i;
3419 unsigned long mark = -1;
3420 struct zone *zone;
3421
3422
3423
3424
3425
3426 for (i = 0; i <= highest_zoneidx; i++) {
3427 zone = pgdat->node_zones + i;
3428
3429 if (!managed_zone(zone))
3430 continue;
3431
3432 mark = high_wmark_pages(zone);
3433 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
3434 return true;
3435 }
3436
3437
3438
3439
3440
3441
3442 if (mark == -1)
3443 return true;
3444
3445 return false;
3446}
3447
3448
3449static void clear_pgdat_congested(pg_data_t *pgdat)
3450{
3451 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
3452
3453 clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3454 clear_bit(PGDAT_DIRTY, &pgdat->flags);
3455 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3456}
3457
3458
3459
3460
3461
3462
3463
3464static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
3465 int highest_zoneidx)
3466{
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3481 wake_up_all(&pgdat->pfmemalloc_wait);
3482
3483
3484 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3485 return true;
3486
3487 if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
3488 clear_pgdat_congested(pgdat);
3489 return true;
3490 }
3491
3492 return false;
3493}
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503static bool kswapd_shrink_node(pg_data_t *pgdat,
3504 struct scan_control *sc)
3505{
3506 struct zone *zone;
3507 int z;
3508
3509
3510 sc->nr_to_reclaim = 0;
3511 for (z = 0; z <= sc->reclaim_idx; z++) {
3512 zone = pgdat->node_zones + z;
3513 if (!managed_zone(zone))
3514 continue;
3515
3516 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3517 }
3518
3519
3520
3521
3522
3523 shrink_node(pgdat, sc);
3524
3525
3526
3527
3528
3529
3530
3531
3532 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3533 sc->order = 0;
3534
3535 return sc->nr_scanned >= sc->nr_to_reclaim;
3536}
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
3552{
3553 int i;
3554 unsigned long nr_soft_reclaimed;
3555 unsigned long nr_soft_scanned;
3556 unsigned long pflags;
3557 unsigned long nr_boost_reclaim;
3558 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
3559 bool boosted;
3560 struct zone *zone;
3561 struct scan_control sc = {
3562 .gfp_mask = GFP_KERNEL,
3563 .order = order,
3564 .may_unmap = 1,
3565 };
3566
3567 set_task_reclaim_state(current, &sc.reclaim_state);
3568 psi_memstall_enter(&pflags);
3569 __fs_reclaim_acquire();
3570
3571 count_vm_event(PAGEOUTRUN);
3572
3573
3574
3575
3576
3577
3578 nr_boost_reclaim = 0;
3579 for (i = 0; i <= highest_zoneidx; i++) {
3580 zone = pgdat->node_zones + i;
3581 if (!managed_zone(zone))
3582 continue;
3583
3584 nr_boost_reclaim += zone->watermark_boost;
3585 zone_boosts[i] = zone->watermark_boost;
3586 }
3587 boosted = nr_boost_reclaim;
3588
3589restart:
3590 sc.priority = DEF_PRIORITY;
3591 do {
3592 unsigned long nr_reclaimed = sc.nr_reclaimed;
3593 bool raise_priority = true;
3594 bool balanced;
3595 bool ret;
3596
3597 sc.reclaim_idx = highest_zoneidx;
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609 if (buffer_heads_over_limit) {
3610 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3611 zone = pgdat->node_zones + i;
3612 if (!managed_zone(zone))
3613 continue;
3614
3615 sc.reclaim_idx = i;
3616 break;
3617 }
3618 }
3619
3620
3621
3622
3623
3624
3625
3626
3627 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
3628 if (!balanced && nr_boost_reclaim) {
3629 nr_boost_reclaim = 0;
3630 goto restart;
3631 }
3632
3633
3634
3635
3636
3637
3638 if (!nr_boost_reclaim && balanced)
3639 goto out;
3640
3641
3642 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
3643 raise_priority = false;
3644
3645
3646
3647
3648
3649
3650
3651 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
3652 sc.may_swap = !nr_boost_reclaim;
3653
3654
3655
3656
3657
3658
3659
3660 age_active_anon(pgdat, &sc);
3661
3662
3663
3664
3665
3666 if (sc.priority < DEF_PRIORITY - 2)
3667 sc.may_writepage = 1;
3668
3669
3670 sc.nr_scanned = 0;
3671 nr_soft_scanned = 0;
3672 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3673 sc.gfp_mask, &nr_soft_scanned);
3674 sc.nr_reclaimed += nr_soft_reclaimed;
3675
3676
3677
3678
3679
3680
3681 if (kswapd_shrink_node(pgdat, &sc))
3682 raise_priority = false;
3683
3684
3685
3686
3687
3688
3689 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3690 allow_direct_reclaim(pgdat))
3691 wake_up_all(&pgdat->pfmemalloc_wait);
3692
3693
3694 __fs_reclaim_release();
3695 ret = try_to_freeze();
3696 __fs_reclaim_acquire();
3697 if (ret || kthread_should_stop())
3698 break;
3699
3700
3701
3702
3703
3704 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3705 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
3706
3707
3708
3709
3710
3711
3712 if (nr_boost_reclaim && !nr_reclaimed)
3713 break;
3714
3715 if (raise_priority || !nr_reclaimed)
3716 sc.priority--;
3717 } while (sc.priority >= 1);
3718
3719 if (!sc.nr_reclaimed)
3720 pgdat->kswapd_failures++;
3721
3722out:
3723
3724 if (boosted) {
3725 unsigned long flags;
3726
3727 for (i = 0; i <= highest_zoneidx; i++) {
3728 if (!zone_boosts[i])
3729 continue;
3730
3731
3732 zone = pgdat->node_zones + i;
3733 spin_lock_irqsave(&zone->lock, flags);
3734 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
3735 spin_unlock_irqrestore(&zone->lock, flags);
3736 }
3737
3738
3739
3740
3741
3742 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
3743 }
3744
3745 snapshot_refaults(NULL, pgdat);
3746 __fs_reclaim_release();
3747 psi_memstall_leave(&pflags);
3748 set_task_reclaim_state(current, NULL);
3749
3750
3751
3752
3753
3754
3755
3756 return sc.order;
3757}
3758
3759
3760
3761
3762
3763
3764
3765
3766static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
3767 enum zone_type prev_highest_zoneidx)
3768{
3769 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
3770
3771 return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
3772}
3773
3774static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3775 unsigned int highest_zoneidx)
3776{
3777 long remaining = 0;
3778 DEFINE_WAIT(wait);
3779
3780 if (freezing(current) || kthread_should_stop())
3781 return;
3782
3783 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3784
3785
3786
3787
3788
3789
3790
3791
3792 if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
3793
3794
3795
3796
3797
3798
3799 reset_isolation_suitable(pgdat);
3800
3801
3802
3803
3804
3805 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
3806
3807 remaining = schedule_timeout(HZ/10);
3808
3809
3810
3811
3812
3813
3814 if (remaining) {
3815 WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
3816 kswapd_highest_zoneidx(pgdat,
3817 highest_zoneidx));
3818
3819 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
3820 WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
3821 }
3822
3823 finish_wait(&pgdat->kswapd_wait, &wait);
3824 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3825 }
3826
3827
3828
3829
3830
3831 if (!remaining &&
3832 prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
3833 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3844
3845 if (!kthread_should_stop())
3846 schedule();
3847
3848 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3849 } else {
3850 if (remaining)
3851 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3852 else
3853 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3854 }
3855 finish_wait(&pgdat->kswapd_wait, &wait);
3856}
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871static int kswapd(void *p)
3872{
3873 unsigned int alloc_order, reclaim_order;
3874 unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
3875 pg_data_t *pgdat = (pg_data_t*)p;
3876 struct task_struct *tsk = current;
3877 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3878
3879 if (!cpumask_empty(cpumask))
3880 set_cpus_allowed_ptr(tsk, cpumask);
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3895 set_freezable();
3896
3897 WRITE_ONCE(pgdat->kswapd_order, 0);
3898 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
3899 for ( ; ; ) {
3900 bool ret;
3901
3902 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
3903 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
3904 highest_zoneidx);
3905
3906kswapd_try_sleep:
3907 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3908 highest_zoneidx);
3909
3910
3911 alloc_order = READ_ONCE(pgdat->kswapd_order);
3912 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
3913 highest_zoneidx);
3914 WRITE_ONCE(pgdat->kswapd_order, 0);
3915 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
3916
3917 ret = try_to_freeze();
3918 if (kthread_should_stop())
3919 break;
3920
3921
3922
3923
3924
3925 if (ret)
3926 continue;
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
3937 alloc_order);
3938 reclaim_order = balance_pgdat(pgdat, alloc_order,
3939 highest_zoneidx);
3940 if (reclaim_order < alloc_order)
3941 goto kswapd_try_sleep;
3942 }
3943
3944 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3945
3946 return 0;
3947}
3948
3949
3950
3951
3952
3953
3954
3955
3956void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3957 enum zone_type highest_zoneidx)
3958{
3959 pg_data_t *pgdat;
3960 enum zone_type curr_idx;
3961
3962 if (!managed_zone(zone))
3963 return;
3964
3965 if (!cpuset_zone_allowed(zone, gfp_flags))
3966 return;
3967
3968 pgdat = zone->zone_pgdat;
3969 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
3970
3971 if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
3972 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
3973
3974 if (READ_ONCE(pgdat->kswapd_order) < order)
3975 WRITE_ONCE(pgdat->kswapd_order, order);
3976
3977 if (!waitqueue_active(&pgdat->kswapd_wait))
3978 return;
3979
3980
3981 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3982 (pgdat_balanced(pgdat, order, highest_zoneidx) &&
3983 !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
3984
3985
3986
3987
3988
3989
3990
3991 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
3992 wakeup_kcompactd(pgdat, order, highest_zoneidx);
3993 return;
3994 }
3995
3996 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
3997 gfp_flags);
3998 wake_up_interruptible(&pgdat->kswapd_wait);
3999}
4000
4001#ifdef CONFIG_HIBERNATION
4002
4003
4004
4005
4006
4007
4008
4009
4010unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
4011{
4012 struct scan_control sc = {
4013 .nr_to_reclaim = nr_to_reclaim,
4014 .gfp_mask = GFP_HIGHUSER_MOVABLE,
4015 .reclaim_idx = MAX_NR_ZONES - 1,
4016 .priority = DEF_PRIORITY,
4017 .may_writepage = 1,
4018 .may_unmap = 1,
4019 .may_swap = 1,
4020 .hibernation_mode = 1,
4021 };
4022 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4023 unsigned long nr_reclaimed;
4024 unsigned int noreclaim_flag;
4025
4026 fs_reclaim_acquire(sc.gfp_mask);
4027 noreclaim_flag = memalloc_noreclaim_save();
4028 set_task_reclaim_state(current, &sc.reclaim_state);
4029
4030 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4031
4032 set_task_reclaim_state(current, NULL);
4033 memalloc_noreclaim_restore(noreclaim_flag);
4034 fs_reclaim_release(sc.gfp_mask);
4035
4036 return nr_reclaimed;
4037}
4038#endif
4039
4040
4041
4042
4043
4044int kswapd_run(int nid)
4045{
4046 pg_data_t *pgdat = NODE_DATA(nid);
4047 int ret = 0;
4048
4049 if (pgdat->kswapd)
4050 return 0;
4051
4052 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
4053 if (IS_ERR(pgdat->kswapd)) {
4054
4055 BUG_ON(system_state < SYSTEM_RUNNING);
4056 pr_err("Failed to start kswapd on node %d\n", nid);
4057 ret = PTR_ERR(pgdat->kswapd);
4058 pgdat->kswapd = NULL;
4059 }
4060 return ret;
4061}
4062
4063
4064
4065
4066
4067void kswapd_stop(int nid)
4068{
4069 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
4070
4071 if (kswapd) {
4072 kthread_stop(kswapd);
4073 NODE_DATA(nid)->kswapd = NULL;
4074 }
4075}
4076
4077static int __init kswapd_init(void)
4078{
4079 int nid;
4080
4081 swap_setup();
4082 for_each_node_state(nid, N_MEMORY)
4083 kswapd_run(nid);
4084 return 0;
4085}
4086
4087module_init(kswapd_init)
4088
4089#ifdef CONFIG_NUMA
4090
4091
4092
4093
4094
4095
4096int node_reclaim_mode __read_mostly;
4097
4098#define RECLAIM_WRITE (1<<0)
4099#define RECLAIM_UNMAP (1<<1)
4100
4101
4102
4103
4104
4105
4106#define NODE_RECLAIM_PRIORITY 4
4107
4108
4109
4110
4111
4112int sysctl_min_unmapped_ratio = 1;
4113
4114
4115
4116
4117
4118int sysctl_min_slab_ratio = 5;
4119
4120static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4121{
4122 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4123 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4124 node_page_state(pgdat, NR_ACTIVE_FILE);
4125
4126
4127
4128
4129
4130
4131 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4132}
4133
4134
4135static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4136{
4137 unsigned long nr_pagecache_reclaimable;
4138 unsigned long delta = 0;
4139
4140
4141
4142
4143
4144
4145
4146 if (node_reclaim_mode & RECLAIM_UNMAP)
4147 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4148 else
4149 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4150
4151
4152 if (!(node_reclaim_mode & RECLAIM_WRITE))
4153 delta += node_page_state(pgdat, NR_FILE_DIRTY);
4154
4155
4156 if (unlikely(delta > nr_pagecache_reclaimable))
4157 delta = nr_pagecache_reclaimable;
4158
4159 return nr_pagecache_reclaimable - delta;
4160}
4161
4162
4163
4164
4165static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4166{
4167
4168 const unsigned long nr_pages = 1 << order;
4169 struct task_struct *p = current;
4170 unsigned int noreclaim_flag;
4171 struct scan_control sc = {
4172 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4173 .gfp_mask = current_gfp_context(gfp_mask),
4174 .order = order,
4175 .priority = NODE_RECLAIM_PRIORITY,
4176 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4177 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4178 .may_swap = 1,
4179 .reclaim_idx = gfp_zone(gfp_mask),
4180 };
4181
4182 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
4183 sc.gfp_mask);
4184
4185 cond_resched();
4186 fs_reclaim_acquire(sc.gfp_mask);
4187
4188
4189
4190
4191
4192 noreclaim_flag = memalloc_noreclaim_save();
4193 p->flags |= PF_SWAPWRITE;
4194 set_task_reclaim_state(p, &sc.reclaim_state);
4195
4196 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4197
4198
4199
4200
4201 do {
4202 shrink_node(pgdat, &sc);
4203 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4204 }
4205
4206 set_task_reclaim_state(p, NULL);
4207 current->flags &= ~PF_SWAPWRITE;
4208 memalloc_noreclaim_restore(noreclaim_flag);
4209 fs_reclaim_release(sc.gfp_mask);
4210
4211 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
4212
4213 return sc.nr_reclaimed >= nr_pages;
4214}
4215
4216int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4217{
4218 int ret;
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4231 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
4232 pgdat->min_slab_pages)
4233 return NODE_RECLAIM_FULL;
4234
4235
4236
4237
4238 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4239 return NODE_RECLAIM_NOSCAN;
4240
4241
4242
4243
4244
4245
4246
4247 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4248 return NODE_RECLAIM_NOSCAN;
4249
4250 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4251 return NODE_RECLAIM_NOSCAN;
4252
4253 ret = __node_reclaim(pgdat, gfp_mask, order);
4254 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4255
4256 if (!ret)
4257 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4258
4259 return ret;
4260}
4261#endif
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272void check_move_unevictable_pages(struct pagevec *pvec)
4273{
4274 struct lruvec *lruvec = NULL;
4275 int pgscanned = 0;
4276 int pgrescued = 0;
4277 int i;
4278
4279 for (i = 0; i < pvec->nr; i++) {
4280 struct page *page = pvec->pages[i];
4281 int nr_pages;
4282
4283 if (PageTransTail(page))
4284 continue;
4285
4286 nr_pages = thp_nr_pages(page);
4287 pgscanned += nr_pages;
4288
4289
4290 if (!TestClearPageLRU(page))
4291 continue;
4292
4293 lruvec = relock_page_lruvec_irq(page, lruvec);
4294 if (page_evictable(page) && PageUnevictable(page)) {
4295 enum lru_list lru = page_lru_base_type(page);
4296
4297 VM_BUG_ON_PAGE(PageActive(page), page);
4298 ClearPageUnevictable(page);
4299 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
4300 add_page_to_lru_list(page, lruvec, lru);
4301 pgrescued += nr_pages;
4302 }
4303 SetPageLRU(page);
4304 }
4305
4306 if (lruvec) {
4307 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4308 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4309 unlock_page_lruvec_irq(lruvec);
4310 } else if (pgscanned) {
4311 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4312 }
4313}
4314EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4315