1
2
3
4
5
6
7
8
9
10
11
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15#include <linux/mm.h>
16#include <linux/sched/mm.h>
17#include <linux/module.h>
18#include <linux/gfp.h>
19#include <linux/kernel_stat.h>
20#include <linux/swap.h>
21#include <linux/pagemap.h>
22#include <linux/init.h>
23#include <linux/highmem.h>
24#include <linux/vmpressure.h>
25#include <linux/vmstat.h>
26#include <linux/file.h>
27#include <linux/writeback.h>
28#include <linux/blkdev.h>
29#include <linux/buffer_head.h>
30
31#include <linux/mm_inline.h>
32#include <linux/backing-dev.h>
33#include <linux/rmap.h>
34#include <linux/topology.h>
35#include <linux/cpu.h>
36#include <linux/cpuset.h>
37#include <linux/compaction.h>
38#include <linux/notifier.h>
39#include <linux/rwsem.h>
40#include <linux/delay.h>
41#include <linux/kthread.h>
42#include <linux/freezer.h>
43#include <linux/memcontrol.h>
44#include <linux/migrate.h>
45#include <linux/delayacct.h>
46#include <linux/sysctl.h>
47#include <linux/oom.h>
48#include <linux/pagevec.h>
49#include <linux/prefetch.h>
50#include <linux/printk.h>
51#include <linux/dax.h>
52#include <linux/psi.h>
53
54#include <asm/tlbflush.h>
55#include <asm/div64.h>
56
57#include <linux/swapops.h>
58#include <linux/balloon_compaction.h>
59
60#include "internal.h"
61
62#define CREATE_TRACE_POINTS
63#include <trace/events/vmscan.h>
64
65struct scan_control {
66
67 unsigned long nr_to_reclaim;
68
69
70
71
72
73 nodemask_t *nodemask;
74
75
76
77
78
79 struct mem_cgroup *target_mem_cgroup;
80
81
82
83
84 unsigned long anon_cost;
85 unsigned long file_cost;
86
87
88#define DEACTIVATE_ANON 1
89#define DEACTIVATE_FILE 2
90 unsigned int may_deactivate:2;
91 unsigned int force_deactivate:1;
92 unsigned int skipped_deactivate:1;
93
94
95 unsigned int may_writepage:1;
96
97
98 unsigned int may_unmap:1;
99
100
101 unsigned int may_swap:1;
102
103
104
105
106
107
108
109
110
111 unsigned int memcg_low_reclaim:1;
112 unsigned int memcg_low_skipped:1;
113
114 unsigned int hibernation_mode:1;
115
116
117 unsigned int compaction_ready:1;
118
119
120 unsigned int cache_trim_mode:1;
121
122
123 unsigned int file_is_tiny:1;
124
125
126 unsigned int no_demotion:1;
127
128
129 s8 order;
130
131
132 s8 priority;
133
134
135 s8 reclaim_idx;
136
137
138 gfp_t gfp_mask;
139
140
141 unsigned long nr_scanned;
142
143
144 unsigned long nr_reclaimed;
145
146 struct {
147 unsigned int dirty;
148 unsigned int unqueued_dirty;
149 unsigned int congested;
150 unsigned int writeback;
151 unsigned int immediate;
152 unsigned int file_taken;
153 unsigned int taken;
154 } nr;
155
156
157 struct reclaim_state reclaim_state;
158};
159
160#ifdef ARCH_HAS_PREFETCHW
161#define prefetchw_prev_lru_page(_page, _base, _field) \
162 do { \
163 if ((_page)->lru.prev != _base) { \
164 struct page *prev; \
165 \
166 prev = lru_to_page(&(_page->lru)); \
167 prefetchw(&prev->_field); \
168 } \
169 } while (0)
170#else
171#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
172#endif
173
174
175
176
177int vm_swappiness = 60;
178
179static void set_task_reclaim_state(struct task_struct *task,
180 struct reclaim_state *rs)
181{
182
183 WARN_ON_ONCE(rs && task->reclaim_state);
184
185
186 WARN_ON_ONCE(!rs && !task->reclaim_state);
187
188 task->reclaim_state = rs;
189}
190
191static LIST_HEAD(shrinker_list);
192static DECLARE_RWSEM(shrinker_rwsem);
193
194#ifdef CONFIG_MEMCG
195static int shrinker_nr_max;
196
197
198static inline int shrinker_map_size(int nr_items)
199{
200 return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
201}
202
203static inline int shrinker_defer_size(int nr_items)
204{
205 return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
206}
207
208static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
209 int nid)
210{
211 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
212 lockdep_is_held(&shrinker_rwsem));
213}
214
215static int expand_one_shrinker_info(struct mem_cgroup *memcg,
216 int map_size, int defer_size,
217 int old_map_size, int old_defer_size)
218{
219 struct shrinker_info *new, *old;
220 struct mem_cgroup_per_node *pn;
221 int nid;
222 int size = map_size + defer_size;
223
224 for_each_node(nid) {
225 pn = memcg->nodeinfo[nid];
226 old = shrinker_info_protected(memcg, nid);
227
228 if (!old)
229 return 0;
230
231 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
232 if (!new)
233 return -ENOMEM;
234
235 new->nr_deferred = (atomic_long_t *)(new + 1);
236 new->map = (void *)new->nr_deferred + defer_size;
237
238
239 memset(new->map, (int)0xff, old_map_size);
240 memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
241
242 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
243 memset((void *)new->nr_deferred + old_defer_size, 0,
244 defer_size - old_defer_size);
245
246 rcu_assign_pointer(pn->shrinker_info, new);
247 kvfree_rcu(old, rcu);
248 }
249
250 return 0;
251}
252
253void free_shrinker_info(struct mem_cgroup *memcg)
254{
255 struct mem_cgroup_per_node *pn;
256 struct shrinker_info *info;
257 int nid;
258
259 for_each_node(nid) {
260 pn = memcg->nodeinfo[nid];
261 info = rcu_dereference_protected(pn->shrinker_info, true);
262 kvfree(info);
263 rcu_assign_pointer(pn->shrinker_info, NULL);
264 }
265}
266
267int alloc_shrinker_info(struct mem_cgroup *memcg)
268{
269 struct shrinker_info *info;
270 int nid, size, ret = 0;
271 int map_size, defer_size = 0;
272
273 down_write(&shrinker_rwsem);
274 map_size = shrinker_map_size(shrinker_nr_max);
275 defer_size = shrinker_defer_size(shrinker_nr_max);
276 size = map_size + defer_size;
277 for_each_node(nid) {
278 info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
279 if (!info) {
280 free_shrinker_info(memcg);
281 ret = -ENOMEM;
282 break;
283 }
284 info->nr_deferred = (atomic_long_t *)(info + 1);
285 info->map = (void *)info->nr_deferred + defer_size;
286 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
287 }
288 up_write(&shrinker_rwsem);
289
290 return ret;
291}
292
293static inline bool need_expand(int nr_max)
294{
295 return round_up(nr_max, BITS_PER_LONG) >
296 round_up(shrinker_nr_max, BITS_PER_LONG);
297}
298
299static int expand_shrinker_info(int new_id)
300{
301 int ret = 0;
302 int new_nr_max = new_id + 1;
303 int map_size, defer_size = 0;
304 int old_map_size, old_defer_size = 0;
305 struct mem_cgroup *memcg;
306
307 if (!need_expand(new_nr_max))
308 goto out;
309
310 if (!root_mem_cgroup)
311 goto out;
312
313 lockdep_assert_held(&shrinker_rwsem);
314
315 map_size = shrinker_map_size(new_nr_max);
316 defer_size = shrinker_defer_size(new_nr_max);
317 old_map_size = shrinker_map_size(shrinker_nr_max);
318 old_defer_size = shrinker_defer_size(shrinker_nr_max);
319
320 memcg = mem_cgroup_iter(NULL, NULL, NULL);
321 do {
322 ret = expand_one_shrinker_info(memcg, map_size, defer_size,
323 old_map_size, old_defer_size);
324 if (ret) {
325 mem_cgroup_iter_break(NULL, memcg);
326 goto out;
327 }
328 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
329out:
330 if (!ret)
331 shrinker_nr_max = new_nr_max;
332
333 return ret;
334}
335
336void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
337{
338 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
339 struct shrinker_info *info;
340
341 rcu_read_lock();
342 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
343
344 smp_mb__before_atomic();
345 set_bit(shrinker_id, info->map);
346 rcu_read_unlock();
347 }
348}
349
350static DEFINE_IDR(shrinker_idr);
351
352static int prealloc_memcg_shrinker(struct shrinker *shrinker)
353{
354 int id, ret = -ENOMEM;
355
356 if (mem_cgroup_disabled())
357 return -ENOSYS;
358
359 down_write(&shrinker_rwsem);
360
361 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
362 if (id < 0)
363 goto unlock;
364
365 if (id >= shrinker_nr_max) {
366 if (expand_shrinker_info(id)) {
367 idr_remove(&shrinker_idr, id);
368 goto unlock;
369 }
370 }
371 shrinker->id = id;
372 ret = 0;
373unlock:
374 up_write(&shrinker_rwsem);
375 return ret;
376}
377
378static void unregister_memcg_shrinker(struct shrinker *shrinker)
379{
380 int id = shrinker->id;
381
382 BUG_ON(id < 0);
383
384 lockdep_assert_held(&shrinker_rwsem);
385
386 idr_remove(&shrinker_idr, id);
387}
388
389static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
390 struct mem_cgroup *memcg)
391{
392 struct shrinker_info *info;
393
394 info = shrinker_info_protected(memcg, nid);
395 return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
396}
397
398static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
399 struct mem_cgroup *memcg)
400{
401 struct shrinker_info *info;
402
403 info = shrinker_info_protected(memcg, nid);
404 return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
405}
406
407void reparent_shrinker_deferred(struct mem_cgroup *memcg)
408{
409 int i, nid;
410 long nr;
411 struct mem_cgroup *parent;
412 struct shrinker_info *child_info, *parent_info;
413
414 parent = parent_mem_cgroup(memcg);
415 if (!parent)
416 parent = root_mem_cgroup;
417
418
419 down_read(&shrinker_rwsem);
420 for_each_node(nid) {
421 child_info = shrinker_info_protected(memcg, nid);
422 parent_info = shrinker_info_protected(parent, nid);
423 for (i = 0; i < shrinker_nr_max; i++) {
424 nr = atomic_long_read(&child_info->nr_deferred[i]);
425 atomic_long_add(nr, &parent_info->nr_deferred[i]);
426 }
427 }
428 up_read(&shrinker_rwsem);
429}
430
431static bool cgroup_reclaim(struct scan_control *sc)
432{
433 return sc->target_mem_cgroup;
434}
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449static bool writeback_throttling_sane(struct scan_control *sc)
450{
451 if (!cgroup_reclaim(sc))
452 return true;
453#ifdef CONFIG_CGROUP_WRITEBACK
454 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
455 return true;
456#endif
457 return false;
458}
459#else
460static int prealloc_memcg_shrinker(struct shrinker *shrinker)
461{
462 return -ENOSYS;
463}
464
465static void unregister_memcg_shrinker(struct shrinker *shrinker)
466{
467}
468
469static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
470 struct mem_cgroup *memcg)
471{
472 return 0;
473}
474
475static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
476 struct mem_cgroup *memcg)
477{
478 return 0;
479}
480
481static bool cgroup_reclaim(struct scan_control *sc)
482{
483 return false;
484}
485
486static bool writeback_throttling_sane(struct scan_control *sc)
487{
488 return true;
489}
490#endif
491
492static long xchg_nr_deferred(struct shrinker *shrinker,
493 struct shrink_control *sc)
494{
495 int nid = sc->nid;
496
497 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
498 nid = 0;
499
500 if (sc->memcg &&
501 (shrinker->flags & SHRINKER_MEMCG_AWARE))
502 return xchg_nr_deferred_memcg(nid, shrinker,
503 sc->memcg);
504
505 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
506}
507
508
509static long add_nr_deferred(long nr, struct shrinker *shrinker,
510 struct shrink_control *sc)
511{
512 int nid = sc->nid;
513
514 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
515 nid = 0;
516
517 if (sc->memcg &&
518 (shrinker->flags & SHRINKER_MEMCG_AWARE))
519 return add_nr_deferred_memcg(nr, nid, shrinker,
520 sc->memcg);
521
522 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
523}
524
525static bool can_demote(int nid, struct scan_control *sc)
526{
527 if (!numa_demotion_enabled)
528 return false;
529 if (sc) {
530 if (sc->no_demotion)
531 return false;
532
533 if (cgroup_reclaim(sc))
534 return false;
535 }
536 if (next_demotion_node(nid) == NUMA_NO_NODE)
537 return false;
538
539 return true;
540}
541
542static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
543 int nid,
544 struct scan_control *sc)
545{
546 if (memcg == NULL) {
547
548
549
550
551 if (get_nr_swap_pages() > 0)
552 return true;
553 } else {
554
555 if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
556 return true;
557 }
558
559
560
561
562
563
564 return can_demote(nid, sc);
565}
566
567
568
569
570
571
572unsigned long zone_reclaimable_pages(struct zone *zone)
573{
574 unsigned long nr;
575
576 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
577 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
578 if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
579 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
580 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
581
582 return nr;
583}
584
585
586
587
588
589
590
591static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
592 int zone_idx)
593{
594 unsigned long size = 0;
595 int zid;
596
597 for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
598 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
599
600 if (!managed_zone(zone))
601 continue;
602
603 if (!mem_cgroup_disabled())
604 size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
605 else
606 size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
607 }
608 return size;
609}
610
611
612
613
614int prealloc_shrinker(struct shrinker *shrinker)
615{
616 unsigned int size;
617 int err;
618
619 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
620 err = prealloc_memcg_shrinker(shrinker);
621 if (err != -ENOSYS)
622 return err;
623
624 shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
625 }
626
627 size = sizeof(*shrinker->nr_deferred);
628 if (shrinker->flags & SHRINKER_NUMA_AWARE)
629 size *= nr_node_ids;
630
631 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
632 if (!shrinker->nr_deferred)
633 return -ENOMEM;
634
635 return 0;
636}
637
638void free_prealloced_shrinker(struct shrinker *shrinker)
639{
640 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
641 down_write(&shrinker_rwsem);
642 unregister_memcg_shrinker(shrinker);
643 up_write(&shrinker_rwsem);
644 return;
645 }
646
647 kfree(shrinker->nr_deferred);
648 shrinker->nr_deferred = NULL;
649}
650
651void register_shrinker_prepared(struct shrinker *shrinker)
652{
653 down_write(&shrinker_rwsem);
654 list_add_tail(&shrinker->list, &shrinker_list);
655 shrinker->flags |= SHRINKER_REGISTERED;
656 up_write(&shrinker_rwsem);
657}
658
659int register_shrinker(struct shrinker *shrinker)
660{
661 int err = prealloc_shrinker(shrinker);
662
663 if (err)
664 return err;
665 register_shrinker_prepared(shrinker);
666 return 0;
667}
668EXPORT_SYMBOL(register_shrinker);
669
670
671
672
673void unregister_shrinker(struct shrinker *shrinker)
674{
675 if (!(shrinker->flags & SHRINKER_REGISTERED))
676 return;
677
678 down_write(&shrinker_rwsem);
679 list_del(&shrinker->list);
680 shrinker->flags &= ~SHRINKER_REGISTERED;
681 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
682 unregister_memcg_shrinker(shrinker);
683 up_write(&shrinker_rwsem);
684
685 kfree(shrinker->nr_deferred);
686 shrinker->nr_deferred = NULL;
687}
688EXPORT_SYMBOL(unregister_shrinker);
689
690
691
692
693
694
695
696
697
698void synchronize_shrinkers(void)
699{
700 down_write(&shrinker_rwsem);
701 up_write(&shrinker_rwsem);
702}
703EXPORT_SYMBOL(synchronize_shrinkers);
704
705#define SHRINK_BATCH 128
706
707static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
708 struct shrinker *shrinker, int priority)
709{
710 unsigned long freed = 0;
711 unsigned long long delta;
712 long total_scan;
713 long freeable;
714 long nr;
715 long new_nr;
716 long batch_size = shrinker->batch ? shrinker->batch
717 : SHRINK_BATCH;
718 long scanned = 0, next_deferred;
719
720 freeable = shrinker->count_objects(shrinker, shrinkctl);
721 if (freeable == 0 || freeable == SHRINK_EMPTY)
722 return freeable;
723
724
725
726
727
728
729 nr = xchg_nr_deferred(shrinker, shrinkctl);
730
731 if (shrinker->seeks) {
732 delta = freeable >> priority;
733 delta *= 4;
734 do_div(delta, shrinker->seeks);
735 } else {
736
737
738
739
740
741 delta = freeable / 2;
742 }
743
744 total_scan = nr >> priority;
745 total_scan += delta;
746 total_scan = min(total_scan, (2 * freeable));
747
748 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
749 freeable, delta, total_scan, priority);
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766 while (total_scan >= batch_size ||
767 total_scan >= freeable) {
768 unsigned long ret;
769 unsigned long nr_to_scan = min(batch_size, total_scan);
770
771 shrinkctl->nr_to_scan = nr_to_scan;
772 shrinkctl->nr_scanned = nr_to_scan;
773 ret = shrinker->scan_objects(shrinker, shrinkctl);
774 if (ret == SHRINK_STOP)
775 break;
776 freed += ret;
777
778 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
779 total_scan -= shrinkctl->nr_scanned;
780 scanned += shrinkctl->nr_scanned;
781
782 cond_resched();
783 }
784
785
786
787
788
789
790
791 next_deferred = max_t(long, (nr + delta - scanned), 0);
792 next_deferred = min(next_deferred, (2 * freeable));
793
794
795
796
797
798 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
799
800 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
801 return freed;
802}
803
804#ifdef CONFIG_MEMCG
805static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
806 struct mem_cgroup *memcg, int priority)
807{
808 struct shrinker_info *info;
809 unsigned long ret, freed = 0;
810 int i;
811
812 if (!mem_cgroup_online(memcg))
813 return 0;
814
815 if (!down_read_trylock(&shrinker_rwsem))
816 return 0;
817
818 info = shrinker_info_protected(memcg, nid);
819 if (unlikely(!info))
820 goto unlock;
821
822 for_each_set_bit(i, info->map, shrinker_nr_max) {
823 struct shrink_control sc = {
824 .gfp_mask = gfp_mask,
825 .nid = nid,
826 .memcg = memcg,
827 };
828 struct shrinker *shrinker;
829
830 shrinker = idr_find(&shrinker_idr, i);
831 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
832 if (!shrinker)
833 clear_bit(i, info->map);
834 continue;
835 }
836
837
838 if (!memcg_kmem_enabled() &&
839 !(shrinker->flags & SHRINKER_NONSLAB))
840 continue;
841
842 ret = do_shrink_slab(&sc, shrinker, priority);
843 if (ret == SHRINK_EMPTY) {
844 clear_bit(i, info->map);
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860 smp_mb__after_atomic();
861 ret = do_shrink_slab(&sc, shrinker, priority);
862 if (ret == SHRINK_EMPTY)
863 ret = 0;
864 else
865 set_shrinker_bit(memcg, nid, i);
866 }
867 freed += ret;
868
869 if (rwsem_is_contended(&shrinker_rwsem)) {
870 freed = freed ? : 1;
871 break;
872 }
873 }
874unlock:
875 up_read(&shrinker_rwsem);
876 return freed;
877}
878#else
879static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
880 struct mem_cgroup *memcg, int priority)
881{
882 return 0;
883}
884#endif
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
907 struct mem_cgroup *memcg,
908 int priority)
909{
910 unsigned long ret, freed = 0;
911 struct shrinker *shrinker;
912
913
914
915
916
917
918
919
920 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
921 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
922
923 if (!down_read_trylock(&shrinker_rwsem))
924 goto out;
925
926 list_for_each_entry(shrinker, &shrinker_list, list) {
927 struct shrink_control sc = {
928 .gfp_mask = gfp_mask,
929 .nid = nid,
930 .memcg = memcg,
931 };
932
933 ret = do_shrink_slab(&sc, shrinker, priority);
934 if (ret == SHRINK_EMPTY)
935 ret = 0;
936 freed += ret;
937
938
939
940
941
942 if (rwsem_is_contended(&shrinker_rwsem)) {
943 freed = freed ? : 1;
944 break;
945 }
946 }
947
948 up_read(&shrinker_rwsem);
949out:
950 cond_resched();
951 return freed;
952}
953
954static void drop_slab_node(int nid)
955{
956 unsigned long freed;
957 int shift = 0;
958
959 do {
960 struct mem_cgroup *memcg = NULL;
961
962 if (fatal_signal_pending(current))
963 return;
964
965 freed = 0;
966 memcg = mem_cgroup_iter(NULL, NULL, NULL);
967 do {
968 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
969 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
970 } while ((freed >> shift++) > 1);
971}
972
973void drop_slab(void)
974{
975 int nid;
976
977 for_each_online_node(nid)
978 drop_slab_node(nid);
979}
980
981static inline int is_page_cache_freeable(struct page *page)
982{
983
984
985
986
987
988 int page_cache_pins = thp_nr_pages(page);
989 return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
990}
991
992static int may_write_to_inode(struct inode *inode)
993{
994 if (current->flags & PF_SWAPWRITE)
995 return 1;
996 if (!inode_write_congested(inode))
997 return 1;
998 if (inode_to_bdi(inode) == current->backing_dev_info)
999 return 1;
1000 return 0;
1001}
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015static void handle_write_error(struct address_space *mapping,
1016 struct page *page, int error)
1017{
1018 lock_page(page);
1019 if (page_mapping(page) == mapping)
1020 mapping_set_error(mapping, error);
1021 unlock_page(page);
1022}
1023
1024static bool skip_throttle_noprogress(pg_data_t *pgdat)
1025{
1026 int reclaimable = 0, write_pending = 0;
1027 int i;
1028
1029
1030
1031
1032
1033 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
1034 return true;
1035
1036
1037
1038
1039
1040
1041 for (i = 0; i < MAX_NR_ZONES; i++) {
1042 struct zone *zone = pgdat->node_zones + i;
1043
1044 if (!populated_zone(zone))
1045 continue;
1046
1047 reclaimable += zone_reclaimable_pages(zone);
1048 write_pending += zone_page_state_snapshot(zone,
1049 NR_ZONE_WRITE_PENDING);
1050 }
1051 if (2 * write_pending <= reclaimable)
1052 return true;
1053
1054 return false;
1055}
1056
1057void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
1058{
1059 wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
1060 long timeout, ret;
1061 DEFINE_WAIT(wait);
1062
1063
1064
1065
1066
1067
1068 if (!current_is_kswapd() &&
1069 current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
1070 cond_resched();
1071 return;
1072 }
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084 switch(reason) {
1085 case VMSCAN_THROTTLE_WRITEBACK:
1086 timeout = HZ/10;
1087
1088 if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
1089 WRITE_ONCE(pgdat->nr_reclaim_start,
1090 node_page_state(pgdat, NR_THROTTLED_WRITTEN));
1091 }
1092
1093 break;
1094 case VMSCAN_THROTTLE_CONGESTED:
1095 fallthrough;
1096 case VMSCAN_THROTTLE_NOPROGRESS:
1097 if (skip_throttle_noprogress(pgdat)) {
1098 cond_resched();
1099 return;
1100 }
1101
1102 timeout = 1;
1103
1104 break;
1105 case VMSCAN_THROTTLE_ISOLATED:
1106 timeout = HZ/50;
1107 break;
1108 default:
1109 WARN_ON_ONCE(1);
1110 timeout = HZ;
1111 break;
1112 }
1113
1114 prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
1115 ret = schedule_timeout(timeout);
1116 finish_wait(wqh, &wait);
1117
1118 if (reason == VMSCAN_THROTTLE_WRITEBACK)
1119 atomic_dec(&pgdat->nr_writeback_throttled);
1120
1121 trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
1122 jiffies_to_usecs(timeout - ret),
1123 reason);
1124}
1125
1126
1127
1128
1129
1130
1131void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
1132 int nr_throttled)
1133{
1134 unsigned long nr_written;
1135
1136 node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
1137
1138
1139
1140
1141
1142
1143
1144
1145 nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
1146 READ_ONCE(pgdat->nr_reclaim_start);
1147
1148 if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
1149 wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
1150}
1151
1152
1153typedef enum {
1154
1155 PAGE_KEEP,
1156
1157 PAGE_ACTIVATE,
1158
1159 PAGE_SUCCESS,
1160
1161 PAGE_CLEAN,
1162} pageout_t;
1163
1164
1165
1166
1167
1168static pageout_t pageout(struct page *page, struct address_space *mapping)
1169{
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186 if (!is_page_cache_freeable(page))
1187 return PAGE_KEEP;
1188 if (!mapping) {
1189
1190
1191
1192
1193 if (page_has_private(page)) {
1194 if (try_to_free_buffers(page)) {
1195 ClearPageDirty(page);
1196 pr_info("%s: orphaned page\n", __func__);
1197 return PAGE_CLEAN;
1198 }
1199 }
1200 return PAGE_KEEP;
1201 }
1202 if (mapping->a_ops->writepage == NULL)
1203 return PAGE_ACTIVATE;
1204 if (!may_write_to_inode(mapping->host))
1205 return PAGE_KEEP;
1206
1207 if (clear_page_dirty_for_io(page)) {
1208 int res;
1209 struct writeback_control wbc = {
1210 .sync_mode = WB_SYNC_NONE,
1211 .nr_to_write = SWAP_CLUSTER_MAX,
1212 .range_start = 0,
1213 .range_end = LLONG_MAX,
1214 .for_reclaim = 1,
1215 };
1216
1217 SetPageReclaim(page);
1218 res = mapping->a_ops->writepage(page, &wbc);
1219 if (res < 0)
1220 handle_write_error(mapping, page, res);
1221 if (res == AOP_WRITEPAGE_ACTIVATE) {
1222 ClearPageReclaim(page);
1223 return PAGE_ACTIVATE;
1224 }
1225
1226 if (!PageWriteback(page)) {
1227
1228 ClearPageReclaim(page);
1229 }
1230 trace_mm_vmscan_writepage(page);
1231 inc_node_page_state(page, NR_VMSCAN_WRITE);
1232 return PAGE_SUCCESS;
1233 }
1234
1235 return PAGE_CLEAN;
1236}
1237
1238
1239
1240
1241
1242static int __remove_mapping(struct address_space *mapping, struct page *page,
1243 bool reclaimed, struct mem_cgroup *target_memcg)
1244{
1245 int refcount;
1246 void *shadow = NULL;
1247
1248 BUG_ON(!PageLocked(page));
1249 BUG_ON(mapping != page_mapping(page));
1250
1251 if (!PageSwapCache(page))
1252 spin_lock(&mapping->host->i_lock);
1253 xa_lock_irq(&mapping->i_pages);
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279 refcount = 1 + compound_nr(page);
1280 if (!page_ref_freeze(page, refcount))
1281 goto cannot_free;
1282
1283 if (unlikely(PageDirty(page))) {
1284 page_ref_unfreeze(page, refcount);
1285 goto cannot_free;
1286 }
1287
1288 if (PageSwapCache(page)) {
1289 swp_entry_t swap = { .val = page_private(page) };
1290 mem_cgroup_swapout(page, swap);
1291 if (reclaimed && !mapping_exiting(mapping))
1292 shadow = workingset_eviction(page, target_memcg);
1293 __delete_from_swap_cache(page, swap, shadow);
1294 xa_unlock_irq(&mapping->i_pages);
1295 put_swap_page(page, swap);
1296 } else {
1297 void (*freepage)(struct page *);
1298
1299 freepage = mapping->a_ops->freepage;
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316 if (reclaimed && page_is_file_lru(page) &&
1317 !mapping_exiting(mapping) && !dax_mapping(mapping))
1318 shadow = workingset_eviction(page, target_memcg);
1319 __delete_from_page_cache(page, shadow);
1320 xa_unlock_irq(&mapping->i_pages);
1321 if (mapping_shrinkable(mapping))
1322 inode_add_lru(mapping->host);
1323 spin_unlock(&mapping->host->i_lock);
1324
1325 if (freepage != NULL)
1326 freepage(page);
1327 }
1328
1329 return 1;
1330
1331cannot_free:
1332 xa_unlock_irq(&mapping->i_pages);
1333 if (!PageSwapCache(page))
1334 spin_unlock(&mapping->host->i_lock);
1335 return 0;
1336}
1337
1338
1339
1340
1341
1342
1343
1344int remove_mapping(struct address_space *mapping, struct page *page)
1345{
1346 if (__remove_mapping(mapping, page, false, NULL)) {
1347
1348
1349
1350
1351
1352 page_ref_unfreeze(page, 1);
1353 return 1;
1354 }
1355 return 0;
1356}
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367void putback_lru_page(struct page *page)
1368{
1369 lru_cache_add(page);
1370 put_page(page);
1371}
1372
1373enum page_references {
1374 PAGEREF_RECLAIM,
1375 PAGEREF_RECLAIM_CLEAN,
1376 PAGEREF_KEEP,
1377 PAGEREF_ACTIVATE,
1378};
1379
1380static enum page_references page_check_references(struct page *page,
1381 struct scan_control *sc)
1382{
1383 int referenced_ptes, referenced_page;
1384 unsigned long vm_flags;
1385
1386 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
1387 &vm_flags);
1388 referenced_page = TestClearPageReferenced(page);
1389
1390
1391
1392
1393
1394 if (vm_flags & VM_LOCKED)
1395 return PAGEREF_RECLAIM;
1396
1397 if (referenced_ptes) {
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412 SetPageReferenced(page);
1413
1414 if (referenced_page || referenced_ptes > 1)
1415 return PAGEREF_ACTIVATE;
1416
1417
1418
1419
1420 if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
1421 return PAGEREF_ACTIVATE;
1422
1423 return PAGEREF_KEEP;
1424 }
1425
1426
1427 if (referenced_page && !PageSwapBacked(page))
1428 return PAGEREF_RECLAIM_CLEAN;
1429
1430 return PAGEREF_RECLAIM;
1431}
1432
1433
1434static void page_check_dirty_writeback(struct page *page,
1435 bool *dirty, bool *writeback)
1436{
1437 struct address_space *mapping;
1438
1439
1440
1441
1442
1443 if (!page_is_file_lru(page) ||
1444 (PageAnon(page) && !PageSwapBacked(page))) {
1445 *dirty = false;
1446 *writeback = false;
1447 return;
1448 }
1449
1450
1451 *dirty = PageDirty(page);
1452 *writeback = PageWriteback(page);
1453
1454
1455 if (!page_has_private(page))
1456 return;
1457
1458 mapping = page_mapping(page);
1459 if (mapping && mapping->a_ops->is_dirty_writeback)
1460 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1461}
1462
1463static struct page *alloc_demote_page(struct page *page, unsigned long node)
1464{
1465 struct migration_target_control mtc = {
1466
1467
1468
1469
1470
1471 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
1472 __GFP_THISNODE | __GFP_NOWARN |
1473 __GFP_NOMEMALLOC | GFP_NOWAIT,
1474 .nid = node
1475 };
1476
1477 return alloc_migration_target(page, (unsigned long)&mtc);
1478}
1479
1480
1481
1482
1483
1484
1485static unsigned int demote_page_list(struct list_head *demote_pages,
1486 struct pglist_data *pgdat)
1487{
1488 int target_nid = next_demotion_node(pgdat->node_id);
1489 unsigned int nr_succeeded;
1490
1491 if (list_empty(demote_pages))
1492 return 0;
1493
1494 if (target_nid == NUMA_NO_NODE)
1495 return 0;
1496
1497
1498 migrate_pages(demote_pages, alloc_demote_page, NULL,
1499 target_nid, MIGRATE_ASYNC, MR_DEMOTION,
1500 &nr_succeeded);
1501
1502 if (current_is_kswapd())
1503 __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
1504 else
1505 __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
1506
1507 return nr_succeeded;
1508}
1509
1510
1511
1512
1513static unsigned int shrink_page_list(struct list_head *page_list,
1514 struct pglist_data *pgdat,
1515 struct scan_control *sc,
1516 struct reclaim_stat *stat,
1517 bool ignore_references)
1518{
1519 LIST_HEAD(ret_pages);
1520 LIST_HEAD(free_pages);
1521 LIST_HEAD(demote_pages);
1522 unsigned int nr_reclaimed = 0;
1523 unsigned int pgactivate = 0;
1524 bool do_demote_pass;
1525
1526 memset(stat, 0, sizeof(*stat));
1527 cond_resched();
1528 do_demote_pass = can_demote(pgdat->node_id, sc);
1529
1530retry:
1531 while (!list_empty(page_list)) {
1532 struct address_space *mapping;
1533 struct page *page;
1534 enum page_references references = PAGEREF_RECLAIM;
1535 bool dirty, writeback, may_enter_fs;
1536 unsigned int nr_pages;
1537
1538 cond_resched();
1539
1540 page = lru_to_page(page_list);
1541 list_del(&page->lru);
1542
1543 if (!trylock_page(page))
1544 goto keep;
1545
1546 VM_BUG_ON_PAGE(PageActive(page), page);
1547
1548 nr_pages = compound_nr(page);
1549
1550
1551 sc->nr_scanned += nr_pages;
1552
1553 if (unlikely(!page_evictable(page)))
1554 goto activate_locked;
1555
1556 if (!sc->may_unmap && page_mapped(page))
1557 goto keep_locked;
1558
1559 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1560 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1561
1562
1563
1564
1565
1566
1567 page_check_dirty_writeback(page, &dirty, &writeback);
1568 if (dirty || writeback)
1569 stat->nr_dirty++;
1570
1571 if (dirty && !writeback)
1572 stat->nr_unqueued_dirty++;
1573
1574
1575
1576
1577
1578
1579
1580 mapping = page_mapping(page);
1581 if (((dirty || writeback) && mapping &&
1582 inode_write_congested(mapping->host)) ||
1583 (writeback && PageReclaim(page)))
1584 stat->nr_congested++;
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628 if (PageWriteback(page)) {
1629
1630 if (current_is_kswapd() &&
1631 PageReclaim(page) &&
1632 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1633 stat->nr_immediate++;
1634 goto activate_locked;
1635
1636
1637 } else if (writeback_throttling_sane(sc) ||
1638 !PageReclaim(page) || !may_enter_fs) {
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650 SetPageReclaim(page);
1651 stat->nr_writeback++;
1652 goto activate_locked;
1653
1654
1655 } else {
1656 unlock_page(page);
1657 wait_on_page_writeback(page);
1658
1659 list_add_tail(&page->lru, page_list);
1660 continue;
1661 }
1662 }
1663
1664 if (!ignore_references)
1665 references = page_check_references(page, sc);
1666
1667 switch (references) {
1668 case PAGEREF_ACTIVATE:
1669 goto activate_locked;
1670 case PAGEREF_KEEP:
1671 stat->nr_ref_keep += nr_pages;
1672 goto keep_locked;
1673 case PAGEREF_RECLAIM:
1674 case PAGEREF_RECLAIM_CLEAN:
1675 ;
1676 }
1677
1678
1679
1680
1681
1682 if (do_demote_pass &&
1683 (thp_migration_supported() || !PageTransHuge(page))) {
1684 list_add(&page->lru, &demote_pages);
1685 unlock_page(page);
1686 continue;
1687 }
1688
1689
1690
1691
1692
1693
1694 if (PageAnon(page) && PageSwapBacked(page)) {
1695 if (!PageSwapCache(page)) {
1696 if (!(sc->gfp_mask & __GFP_IO))
1697 goto keep_locked;
1698 if (page_maybe_dma_pinned(page))
1699 goto keep_locked;
1700 if (PageTransHuge(page)) {
1701
1702 if (!can_split_huge_page(page, NULL))
1703 goto activate_locked;
1704
1705
1706
1707
1708
1709 if (!compound_mapcount(page) &&
1710 split_huge_page_to_list(page,
1711 page_list))
1712 goto activate_locked;
1713 }
1714 if (!add_to_swap(page)) {
1715 if (!PageTransHuge(page))
1716 goto activate_locked_split;
1717
1718 if (split_huge_page_to_list(page,
1719 page_list))
1720 goto activate_locked;
1721#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1722 count_vm_event(THP_SWPOUT_FALLBACK);
1723#endif
1724 if (!add_to_swap(page))
1725 goto activate_locked_split;
1726 }
1727
1728 may_enter_fs = true;
1729
1730
1731 mapping = page_mapping(page);
1732 }
1733 } else if (unlikely(PageTransHuge(page))) {
1734
1735 if (split_huge_page_to_list(page, page_list))
1736 goto keep_locked;
1737 }
1738
1739
1740
1741
1742
1743
1744
1745
1746 if ((nr_pages > 1) && !PageTransHuge(page)) {
1747 sc->nr_scanned -= (nr_pages - 1);
1748 nr_pages = 1;
1749 }
1750
1751
1752
1753
1754
1755 if (page_mapped(page)) {
1756 enum ttu_flags flags = TTU_BATCH_FLUSH;
1757 bool was_swapbacked = PageSwapBacked(page);
1758
1759 if (unlikely(PageTransHuge(page)))
1760 flags |= TTU_SPLIT_HUGE_PMD;
1761
1762 try_to_unmap(page, flags);
1763 if (page_mapped(page)) {
1764 stat->nr_unmap_fail += nr_pages;
1765 if (!was_swapbacked && PageSwapBacked(page))
1766 stat->nr_lazyfree_fail += nr_pages;
1767 goto activate_locked;
1768 }
1769 }
1770
1771 if (PageDirty(page)) {
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782 if (page_is_file_lru(page) &&
1783 (!current_is_kswapd() || !PageReclaim(page) ||
1784 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1785
1786
1787
1788
1789
1790
1791 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1792 SetPageReclaim(page);
1793
1794 goto activate_locked;
1795 }
1796
1797 if (references == PAGEREF_RECLAIM_CLEAN)
1798 goto keep_locked;
1799 if (!may_enter_fs)
1800 goto keep_locked;
1801 if (!sc->may_writepage)
1802 goto keep_locked;
1803
1804
1805
1806
1807
1808
1809 try_to_unmap_flush_dirty();
1810 switch (pageout(page, mapping)) {
1811 case PAGE_KEEP:
1812 goto keep_locked;
1813 case PAGE_ACTIVATE:
1814 goto activate_locked;
1815 case PAGE_SUCCESS:
1816 stat->nr_pageout += thp_nr_pages(page);
1817
1818 if (PageWriteback(page))
1819 goto keep;
1820 if (PageDirty(page))
1821 goto keep;
1822
1823
1824
1825
1826
1827 if (!trylock_page(page))
1828 goto keep;
1829 if (PageDirty(page) || PageWriteback(page))
1830 goto keep_locked;
1831 mapping = page_mapping(page);
1832 fallthrough;
1833 case PAGE_CLEAN:
1834 ;
1835 }
1836 }
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859 if (page_has_private(page)) {
1860 if (!try_to_release_page(page, sc->gfp_mask))
1861 goto activate_locked;
1862 if (!mapping && page_count(page) == 1) {
1863 unlock_page(page);
1864 if (put_page_testzero(page))
1865 goto free_it;
1866 else {
1867
1868
1869
1870
1871
1872
1873
1874 nr_reclaimed++;
1875 continue;
1876 }
1877 }
1878 }
1879
1880 if (PageAnon(page) && !PageSwapBacked(page)) {
1881
1882 if (!page_ref_freeze(page, 1))
1883 goto keep_locked;
1884
1885
1886
1887
1888
1889
1890
1891
1892 count_vm_event(PGLAZYFREED);
1893 count_memcg_page_event(page, PGLAZYFREED);
1894 } else if (!mapping || !__remove_mapping(mapping, page, true,
1895 sc->target_mem_cgroup))
1896 goto keep_locked;
1897
1898 unlock_page(page);
1899free_it:
1900
1901
1902
1903
1904 nr_reclaimed += nr_pages;
1905
1906
1907
1908
1909
1910 if (unlikely(PageTransHuge(page)))
1911 destroy_compound_page(page);
1912 else
1913 list_add(&page->lru, &free_pages);
1914 continue;
1915
1916activate_locked_split:
1917
1918
1919
1920
1921 if (nr_pages > 1) {
1922 sc->nr_scanned -= (nr_pages - 1);
1923 nr_pages = 1;
1924 }
1925activate_locked:
1926
1927 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1928 PageMlocked(page)))
1929 try_to_free_swap(page);
1930 VM_BUG_ON_PAGE(PageActive(page), page);
1931 if (!PageMlocked(page)) {
1932 int type = page_is_file_lru(page);
1933 SetPageActive(page);
1934 stat->nr_activate[type] += nr_pages;
1935 count_memcg_page_event(page, PGACTIVATE);
1936 }
1937keep_locked:
1938 unlock_page(page);
1939keep:
1940 list_add(&page->lru, &ret_pages);
1941 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1942 }
1943
1944
1945
1946 nr_reclaimed += demote_page_list(&demote_pages, pgdat);
1947
1948 if (!list_empty(&demote_pages)) {
1949
1950 list_splice_init(&demote_pages, page_list);
1951 do_demote_pass = false;
1952 goto retry;
1953 }
1954
1955 pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
1956
1957 mem_cgroup_uncharge_list(&free_pages);
1958 try_to_unmap_flush();
1959 free_unref_page_list(&free_pages);
1960
1961 list_splice(&ret_pages, page_list);
1962 count_vm_events(PGACTIVATE, pgactivate);
1963
1964 return nr_reclaimed;
1965}
1966
1967unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1968 struct list_head *page_list)
1969{
1970 struct scan_control sc = {
1971 .gfp_mask = GFP_KERNEL,
1972 .may_unmap = 1,
1973 };
1974 struct reclaim_stat stat;
1975 unsigned int nr_reclaimed;
1976 struct page *page, *next;
1977 LIST_HEAD(clean_pages);
1978 unsigned int noreclaim_flag;
1979
1980 list_for_each_entry_safe(page, next, page_list, lru) {
1981 if (!PageHuge(page) && page_is_file_lru(page) &&
1982 !PageDirty(page) && !__PageMovable(page) &&
1983 !PageUnevictable(page)) {
1984 ClearPageActive(page);
1985 list_move(&page->lru, &clean_pages);
1986 }
1987 }
1988
1989
1990
1991
1992
1993
1994
1995 noreclaim_flag = memalloc_noreclaim_save();
1996 nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1997 &stat, true);
1998 memalloc_noreclaim_restore(noreclaim_flag);
1999
2000 list_splice(&clean_pages, page_list);
2001 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
2002 -(long)nr_reclaimed);
2003
2004
2005
2006
2007
2008
2009 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
2010 stat.nr_lazyfree_fail);
2011 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
2012 -(long)stat.nr_lazyfree_fail);
2013 return nr_reclaimed;
2014}
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
2027{
2028
2029 if (!PageLRU(page))
2030 return false;
2031
2032
2033 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
2034 return false;
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044 if (mode & ISOLATE_ASYNC_MIGRATE) {
2045
2046 if (PageWriteback(page))
2047 return false;
2048
2049 if (PageDirty(page)) {
2050 struct address_space *mapping;
2051 bool migrate_dirty;
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062 if (!trylock_page(page))
2063 return false;
2064
2065 mapping = page_mapping(page);
2066 migrate_dirty = !mapping || mapping->a_ops->migratepage;
2067 unlock_page(page);
2068 if (!migrate_dirty)
2069 return false;
2070 }
2071 }
2072
2073 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
2074 return false;
2075
2076 return true;
2077}
2078
2079
2080
2081
2082
2083static __always_inline void update_lru_sizes(struct lruvec *lruvec,
2084 enum lru_list lru, unsigned long *nr_zone_taken)
2085{
2086 int zid;
2087
2088 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2089 if (!nr_zone_taken[zid])
2090 continue;
2091
2092 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
2093 }
2094
2095}
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
2119 struct lruvec *lruvec, struct list_head *dst,
2120 unsigned long *nr_scanned, struct scan_control *sc,
2121 enum lru_list lru)
2122{
2123 struct list_head *src = &lruvec->lists[lru];
2124 unsigned long nr_taken = 0;
2125 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
2126 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
2127 unsigned long skipped = 0;
2128 unsigned long scan, total_scan, nr_pages;
2129 LIST_HEAD(pages_skipped);
2130 isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
2131
2132 total_scan = 0;
2133 scan = 0;
2134 while (scan < nr_to_scan && !list_empty(src)) {
2135 struct page *page;
2136
2137 page = lru_to_page(src);
2138 prefetchw_prev_lru_page(page, src, flags);
2139
2140 nr_pages = compound_nr(page);
2141 total_scan += nr_pages;
2142
2143 if (page_zonenum(page) > sc->reclaim_idx) {
2144 list_move(&page->lru, &pages_skipped);
2145 nr_skipped[page_zonenum(page)] += nr_pages;
2146 continue;
2147 }
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159 scan += nr_pages;
2160 if (!__isolate_lru_page_prepare(page, mode)) {
2161
2162 list_move(&page->lru, src);
2163 continue;
2164 }
2165
2166
2167
2168
2169
2170 if (unlikely(!get_page_unless_zero(page))) {
2171 list_move(&page->lru, src);
2172 continue;
2173 }
2174
2175 if (!TestClearPageLRU(page)) {
2176
2177 put_page(page);
2178 list_move(&page->lru, src);
2179 continue;
2180 }
2181
2182 nr_taken += nr_pages;
2183 nr_zone_taken[page_zonenum(page)] += nr_pages;
2184 list_move(&page->lru, dst);
2185 }
2186
2187
2188
2189
2190
2191
2192
2193
2194 if (!list_empty(&pages_skipped)) {
2195 int zid;
2196
2197 list_splice(&pages_skipped, src);
2198 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2199 if (!nr_skipped[zid])
2200 continue;
2201
2202 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
2203 skipped += nr_skipped[zid];
2204 }
2205 }
2206 *nr_scanned = total_scan;
2207 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
2208 total_scan, skipped, nr_taken, mode, lru);
2209 update_lru_sizes(lruvec, lru, nr_zone_taken);
2210 return nr_taken;
2211}
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239int isolate_lru_page(struct page *page)
2240{
2241 struct folio *folio = page_folio(page);
2242 int ret = -EBUSY;
2243
2244 VM_BUG_ON_PAGE(!page_count(page), page);
2245 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
2246
2247 if (TestClearPageLRU(page)) {
2248 struct lruvec *lruvec;
2249
2250 get_page(page);
2251 lruvec = folio_lruvec_lock_irq(folio);
2252 del_page_from_lru_list(page, lruvec);
2253 unlock_page_lruvec_irq(lruvec);
2254 ret = 0;
2255 }
2256
2257 return ret;
2258}
2259
2260
2261
2262
2263
2264
2265
2266
2267static int too_many_isolated(struct pglist_data *pgdat, int file,
2268 struct scan_control *sc)
2269{
2270 unsigned long inactive, isolated;
2271 bool too_many;
2272
2273 if (current_is_kswapd())
2274 return 0;
2275
2276 if (!writeback_throttling_sane(sc))
2277 return 0;
2278
2279 if (file) {
2280 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
2281 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
2282 } else {
2283 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
2284 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
2285 }
2286
2287
2288
2289
2290
2291
2292 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
2293 inactive >>= 3;
2294
2295 too_many = isolated > inactive;
2296
2297
2298 if (!too_many)
2299 wake_throttle_isolated(pgdat);
2300
2301 return too_many;
2302}
2303
2304
2305
2306
2307
2308
2309
2310static unsigned int move_pages_to_lru(struct lruvec *lruvec,
2311 struct list_head *list)
2312{
2313 int nr_pages, nr_moved = 0;
2314 LIST_HEAD(pages_to_free);
2315 struct page *page;
2316
2317 while (!list_empty(list)) {
2318 page = lru_to_page(list);
2319 VM_BUG_ON_PAGE(PageLRU(page), page);
2320 list_del(&page->lru);
2321 if (unlikely(!page_evictable(page))) {
2322 spin_unlock_irq(&lruvec->lru_lock);
2323 putback_lru_page(page);
2324 spin_lock_irq(&lruvec->lru_lock);
2325 continue;
2326 }
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339 SetPageLRU(page);
2340
2341 if (unlikely(put_page_testzero(page))) {
2342 __clear_page_lru_flags(page);
2343
2344 if (unlikely(PageCompound(page))) {
2345 spin_unlock_irq(&lruvec->lru_lock);
2346 destroy_compound_page(page);
2347 spin_lock_irq(&lruvec->lru_lock);
2348 } else
2349 list_add(&page->lru, &pages_to_free);
2350
2351 continue;
2352 }
2353
2354
2355
2356
2357
2358 VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page);
2359 add_page_to_lru_list(page, lruvec);
2360 nr_pages = thp_nr_pages(page);
2361 nr_moved += nr_pages;
2362 if (PageActive(page))
2363 workingset_age_nonresident(lruvec, nr_pages);
2364 }
2365
2366
2367
2368
2369 list_splice(&pages_to_free, list);
2370
2371 return nr_moved;
2372}
2373
2374
2375
2376
2377
2378
2379
2380static int current_may_throttle(void)
2381{
2382 return !(current->flags & PF_LOCAL_THROTTLE) ||
2383 current->backing_dev_info == NULL ||
2384 bdi_write_congested(current->backing_dev_info);
2385}
2386
2387
2388
2389
2390
2391static unsigned long
2392shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
2393 struct scan_control *sc, enum lru_list lru)
2394{
2395 LIST_HEAD(page_list);
2396 unsigned long nr_scanned;
2397 unsigned int nr_reclaimed = 0;
2398 unsigned long nr_taken;
2399 struct reclaim_stat stat;
2400 bool file = is_file_lru(lru);
2401 enum vm_event_item item;
2402 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2403 bool stalled = false;
2404
2405 while (unlikely(too_many_isolated(pgdat, file, sc))) {
2406 if (stalled)
2407 return 0;
2408
2409
2410 stalled = true;
2411 reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
2412
2413
2414 if (fatal_signal_pending(current))
2415 return SWAP_CLUSTER_MAX;
2416 }
2417
2418 lru_add_drain();
2419
2420 spin_lock_irq(&lruvec->lru_lock);
2421
2422 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
2423 &nr_scanned, sc, lru);
2424
2425 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2426 item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
2427 if (!cgroup_reclaim(sc))
2428 __count_vm_events(item, nr_scanned);
2429 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
2430 __count_vm_events(PGSCAN_ANON + file, nr_scanned);
2431
2432 spin_unlock_irq(&lruvec->lru_lock);
2433
2434 if (nr_taken == 0)
2435 return 0;
2436
2437 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
2438
2439 spin_lock_irq(&lruvec->lru_lock);
2440 move_pages_to_lru(lruvec, &page_list);
2441
2442 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2443 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
2444 if (!cgroup_reclaim(sc))
2445 __count_vm_events(item, nr_reclaimed);
2446 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
2447 __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
2448 spin_unlock_irq(&lruvec->lru_lock);
2449
2450 lru_note_cost(lruvec, file, stat.nr_pageout);
2451 mem_cgroup_uncharge_list(&page_list);
2452 free_unref_page_list(&page_list);
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465 if (stat.nr_unqueued_dirty == nr_taken)
2466 wakeup_flusher_threads(WB_REASON_VMSCAN);
2467
2468 sc->nr.dirty += stat.nr_dirty;
2469 sc->nr.congested += stat.nr_congested;
2470 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2471 sc->nr.writeback += stat.nr_writeback;
2472 sc->nr.immediate += stat.nr_immediate;
2473 sc->nr.taken += nr_taken;
2474 if (file)
2475 sc->nr.file_taken += nr_taken;
2476
2477 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2478 nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2479 return nr_reclaimed;
2480}
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499static void shrink_active_list(unsigned long nr_to_scan,
2500 struct lruvec *lruvec,
2501 struct scan_control *sc,
2502 enum lru_list lru)
2503{
2504 unsigned long nr_taken;
2505 unsigned long nr_scanned;
2506 unsigned long vm_flags;
2507 LIST_HEAD(l_hold);
2508 LIST_HEAD(l_active);
2509 LIST_HEAD(l_inactive);
2510 struct page *page;
2511 unsigned nr_deactivate, nr_activate;
2512 unsigned nr_rotated = 0;
2513 int file = is_file_lru(lru);
2514 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2515
2516 lru_add_drain();
2517
2518 spin_lock_irq(&lruvec->lru_lock);
2519
2520 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2521 &nr_scanned, sc, lru);
2522
2523 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2524
2525 if (!cgroup_reclaim(sc))
2526 __count_vm_events(PGREFILL, nr_scanned);
2527 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2528
2529 spin_unlock_irq(&lruvec->lru_lock);
2530
2531 while (!list_empty(&l_hold)) {
2532 cond_resched();
2533 page = lru_to_page(&l_hold);
2534 list_del(&page->lru);
2535
2536 if (unlikely(!page_evictable(page))) {
2537 putback_lru_page(page);
2538 continue;
2539 }
2540
2541 if (unlikely(buffer_heads_over_limit)) {
2542 if (page_has_private(page) && trylock_page(page)) {
2543 if (page_has_private(page))
2544 try_to_release_page(page, 0);
2545 unlock_page(page);
2546 }
2547 }
2548
2549 if (page_referenced(page, 0, sc->target_mem_cgroup,
2550 &vm_flags)) {
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560 if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
2561 nr_rotated += thp_nr_pages(page);
2562 list_add(&page->lru, &l_active);
2563 continue;
2564 }
2565 }
2566
2567 ClearPageActive(page);
2568 SetPageWorkingset(page);
2569 list_add(&page->lru, &l_inactive);
2570 }
2571
2572
2573
2574
2575 spin_lock_irq(&lruvec->lru_lock);
2576
2577 nr_activate = move_pages_to_lru(lruvec, &l_active);
2578 nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2579
2580 list_splice(&l_inactive, &l_active);
2581
2582 __count_vm_events(PGDEACTIVATE, nr_deactivate);
2583 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2584
2585 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2586 spin_unlock_irq(&lruvec->lru_lock);
2587
2588 mem_cgroup_uncharge_list(&l_active);
2589 free_unref_page_list(&l_active);
2590 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2591 nr_deactivate, nr_rotated, sc->priority, file);
2592}
2593
2594unsigned long reclaim_pages(struct list_head *page_list)
2595{
2596 int nid = NUMA_NO_NODE;
2597 unsigned int nr_reclaimed = 0;
2598 LIST_HEAD(node_page_list);
2599 struct reclaim_stat dummy_stat;
2600 struct page *page;
2601 unsigned int noreclaim_flag;
2602 struct scan_control sc = {
2603 .gfp_mask = GFP_KERNEL,
2604 .may_writepage = 1,
2605 .may_unmap = 1,
2606 .may_swap = 1,
2607 .no_demotion = 1,
2608 };
2609
2610 noreclaim_flag = memalloc_noreclaim_save();
2611
2612 while (!list_empty(page_list)) {
2613 page = lru_to_page(page_list);
2614 if (nid == NUMA_NO_NODE) {
2615 nid = page_to_nid(page);
2616 INIT_LIST_HEAD(&node_page_list);
2617 }
2618
2619 if (nid == page_to_nid(page)) {
2620 ClearPageActive(page);
2621 list_move(&page->lru, &node_page_list);
2622 continue;
2623 }
2624
2625 nr_reclaimed += shrink_page_list(&node_page_list,
2626 NODE_DATA(nid),
2627 &sc, &dummy_stat, false);
2628 while (!list_empty(&node_page_list)) {
2629 page = lru_to_page(&node_page_list);
2630 list_del(&page->lru);
2631 putback_lru_page(page);
2632 }
2633
2634 nid = NUMA_NO_NODE;
2635 }
2636
2637 if (!list_empty(&node_page_list)) {
2638 nr_reclaimed += shrink_page_list(&node_page_list,
2639 NODE_DATA(nid),
2640 &sc, &dummy_stat, false);
2641 while (!list_empty(&node_page_list)) {
2642 page = lru_to_page(&node_page_list);
2643 list_del(&page->lru);
2644 putback_lru_page(page);
2645 }
2646 }
2647
2648 memalloc_noreclaim_restore(noreclaim_flag);
2649
2650 return nr_reclaimed;
2651}
2652
2653static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2654 struct lruvec *lruvec, struct scan_control *sc)
2655{
2656 if (is_active_lru(lru)) {
2657 if (sc->may_deactivate & (1 << is_file_lru(lru)))
2658 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2659 else
2660 sc->skipped_deactivate = 1;
2661 return 0;
2662 }
2663
2664 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2665}
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2696{
2697 enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2698 unsigned long inactive, active;
2699 unsigned long inactive_ratio;
2700 unsigned long gb;
2701
2702 inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2703 active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2704
2705 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2706 if (gb)
2707 inactive_ratio = int_sqrt(10 * gb);
2708 else
2709 inactive_ratio = 1;
2710
2711 return inactive * inactive_ratio < active;
2712}
2713
2714enum scan_balance {
2715 SCAN_EQUAL,
2716 SCAN_FRACT,
2717 SCAN_ANON,
2718 SCAN_FILE,
2719};
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2731 unsigned long *nr)
2732{
2733 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2734 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2735 unsigned long anon_cost, file_cost, total_cost;
2736 int swappiness = mem_cgroup_swappiness(memcg);
2737 u64 fraction[ANON_AND_FILE];
2738 u64 denominator = 0;
2739 enum scan_balance scan_balance;
2740 unsigned long ap, fp;
2741 enum lru_list lru;
2742
2743
2744 if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
2745 scan_balance = SCAN_FILE;
2746 goto out;
2747 }
2748
2749
2750
2751
2752
2753
2754
2755
2756 if (cgroup_reclaim(sc) && !swappiness) {
2757 scan_balance = SCAN_FILE;
2758 goto out;
2759 }
2760
2761
2762
2763
2764
2765
2766 if (!sc->priority && swappiness) {
2767 scan_balance = SCAN_EQUAL;
2768 goto out;
2769 }
2770
2771
2772
2773
2774 if (sc->file_is_tiny) {
2775 scan_balance = SCAN_ANON;
2776 goto out;
2777 }
2778
2779
2780
2781
2782
2783 if (sc->cache_trim_mode) {
2784 scan_balance = SCAN_FILE;
2785 goto out;
2786 }
2787
2788 scan_balance = SCAN_FRACT;
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804 total_cost = sc->anon_cost + sc->file_cost;
2805 anon_cost = total_cost + sc->anon_cost;
2806 file_cost = total_cost + sc->file_cost;
2807 total_cost = anon_cost + file_cost;
2808
2809 ap = swappiness * (total_cost + 1);
2810 ap /= anon_cost + 1;
2811
2812 fp = (200 - swappiness) * (total_cost + 1);
2813 fp /= file_cost + 1;
2814
2815 fraction[0] = ap;
2816 fraction[1] = fp;
2817 denominator = ap + fp;
2818out:
2819 for_each_evictable_lru(lru) {
2820 int file = is_file_lru(lru);
2821 unsigned long lruvec_size;
2822 unsigned long low, min;
2823 unsigned long scan;
2824
2825 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2826 mem_cgroup_protection(sc->target_mem_cgroup, memcg,
2827 &min, &low);
2828
2829 if (min || low) {
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859 unsigned long cgroup_size = mem_cgroup_size(memcg);
2860 unsigned long protection;
2861
2862
2863 if (!sc->memcg_low_reclaim && low > min) {
2864 protection = low;
2865 sc->memcg_low_skipped = 1;
2866 } else {
2867 protection = min;
2868 }
2869
2870
2871 cgroup_size = max(cgroup_size, protection);
2872
2873 scan = lruvec_size - lruvec_size * protection /
2874 (cgroup_size + 1);
2875
2876
2877
2878
2879
2880
2881 scan = max(scan, SWAP_CLUSTER_MAX);
2882 } else {
2883 scan = lruvec_size;
2884 }
2885
2886 scan >>= sc->priority;
2887
2888
2889
2890
2891
2892 if (!scan && !mem_cgroup_online(memcg))
2893 scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2894
2895 switch (scan_balance) {
2896 case SCAN_EQUAL:
2897
2898 break;
2899 case SCAN_FRACT:
2900
2901
2902
2903
2904
2905
2906
2907 scan = mem_cgroup_online(memcg) ?
2908 div64_u64(scan * fraction[file], denominator) :
2909 DIV64_U64_ROUND_UP(scan * fraction[file],
2910 denominator);
2911 break;
2912 case SCAN_FILE:
2913 case SCAN_ANON:
2914
2915 if ((scan_balance == SCAN_FILE) != file)
2916 scan = 0;
2917 break;
2918 default:
2919
2920 BUG();
2921 }
2922
2923 nr[lru] = scan;
2924 }
2925}
2926
2927
2928
2929
2930
2931static bool can_age_anon_pages(struct pglist_data *pgdat,
2932 struct scan_control *sc)
2933{
2934
2935 if (total_swap_pages > 0)
2936 return true;
2937
2938
2939 return can_demote(pgdat->node_id, sc);
2940}
2941
2942static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2943{
2944 unsigned long nr[NR_LRU_LISTS];
2945 unsigned long targets[NR_LRU_LISTS];
2946 unsigned long nr_to_scan;
2947 enum lru_list lru;
2948 unsigned long nr_reclaimed = 0;
2949 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2950 struct blk_plug plug;
2951 bool scan_adjusted;
2952
2953 get_scan_count(lruvec, sc, nr);
2954
2955
2956 memcpy(targets, nr, sizeof(nr));
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969 scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
2970 sc->priority == DEF_PRIORITY);
2971
2972 blk_start_plug(&plug);
2973 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2974 nr[LRU_INACTIVE_FILE]) {
2975 unsigned long nr_anon, nr_file, percentage;
2976 unsigned long nr_scanned;
2977
2978 for_each_evictable_lru(lru) {
2979 if (nr[lru]) {
2980 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2981 nr[lru] -= nr_to_scan;
2982
2983 nr_reclaimed += shrink_list(lru, nr_to_scan,
2984 lruvec, sc);
2985 }
2986 }
2987
2988 cond_resched();
2989
2990 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2991 continue;
2992
2993
2994
2995
2996
2997
2998
2999
3000 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
3001 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
3002
3003
3004
3005
3006
3007
3008
3009 if (!nr_file || !nr_anon)
3010 break;
3011
3012 if (nr_file > nr_anon) {
3013 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
3014 targets[LRU_ACTIVE_ANON] + 1;
3015 lru = LRU_BASE;
3016 percentage = nr_anon * 100 / scan_target;
3017 } else {
3018 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
3019 targets[LRU_ACTIVE_FILE] + 1;
3020 lru = LRU_FILE;
3021 percentage = nr_file * 100 / scan_target;
3022 }
3023
3024
3025 nr[lru] = 0;
3026 nr[lru + LRU_ACTIVE] = 0;
3027
3028
3029
3030
3031
3032 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
3033 nr_scanned = targets[lru] - nr[lru];
3034 nr[lru] = targets[lru] * (100 - percentage) / 100;
3035 nr[lru] -= min(nr[lru], nr_scanned);
3036
3037 lru += LRU_ACTIVE;
3038 nr_scanned = targets[lru] - nr[lru];
3039 nr[lru] = targets[lru] * (100 - percentage) / 100;
3040 nr[lru] -= min(nr[lru], nr_scanned);
3041
3042 scan_adjusted = true;
3043 }
3044 blk_finish_plug(&plug);
3045 sc->nr_reclaimed += nr_reclaimed;
3046
3047
3048
3049
3050
3051 if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
3052 inactive_is_low(lruvec, LRU_INACTIVE_ANON))
3053 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3054 sc, LRU_ACTIVE_ANON);
3055}
3056
3057
3058static bool in_reclaim_compaction(struct scan_control *sc)
3059{
3060 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
3061 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
3062 sc->priority < DEF_PRIORITY - 2))
3063 return true;
3064
3065 return false;
3066}
3067
3068
3069
3070
3071
3072
3073
3074
3075static inline bool should_continue_reclaim(struct pglist_data *pgdat,
3076 unsigned long nr_reclaimed,
3077 struct scan_control *sc)
3078{
3079 unsigned long pages_for_compaction;
3080 unsigned long inactive_lru_pages;
3081 int z;
3082
3083
3084 if (!in_reclaim_compaction(sc))
3085 return false;
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097 if (!nr_reclaimed)
3098 return false;
3099
3100
3101 for (z = 0; z <= sc->reclaim_idx; z++) {
3102 struct zone *zone = &pgdat->node_zones[z];
3103 if (!managed_zone(zone))
3104 continue;
3105
3106 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
3107 case COMPACT_SUCCESS:
3108 case COMPACT_CONTINUE:
3109 return false;
3110 default:
3111
3112 ;
3113 }
3114 }
3115
3116
3117
3118
3119
3120 pages_for_compaction = compact_gap(sc->order);
3121 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
3122 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
3123 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
3124
3125 return inactive_lru_pages > pages_for_compaction;
3126}
3127
3128static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
3129{
3130 struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
3131 struct mem_cgroup *memcg;
3132
3133 memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
3134 do {
3135 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3136 unsigned long reclaimed;
3137 unsigned long scanned;
3138
3139
3140
3141
3142
3143
3144
3145 cond_resched();
3146
3147 mem_cgroup_calculate_protection(target_memcg, memcg);
3148
3149 if (mem_cgroup_below_min(memcg)) {
3150
3151
3152
3153
3154 continue;
3155 } else if (mem_cgroup_below_low(memcg)) {
3156
3157
3158
3159
3160
3161
3162 if (!sc->memcg_low_reclaim) {
3163 sc->memcg_low_skipped = 1;
3164 continue;
3165 }
3166 memcg_memory_event(memcg, MEMCG_LOW);
3167 }
3168
3169 reclaimed = sc->nr_reclaimed;
3170 scanned = sc->nr_scanned;
3171
3172 shrink_lruvec(lruvec, sc);
3173
3174 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
3175 sc->priority);
3176
3177
3178 vmpressure(sc->gfp_mask, memcg, false,
3179 sc->nr_scanned - scanned,
3180 sc->nr_reclaimed - reclaimed);
3181
3182 } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
3183}
3184
3185static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
3186{
3187 struct reclaim_state *reclaim_state = current->reclaim_state;
3188 unsigned long nr_reclaimed, nr_scanned;
3189 struct lruvec *target_lruvec;
3190 bool reclaimable = false;
3191 unsigned long file;
3192
3193 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
3194
3195again:
3196
3197
3198
3199
3200 mem_cgroup_flush_stats();
3201
3202 memset(&sc->nr, 0, sizeof(sc->nr));
3203
3204 nr_reclaimed = sc->nr_reclaimed;
3205 nr_scanned = sc->nr_scanned;
3206
3207
3208
3209
3210 spin_lock_irq(&target_lruvec->lru_lock);
3211 sc->anon_cost = target_lruvec->anon_cost;
3212 sc->file_cost = target_lruvec->file_cost;
3213 spin_unlock_irq(&target_lruvec->lru_lock);
3214
3215
3216
3217
3218
3219 if (!sc->force_deactivate) {
3220 unsigned long refaults;
3221
3222 refaults = lruvec_page_state(target_lruvec,
3223 WORKINGSET_ACTIVATE_ANON);
3224 if (refaults != target_lruvec->refaults[0] ||
3225 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
3226 sc->may_deactivate |= DEACTIVATE_ANON;
3227 else
3228 sc->may_deactivate &= ~DEACTIVATE_ANON;
3229
3230
3231
3232
3233
3234
3235 refaults = lruvec_page_state(target_lruvec,
3236 WORKINGSET_ACTIVATE_FILE);
3237 if (refaults != target_lruvec->refaults[1] ||
3238 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
3239 sc->may_deactivate |= DEACTIVATE_FILE;
3240 else
3241 sc->may_deactivate &= ~DEACTIVATE_FILE;
3242 } else
3243 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
3244
3245
3246
3247
3248
3249
3250 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
3251 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
3252 sc->cache_trim_mode = 1;
3253 else
3254 sc->cache_trim_mode = 0;
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265 if (!cgroup_reclaim(sc)) {
3266 unsigned long total_high_wmark = 0;
3267 unsigned long free, anon;
3268 int z;
3269
3270 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
3271 file = node_page_state(pgdat, NR_ACTIVE_FILE) +
3272 node_page_state(pgdat, NR_INACTIVE_FILE);
3273
3274 for (z = 0; z < MAX_NR_ZONES; z++) {
3275 struct zone *zone = &pgdat->node_zones[z];
3276 if (!managed_zone(zone))
3277 continue;
3278
3279 total_high_wmark += high_wmark_pages(zone);
3280 }
3281
3282
3283
3284
3285
3286
3287 anon = node_page_state(pgdat, NR_INACTIVE_ANON);
3288
3289 sc->file_is_tiny =
3290 file + free <= total_high_wmark &&
3291 !(sc->may_deactivate & DEACTIVATE_ANON) &&
3292 anon >> sc->priority;
3293 }
3294
3295 shrink_node_memcgs(pgdat, sc);
3296
3297 if (reclaim_state) {
3298 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
3299 reclaim_state->reclaimed_slab = 0;
3300 }
3301
3302
3303 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
3304 sc->nr_scanned - nr_scanned,
3305 sc->nr_reclaimed - nr_reclaimed);
3306
3307 if (sc->nr_reclaimed - nr_reclaimed)
3308 reclaimable = true;
3309
3310 if (current_is_kswapd()) {
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
3329 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
3330
3331
3332 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
3333 set_bit(PGDAT_DIRTY, &pgdat->flags);
3334
3335
3336
3337
3338
3339
3340
3341
3342 if (sc->nr.immediate)
3343 reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
3344 }
3345
3346
3347
3348
3349
3350
3351
3352
3353 if ((current_is_kswapd() ||
3354 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
3355 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
3356 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
3357
3358
3359
3360
3361
3362
3363
3364 if (!current_is_kswapd() && current_may_throttle() &&
3365 !sc->hibernation_mode &&
3366 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
3367 reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
3368
3369 if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
3370 sc))
3371 goto again;
3372
3373
3374
3375
3376
3377
3378
3379 if (reclaimable)
3380 pgdat->kswapd_failures = 0;
3381}
3382
3383
3384
3385
3386
3387
3388static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
3389{
3390 unsigned long watermark;
3391 enum compact_result suitable;
3392
3393 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
3394 if (suitable == COMPACT_SUCCESS)
3395
3396 return true;
3397 if (suitable == COMPACT_SKIPPED)
3398
3399 return false;
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
3411
3412 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
3413}
3414
3415static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
3416{
3417
3418
3419
3420
3421 if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
3422 wait_queue_head_t *wqh;
3423
3424 wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
3425 if (waitqueue_active(wqh))
3426 wake_up(wqh);
3427
3428 return;
3429 }
3430
3431
3432
3433
3434
3435
3436
3437 if (current_is_kswapd() || cgroup_reclaim(sc))
3438 return;
3439
3440
3441 if (sc->priority == 1 && !sc->nr_reclaimed)
3442 reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
3443}
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
3454{
3455 struct zoneref *z;
3456 struct zone *zone;
3457 unsigned long nr_soft_reclaimed;
3458 unsigned long nr_soft_scanned;
3459 gfp_t orig_mask;
3460 pg_data_t *last_pgdat = NULL;
3461 pg_data_t *first_pgdat = NULL;
3462
3463
3464
3465
3466
3467
3468 orig_mask = sc->gfp_mask;
3469 if (buffer_heads_over_limit) {
3470 sc->gfp_mask |= __GFP_HIGHMEM;
3471 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
3472 }
3473
3474 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3475 sc->reclaim_idx, sc->nodemask) {
3476
3477
3478
3479
3480 if (!cgroup_reclaim(sc)) {
3481 if (!cpuset_zone_allowed(zone,
3482 GFP_KERNEL | __GFP_HARDWALL))
3483 continue;
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494 if (IS_ENABLED(CONFIG_COMPACTION) &&
3495 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
3496 compaction_ready(zone, sc)) {
3497 sc->compaction_ready = true;
3498 continue;
3499 }
3500
3501
3502
3503
3504
3505
3506
3507 if (zone->zone_pgdat == last_pgdat)
3508 continue;
3509
3510
3511
3512
3513
3514
3515
3516 nr_soft_scanned = 0;
3517 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
3518 sc->order, sc->gfp_mask,
3519 &nr_soft_scanned);
3520 sc->nr_reclaimed += nr_soft_reclaimed;
3521 sc->nr_scanned += nr_soft_scanned;
3522
3523 }
3524
3525 if (!first_pgdat)
3526 first_pgdat = zone->zone_pgdat;
3527
3528
3529 if (zone->zone_pgdat == last_pgdat)
3530 continue;
3531 last_pgdat = zone->zone_pgdat;
3532 shrink_node(zone->zone_pgdat, sc);
3533 }
3534
3535 if (first_pgdat)
3536 consider_reclaim_throttle(first_pgdat, sc);
3537
3538
3539
3540
3541
3542 sc->gfp_mask = orig_mask;
3543}
3544
3545static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
3546{
3547 struct lruvec *target_lruvec;
3548 unsigned long refaults;
3549
3550 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
3551 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
3552 target_lruvec->refaults[0] = refaults;
3553 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
3554 target_lruvec->refaults[1] = refaults;
3555}
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3574 struct scan_control *sc)
3575{
3576 int initial_priority = sc->priority;
3577 pg_data_t *last_pgdat;
3578 struct zoneref *z;
3579 struct zone *zone;
3580retry:
3581 delayacct_freepages_start();
3582
3583 if (!cgroup_reclaim(sc))
3584 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3585
3586 do {
3587 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3588 sc->priority);
3589 sc->nr_scanned = 0;
3590 shrink_zones(zonelist, sc);
3591
3592 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3593 break;
3594
3595 if (sc->compaction_ready)
3596 break;
3597
3598
3599
3600
3601
3602 if (sc->priority < DEF_PRIORITY - 2)
3603 sc->may_writepage = 1;
3604 } while (--sc->priority >= 0);
3605
3606 last_pgdat = NULL;
3607 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3608 sc->nodemask) {
3609 if (zone->zone_pgdat == last_pgdat)
3610 continue;
3611 last_pgdat = zone->zone_pgdat;
3612
3613 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3614
3615 if (cgroup_reclaim(sc)) {
3616 struct lruvec *lruvec;
3617
3618 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
3619 zone->zone_pgdat);
3620 clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3621 }
3622 }
3623
3624 delayacct_freepages_end();
3625
3626 if (sc->nr_reclaimed)
3627 return sc->nr_reclaimed;
3628
3629
3630 if (sc->compaction_ready)
3631 return 1;
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642 if (sc->skipped_deactivate) {
3643 sc->priority = initial_priority;
3644 sc->force_deactivate = 1;
3645 sc->skipped_deactivate = 0;
3646 goto retry;
3647 }
3648
3649
3650 if (sc->memcg_low_skipped) {
3651 sc->priority = initial_priority;
3652 sc->force_deactivate = 0;
3653 sc->memcg_low_reclaim = 1;
3654 sc->memcg_low_skipped = 0;
3655 goto retry;
3656 }
3657
3658 return 0;
3659}
3660
3661static bool allow_direct_reclaim(pg_data_t *pgdat)
3662{
3663 struct zone *zone;
3664 unsigned long pfmemalloc_reserve = 0;
3665 unsigned long free_pages = 0;
3666 int i;
3667 bool wmark_ok;
3668
3669 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3670 return true;
3671
3672 for (i = 0; i <= ZONE_NORMAL; i++) {
3673 zone = &pgdat->node_zones[i];
3674 if (!managed_zone(zone))
3675 continue;
3676
3677 if (!zone_reclaimable_pages(zone))
3678 continue;
3679
3680 pfmemalloc_reserve += min_wmark_pages(zone);
3681 free_pages += zone_page_state(zone, NR_FREE_PAGES);
3682 }
3683
3684
3685 if (!pfmemalloc_reserve)
3686 return true;
3687
3688 wmark_ok = free_pages > pfmemalloc_reserve / 2;
3689
3690
3691 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3692 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
3693 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
3694
3695 wake_up_interruptible(&pgdat->kswapd_wait);
3696 }
3697
3698 return wmark_ok;
3699}
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3711 nodemask_t *nodemask)
3712{
3713 struct zoneref *z;
3714 struct zone *zone;
3715 pg_data_t *pgdat = NULL;
3716
3717
3718
3719
3720
3721
3722
3723
3724 if (current->flags & PF_KTHREAD)
3725 goto out;
3726
3727
3728
3729
3730
3731 if (fatal_signal_pending(current))
3732 goto out;
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3749 gfp_zone(gfp_mask), nodemask) {
3750 if (zone_idx(zone) > ZONE_NORMAL)
3751 continue;
3752
3753
3754 pgdat = zone->zone_pgdat;
3755 if (allow_direct_reclaim(pgdat))
3756 goto out;
3757 break;
3758 }
3759
3760
3761 if (!pgdat)
3762 goto out;
3763
3764
3765 count_vm_event(PGSCAN_DIRECT_THROTTLE);
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775 if (!(gfp_mask & __GFP_FS))
3776 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3777 allow_direct_reclaim(pgdat), HZ);
3778 else
3779
3780 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3781 allow_direct_reclaim(pgdat));
3782
3783 if (fatal_signal_pending(current))
3784 return true;
3785
3786out:
3787 return false;
3788}
3789
3790unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3791 gfp_t gfp_mask, nodemask_t *nodemask)
3792{
3793 unsigned long nr_reclaimed;
3794 struct scan_control sc = {
3795 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3796 .gfp_mask = current_gfp_context(gfp_mask),
3797 .reclaim_idx = gfp_zone(gfp_mask),
3798 .order = order,
3799 .nodemask = nodemask,
3800 .priority = DEF_PRIORITY,
3801 .may_writepage = !laptop_mode,
3802 .may_unmap = 1,
3803 .may_swap = 1,
3804 };
3805
3806
3807
3808
3809
3810 BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3811 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3812 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3813
3814
3815
3816
3817
3818
3819 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3820 return 1;
3821
3822 set_task_reclaim_state(current, &sc.reclaim_state);
3823 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3824
3825 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3826
3827 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3828 set_task_reclaim_state(current, NULL);
3829
3830 return nr_reclaimed;
3831}
3832
3833#ifdef CONFIG_MEMCG
3834
3835
3836unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3837 gfp_t gfp_mask, bool noswap,
3838 pg_data_t *pgdat,
3839 unsigned long *nr_scanned)
3840{
3841 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3842 struct scan_control sc = {
3843 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3844 .target_mem_cgroup = memcg,
3845 .may_writepage = !laptop_mode,
3846 .may_unmap = 1,
3847 .reclaim_idx = MAX_NR_ZONES - 1,
3848 .may_swap = !noswap,
3849 };
3850
3851 WARN_ON_ONCE(!current->reclaim_state);
3852
3853 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3854 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3855
3856 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3857 sc.gfp_mask);
3858
3859
3860
3861
3862
3863
3864
3865
3866 shrink_lruvec(lruvec, &sc);
3867
3868 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3869
3870 *nr_scanned = sc.nr_scanned;
3871
3872 return sc.nr_reclaimed;
3873}
3874
3875unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3876 unsigned long nr_pages,
3877 gfp_t gfp_mask,
3878 bool may_swap)
3879{
3880 unsigned long nr_reclaimed;
3881 unsigned int noreclaim_flag;
3882 struct scan_control sc = {
3883 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3884 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3885 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3886 .reclaim_idx = MAX_NR_ZONES - 1,
3887 .target_mem_cgroup = memcg,
3888 .priority = DEF_PRIORITY,
3889 .may_writepage = !laptop_mode,
3890 .may_unmap = 1,
3891 .may_swap = may_swap,
3892 };
3893
3894
3895
3896
3897
3898 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3899
3900 set_task_reclaim_state(current, &sc.reclaim_state);
3901 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
3902 noreclaim_flag = memalloc_noreclaim_save();
3903
3904 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3905
3906 memalloc_noreclaim_restore(noreclaim_flag);
3907 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3908 set_task_reclaim_state(current, NULL);
3909
3910 return nr_reclaimed;
3911}
3912#endif
3913
3914static void age_active_anon(struct pglist_data *pgdat,
3915 struct scan_control *sc)
3916{
3917 struct mem_cgroup *memcg;
3918 struct lruvec *lruvec;
3919
3920 if (!can_age_anon_pages(pgdat, sc))
3921 return;
3922
3923 lruvec = mem_cgroup_lruvec(NULL, pgdat);
3924 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
3925 return;
3926
3927 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3928 do {
3929 lruvec = mem_cgroup_lruvec(memcg, pgdat);
3930 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3931 sc, LRU_ACTIVE_ANON);
3932 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3933 } while (memcg);
3934}
3935
3936static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
3937{
3938 int i;
3939 struct zone *zone;
3940
3941
3942
3943
3944
3945
3946
3947
3948 for (i = highest_zoneidx; i >= 0; i--) {
3949 zone = pgdat->node_zones + i;
3950 if (!managed_zone(zone))
3951 continue;
3952
3953 if (zone->watermark_boost)
3954 return true;
3955 }
3956
3957 return false;
3958}
3959
3960
3961
3962
3963
3964static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
3965{
3966 int i;
3967 unsigned long mark = -1;
3968 struct zone *zone;
3969
3970
3971
3972
3973
3974 for (i = 0; i <= highest_zoneidx; i++) {
3975 zone = pgdat->node_zones + i;
3976
3977 if (!managed_zone(zone))
3978 continue;
3979
3980 mark = high_wmark_pages(zone);
3981 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
3982 return true;
3983 }
3984
3985
3986
3987
3988
3989
3990 if (mark == -1)
3991 return true;
3992
3993 return false;
3994}
3995
3996
3997static void clear_pgdat_congested(pg_data_t *pgdat)
3998{
3999 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
4000
4001 clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
4002 clear_bit(PGDAT_DIRTY, &pgdat->flags);
4003 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
4004}
4005
4006
4007
4008
4009
4010
4011
4012static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
4013 int highest_zoneidx)
4014{
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028 if (waitqueue_active(&pgdat->pfmemalloc_wait))
4029 wake_up_all(&pgdat->pfmemalloc_wait);
4030
4031
4032 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
4033 return true;
4034
4035 if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
4036 clear_pgdat_congested(pgdat);
4037 return true;
4038 }
4039
4040 return false;
4041}
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051static bool kswapd_shrink_node(pg_data_t *pgdat,
4052 struct scan_control *sc)
4053{
4054 struct zone *zone;
4055 int z;
4056
4057
4058 sc->nr_to_reclaim = 0;
4059 for (z = 0; z <= sc->reclaim_idx; z++) {
4060 zone = pgdat->node_zones + z;
4061 if (!managed_zone(zone))
4062 continue;
4063
4064 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
4065 }
4066
4067
4068
4069
4070
4071 shrink_node(pgdat, sc);
4072
4073
4074
4075
4076
4077
4078
4079
4080 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
4081 sc->order = 0;
4082
4083 return sc->nr_scanned >= sc->nr_to_reclaim;
4084}
4085
4086
4087static inline void
4088update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
4089{
4090 int i;
4091 struct zone *zone;
4092
4093 for (i = 0; i <= highest_zoneidx; i++) {
4094 zone = pgdat->node_zones + i;
4095
4096 if (!managed_zone(zone))
4097 continue;
4098
4099 if (active)
4100 set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
4101 else
4102 clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
4103 }
4104}
4105
4106static inline void
4107set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
4108{
4109 update_reclaim_active(pgdat, highest_zoneidx, true);
4110}
4111
4112static inline void
4113clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
4114{
4115 update_reclaim_active(pgdat, highest_zoneidx, false);
4116}
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
4132{
4133 int i;
4134 unsigned long nr_soft_reclaimed;
4135 unsigned long nr_soft_scanned;
4136 unsigned long pflags;
4137 unsigned long nr_boost_reclaim;
4138 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
4139 bool boosted;
4140 struct zone *zone;
4141 struct scan_control sc = {
4142 .gfp_mask = GFP_KERNEL,
4143 .order = order,
4144 .may_unmap = 1,
4145 };
4146
4147 set_task_reclaim_state(current, &sc.reclaim_state);
4148 psi_memstall_enter(&pflags);
4149 __fs_reclaim_acquire(_THIS_IP_);
4150
4151 count_vm_event(PAGEOUTRUN);
4152
4153
4154
4155
4156
4157
4158 nr_boost_reclaim = 0;
4159 for (i = 0; i <= highest_zoneidx; i++) {
4160 zone = pgdat->node_zones + i;
4161 if (!managed_zone(zone))
4162 continue;
4163
4164 nr_boost_reclaim += zone->watermark_boost;
4165 zone_boosts[i] = zone->watermark_boost;
4166 }
4167 boosted = nr_boost_reclaim;
4168
4169restart:
4170 set_reclaim_active(pgdat, highest_zoneidx);
4171 sc.priority = DEF_PRIORITY;
4172 do {
4173 unsigned long nr_reclaimed = sc.nr_reclaimed;
4174 bool raise_priority = true;
4175 bool balanced;
4176 bool ret;
4177
4178 sc.reclaim_idx = highest_zoneidx;
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190 if (buffer_heads_over_limit) {
4191 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
4192 zone = pgdat->node_zones + i;
4193 if (!managed_zone(zone))
4194 continue;
4195
4196 sc.reclaim_idx = i;
4197 break;
4198 }
4199 }
4200
4201
4202
4203
4204
4205
4206
4207
4208 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
4209 if (!balanced && nr_boost_reclaim) {
4210 nr_boost_reclaim = 0;
4211 goto restart;
4212 }
4213
4214
4215
4216
4217
4218
4219 if (!nr_boost_reclaim && balanced)
4220 goto out;
4221
4222
4223 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
4224 raise_priority = false;
4225
4226
4227
4228
4229
4230
4231
4232 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
4233 sc.may_swap = !nr_boost_reclaim;
4234
4235
4236
4237
4238
4239
4240
4241 age_active_anon(pgdat, &sc);
4242
4243
4244
4245
4246
4247 if (sc.priority < DEF_PRIORITY - 2)
4248 sc.may_writepage = 1;
4249
4250
4251 sc.nr_scanned = 0;
4252 nr_soft_scanned = 0;
4253 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
4254 sc.gfp_mask, &nr_soft_scanned);
4255 sc.nr_reclaimed += nr_soft_reclaimed;
4256
4257
4258
4259
4260
4261
4262 if (kswapd_shrink_node(pgdat, &sc))
4263 raise_priority = false;
4264
4265
4266
4267
4268
4269
4270 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
4271 allow_direct_reclaim(pgdat))
4272 wake_up_all(&pgdat->pfmemalloc_wait);
4273
4274
4275 __fs_reclaim_release(_THIS_IP_);
4276 ret = try_to_freeze();
4277 __fs_reclaim_acquire(_THIS_IP_);
4278 if (ret || kthread_should_stop())
4279 break;
4280
4281
4282
4283
4284
4285 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
4286 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
4287
4288
4289
4290
4291
4292
4293 if (nr_boost_reclaim && !nr_reclaimed)
4294 break;
4295
4296 if (raise_priority || !nr_reclaimed)
4297 sc.priority--;
4298 } while (sc.priority >= 1);
4299
4300 if (!sc.nr_reclaimed)
4301 pgdat->kswapd_failures++;
4302
4303out:
4304 clear_reclaim_active(pgdat, highest_zoneidx);
4305
4306
4307 if (boosted) {
4308 unsigned long flags;
4309
4310 for (i = 0; i <= highest_zoneidx; i++) {
4311 if (!zone_boosts[i])
4312 continue;
4313
4314
4315 zone = pgdat->node_zones + i;
4316 spin_lock_irqsave(&zone->lock, flags);
4317 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
4318 spin_unlock_irqrestore(&zone->lock, flags);
4319 }
4320
4321
4322
4323
4324
4325 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
4326 }
4327
4328 snapshot_refaults(NULL, pgdat);
4329 __fs_reclaim_release(_THIS_IP_);
4330 psi_memstall_leave(&pflags);
4331 set_task_reclaim_state(current, NULL);
4332
4333
4334
4335
4336
4337
4338
4339 return sc.order;
4340}
4341
4342
4343
4344
4345
4346
4347
4348
4349static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
4350 enum zone_type prev_highest_zoneidx)
4351{
4352 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
4353
4354 return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
4355}
4356
4357static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
4358 unsigned int highest_zoneidx)
4359{
4360 long remaining = 0;
4361 DEFINE_WAIT(wait);
4362
4363 if (freezing(current) || kthread_should_stop())
4364 return;
4365
4366 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
4367
4368
4369
4370
4371
4372
4373
4374
4375 if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
4376
4377
4378
4379
4380
4381
4382 reset_isolation_suitable(pgdat);
4383
4384
4385
4386
4387
4388 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
4389
4390 remaining = schedule_timeout(HZ/10);
4391
4392
4393
4394
4395
4396
4397 if (remaining) {
4398 WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
4399 kswapd_highest_zoneidx(pgdat,
4400 highest_zoneidx));
4401
4402 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
4403 WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
4404 }
4405
4406 finish_wait(&pgdat->kswapd_wait, &wait);
4407 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
4408 }
4409
4410
4411
4412
4413
4414 if (!remaining &&
4415 prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
4416 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
4427
4428 if (!kthread_should_stop())
4429 schedule();
4430
4431 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
4432 } else {
4433 if (remaining)
4434 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
4435 else
4436 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
4437 }
4438 finish_wait(&pgdat->kswapd_wait, &wait);
4439}
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454static int kswapd(void *p)
4455{
4456 unsigned int alloc_order, reclaim_order;
4457 unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
4458 pg_data_t *pgdat = (pg_data_t *)p;
4459 struct task_struct *tsk = current;
4460 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
4461
4462 if (!cpumask_empty(cpumask))
4463 set_cpus_allowed_ptr(tsk, cpumask);
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
4478 set_freezable();
4479
4480 WRITE_ONCE(pgdat->kswapd_order, 0);
4481 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
4482 atomic_set(&pgdat->nr_writeback_throttled, 0);
4483 for ( ; ; ) {
4484 bool ret;
4485
4486 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
4487 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
4488 highest_zoneidx);
4489
4490kswapd_try_sleep:
4491 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
4492 highest_zoneidx);
4493
4494
4495 alloc_order = READ_ONCE(pgdat->kswapd_order);
4496 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
4497 highest_zoneidx);
4498 WRITE_ONCE(pgdat->kswapd_order, 0);
4499 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
4500
4501 ret = try_to_freeze();
4502 if (kthread_should_stop())
4503 break;
4504
4505
4506
4507
4508
4509 if (ret)
4510 continue;
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
4521 alloc_order);
4522 reclaim_order = balance_pgdat(pgdat, alloc_order,
4523 highest_zoneidx);
4524 if (reclaim_order < alloc_order)
4525 goto kswapd_try_sleep;
4526 }
4527
4528 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
4529
4530 return 0;
4531}
4532
4533
4534
4535
4536
4537
4538
4539
4540void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
4541 enum zone_type highest_zoneidx)
4542{
4543 pg_data_t *pgdat;
4544 enum zone_type curr_idx;
4545
4546 if (!managed_zone(zone))
4547 return;
4548
4549 if (!cpuset_zone_allowed(zone, gfp_flags))
4550 return;
4551
4552 pgdat = zone->zone_pgdat;
4553 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
4554
4555 if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
4556 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
4557
4558 if (READ_ONCE(pgdat->kswapd_order) < order)
4559 WRITE_ONCE(pgdat->kswapd_order, order);
4560
4561 if (!waitqueue_active(&pgdat->kswapd_wait))
4562 return;
4563
4564
4565 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
4566 (pgdat_balanced(pgdat, order, highest_zoneidx) &&
4567 !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
4568
4569
4570
4571
4572
4573
4574
4575 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
4576 wakeup_kcompactd(pgdat, order, highest_zoneidx);
4577 return;
4578 }
4579
4580 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
4581 gfp_flags);
4582 wake_up_interruptible(&pgdat->kswapd_wait);
4583}
4584
4585#ifdef CONFIG_HIBERNATION
4586
4587
4588
4589
4590
4591
4592
4593
4594unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
4595{
4596 struct scan_control sc = {
4597 .nr_to_reclaim = nr_to_reclaim,
4598 .gfp_mask = GFP_HIGHUSER_MOVABLE,
4599 .reclaim_idx = MAX_NR_ZONES - 1,
4600 .priority = DEF_PRIORITY,
4601 .may_writepage = 1,
4602 .may_unmap = 1,
4603 .may_swap = 1,
4604 .hibernation_mode = 1,
4605 };
4606 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4607 unsigned long nr_reclaimed;
4608 unsigned int noreclaim_flag;
4609
4610 fs_reclaim_acquire(sc.gfp_mask);
4611 noreclaim_flag = memalloc_noreclaim_save();
4612 set_task_reclaim_state(current, &sc.reclaim_state);
4613
4614 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4615
4616 set_task_reclaim_state(current, NULL);
4617 memalloc_noreclaim_restore(noreclaim_flag);
4618 fs_reclaim_release(sc.gfp_mask);
4619
4620 return nr_reclaimed;
4621}
4622#endif
4623
4624
4625
4626
4627
4628void kswapd_run(int nid)
4629{
4630 pg_data_t *pgdat = NODE_DATA(nid);
4631
4632 if (pgdat->kswapd)
4633 return;
4634
4635 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
4636 if (IS_ERR(pgdat->kswapd)) {
4637
4638 BUG_ON(system_state < SYSTEM_RUNNING);
4639 pr_err("Failed to start kswapd on node %d\n", nid);
4640 pgdat->kswapd = NULL;
4641 }
4642}
4643
4644
4645
4646
4647
4648void kswapd_stop(int nid)
4649{
4650 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
4651
4652 if (kswapd) {
4653 kthread_stop(kswapd);
4654 NODE_DATA(nid)->kswapd = NULL;
4655 }
4656}
4657
4658static int __init kswapd_init(void)
4659{
4660 int nid;
4661
4662 swap_setup();
4663 for_each_node_state(nid, N_MEMORY)
4664 kswapd_run(nid);
4665 return 0;
4666}
4667
4668module_init(kswapd_init)
4669
4670#ifdef CONFIG_NUMA
4671
4672
4673
4674
4675
4676
4677int node_reclaim_mode __read_mostly;
4678
4679
4680
4681
4682
4683
4684#define NODE_RECLAIM_PRIORITY 4
4685
4686
4687
4688
4689
4690int sysctl_min_unmapped_ratio = 1;
4691
4692
4693
4694
4695
4696int sysctl_min_slab_ratio = 5;
4697
4698static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4699{
4700 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4701 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4702 node_page_state(pgdat, NR_ACTIVE_FILE);
4703
4704
4705
4706
4707
4708
4709 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4710}
4711
4712
4713static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4714{
4715 unsigned long nr_pagecache_reclaimable;
4716 unsigned long delta = 0;
4717
4718
4719
4720
4721
4722
4723
4724 if (node_reclaim_mode & RECLAIM_UNMAP)
4725 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4726 else
4727 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4728
4729
4730 if (!(node_reclaim_mode & RECLAIM_WRITE))
4731 delta += node_page_state(pgdat, NR_FILE_DIRTY);
4732
4733
4734 if (unlikely(delta > nr_pagecache_reclaimable))
4735 delta = nr_pagecache_reclaimable;
4736
4737 return nr_pagecache_reclaimable - delta;
4738}
4739
4740
4741
4742
4743static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4744{
4745
4746 const unsigned long nr_pages = 1 << order;
4747 struct task_struct *p = current;
4748 unsigned int noreclaim_flag;
4749 struct scan_control sc = {
4750 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4751 .gfp_mask = current_gfp_context(gfp_mask),
4752 .order = order,
4753 .priority = NODE_RECLAIM_PRIORITY,
4754 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4755 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4756 .may_swap = 1,
4757 .reclaim_idx = gfp_zone(gfp_mask),
4758 };
4759 unsigned long pflags;
4760
4761 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
4762 sc.gfp_mask);
4763
4764 cond_resched();
4765 psi_memstall_enter(&pflags);
4766 fs_reclaim_acquire(sc.gfp_mask);
4767
4768
4769
4770
4771
4772 noreclaim_flag = memalloc_noreclaim_save();
4773 p->flags |= PF_SWAPWRITE;
4774 set_task_reclaim_state(p, &sc.reclaim_state);
4775
4776 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4777
4778
4779
4780
4781 do {
4782 shrink_node(pgdat, &sc);
4783 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4784 }
4785
4786 set_task_reclaim_state(p, NULL);
4787 current->flags &= ~PF_SWAPWRITE;
4788 memalloc_noreclaim_restore(noreclaim_flag);
4789 fs_reclaim_release(sc.gfp_mask);
4790 psi_memstall_leave(&pflags);
4791
4792 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
4793
4794 return sc.nr_reclaimed >= nr_pages;
4795}
4796
4797int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4798{
4799 int ret;
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4812 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
4813 pgdat->min_slab_pages)
4814 return NODE_RECLAIM_FULL;
4815
4816
4817
4818
4819 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4820 return NODE_RECLAIM_NOSCAN;
4821
4822
4823
4824
4825
4826
4827
4828 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4829 return NODE_RECLAIM_NOSCAN;
4830
4831 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4832 return NODE_RECLAIM_NOSCAN;
4833
4834 ret = __node_reclaim(pgdat, gfp_mask, order);
4835 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4836
4837 if (!ret)
4838 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4839
4840 return ret;
4841}
4842#endif
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853void check_move_unevictable_pages(struct pagevec *pvec)
4854{
4855 struct lruvec *lruvec = NULL;
4856 int pgscanned = 0;
4857 int pgrescued = 0;
4858 int i;
4859
4860 for (i = 0; i < pvec->nr; i++) {
4861 struct page *page = pvec->pages[i];
4862 struct folio *folio = page_folio(page);
4863 int nr_pages;
4864
4865 if (PageTransTail(page))
4866 continue;
4867
4868 nr_pages = thp_nr_pages(page);
4869 pgscanned += nr_pages;
4870
4871
4872 if (!TestClearPageLRU(page))
4873 continue;
4874
4875 lruvec = folio_lruvec_relock_irq(folio, lruvec);
4876 if (page_evictable(page) && PageUnevictable(page)) {
4877 del_page_from_lru_list(page, lruvec);
4878 ClearPageUnevictable(page);
4879 add_page_to_lru_list(page, lruvec);
4880 pgrescued += nr_pages;
4881 }
4882 SetPageLRU(page);
4883 }
4884
4885 if (lruvec) {
4886 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4887 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4888 unlock_page_lruvec_irq(lruvec);
4889 } else if (pgscanned) {
4890 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4891 }
4892}
4893EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4894