1
2
3
4
5
6
7
8
9
10
11
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15#include <linux/mm.h>
16#include <linux/sched/mm.h>
17#include <linux/module.h>
18#include <linux/gfp.h>
19#include <linux/kernel_stat.h>
20#include <linux/swap.h>
21#include <linux/pagemap.h>
22#include <linux/init.h>
23#include <linux/highmem.h>
24#include <linux/vmpressure.h>
25#include <linux/vmstat.h>
26#include <linux/file.h>
27#include <linux/writeback.h>
28#include <linux/blkdev.h>
29#include <linux/buffer_head.h>
30
31#include <linux/mm_inline.h>
32#include <linux/backing-dev.h>
33#include <linux/rmap.h>
34#include <linux/topology.h>
35#include <linux/cpu.h>
36#include <linux/cpuset.h>
37#include <linux/compaction.h>
38#include <linux/notifier.h>
39#include <linux/rwsem.h>
40#include <linux/delay.h>
41#include <linux/kthread.h>
42#include <linux/freezer.h>
43#include <linux/memcontrol.h>
44#include <linux/migrate.h>
45#include <linux/delayacct.h>
46#include <linux/sysctl.h>
47#include <linux/oom.h>
48#include <linux/pagevec.h>
49#include <linux/prefetch.h>
50#include <linux/printk.h>
51#include <linux/dax.h>
52#include <linux/psi.h>
53
54#include <asm/tlbflush.h>
55#include <asm/div64.h>
56
57#include <linux/swapops.h>
58#include <linux/balloon_compaction.h>
59
60#include "internal.h"
61
62#define CREATE_TRACE_POINTS
63#include <trace/events/vmscan.h>
64
65struct scan_control {
66
67 unsigned long nr_to_reclaim;
68
69
70
71
72
73 nodemask_t *nodemask;
74
75
76
77
78
79 struct mem_cgroup *target_mem_cgroup;
80
81
82
83
84 unsigned long anon_cost;
85 unsigned long file_cost;
86
87
88#define DEACTIVATE_ANON 1
89#define DEACTIVATE_FILE 2
90 unsigned int may_deactivate:2;
91 unsigned int force_deactivate:1;
92 unsigned int skipped_deactivate:1;
93
94
95 unsigned int may_writepage:1;
96
97
98 unsigned int may_unmap:1;
99
100
101 unsigned int may_swap:1;
102
103
104
105
106
107
108
109
110
111 unsigned int memcg_low_reclaim:1;
112 unsigned int memcg_low_skipped:1;
113
114 unsigned int hibernation_mode:1;
115
116
117 unsigned int compaction_ready:1;
118
119
120 unsigned int cache_trim_mode:1;
121
122
123 unsigned int file_is_tiny:1;
124
125
126 unsigned int no_demotion:1;
127
128
129 s8 order;
130
131
132 s8 priority;
133
134
135 s8 reclaim_idx;
136
137
138 gfp_t gfp_mask;
139
140
141 unsigned long nr_scanned;
142
143
144 unsigned long nr_reclaimed;
145
146 struct {
147 unsigned int dirty;
148 unsigned int unqueued_dirty;
149 unsigned int congested;
150 unsigned int writeback;
151 unsigned int immediate;
152 unsigned int file_taken;
153 unsigned int taken;
154 } nr;
155
156
157 struct reclaim_state reclaim_state;
158};
159
160#ifdef ARCH_HAS_PREFETCHW
161#define prefetchw_prev_lru_page(_page, _base, _field) \
162 do { \
163 if ((_page)->lru.prev != _base) { \
164 struct page *prev; \
165 \
166 prev = lru_to_page(&(_page->lru)); \
167 prefetchw(&prev->_field); \
168 } \
169 } while (0)
170#else
171#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
172#endif
173
174
175
176
177int vm_swappiness = 60;
178
179static void set_task_reclaim_state(struct task_struct *task,
180 struct reclaim_state *rs)
181{
182
183 WARN_ON_ONCE(rs && task->reclaim_state);
184
185
186 WARN_ON_ONCE(!rs && !task->reclaim_state);
187
188 task->reclaim_state = rs;
189}
190
191static LIST_HEAD(shrinker_list);
192static DECLARE_RWSEM(shrinker_rwsem);
193
194#ifdef CONFIG_MEMCG
195static int shrinker_nr_max;
196
197
198static inline int shrinker_map_size(int nr_items)
199{
200 return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
201}
202
203static inline int shrinker_defer_size(int nr_items)
204{
205 return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
206}
207
208static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
209 int nid)
210{
211 return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
212 lockdep_is_held(&shrinker_rwsem));
213}
214
215static int expand_one_shrinker_info(struct mem_cgroup *memcg,
216 int map_size, int defer_size,
217 int old_map_size, int old_defer_size)
218{
219 struct shrinker_info *new, *old;
220 struct mem_cgroup_per_node *pn;
221 int nid;
222 int size = map_size + defer_size;
223
224 for_each_node(nid) {
225 pn = memcg->nodeinfo[nid];
226 old = shrinker_info_protected(memcg, nid);
227
228 if (!old)
229 return 0;
230
231 new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
232 if (!new)
233 return -ENOMEM;
234
235 new->nr_deferred = (atomic_long_t *)(new + 1);
236 new->map = (void *)new->nr_deferred + defer_size;
237
238
239 memset(new->map, (int)0xff, old_map_size);
240 memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
241
242 memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
243 memset((void *)new->nr_deferred + old_defer_size, 0,
244 defer_size - old_defer_size);
245
246 rcu_assign_pointer(pn->shrinker_info, new);
247 kvfree_rcu(old, rcu);
248 }
249
250 return 0;
251}
252
253void free_shrinker_info(struct mem_cgroup *memcg)
254{
255 struct mem_cgroup_per_node *pn;
256 struct shrinker_info *info;
257 int nid;
258
259 for_each_node(nid) {
260 pn = memcg->nodeinfo[nid];
261 info = rcu_dereference_protected(pn->shrinker_info, true);
262 kvfree(info);
263 rcu_assign_pointer(pn->shrinker_info, NULL);
264 }
265}
266
267int alloc_shrinker_info(struct mem_cgroup *memcg)
268{
269 struct shrinker_info *info;
270 int nid, size, ret = 0;
271 int map_size, defer_size = 0;
272
273 down_write(&shrinker_rwsem);
274 map_size = shrinker_map_size(shrinker_nr_max);
275 defer_size = shrinker_defer_size(shrinker_nr_max);
276 size = map_size + defer_size;
277 for_each_node(nid) {
278 info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
279 if (!info) {
280 free_shrinker_info(memcg);
281 ret = -ENOMEM;
282 break;
283 }
284 info->nr_deferred = (atomic_long_t *)(info + 1);
285 info->map = (void *)info->nr_deferred + defer_size;
286 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
287 }
288 up_write(&shrinker_rwsem);
289
290 return ret;
291}
292
293static inline bool need_expand(int nr_max)
294{
295 return round_up(nr_max, BITS_PER_LONG) >
296 round_up(shrinker_nr_max, BITS_PER_LONG);
297}
298
299static int expand_shrinker_info(int new_id)
300{
301 int ret = 0;
302 int new_nr_max = new_id + 1;
303 int map_size, defer_size = 0;
304 int old_map_size, old_defer_size = 0;
305 struct mem_cgroup *memcg;
306
307 if (!need_expand(new_nr_max))
308 goto out;
309
310 if (!root_mem_cgroup)
311 goto out;
312
313 lockdep_assert_held(&shrinker_rwsem);
314
315 map_size = shrinker_map_size(new_nr_max);
316 defer_size = shrinker_defer_size(new_nr_max);
317 old_map_size = shrinker_map_size(shrinker_nr_max);
318 old_defer_size = shrinker_defer_size(shrinker_nr_max);
319
320 memcg = mem_cgroup_iter(NULL, NULL, NULL);
321 do {
322 ret = expand_one_shrinker_info(memcg, map_size, defer_size,
323 old_map_size, old_defer_size);
324 if (ret) {
325 mem_cgroup_iter_break(NULL, memcg);
326 goto out;
327 }
328 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
329out:
330 if (!ret)
331 shrinker_nr_max = new_nr_max;
332
333 return ret;
334}
335
336void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
337{
338 if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
339 struct shrinker_info *info;
340
341 rcu_read_lock();
342 info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
343
344 smp_mb__before_atomic();
345 set_bit(shrinker_id, info->map);
346 rcu_read_unlock();
347 }
348}
349
350static DEFINE_IDR(shrinker_idr);
351
352static int prealloc_memcg_shrinker(struct shrinker *shrinker)
353{
354 int id, ret = -ENOMEM;
355
356 if (mem_cgroup_disabled())
357 return -ENOSYS;
358
359 down_write(&shrinker_rwsem);
360
361 id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
362 if (id < 0)
363 goto unlock;
364
365 if (id >= shrinker_nr_max) {
366 if (expand_shrinker_info(id)) {
367 idr_remove(&shrinker_idr, id);
368 goto unlock;
369 }
370 }
371 shrinker->id = id;
372 ret = 0;
373unlock:
374 up_write(&shrinker_rwsem);
375 return ret;
376}
377
378static void unregister_memcg_shrinker(struct shrinker *shrinker)
379{
380 int id = shrinker->id;
381
382 BUG_ON(id < 0);
383
384 lockdep_assert_held(&shrinker_rwsem);
385
386 idr_remove(&shrinker_idr, id);
387}
388
389static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
390 struct mem_cgroup *memcg)
391{
392 struct shrinker_info *info;
393
394 info = shrinker_info_protected(memcg, nid);
395 return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
396}
397
398static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
399 struct mem_cgroup *memcg)
400{
401 struct shrinker_info *info;
402
403 info = shrinker_info_protected(memcg, nid);
404 return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
405}
406
407void reparent_shrinker_deferred(struct mem_cgroup *memcg)
408{
409 int i, nid;
410 long nr;
411 struct mem_cgroup *parent;
412 struct shrinker_info *child_info, *parent_info;
413
414 parent = parent_mem_cgroup(memcg);
415 if (!parent)
416 parent = root_mem_cgroup;
417
418
419 down_read(&shrinker_rwsem);
420 for_each_node(nid) {
421 child_info = shrinker_info_protected(memcg, nid);
422 parent_info = shrinker_info_protected(parent, nid);
423 for (i = 0; i < shrinker_nr_max; i++) {
424 nr = atomic_long_read(&child_info->nr_deferred[i]);
425 atomic_long_add(nr, &parent_info->nr_deferred[i]);
426 }
427 }
428 up_read(&shrinker_rwsem);
429}
430
431static bool cgroup_reclaim(struct scan_control *sc)
432{
433 return sc->target_mem_cgroup;
434}
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449static bool writeback_throttling_sane(struct scan_control *sc)
450{
451 if (!cgroup_reclaim(sc))
452 return true;
453#ifdef CONFIG_CGROUP_WRITEBACK
454 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
455 return true;
456#endif
457 return false;
458}
459#else
460static int prealloc_memcg_shrinker(struct shrinker *shrinker)
461{
462 return -ENOSYS;
463}
464
465static void unregister_memcg_shrinker(struct shrinker *shrinker)
466{
467}
468
469static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
470 struct mem_cgroup *memcg)
471{
472 return 0;
473}
474
475static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
476 struct mem_cgroup *memcg)
477{
478 return 0;
479}
480
481static bool cgroup_reclaim(struct scan_control *sc)
482{
483 return false;
484}
485
486static bool writeback_throttling_sane(struct scan_control *sc)
487{
488 return true;
489}
490#endif
491
492static long xchg_nr_deferred(struct shrinker *shrinker,
493 struct shrink_control *sc)
494{
495 int nid = sc->nid;
496
497 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
498 nid = 0;
499
500 if (sc->memcg &&
501 (shrinker->flags & SHRINKER_MEMCG_AWARE))
502 return xchg_nr_deferred_memcg(nid, shrinker,
503 sc->memcg);
504
505 return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
506}
507
508
509static long add_nr_deferred(long nr, struct shrinker *shrinker,
510 struct shrink_control *sc)
511{
512 int nid = sc->nid;
513
514 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
515 nid = 0;
516
517 if (sc->memcg &&
518 (shrinker->flags & SHRINKER_MEMCG_AWARE))
519 return add_nr_deferred_memcg(nr, nid, shrinker,
520 sc->memcg);
521
522 return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
523}
524
525static bool can_demote(int nid, struct scan_control *sc)
526{
527 if (!numa_demotion_enabled)
528 return false;
529 if (sc) {
530 if (sc->no_demotion)
531 return false;
532
533 if (cgroup_reclaim(sc))
534 return false;
535 }
536 if (next_demotion_node(nid) == NUMA_NO_NODE)
537 return false;
538
539 return true;
540}
541
542static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
543 int nid,
544 struct scan_control *sc)
545{
546 if (memcg == NULL) {
547
548
549
550
551 if (get_nr_swap_pages() > 0)
552 return true;
553 } else {
554
555 if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
556 return true;
557 }
558
559
560
561
562
563
564 return can_demote(nid, sc);
565}
566
567
568
569
570
571
572unsigned long zone_reclaimable_pages(struct zone *zone)
573{
574 unsigned long nr;
575
576 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
577 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
578 if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
579 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
580 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
581
582 return nr;
583}
584
585
586
587
588
589
590
591static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
592 int zone_idx)
593{
594 unsigned long size = 0;
595 int zid;
596
597 for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
598 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
599
600 if (!managed_zone(zone))
601 continue;
602
603 if (!mem_cgroup_disabled())
604 size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
605 else
606 size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
607 }
608 return size;
609}
610
611
612
613
614int prealloc_shrinker(struct shrinker *shrinker)
615{
616 unsigned int size;
617 int err;
618
619 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
620 err = prealloc_memcg_shrinker(shrinker);
621 if (err != -ENOSYS)
622 return err;
623
624 shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
625 }
626
627 size = sizeof(*shrinker->nr_deferred);
628 if (shrinker->flags & SHRINKER_NUMA_AWARE)
629 size *= nr_node_ids;
630
631 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
632 if (!shrinker->nr_deferred)
633 return -ENOMEM;
634
635 return 0;
636}
637
638void free_prealloced_shrinker(struct shrinker *shrinker)
639{
640 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
641 down_write(&shrinker_rwsem);
642 unregister_memcg_shrinker(shrinker);
643 up_write(&shrinker_rwsem);
644 return;
645 }
646
647 kfree(shrinker->nr_deferred);
648 shrinker->nr_deferred = NULL;
649}
650
651void register_shrinker_prepared(struct shrinker *shrinker)
652{
653 down_write(&shrinker_rwsem);
654 list_add_tail(&shrinker->list, &shrinker_list);
655 shrinker->flags |= SHRINKER_REGISTERED;
656 up_write(&shrinker_rwsem);
657}
658
659int register_shrinker(struct shrinker *shrinker)
660{
661 int err = prealloc_shrinker(shrinker);
662
663 if (err)
664 return err;
665 register_shrinker_prepared(shrinker);
666 return 0;
667}
668EXPORT_SYMBOL(register_shrinker);
669
670
671
672
673void unregister_shrinker(struct shrinker *shrinker)
674{
675 if (!(shrinker->flags & SHRINKER_REGISTERED))
676 return;
677
678 down_write(&shrinker_rwsem);
679 list_del(&shrinker->list);
680 shrinker->flags &= ~SHRINKER_REGISTERED;
681 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
682 unregister_memcg_shrinker(shrinker);
683 up_write(&shrinker_rwsem);
684
685 kfree(shrinker->nr_deferred);
686 shrinker->nr_deferred = NULL;
687}
688EXPORT_SYMBOL(unregister_shrinker);
689
690#define SHRINK_BATCH 128
691
692static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
693 struct shrinker *shrinker, int priority)
694{
695 unsigned long freed = 0;
696 unsigned long long delta;
697 long total_scan;
698 long freeable;
699 long nr;
700 long new_nr;
701 long batch_size = shrinker->batch ? shrinker->batch
702 : SHRINK_BATCH;
703 long scanned = 0, next_deferred;
704
705 freeable = shrinker->count_objects(shrinker, shrinkctl);
706 if (freeable == 0 || freeable == SHRINK_EMPTY)
707 return freeable;
708
709
710
711
712
713
714 nr = xchg_nr_deferred(shrinker, shrinkctl);
715
716 if (shrinker->seeks) {
717 delta = freeable >> priority;
718 delta *= 4;
719 do_div(delta, shrinker->seeks);
720 } else {
721
722
723
724
725
726 delta = freeable / 2;
727 }
728
729 total_scan = nr >> priority;
730 total_scan += delta;
731 total_scan = min(total_scan, (2 * freeable));
732
733 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
734 freeable, delta, total_scan, priority);
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751 while (total_scan >= batch_size ||
752 total_scan >= freeable) {
753 unsigned long ret;
754 unsigned long nr_to_scan = min(batch_size, total_scan);
755
756 shrinkctl->nr_to_scan = nr_to_scan;
757 shrinkctl->nr_scanned = nr_to_scan;
758 ret = shrinker->scan_objects(shrinker, shrinkctl);
759 if (ret == SHRINK_STOP)
760 break;
761 freed += ret;
762
763 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
764 total_scan -= shrinkctl->nr_scanned;
765 scanned += shrinkctl->nr_scanned;
766
767 cond_resched();
768 }
769
770
771
772
773
774
775
776 next_deferred = max_t(long, (nr + delta - scanned), 0);
777 next_deferred = min(next_deferred, (2 * freeable));
778
779
780
781
782
783 new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
784
785 trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
786 return freed;
787}
788
789#ifdef CONFIG_MEMCG
790static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
791 struct mem_cgroup *memcg, int priority)
792{
793 struct shrinker_info *info;
794 unsigned long ret, freed = 0;
795 int i;
796
797 if (!mem_cgroup_online(memcg))
798 return 0;
799
800 if (!down_read_trylock(&shrinker_rwsem))
801 return 0;
802
803 info = shrinker_info_protected(memcg, nid);
804 if (unlikely(!info))
805 goto unlock;
806
807 for_each_set_bit(i, info->map, shrinker_nr_max) {
808 struct shrink_control sc = {
809 .gfp_mask = gfp_mask,
810 .nid = nid,
811 .memcg = memcg,
812 };
813 struct shrinker *shrinker;
814
815 shrinker = idr_find(&shrinker_idr, i);
816 if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
817 if (!shrinker)
818 clear_bit(i, info->map);
819 continue;
820 }
821
822
823 if (!memcg_kmem_enabled() &&
824 !(shrinker->flags & SHRINKER_NONSLAB))
825 continue;
826
827 ret = do_shrink_slab(&sc, shrinker, priority);
828 if (ret == SHRINK_EMPTY) {
829 clear_bit(i, info->map);
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845 smp_mb__after_atomic();
846 ret = do_shrink_slab(&sc, shrinker, priority);
847 if (ret == SHRINK_EMPTY)
848 ret = 0;
849 else
850 set_shrinker_bit(memcg, nid, i);
851 }
852 freed += ret;
853
854 if (rwsem_is_contended(&shrinker_rwsem)) {
855 freed = freed ? : 1;
856 break;
857 }
858 }
859unlock:
860 up_read(&shrinker_rwsem);
861 return freed;
862}
863#else
864static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
865 struct mem_cgroup *memcg, int priority)
866{
867 return 0;
868}
869#endif
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
892 struct mem_cgroup *memcg,
893 int priority)
894{
895 unsigned long ret, freed = 0;
896 struct shrinker *shrinker;
897
898
899
900
901
902
903
904
905 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
906 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
907
908 if (!down_read_trylock(&shrinker_rwsem))
909 goto out;
910
911 list_for_each_entry(shrinker, &shrinker_list, list) {
912 struct shrink_control sc = {
913 .gfp_mask = gfp_mask,
914 .nid = nid,
915 .memcg = memcg,
916 };
917
918 ret = do_shrink_slab(&sc, shrinker, priority);
919 if (ret == SHRINK_EMPTY)
920 ret = 0;
921 freed += ret;
922
923
924
925
926
927 if (rwsem_is_contended(&shrinker_rwsem)) {
928 freed = freed ? : 1;
929 break;
930 }
931 }
932
933 up_read(&shrinker_rwsem);
934out:
935 cond_resched();
936 return freed;
937}
938
939void drop_slab_node(int nid)
940{
941 unsigned long freed;
942 int shift = 0;
943
944 do {
945 struct mem_cgroup *memcg = NULL;
946
947 if (fatal_signal_pending(current))
948 return;
949
950 freed = 0;
951 memcg = mem_cgroup_iter(NULL, NULL, NULL);
952 do {
953 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
954 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
955 } while ((freed >> shift++) > 1);
956}
957
958void drop_slab(void)
959{
960 int nid;
961
962 for_each_online_node(nid)
963 drop_slab_node(nid);
964}
965
966static inline int is_page_cache_freeable(struct page *page)
967{
968
969
970
971
972
973 int page_cache_pins = thp_nr_pages(page);
974 return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
975}
976
977static int may_write_to_inode(struct inode *inode)
978{
979 if (current->flags & PF_SWAPWRITE)
980 return 1;
981 if (!inode_write_congested(inode))
982 return 1;
983 if (inode_to_bdi(inode) == current->backing_dev_info)
984 return 1;
985 return 0;
986}
987
988
989
990
991
992
993
994
995
996
997
998
999
1000static void handle_write_error(struct address_space *mapping,
1001 struct page *page, int error)
1002{
1003 lock_page(page);
1004 if (page_mapping(page) == mapping)
1005 mapping_set_error(mapping, error);
1006 unlock_page(page);
1007}
1008
1009
1010typedef enum {
1011
1012 PAGE_KEEP,
1013
1014 PAGE_ACTIVATE,
1015
1016 PAGE_SUCCESS,
1017
1018 PAGE_CLEAN,
1019} pageout_t;
1020
1021
1022
1023
1024
1025static pageout_t pageout(struct page *page, struct address_space *mapping)
1026{
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043 if (!is_page_cache_freeable(page))
1044 return PAGE_KEEP;
1045 if (!mapping) {
1046
1047
1048
1049
1050 if (page_has_private(page)) {
1051 if (try_to_free_buffers(page)) {
1052 ClearPageDirty(page);
1053 pr_info("%s: orphaned page\n", __func__);
1054 return PAGE_CLEAN;
1055 }
1056 }
1057 return PAGE_KEEP;
1058 }
1059 if (mapping->a_ops->writepage == NULL)
1060 return PAGE_ACTIVATE;
1061 if (!may_write_to_inode(mapping->host))
1062 return PAGE_KEEP;
1063
1064 if (clear_page_dirty_for_io(page)) {
1065 int res;
1066 struct writeback_control wbc = {
1067 .sync_mode = WB_SYNC_NONE,
1068 .nr_to_write = SWAP_CLUSTER_MAX,
1069 .range_start = 0,
1070 .range_end = LLONG_MAX,
1071 .for_reclaim = 1,
1072 };
1073
1074 SetPageReclaim(page);
1075 res = mapping->a_ops->writepage(page, &wbc);
1076 if (res < 0)
1077 handle_write_error(mapping, page, res);
1078 if (res == AOP_WRITEPAGE_ACTIVATE) {
1079 ClearPageReclaim(page);
1080 return PAGE_ACTIVATE;
1081 }
1082
1083 if (!PageWriteback(page)) {
1084
1085 ClearPageReclaim(page);
1086 }
1087 trace_mm_vmscan_writepage(page);
1088 inc_node_page_state(page, NR_VMSCAN_WRITE);
1089 return PAGE_SUCCESS;
1090 }
1091
1092 return PAGE_CLEAN;
1093}
1094
1095
1096
1097
1098
1099static int __remove_mapping(struct address_space *mapping, struct page *page,
1100 bool reclaimed, struct mem_cgroup *target_memcg)
1101{
1102 int refcount;
1103 void *shadow = NULL;
1104
1105 BUG_ON(!PageLocked(page));
1106 BUG_ON(mapping != page_mapping(page));
1107
1108 xa_lock_irq(&mapping->i_pages);
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134 refcount = 1 + compound_nr(page);
1135 if (!page_ref_freeze(page, refcount))
1136 goto cannot_free;
1137
1138 if (unlikely(PageDirty(page))) {
1139 page_ref_unfreeze(page, refcount);
1140 goto cannot_free;
1141 }
1142
1143 if (PageSwapCache(page)) {
1144 swp_entry_t swap = { .val = page_private(page) };
1145 mem_cgroup_swapout(page, swap);
1146 if (reclaimed && !mapping_exiting(mapping))
1147 shadow = workingset_eviction(page, target_memcg);
1148 __delete_from_swap_cache(page, swap, shadow);
1149 xa_unlock_irq(&mapping->i_pages);
1150 put_swap_page(page, swap);
1151 } else {
1152 void (*freepage)(struct page *);
1153
1154 freepage = mapping->a_ops->freepage;
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171 if (reclaimed && page_is_file_lru(page) &&
1172 !mapping_exiting(mapping) && !dax_mapping(mapping))
1173 shadow = workingset_eviction(page, target_memcg);
1174 __delete_from_page_cache(page, shadow);
1175 xa_unlock_irq(&mapping->i_pages);
1176
1177 if (freepage != NULL)
1178 freepage(page);
1179 }
1180
1181 return 1;
1182
1183cannot_free:
1184 xa_unlock_irq(&mapping->i_pages);
1185 return 0;
1186}
1187
1188
1189
1190
1191
1192
1193
1194int remove_mapping(struct address_space *mapping, struct page *page)
1195{
1196 if (__remove_mapping(mapping, page, false, NULL)) {
1197
1198
1199
1200
1201
1202 page_ref_unfreeze(page, 1);
1203 return 1;
1204 }
1205 return 0;
1206}
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217void putback_lru_page(struct page *page)
1218{
1219 lru_cache_add(page);
1220 put_page(page);
1221}
1222
1223enum page_references {
1224 PAGEREF_RECLAIM,
1225 PAGEREF_RECLAIM_CLEAN,
1226 PAGEREF_KEEP,
1227 PAGEREF_ACTIVATE,
1228};
1229
1230static enum page_references page_check_references(struct page *page,
1231 struct scan_control *sc)
1232{
1233 int referenced_ptes, referenced_page;
1234 unsigned long vm_flags;
1235
1236 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
1237 &vm_flags);
1238 referenced_page = TestClearPageReferenced(page);
1239
1240
1241
1242
1243
1244 if (vm_flags & VM_LOCKED)
1245 return PAGEREF_RECLAIM;
1246
1247 if (referenced_ptes) {
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262 SetPageReferenced(page);
1263
1264 if (referenced_page || referenced_ptes > 1)
1265 return PAGEREF_ACTIVATE;
1266
1267
1268
1269
1270 if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
1271 return PAGEREF_ACTIVATE;
1272
1273 return PAGEREF_KEEP;
1274 }
1275
1276
1277 if (referenced_page && !PageSwapBacked(page))
1278 return PAGEREF_RECLAIM_CLEAN;
1279
1280 return PAGEREF_RECLAIM;
1281}
1282
1283
1284static void page_check_dirty_writeback(struct page *page,
1285 bool *dirty, bool *writeback)
1286{
1287 struct address_space *mapping;
1288
1289
1290
1291
1292
1293 if (!page_is_file_lru(page) ||
1294 (PageAnon(page) && !PageSwapBacked(page))) {
1295 *dirty = false;
1296 *writeback = false;
1297 return;
1298 }
1299
1300
1301 *dirty = PageDirty(page);
1302 *writeback = PageWriteback(page);
1303
1304
1305 if (!page_has_private(page))
1306 return;
1307
1308 mapping = page_mapping(page);
1309 if (mapping && mapping->a_ops->is_dirty_writeback)
1310 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1311}
1312
1313static struct page *alloc_demote_page(struct page *page, unsigned long node)
1314{
1315 struct migration_target_control mtc = {
1316
1317
1318
1319
1320
1321 .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
1322 __GFP_THISNODE | __GFP_NOWARN |
1323 __GFP_NOMEMALLOC | GFP_NOWAIT,
1324 .nid = node
1325 };
1326
1327 return alloc_migration_target(page, (unsigned long)&mtc);
1328}
1329
1330
1331
1332
1333
1334
1335static unsigned int demote_page_list(struct list_head *demote_pages,
1336 struct pglist_data *pgdat)
1337{
1338 int target_nid = next_demotion_node(pgdat->node_id);
1339 unsigned int nr_succeeded;
1340 int err;
1341
1342 if (list_empty(demote_pages))
1343 return 0;
1344
1345 if (target_nid == NUMA_NO_NODE)
1346 return 0;
1347
1348
1349 err = migrate_pages(demote_pages, alloc_demote_page, NULL,
1350 target_nid, MIGRATE_ASYNC, MR_DEMOTION,
1351 &nr_succeeded);
1352
1353 if (current_is_kswapd())
1354 __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
1355 else
1356 __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
1357
1358 return nr_succeeded;
1359}
1360
1361
1362
1363
1364static unsigned int shrink_page_list(struct list_head *page_list,
1365 struct pglist_data *pgdat,
1366 struct scan_control *sc,
1367 struct reclaim_stat *stat,
1368 bool ignore_references)
1369{
1370 LIST_HEAD(ret_pages);
1371 LIST_HEAD(free_pages);
1372 LIST_HEAD(demote_pages);
1373 unsigned int nr_reclaimed = 0;
1374 unsigned int pgactivate = 0;
1375 bool do_demote_pass;
1376
1377 memset(stat, 0, sizeof(*stat));
1378 cond_resched();
1379 do_demote_pass = can_demote(pgdat->node_id, sc);
1380
1381retry:
1382 while (!list_empty(page_list)) {
1383 struct address_space *mapping;
1384 struct page *page;
1385 enum page_references references = PAGEREF_RECLAIM;
1386 bool dirty, writeback, may_enter_fs;
1387 unsigned int nr_pages;
1388
1389 cond_resched();
1390
1391 page = lru_to_page(page_list);
1392 list_del(&page->lru);
1393
1394 if (!trylock_page(page))
1395 goto keep;
1396
1397 VM_BUG_ON_PAGE(PageActive(page), page);
1398
1399 nr_pages = compound_nr(page);
1400
1401
1402 sc->nr_scanned += nr_pages;
1403
1404 if (unlikely(!page_evictable(page)))
1405 goto activate_locked;
1406
1407 if (!sc->may_unmap && page_mapped(page))
1408 goto keep_locked;
1409
1410 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1411 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1412
1413
1414
1415
1416
1417
1418
1419 page_check_dirty_writeback(page, &dirty, &writeback);
1420 if (dirty || writeback)
1421 stat->nr_dirty++;
1422
1423 if (dirty && !writeback)
1424 stat->nr_unqueued_dirty++;
1425
1426
1427
1428
1429
1430
1431
1432 mapping = page_mapping(page);
1433 if (((dirty || writeback) && mapping &&
1434 inode_write_congested(mapping->host)) ||
1435 (writeback && PageReclaim(page)))
1436 stat->nr_congested++;
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480 if (PageWriteback(page)) {
1481
1482 if (current_is_kswapd() &&
1483 PageReclaim(page) &&
1484 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1485 stat->nr_immediate++;
1486 goto activate_locked;
1487
1488
1489 } else if (writeback_throttling_sane(sc) ||
1490 !PageReclaim(page) || !may_enter_fs) {
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502 SetPageReclaim(page);
1503 stat->nr_writeback++;
1504 goto activate_locked;
1505
1506
1507 } else {
1508 unlock_page(page);
1509 wait_on_page_writeback(page);
1510
1511 list_add_tail(&page->lru, page_list);
1512 continue;
1513 }
1514 }
1515
1516 if (!ignore_references)
1517 references = page_check_references(page, sc);
1518
1519 switch (references) {
1520 case PAGEREF_ACTIVATE:
1521 goto activate_locked;
1522 case PAGEREF_KEEP:
1523 stat->nr_ref_keep += nr_pages;
1524 goto keep_locked;
1525 case PAGEREF_RECLAIM:
1526 case PAGEREF_RECLAIM_CLEAN:
1527 ;
1528 }
1529
1530
1531
1532
1533
1534 if (do_demote_pass &&
1535 (thp_migration_supported() || !PageTransHuge(page))) {
1536 list_add(&page->lru, &demote_pages);
1537 unlock_page(page);
1538 continue;
1539 }
1540
1541
1542
1543
1544
1545
1546 if (PageAnon(page) && PageSwapBacked(page)) {
1547 if (!PageSwapCache(page)) {
1548 if (!(sc->gfp_mask & __GFP_IO))
1549 goto keep_locked;
1550 if (page_maybe_dma_pinned(page))
1551 goto keep_locked;
1552 if (PageTransHuge(page)) {
1553
1554 if (!can_split_huge_page(page, NULL))
1555 goto activate_locked;
1556
1557
1558
1559
1560
1561 if (!compound_mapcount(page) &&
1562 split_huge_page_to_list(page,
1563 page_list))
1564 goto activate_locked;
1565 }
1566 if (!add_to_swap(page)) {
1567 if (!PageTransHuge(page))
1568 goto activate_locked_split;
1569
1570 if (split_huge_page_to_list(page,
1571 page_list))
1572 goto activate_locked;
1573#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1574 count_vm_event(THP_SWPOUT_FALLBACK);
1575#endif
1576 if (!add_to_swap(page))
1577 goto activate_locked_split;
1578 }
1579
1580 may_enter_fs = true;
1581
1582
1583 mapping = page_mapping(page);
1584 }
1585 } else if (unlikely(PageTransHuge(page))) {
1586
1587 if (split_huge_page_to_list(page, page_list))
1588 goto keep_locked;
1589 }
1590
1591
1592
1593
1594
1595
1596
1597
1598 if ((nr_pages > 1) && !PageTransHuge(page)) {
1599 sc->nr_scanned -= (nr_pages - 1);
1600 nr_pages = 1;
1601 }
1602
1603
1604
1605
1606
1607 if (page_mapped(page)) {
1608 enum ttu_flags flags = TTU_BATCH_FLUSH;
1609 bool was_swapbacked = PageSwapBacked(page);
1610
1611 if (unlikely(PageTransHuge(page)))
1612 flags |= TTU_SPLIT_HUGE_PMD;
1613
1614 try_to_unmap(page, flags);
1615 if (page_mapped(page)) {
1616 stat->nr_unmap_fail += nr_pages;
1617 if (!was_swapbacked && PageSwapBacked(page))
1618 stat->nr_lazyfree_fail += nr_pages;
1619 goto activate_locked;
1620 }
1621 }
1622
1623 if (PageDirty(page)) {
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634 if (page_is_file_lru(page) &&
1635 (!current_is_kswapd() || !PageReclaim(page) ||
1636 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1637
1638
1639
1640
1641
1642
1643 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1644 SetPageReclaim(page);
1645
1646 goto activate_locked;
1647 }
1648
1649 if (references == PAGEREF_RECLAIM_CLEAN)
1650 goto keep_locked;
1651 if (!may_enter_fs)
1652 goto keep_locked;
1653 if (!sc->may_writepage)
1654 goto keep_locked;
1655
1656
1657
1658
1659
1660
1661 try_to_unmap_flush_dirty();
1662 switch (pageout(page, mapping)) {
1663 case PAGE_KEEP:
1664 goto keep_locked;
1665 case PAGE_ACTIVATE:
1666 goto activate_locked;
1667 case PAGE_SUCCESS:
1668 stat->nr_pageout += thp_nr_pages(page);
1669
1670 if (PageWriteback(page))
1671 goto keep;
1672 if (PageDirty(page))
1673 goto keep;
1674
1675
1676
1677
1678
1679 if (!trylock_page(page))
1680 goto keep;
1681 if (PageDirty(page) || PageWriteback(page))
1682 goto keep_locked;
1683 mapping = page_mapping(page);
1684 fallthrough;
1685 case PAGE_CLEAN:
1686 ;
1687 }
1688 }
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711 if (page_has_private(page)) {
1712 if (!try_to_release_page(page, sc->gfp_mask))
1713 goto activate_locked;
1714 if (!mapping && page_count(page) == 1) {
1715 unlock_page(page);
1716 if (put_page_testzero(page))
1717 goto free_it;
1718 else {
1719
1720
1721
1722
1723
1724
1725
1726 nr_reclaimed++;
1727 continue;
1728 }
1729 }
1730 }
1731
1732 if (PageAnon(page) && !PageSwapBacked(page)) {
1733
1734 if (!page_ref_freeze(page, 1))
1735 goto keep_locked;
1736
1737
1738
1739
1740
1741
1742
1743
1744 count_vm_event(PGLAZYFREED);
1745 count_memcg_page_event(page, PGLAZYFREED);
1746 } else if (!mapping || !__remove_mapping(mapping, page, true,
1747 sc->target_mem_cgroup))
1748 goto keep_locked;
1749
1750 unlock_page(page);
1751free_it:
1752
1753
1754
1755
1756 nr_reclaimed += nr_pages;
1757
1758
1759
1760
1761
1762 if (unlikely(PageTransHuge(page)))
1763 destroy_compound_page(page);
1764 else
1765 list_add(&page->lru, &free_pages);
1766 continue;
1767
1768activate_locked_split:
1769
1770
1771
1772
1773 if (nr_pages > 1) {
1774 sc->nr_scanned -= (nr_pages - 1);
1775 nr_pages = 1;
1776 }
1777activate_locked:
1778
1779 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1780 PageMlocked(page)))
1781 try_to_free_swap(page);
1782 VM_BUG_ON_PAGE(PageActive(page), page);
1783 if (!PageMlocked(page)) {
1784 int type = page_is_file_lru(page);
1785 SetPageActive(page);
1786 stat->nr_activate[type] += nr_pages;
1787 count_memcg_page_event(page, PGACTIVATE);
1788 }
1789keep_locked:
1790 unlock_page(page);
1791keep:
1792 list_add(&page->lru, &ret_pages);
1793 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1794 }
1795
1796
1797
1798 nr_reclaimed += demote_page_list(&demote_pages, pgdat);
1799
1800 if (!list_empty(&demote_pages)) {
1801
1802 list_splice_init(&demote_pages, page_list);
1803 do_demote_pass = false;
1804 goto retry;
1805 }
1806
1807 pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
1808
1809 mem_cgroup_uncharge_list(&free_pages);
1810 try_to_unmap_flush();
1811 free_unref_page_list(&free_pages);
1812
1813 list_splice(&ret_pages, page_list);
1814 count_vm_events(PGACTIVATE, pgactivate);
1815
1816 return nr_reclaimed;
1817}
1818
1819unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1820 struct list_head *page_list)
1821{
1822 struct scan_control sc = {
1823 .gfp_mask = GFP_KERNEL,
1824 .may_unmap = 1,
1825 };
1826 struct reclaim_stat stat;
1827 unsigned int nr_reclaimed;
1828 struct page *page, *next;
1829 LIST_HEAD(clean_pages);
1830 unsigned int noreclaim_flag;
1831
1832 list_for_each_entry_safe(page, next, page_list, lru) {
1833 if (!PageHuge(page) && page_is_file_lru(page) &&
1834 !PageDirty(page) && !__PageMovable(page) &&
1835 !PageUnevictable(page)) {
1836 ClearPageActive(page);
1837 list_move(&page->lru, &clean_pages);
1838 }
1839 }
1840
1841
1842
1843
1844
1845
1846
1847 noreclaim_flag = memalloc_noreclaim_save();
1848 nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1849 &stat, true);
1850 memalloc_noreclaim_restore(noreclaim_flag);
1851
1852 list_splice(&clean_pages, page_list);
1853 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1854 -(long)nr_reclaimed);
1855
1856
1857
1858
1859
1860
1861 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
1862 stat.nr_lazyfree_fail);
1863 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1864 -(long)stat.nr_lazyfree_fail);
1865 return nr_reclaimed;
1866}
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
1879{
1880
1881 if (!PageLRU(page))
1882 return false;
1883
1884
1885 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1886 return false;
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896 if (mode & ISOLATE_ASYNC_MIGRATE) {
1897
1898 if (PageWriteback(page))
1899 return false;
1900
1901 if (PageDirty(page)) {
1902 struct address_space *mapping;
1903 bool migrate_dirty;
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914 if (!trylock_page(page))
1915 return false;
1916
1917 mapping = page_mapping(page);
1918 migrate_dirty = !mapping || mapping->a_ops->migratepage;
1919 unlock_page(page);
1920 if (!migrate_dirty)
1921 return false;
1922 }
1923 }
1924
1925 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1926 return false;
1927
1928 return true;
1929}
1930
1931
1932
1933
1934
1935static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1936 enum lru_list lru, unsigned long *nr_zone_taken)
1937{
1938 int zid;
1939
1940 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1941 if (!nr_zone_taken[zid])
1942 continue;
1943
1944 update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1945 }
1946
1947}
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1971 struct lruvec *lruvec, struct list_head *dst,
1972 unsigned long *nr_scanned, struct scan_control *sc,
1973 enum lru_list lru)
1974{
1975 struct list_head *src = &lruvec->lists[lru];
1976 unsigned long nr_taken = 0;
1977 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1978 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1979 unsigned long skipped = 0;
1980 unsigned long scan, total_scan, nr_pages;
1981 LIST_HEAD(pages_skipped);
1982 isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
1983
1984 total_scan = 0;
1985 scan = 0;
1986 while (scan < nr_to_scan && !list_empty(src)) {
1987 struct page *page;
1988
1989 page = lru_to_page(src);
1990 prefetchw_prev_lru_page(page, src, flags);
1991
1992 nr_pages = compound_nr(page);
1993 total_scan += nr_pages;
1994
1995 if (page_zonenum(page) > sc->reclaim_idx) {
1996 list_move(&page->lru, &pages_skipped);
1997 nr_skipped[page_zonenum(page)] += nr_pages;
1998 continue;
1999 }
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011 scan += nr_pages;
2012 if (!__isolate_lru_page_prepare(page, mode)) {
2013
2014 list_move(&page->lru, src);
2015 continue;
2016 }
2017
2018
2019
2020
2021
2022 if (unlikely(!get_page_unless_zero(page))) {
2023 list_move(&page->lru, src);
2024 continue;
2025 }
2026
2027 if (!TestClearPageLRU(page)) {
2028
2029 put_page(page);
2030 list_move(&page->lru, src);
2031 continue;
2032 }
2033
2034 nr_taken += nr_pages;
2035 nr_zone_taken[page_zonenum(page)] += nr_pages;
2036 list_move(&page->lru, dst);
2037 }
2038
2039
2040
2041
2042
2043
2044
2045
2046 if (!list_empty(&pages_skipped)) {
2047 int zid;
2048
2049 list_splice(&pages_skipped, src);
2050 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
2051 if (!nr_skipped[zid])
2052 continue;
2053
2054 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
2055 skipped += nr_skipped[zid];
2056 }
2057 }
2058 *nr_scanned = total_scan;
2059 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
2060 total_scan, skipped, nr_taken, mode, lru);
2061 update_lru_sizes(lruvec, lru, nr_zone_taken);
2062 return nr_taken;
2063}
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091int isolate_lru_page(struct page *page)
2092{
2093 int ret = -EBUSY;
2094
2095 VM_BUG_ON_PAGE(!page_count(page), page);
2096 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
2097
2098 if (TestClearPageLRU(page)) {
2099 struct lruvec *lruvec;
2100
2101 get_page(page);
2102 lruvec = lock_page_lruvec_irq(page);
2103 del_page_from_lru_list(page, lruvec);
2104 unlock_page_lruvec_irq(lruvec);
2105 ret = 0;
2106 }
2107
2108 return ret;
2109}
2110
2111
2112
2113
2114
2115
2116
2117
2118static int too_many_isolated(struct pglist_data *pgdat, int file,
2119 struct scan_control *sc)
2120{
2121 unsigned long inactive, isolated;
2122
2123 if (current_is_kswapd())
2124 return 0;
2125
2126 if (!writeback_throttling_sane(sc))
2127 return 0;
2128
2129 if (file) {
2130 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
2131 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
2132 } else {
2133 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
2134 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
2135 }
2136
2137
2138
2139
2140
2141
2142 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
2143 inactive >>= 3;
2144
2145 return isolated > inactive;
2146}
2147
2148
2149
2150
2151
2152
2153
2154static unsigned int move_pages_to_lru(struct lruvec *lruvec,
2155 struct list_head *list)
2156{
2157 int nr_pages, nr_moved = 0;
2158 LIST_HEAD(pages_to_free);
2159 struct page *page;
2160
2161 while (!list_empty(list)) {
2162 page = lru_to_page(list);
2163 VM_BUG_ON_PAGE(PageLRU(page), page);
2164 list_del(&page->lru);
2165 if (unlikely(!page_evictable(page))) {
2166 spin_unlock_irq(&lruvec->lru_lock);
2167 putback_lru_page(page);
2168 spin_lock_irq(&lruvec->lru_lock);
2169 continue;
2170 }
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183 SetPageLRU(page);
2184
2185 if (unlikely(put_page_testzero(page))) {
2186 __clear_page_lru_flags(page);
2187
2188 if (unlikely(PageCompound(page))) {
2189 spin_unlock_irq(&lruvec->lru_lock);
2190 destroy_compound_page(page);
2191 spin_lock_irq(&lruvec->lru_lock);
2192 } else
2193 list_add(&page->lru, &pages_to_free);
2194
2195 continue;
2196 }
2197
2198
2199
2200
2201
2202 VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
2203 add_page_to_lru_list(page, lruvec);
2204 nr_pages = thp_nr_pages(page);
2205 nr_moved += nr_pages;
2206 if (PageActive(page))
2207 workingset_age_nonresident(lruvec, nr_pages);
2208 }
2209
2210
2211
2212
2213 list_splice(&pages_to_free, list);
2214
2215 return nr_moved;
2216}
2217
2218
2219
2220
2221
2222
2223
2224static int current_may_throttle(void)
2225{
2226 return !(current->flags & PF_LOCAL_THROTTLE) ||
2227 current->backing_dev_info == NULL ||
2228 bdi_write_congested(current->backing_dev_info);
2229}
2230
2231
2232
2233
2234
2235static unsigned long
2236shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
2237 struct scan_control *sc, enum lru_list lru)
2238{
2239 LIST_HEAD(page_list);
2240 unsigned long nr_scanned;
2241 unsigned int nr_reclaimed = 0;
2242 unsigned long nr_taken;
2243 struct reclaim_stat stat;
2244 bool file = is_file_lru(lru);
2245 enum vm_event_item item;
2246 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2247 bool stalled = false;
2248
2249 while (unlikely(too_many_isolated(pgdat, file, sc))) {
2250 if (stalled)
2251 return 0;
2252
2253
2254 msleep(100);
2255 stalled = true;
2256
2257
2258 if (fatal_signal_pending(current))
2259 return SWAP_CLUSTER_MAX;
2260 }
2261
2262 lru_add_drain();
2263
2264 spin_lock_irq(&lruvec->lru_lock);
2265
2266 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
2267 &nr_scanned, sc, lru);
2268
2269 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2270 item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
2271 if (!cgroup_reclaim(sc))
2272 __count_vm_events(item, nr_scanned);
2273 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
2274 __count_vm_events(PGSCAN_ANON + file, nr_scanned);
2275
2276 spin_unlock_irq(&lruvec->lru_lock);
2277
2278 if (nr_taken == 0)
2279 return 0;
2280
2281 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
2282
2283 spin_lock_irq(&lruvec->lru_lock);
2284 move_pages_to_lru(lruvec, &page_list);
2285
2286 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2287 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
2288 if (!cgroup_reclaim(sc))
2289 __count_vm_events(item, nr_reclaimed);
2290 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
2291 __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
2292 spin_unlock_irq(&lruvec->lru_lock);
2293
2294 lru_note_cost(lruvec, file, stat.nr_pageout);
2295 mem_cgroup_uncharge_list(&page_list);
2296 free_unref_page_list(&page_list);
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309 if (stat.nr_unqueued_dirty == nr_taken)
2310 wakeup_flusher_threads(WB_REASON_VMSCAN);
2311
2312 sc->nr.dirty += stat.nr_dirty;
2313 sc->nr.congested += stat.nr_congested;
2314 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2315 sc->nr.writeback += stat.nr_writeback;
2316 sc->nr.immediate += stat.nr_immediate;
2317 sc->nr.taken += nr_taken;
2318 if (file)
2319 sc->nr.file_taken += nr_taken;
2320
2321 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2322 nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2323 return nr_reclaimed;
2324}
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343static void shrink_active_list(unsigned long nr_to_scan,
2344 struct lruvec *lruvec,
2345 struct scan_control *sc,
2346 enum lru_list lru)
2347{
2348 unsigned long nr_taken;
2349 unsigned long nr_scanned;
2350 unsigned long vm_flags;
2351 LIST_HEAD(l_hold);
2352 LIST_HEAD(l_active);
2353 LIST_HEAD(l_inactive);
2354 struct page *page;
2355 unsigned nr_deactivate, nr_activate;
2356 unsigned nr_rotated = 0;
2357 int file = is_file_lru(lru);
2358 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2359
2360 lru_add_drain();
2361
2362 spin_lock_irq(&lruvec->lru_lock);
2363
2364 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2365 &nr_scanned, sc, lru);
2366
2367 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2368
2369 if (!cgroup_reclaim(sc))
2370 __count_vm_events(PGREFILL, nr_scanned);
2371 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2372
2373 spin_unlock_irq(&lruvec->lru_lock);
2374
2375 while (!list_empty(&l_hold)) {
2376 cond_resched();
2377 page = lru_to_page(&l_hold);
2378 list_del(&page->lru);
2379
2380 if (unlikely(!page_evictable(page))) {
2381 putback_lru_page(page);
2382 continue;
2383 }
2384
2385 if (unlikely(buffer_heads_over_limit)) {
2386 if (page_has_private(page) && trylock_page(page)) {
2387 if (page_has_private(page))
2388 try_to_release_page(page, 0);
2389 unlock_page(page);
2390 }
2391 }
2392
2393 if (page_referenced(page, 0, sc->target_mem_cgroup,
2394 &vm_flags)) {
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404 if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
2405 nr_rotated += thp_nr_pages(page);
2406 list_add(&page->lru, &l_active);
2407 continue;
2408 }
2409 }
2410
2411 ClearPageActive(page);
2412 SetPageWorkingset(page);
2413 list_add(&page->lru, &l_inactive);
2414 }
2415
2416
2417
2418
2419 spin_lock_irq(&lruvec->lru_lock);
2420
2421 nr_activate = move_pages_to_lru(lruvec, &l_active);
2422 nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2423
2424 list_splice(&l_inactive, &l_active);
2425
2426 __count_vm_events(PGDEACTIVATE, nr_deactivate);
2427 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2428
2429 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2430 spin_unlock_irq(&lruvec->lru_lock);
2431
2432 mem_cgroup_uncharge_list(&l_active);
2433 free_unref_page_list(&l_active);
2434 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2435 nr_deactivate, nr_rotated, sc->priority, file);
2436}
2437
2438unsigned long reclaim_pages(struct list_head *page_list)
2439{
2440 int nid = NUMA_NO_NODE;
2441 unsigned int nr_reclaimed = 0;
2442 LIST_HEAD(node_page_list);
2443 struct reclaim_stat dummy_stat;
2444 struct page *page;
2445 unsigned int noreclaim_flag;
2446 struct scan_control sc = {
2447 .gfp_mask = GFP_KERNEL,
2448 .may_writepage = 1,
2449 .may_unmap = 1,
2450 .may_swap = 1,
2451 .no_demotion = 1,
2452 };
2453
2454 noreclaim_flag = memalloc_noreclaim_save();
2455
2456 while (!list_empty(page_list)) {
2457 page = lru_to_page(page_list);
2458 if (nid == NUMA_NO_NODE) {
2459 nid = page_to_nid(page);
2460 INIT_LIST_HEAD(&node_page_list);
2461 }
2462
2463 if (nid == page_to_nid(page)) {
2464 ClearPageActive(page);
2465 list_move(&page->lru, &node_page_list);
2466 continue;
2467 }
2468
2469 nr_reclaimed += shrink_page_list(&node_page_list,
2470 NODE_DATA(nid),
2471 &sc, &dummy_stat, false);
2472 while (!list_empty(&node_page_list)) {
2473 page = lru_to_page(&node_page_list);
2474 list_del(&page->lru);
2475 putback_lru_page(page);
2476 }
2477
2478 nid = NUMA_NO_NODE;
2479 }
2480
2481 if (!list_empty(&node_page_list)) {
2482 nr_reclaimed += shrink_page_list(&node_page_list,
2483 NODE_DATA(nid),
2484 &sc, &dummy_stat, false);
2485 while (!list_empty(&node_page_list)) {
2486 page = lru_to_page(&node_page_list);
2487 list_del(&page->lru);
2488 putback_lru_page(page);
2489 }
2490 }
2491
2492 memalloc_noreclaim_restore(noreclaim_flag);
2493
2494 return nr_reclaimed;
2495}
2496
2497static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2498 struct lruvec *lruvec, struct scan_control *sc)
2499{
2500 if (is_active_lru(lru)) {
2501 if (sc->may_deactivate & (1 << is_file_lru(lru)))
2502 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2503 else
2504 sc->skipped_deactivate = 1;
2505 return 0;
2506 }
2507
2508 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2509}
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2540{
2541 enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2542 unsigned long inactive, active;
2543 unsigned long inactive_ratio;
2544 unsigned long gb;
2545
2546 inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2547 active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2548
2549 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2550 if (gb)
2551 inactive_ratio = int_sqrt(10 * gb);
2552 else
2553 inactive_ratio = 1;
2554
2555 return inactive * inactive_ratio < active;
2556}
2557
2558enum scan_balance {
2559 SCAN_EQUAL,
2560 SCAN_FRACT,
2561 SCAN_ANON,
2562 SCAN_FILE,
2563};
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2575 unsigned long *nr)
2576{
2577 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2578 struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2579 unsigned long anon_cost, file_cost, total_cost;
2580 int swappiness = mem_cgroup_swappiness(memcg);
2581 u64 fraction[ANON_AND_FILE];
2582 u64 denominator = 0;
2583 enum scan_balance scan_balance;
2584 unsigned long ap, fp;
2585 enum lru_list lru;
2586
2587
2588 if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
2589 scan_balance = SCAN_FILE;
2590 goto out;
2591 }
2592
2593
2594
2595
2596
2597
2598
2599
2600 if (cgroup_reclaim(sc) && !swappiness) {
2601 scan_balance = SCAN_FILE;
2602 goto out;
2603 }
2604
2605
2606
2607
2608
2609
2610 if (!sc->priority && swappiness) {
2611 scan_balance = SCAN_EQUAL;
2612 goto out;
2613 }
2614
2615
2616
2617
2618 if (sc->file_is_tiny) {
2619 scan_balance = SCAN_ANON;
2620 goto out;
2621 }
2622
2623
2624
2625
2626
2627 if (sc->cache_trim_mode) {
2628 scan_balance = SCAN_FILE;
2629 goto out;
2630 }
2631
2632 scan_balance = SCAN_FRACT;
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648 total_cost = sc->anon_cost + sc->file_cost;
2649 anon_cost = total_cost + sc->anon_cost;
2650 file_cost = total_cost + sc->file_cost;
2651 total_cost = anon_cost + file_cost;
2652
2653 ap = swappiness * (total_cost + 1);
2654 ap /= anon_cost + 1;
2655
2656 fp = (200 - swappiness) * (total_cost + 1);
2657 fp /= file_cost + 1;
2658
2659 fraction[0] = ap;
2660 fraction[1] = fp;
2661 denominator = ap + fp;
2662out:
2663 for_each_evictable_lru(lru) {
2664 int file = is_file_lru(lru);
2665 unsigned long lruvec_size;
2666 unsigned long low, min;
2667 unsigned long scan;
2668
2669 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2670 mem_cgroup_protection(sc->target_mem_cgroup, memcg,
2671 &min, &low);
2672
2673 if (min || low) {
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703 unsigned long cgroup_size = mem_cgroup_size(memcg);
2704 unsigned long protection;
2705
2706
2707 if (!sc->memcg_low_reclaim && low > min) {
2708 protection = low;
2709 sc->memcg_low_skipped = 1;
2710 } else {
2711 protection = min;
2712 }
2713
2714
2715 cgroup_size = max(cgroup_size, protection);
2716
2717 scan = lruvec_size - lruvec_size * protection /
2718 (cgroup_size + 1);
2719
2720
2721
2722
2723
2724
2725 scan = max(scan, SWAP_CLUSTER_MAX);
2726 } else {
2727 scan = lruvec_size;
2728 }
2729
2730 scan >>= sc->priority;
2731
2732
2733
2734
2735
2736 if (!scan && !mem_cgroup_online(memcg))
2737 scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2738
2739 switch (scan_balance) {
2740 case SCAN_EQUAL:
2741
2742 break;
2743 case SCAN_FRACT:
2744
2745
2746
2747
2748
2749
2750
2751 scan = mem_cgroup_online(memcg) ?
2752 div64_u64(scan * fraction[file], denominator) :
2753 DIV64_U64_ROUND_UP(scan * fraction[file],
2754 denominator);
2755 break;
2756 case SCAN_FILE:
2757 case SCAN_ANON:
2758
2759 if ((scan_balance == SCAN_FILE) != file)
2760 scan = 0;
2761 break;
2762 default:
2763
2764 BUG();
2765 }
2766
2767 nr[lru] = scan;
2768 }
2769}
2770
2771
2772
2773
2774
2775static bool can_age_anon_pages(struct pglist_data *pgdat,
2776 struct scan_control *sc)
2777{
2778
2779 if (total_swap_pages > 0)
2780 return true;
2781
2782
2783 return can_demote(pgdat->node_id, sc);
2784}
2785
2786static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2787{
2788 unsigned long nr[NR_LRU_LISTS];
2789 unsigned long targets[NR_LRU_LISTS];
2790 unsigned long nr_to_scan;
2791 enum lru_list lru;
2792 unsigned long nr_reclaimed = 0;
2793 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2794 struct blk_plug plug;
2795 bool scan_adjusted;
2796
2797 get_scan_count(lruvec, sc, nr);
2798
2799
2800 memcpy(targets, nr, sizeof(nr));
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813 scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
2814 sc->priority == DEF_PRIORITY);
2815
2816 blk_start_plug(&plug);
2817 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2818 nr[LRU_INACTIVE_FILE]) {
2819 unsigned long nr_anon, nr_file, percentage;
2820 unsigned long nr_scanned;
2821
2822 for_each_evictable_lru(lru) {
2823 if (nr[lru]) {
2824 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2825 nr[lru] -= nr_to_scan;
2826
2827 nr_reclaimed += shrink_list(lru, nr_to_scan,
2828 lruvec, sc);
2829 }
2830 }
2831
2832 cond_resched();
2833
2834 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2835 continue;
2836
2837
2838
2839
2840
2841
2842
2843
2844 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2845 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2846
2847
2848
2849
2850
2851
2852
2853 if (!nr_file || !nr_anon)
2854 break;
2855
2856 if (nr_file > nr_anon) {
2857 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2858 targets[LRU_ACTIVE_ANON] + 1;
2859 lru = LRU_BASE;
2860 percentage = nr_anon * 100 / scan_target;
2861 } else {
2862 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2863 targets[LRU_ACTIVE_FILE] + 1;
2864 lru = LRU_FILE;
2865 percentage = nr_file * 100 / scan_target;
2866 }
2867
2868
2869 nr[lru] = 0;
2870 nr[lru + LRU_ACTIVE] = 0;
2871
2872
2873
2874
2875
2876 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2877 nr_scanned = targets[lru] - nr[lru];
2878 nr[lru] = targets[lru] * (100 - percentage) / 100;
2879 nr[lru] -= min(nr[lru], nr_scanned);
2880
2881 lru += LRU_ACTIVE;
2882 nr_scanned = targets[lru] - nr[lru];
2883 nr[lru] = targets[lru] * (100 - percentage) / 100;
2884 nr[lru] -= min(nr[lru], nr_scanned);
2885
2886 scan_adjusted = true;
2887 }
2888 blk_finish_plug(&plug);
2889 sc->nr_reclaimed += nr_reclaimed;
2890
2891
2892
2893
2894
2895 if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
2896 inactive_is_low(lruvec, LRU_INACTIVE_ANON))
2897 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2898 sc, LRU_ACTIVE_ANON);
2899}
2900
2901
2902static bool in_reclaim_compaction(struct scan_control *sc)
2903{
2904 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2905 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2906 sc->priority < DEF_PRIORITY - 2))
2907 return true;
2908
2909 return false;
2910}
2911
2912
2913
2914
2915
2916
2917
2918
2919static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2920 unsigned long nr_reclaimed,
2921 struct scan_control *sc)
2922{
2923 unsigned long pages_for_compaction;
2924 unsigned long inactive_lru_pages;
2925 int z;
2926
2927
2928 if (!in_reclaim_compaction(sc))
2929 return false;
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941 if (!nr_reclaimed)
2942 return false;
2943
2944
2945 for (z = 0; z <= sc->reclaim_idx; z++) {
2946 struct zone *zone = &pgdat->node_zones[z];
2947 if (!managed_zone(zone))
2948 continue;
2949
2950 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2951 case COMPACT_SUCCESS:
2952 case COMPACT_CONTINUE:
2953 return false;
2954 default:
2955
2956 ;
2957 }
2958 }
2959
2960
2961
2962
2963
2964 pages_for_compaction = compact_gap(sc->order);
2965 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2966 if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
2967 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2968
2969 return inactive_lru_pages > pages_for_compaction;
2970}
2971
2972static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
2973{
2974 struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
2975 struct mem_cgroup *memcg;
2976
2977 memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
2978 do {
2979 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
2980 unsigned long reclaimed;
2981 unsigned long scanned;
2982
2983
2984
2985
2986
2987
2988
2989 cond_resched();
2990
2991 mem_cgroup_calculate_protection(target_memcg, memcg);
2992
2993 if (mem_cgroup_below_min(memcg)) {
2994
2995
2996
2997
2998 continue;
2999 } else if (mem_cgroup_below_low(memcg)) {
3000
3001
3002
3003
3004
3005
3006 if (!sc->memcg_low_reclaim) {
3007 sc->memcg_low_skipped = 1;
3008 continue;
3009 }
3010 memcg_memory_event(memcg, MEMCG_LOW);
3011 }
3012
3013 reclaimed = sc->nr_reclaimed;
3014 scanned = sc->nr_scanned;
3015
3016 shrink_lruvec(lruvec, sc);
3017
3018 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
3019 sc->priority);
3020
3021
3022 vmpressure(sc->gfp_mask, memcg, false,
3023 sc->nr_scanned - scanned,
3024 sc->nr_reclaimed - reclaimed);
3025
3026 } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
3027}
3028
3029static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
3030{
3031 struct reclaim_state *reclaim_state = current->reclaim_state;
3032 unsigned long nr_reclaimed, nr_scanned;
3033 struct lruvec *target_lruvec;
3034 bool reclaimable = false;
3035 unsigned long file;
3036
3037 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
3038
3039again:
3040
3041
3042
3043
3044 mem_cgroup_flush_stats();
3045
3046 memset(&sc->nr, 0, sizeof(sc->nr));
3047
3048 nr_reclaimed = sc->nr_reclaimed;
3049 nr_scanned = sc->nr_scanned;
3050
3051
3052
3053
3054 spin_lock_irq(&target_lruvec->lru_lock);
3055 sc->anon_cost = target_lruvec->anon_cost;
3056 sc->file_cost = target_lruvec->file_cost;
3057 spin_unlock_irq(&target_lruvec->lru_lock);
3058
3059
3060
3061
3062
3063 if (!sc->force_deactivate) {
3064 unsigned long refaults;
3065
3066 refaults = lruvec_page_state(target_lruvec,
3067 WORKINGSET_ACTIVATE_ANON);
3068 if (refaults != target_lruvec->refaults[0] ||
3069 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
3070 sc->may_deactivate |= DEACTIVATE_ANON;
3071 else
3072 sc->may_deactivate &= ~DEACTIVATE_ANON;
3073
3074
3075
3076
3077
3078
3079 refaults = lruvec_page_state(target_lruvec,
3080 WORKINGSET_ACTIVATE_FILE);
3081 if (refaults != target_lruvec->refaults[1] ||
3082 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
3083 sc->may_deactivate |= DEACTIVATE_FILE;
3084 else
3085 sc->may_deactivate &= ~DEACTIVATE_FILE;
3086 } else
3087 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
3088
3089
3090
3091
3092
3093
3094 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
3095 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
3096 sc->cache_trim_mode = 1;
3097 else
3098 sc->cache_trim_mode = 0;
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109 if (!cgroup_reclaim(sc)) {
3110 unsigned long total_high_wmark = 0;
3111 unsigned long free, anon;
3112 int z;
3113
3114 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
3115 file = node_page_state(pgdat, NR_ACTIVE_FILE) +
3116 node_page_state(pgdat, NR_INACTIVE_FILE);
3117
3118 for (z = 0; z < MAX_NR_ZONES; z++) {
3119 struct zone *zone = &pgdat->node_zones[z];
3120 if (!managed_zone(zone))
3121 continue;
3122
3123 total_high_wmark += high_wmark_pages(zone);
3124 }
3125
3126
3127
3128
3129
3130
3131 anon = node_page_state(pgdat, NR_INACTIVE_ANON);
3132
3133 sc->file_is_tiny =
3134 file + free <= total_high_wmark &&
3135 !(sc->may_deactivate & DEACTIVATE_ANON) &&
3136 anon >> sc->priority;
3137 }
3138
3139 shrink_node_memcgs(pgdat, sc);
3140
3141 if (reclaim_state) {
3142 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
3143 reclaim_state->reclaimed_slab = 0;
3144 }
3145
3146
3147 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
3148 sc->nr_scanned - nr_scanned,
3149 sc->nr_reclaimed - nr_reclaimed);
3150
3151 if (sc->nr_reclaimed - nr_reclaimed)
3152 reclaimable = true;
3153
3154 if (current_is_kswapd()) {
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
3173 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
3174
3175
3176 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
3177 set_bit(PGDAT_DIRTY, &pgdat->flags);
3178
3179
3180
3181
3182
3183
3184
3185 if (sc->nr.immediate)
3186 congestion_wait(BLK_RW_ASYNC, HZ/10);
3187 }
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197 if ((current_is_kswapd() ||
3198 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
3199 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
3200 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
3201
3202
3203
3204
3205
3206
3207
3208 if (!current_is_kswapd() && current_may_throttle() &&
3209 !sc->hibernation_mode &&
3210 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
3211 wait_iff_congested(BLK_RW_ASYNC, HZ/10);
3212
3213 if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
3214 sc))
3215 goto again;
3216
3217
3218
3219
3220
3221
3222
3223 if (reclaimable)
3224 pgdat->kswapd_failures = 0;
3225}
3226
3227
3228
3229
3230
3231
3232static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
3233{
3234 unsigned long watermark;
3235 enum compact_result suitable;
3236
3237 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
3238 if (suitable == COMPACT_SUCCESS)
3239
3240 return true;
3241 if (suitable == COMPACT_SKIPPED)
3242
3243 return false;
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
3255
3256 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
3257}
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
3268{
3269 struct zoneref *z;
3270 struct zone *zone;
3271 unsigned long nr_soft_reclaimed;
3272 unsigned long nr_soft_scanned;
3273 gfp_t orig_mask;
3274 pg_data_t *last_pgdat = NULL;
3275
3276
3277
3278
3279
3280
3281 orig_mask = sc->gfp_mask;
3282 if (buffer_heads_over_limit) {
3283 sc->gfp_mask |= __GFP_HIGHMEM;
3284 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
3285 }
3286
3287 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3288 sc->reclaim_idx, sc->nodemask) {
3289
3290
3291
3292
3293 if (!cgroup_reclaim(sc)) {
3294 if (!cpuset_zone_allowed(zone,
3295 GFP_KERNEL | __GFP_HARDWALL))
3296 continue;
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307 if (IS_ENABLED(CONFIG_COMPACTION) &&
3308 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
3309 compaction_ready(zone, sc)) {
3310 sc->compaction_ready = true;
3311 continue;
3312 }
3313
3314
3315
3316
3317
3318
3319
3320 if (zone->zone_pgdat == last_pgdat)
3321 continue;
3322
3323
3324
3325
3326
3327
3328
3329 nr_soft_scanned = 0;
3330 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
3331 sc->order, sc->gfp_mask,
3332 &nr_soft_scanned);
3333 sc->nr_reclaimed += nr_soft_reclaimed;
3334 sc->nr_scanned += nr_soft_scanned;
3335
3336 }
3337
3338
3339 if (zone->zone_pgdat == last_pgdat)
3340 continue;
3341 last_pgdat = zone->zone_pgdat;
3342 shrink_node(zone->zone_pgdat, sc);
3343 }
3344
3345
3346
3347
3348
3349 sc->gfp_mask = orig_mask;
3350}
3351
3352static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
3353{
3354 struct lruvec *target_lruvec;
3355 unsigned long refaults;
3356
3357 target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
3358 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
3359 target_lruvec->refaults[0] = refaults;
3360 refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
3361 target_lruvec->refaults[1] = refaults;
3362}
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3381 struct scan_control *sc)
3382{
3383 int initial_priority = sc->priority;
3384 pg_data_t *last_pgdat;
3385 struct zoneref *z;
3386 struct zone *zone;
3387retry:
3388 delayacct_freepages_start();
3389
3390 if (!cgroup_reclaim(sc))
3391 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3392
3393 do {
3394 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3395 sc->priority);
3396 sc->nr_scanned = 0;
3397 shrink_zones(zonelist, sc);
3398
3399 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3400 break;
3401
3402 if (sc->compaction_ready)
3403 break;
3404
3405
3406
3407
3408
3409 if (sc->priority < DEF_PRIORITY - 2)
3410 sc->may_writepage = 1;
3411 } while (--sc->priority >= 0);
3412
3413 last_pgdat = NULL;
3414 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3415 sc->nodemask) {
3416 if (zone->zone_pgdat == last_pgdat)
3417 continue;
3418 last_pgdat = zone->zone_pgdat;
3419
3420 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3421
3422 if (cgroup_reclaim(sc)) {
3423 struct lruvec *lruvec;
3424
3425 lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
3426 zone->zone_pgdat);
3427 clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3428 }
3429 }
3430
3431 delayacct_freepages_end();
3432
3433 if (sc->nr_reclaimed)
3434 return sc->nr_reclaimed;
3435
3436
3437 if (sc->compaction_ready)
3438 return 1;
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449 if (sc->skipped_deactivate) {
3450 sc->priority = initial_priority;
3451 sc->force_deactivate = 1;
3452 sc->skipped_deactivate = 0;
3453 goto retry;
3454 }
3455
3456
3457 if (sc->memcg_low_skipped) {
3458 sc->priority = initial_priority;
3459 sc->force_deactivate = 0;
3460 sc->memcg_low_reclaim = 1;
3461 sc->memcg_low_skipped = 0;
3462 goto retry;
3463 }
3464
3465 return 0;
3466}
3467
3468static bool allow_direct_reclaim(pg_data_t *pgdat)
3469{
3470 struct zone *zone;
3471 unsigned long pfmemalloc_reserve = 0;
3472 unsigned long free_pages = 0;
3473 int i;
3474 bool wmark_ok;
3475
3476 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3477 return true;
3478
3479 for (i = 0; i <= ZONE_NORMAL; i++) {
3480 zone = &pgdat->node_zones[i];
3481 if (!managed_zone(zone))
3482 continue;
3483
3484 if (!zone_reclaimable_pages(zone))
3485 continue;
3486
3487 pfmemalloc_reserve += min_wmark_pages(zone);
3488 free_pages += zone_page_state(zone, NR_FREE_PAGES);
3489 }
3490
3491
3492 if (!pfmemalloc_reserve)
3493 return true;
3494
3495 wmark_ok = free_pages > pfmemalloc_reserve / 2;
3496
3497
3498 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3499 if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
3500 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
3501
3502 wake_up_interruptible(&pgdat->kswapd_wait);
3503 }
3504
3505 return wmark_ok;
3506}
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3518 nodemask_t *nodemask)
3519{
3520 struct zoneref *z;
3521 struct zone *zone;
3522 pg_data_t *pgdat = NULL;
3523
3524
3525
3526
3527
3528
3529
3530
3531 if (current->flags & PF_KTHREAD)
3532 goto out;
3533
3534
3535
3536
3537
3538 if (fatal_signal_pending(current))
3539 goto out;
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3556 gfp_zone(gfp_mask), nodemask) {
3557 if (zone_idx(zone) > ZONE_NORMAL)
3558 continue;
3559
3560
3561 pgdat = zone->zone_pgdat;
3562 if (allow_direct_reclaim(pgdat))
3563 goto out;
3564 break;
3565 }
3566
3567
3568 if (!pgdat)
3569 goto out;
3570
3571
3572 count_vm_event(PGSCAN_DIRECT_THROTTLE);
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582 if (!(gfp_mask & __GFP_FS))
3583 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3584 allow_direct_reclaim(pgdat), HZ);
3585 else
3586
3587 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3588 allow_direct_reclaim(pgdat));
3589
3590 if (fatal_signal_pending(current))
3591 return true;
3592
3593out:
3594 return false;
3595}
3596
3597unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3598 gfp_t gfp_mask, nodemask_t *nodemask)
3599{
3600 unsigned long nr_reclaimed;
3601 struct scan_control sc = {
3602 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3603 .gfp_mask = current_gfp_context(gfp_mask),
3604 .reclaim_idx = gfp_zone(gfp_mask),
3605 .order = order,
3606 .nodemask = nodemask,
3607 .priority = DEF_PRIORITY,
3608 .may_writepage = !laptop_mode,
3609 .may_unmap = 1,
3610 .may_swap = 1,
3611 };
3612
3613
3614
3615
3616
3617 BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3618 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3619 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3620
3621
3622
3623
3624
3625
3626 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3627 return 1;
3628
3629 set_task_reclaim_state(current, &sc.reclaim_state);
3630 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3631
3632 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3633
3634 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3635 set_task_reclaim_state(current, NULL);
3636
3637 return nr_reclaimed;
3638}
3639
3640#ifdef CONFIG_MEMCG
3641
3642
3643unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3644 gfp_t gfp_mask, bool noswap,
3645 pg_data_t *pgdat,
3646 unsigned long *nr_scanned)
3647{
3648 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3649 struct scan_control sc = {
3650 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3651 .target_mem_cgroup = memcg,
3652 .may_writepage = !laptop_mode,
3653 .may_unmap = 1,
3654 .reclaim_idx = MAX_NR_ZONES - 1,
3655 .may_swap = !noswap,
3656 };
3657
3658 WARN_ON_ONCE(!current->reclaim_state);
3659
3660 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3661 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3662
3663 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3664 sc.gfp_mask);
3665
3666
3667
3668
3669
3670
3671
3672
3673 shrink_lruvec(lruvec, &sc);
3674
3675 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3676
3677 *nr_scanned = sc.nr_scanned;
3678
3679 return sc.nr_reclaimed;
3680}
3681
3682unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3683 unsigned long nr_pages,
3684 gfp_t gfp_mask,
3685 bool may_swap)
3686{
3687 unsigned long nr_reclaimed;
3688 unsigned int noreclaim_flag;
3689 struct scan_control sc = {
3690 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3691 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3692 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3693 .reclaim_idx = MAX_NR_ZONES - 1,
3694 .target_mem_cgroup = memcg,
3695 .priority = DEF_PRIORITY,
3696 .may_writepage = !laptop_mode,
3697 .may_unmap = 1,
3698 .may_swap = may_swap,
3699 };
3700
3701
3702
3703
3704
3705 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3706
3707 set_task_reclaim_state(current, &sc.reclaim_state);
3708 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
3709 noreclaim_flag = memalloc_noreclaim_save();
3710
3711 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3712
3713 memalloc_noreclaim_restore(noreclaim_flag);
3714 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3715 set_task_reclaim_state(current, NULL);
3716
3717 return nr_reclaimed;
3718}
3719#endif
3720
3721static void age_active_anon(struct pglist_data *pgdat,
3722 struct scan_control *sc)
3723{
3724 struct mem_cgroup *memcg;
3725 struct lruvec *lruvec;
3726
3727 if (!can_age_anon_pages(pgdat, sc))
3728 return;
3729
3730 lruvec = mem_cgroup_lruvec(NULL, pgdat);
3731 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
3732 return;
3733
3734 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3735 do {
3736 lruvec = mem_cgroup_lruvec(memcg, pgdat);
3737 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3738 sc, LRU_ACTIVE_ANON);
3739 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3740 } while (memcg);
3741}
3742
3743static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
3744{
3745 int i;
3746 struct zone *zone;
3747
3748
3749
3750
3751
3752
3753
3754
3755 for (i = highest_zoneidx; i >= 0; i--) {
3756 zone = pgdat->node_zones + i;
3757 if (!managed_zone(zone))
3758 continue;
3759
3760 if (zone->watermark_boost)
3761 return true;
3762 }
3763
3764 return false;
3765}
3766
3767
3768
3769
3770
3771static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
3772{
3773 int i;
3774 unsigned long mark = -1;
3775 struct zone *zone;
3776
3777
3778
3779
3780
3781 for (i = 0; i <= highest_zoneidx; i++) {
3782 zone = pgdat->node_zones + i;
3783
3784 if (!managed_zone(zone))
3785 continue;
3786
3787 mark = high_wmark_pages(zone);
3788 if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
3789 return true;
3790 }
3791
3792
3793
3794
3795
3796
3797 if (mark == -1)
3798 return true;
3799
3800 return false;
3801}
3802
3803
3804static void clear_pgdat_congested(pg_data_t *pgdat)
3805{
3806 struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
3807
3808 clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3809 clear_bit(PGDAT_DIRTY, &pgdat->flags);
3810 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3811}
3812
3813
3814
3815
3816
3817
3818
3819static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
3820 int highest_zoneidx)
3821{
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3836 wake_up_all(&pgdat->pfmemalloc_wait);
3837
3838
3839 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3840 return true;
3841
3842 if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
3843 clear_pgdat_congested(pgdat);
3844 return true;
3845 }
3846
3847 return false;
3848}
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858static bool kswapd_shrink_node(pg_data_t *pgdat,
3859 struct scan_control *sc)
3860{
3861 struct zone *zone;
3862 int z;
3863
3864
3865 sc->nr_to_reclaim = 0;
3866 for (z = 0; z <= sc->reclaim_idx; z++) {
3867 zone = pgdat->node_zones + z;
3868 if (!managed_zone(zone))
3869 continue;
3870
3871 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3872 }
3873
3874
3875
3876
3877
3878 shrink_node(pgdat, sc);
3879
3880
3881
3882
3883
3884
3885
3886
3887 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3888 sc->order = 0;
3889
3890 return sc->nr_scanned >= sc->nr_to_reclaim;
3891}
3892
3893
3894static inline void
3895update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
3896{
3897 int i;
3898 struct zone *zone;
3899
3900 for (i = 0; i <= highest_zoneidx; i++) {
3901 zone = pgdat->node_zones + i;
3902
3903 if (!managed_zone(zone))
3904 continue;
3905
3906 if (active)
3907 set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
3908 else
3909 clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
3910 }
3911}
3912
3913static inline void
3914set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
3915{
3916 update_reclaim_active(pgdat, highest_zoneidx, true);
3917}
3918
3919static inline void
3920clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
3921{
3922 update_reclaim_active(pgdat, highest_zoneidx, false);
3923}
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
3939{
3940 int i;
3941 unsigned long nr_soft_reclaimed;
3942 unsigned long nr_soft_scanned;
3943 unsigned long pflags;
3944 unsigned long nr_boost_reclaim;
3945 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
3946 bool boosted;
3947 struct zone *zone;
3948 struct scan_control sc = {
3949 .gfp_mask = GFP_KERNEL,
3950 .order = order,
3951 .may_unmap = 1,
3952 };
3953
3954 set_task_reclaim_state(current, &sc.reclaim_state);
3955 psi_memstall_enter(&pflags);
3956 __fs_reclaim_acquire(_THIS_IP_);
3957
3958 count_vm_event(PAGEOUTRUN);
3959
3960
3961
3962
3963
3964
3965 nr_boost_reclaim = 0;
3966 for (i = 0; i <= highest_zoneidx; i++) {
3967 zone = pgdat->node_zones + i;
3968 if (!managed_zone(zone))
3969 continue;
3970
3971 nr_boost_reclaim += zone->watermark_boost;
3972 zone_boosts[i] = zone->watermark_boost;
3973 }
3974 boosted = nr_boost_reclaim;
3975
3976restart:
3977 set_reclaim_active(pgdat, highest_zoneidx);
3978 sc.priority = DEF_PRIORITY;
3979 do {
3980 unsigned long nr_reclaimed = sc.nr_reclaimed;
3981 bool raise_priority = true;
3982 bool balanced;
3983 bool ret;
3984
3985 sc.reclaim_idx = highest_zoneidx;
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997 if (buffer_heads_over_limit) {
3998 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3999 zone = pgdat->node_zones + i;
4000 if (!managed_zone(zone))
4001 continue;
4002
4003 sc.reclaim_idx = i;
4004 break;
4005 }
4006 }
4007
4008
4009
4010
4011
4012
4013
4014
4015 balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
4016 if (!balanced && nr_boost_reclaim) {
4017 nr_boost_reclaim = 0;
4018 goto restart;
4019 }
4020
4021
4022
4023
4024
4025
4026 if (!nr_boost_reclaim && balanced)
4027 goto out;
4028
4029
4030 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
4031 raise_priority = false;
4032
4033
4034
4035
4036
4037
4038
4039 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
4040 sc.may_swap = !nr_boost_reclaim;
4041
4042
4043
4044
4045
4046
4047
4048 age_active_anon(pgdat, &sc);
4049
4050
4051
4052
4053
4054 if (sc.priority < DEF_PRIORITY - 2)
4055 sc.may_writepage = 1;
4056
4057
4058 sc.nr_scanned = 0;
4059 nr_soft_scanned = 0;
4060 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
4061 sc.gfp_mask, &nr_soft_scanned);
4062 sc.nr_reclaimed += nr_soft_reclaimed;
4063
4064
4065
4066
4067
4068
4069 if (kswapd_shrink_node(pgdat, &sc))
4070 raise_priority = false;
4071
4072
4073
4074
4075
4076
4077 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
4078 allow_direct_reclaim(pgdat))
4079 wake_up_all(&pgdat->pfmemalloc_wait);
4080
4081
4082 __fs_reclaim_release(_THIS_IP_);
4083 ret = try_to_freeze();
4084 __fs_reclaim_acquire(_THIS_IP_);
4085 if (ret || kthread_should_stop())
4086 break;
4087
4088
4089
4090
4091
4092 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
4093 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
4094
4095
4096
4097
4098
4099
4100 if (nr_boost_reclaim && !nr_reclaimed)
4101 break;
4102
4103 if (raise_priority || !nr_reclaimed)
4104 sc.priority--;
4105 } while (sc.priority >= 1);
4106
4107 if (!sc.nr_reclaimed)
4108 pgdat->kswapd_failures++;
4109
4110out:
4111 clear_reclaim_active(pgdat, highest_zoneidx);
4112
4113
4114 if (boosted) {
4115 unsigned long flags;
4116
4117 for (i = 0; i <= highest_zoneidx; i++) {
4118 if (!zone_boosts[i])
4119 continue;
4120
4121
4122 zone = pgdat->node_zones + i;
4123 spin_lock_irqsave(&zone->lock, flags);
4124 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
4125 spin_unlock_irqrestore(&zone->lock, flags);
4126 }
4127
4128
4129
4130
4131
4132 wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
4133 }
4134
4135 snapshot_refaults(NULL, pgdat);
4136 __fs_reclaim_release(_THIS_IP_);
4137 psi_memstall_leave(&pflags);
4138 set_task_reclaim_state(current, NULL);
4139
4140
4141
4142
4143
4144
4145
4146 return sc.order;
4147}
4148
4149
4150
4151
4152
4153
4154
4155
4156static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
4157 enum zone_type prev_highest_zoneidx)
4158{
4159 enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
4160
4161 return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
4162}
4163
4164static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
4165 unsigned int highest_zoneidx)
4166{
4167 long remaining = 0;
4168 DEFINE_WAIT(wait);
4169
4170 if (freezing(current) || kthread_should_stop())
4171 return;
4172
4173 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
4174
4175
4176
4177
4178
4179
4180
4181
4182 if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
4183
4184
4185
4186
4187
4188
4189 reset_isolation_suitable(pgdat);
4190
4191
4192
4193
4194
4195 wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
4196
4197 remaining = schedule_timeout(HZ/10);
4198
4199
4200
4201
4202
4203
4204 if (remaining) {
4205 WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
4206 kswapd_highest_zoneidx(pgdat,
4207 highest_zoneidx));
4208
4209 if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
4210 WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
4211 }
4212
4213 finish_wait(&pgdat->kswapd_wait, &wait);
4214 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
4215 }
4216
4217
4218
4219
4220
4221 if (!remaining &&
4222 prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
4223 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
4234
4235 if (!kthread_should_stop())
4236 schedule();
4237
4238 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
4239 } else {
4240 if (remaining)
4241 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
4242 else
4243 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
4244 }
4245 finish_wait(&pgdat->kswapd_wait, &wait);
4246}
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261static int kswapd(void *p)
4262{
4263 unsigned int alloc_order, reclaim_order;
4264 unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
4265 pg_data_t *pgdat = (pg_data_t *)p;
4266 struct task_struct *tsk = current;
4267 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
4268
4269 if (!cpumask_empty(cpumask))
4270 set_cpus_allowed_ptr(tsk, cpumask);
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
4285 set_freezable();
4286
4287 WRITE_ONCE(pgdat->kswapd_order, 0);
4288 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
4289 for ( ; ; ) {
4290 bool ret;
4291
4292 alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
4293 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
4294 highest_zoneidx);
4295
4296kswapd_try_sleep:
4297 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
4298 highest_zoneidx);
4299
4300
4301 alloc_order = READ_ONCE(pgdat->kswapd_order);
4302 highest_zoneidx = kswapd_highest_zoneidx(pgdat,
4303 highest_zoneidx);
4304 WRITE_ONCE(pgdat->kswapd_order, 0);
4305 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
4306
4307 ret = try_to_freeze();
4308 if (kthread_should_stop())
4309 break;
4310
4311
4312
4313
4314
4315 if (ret)
4316 continue;
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326 trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
4327 alloc_order);
4328 reclaim_order = balance_pgdat(pgdat, alloc_order,
4329 highest_zoneidx);
4330 if (reclaim_order < alloc_order)
4331 goto kswapd_try_sleep;
4332 }
4333
4334 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
4335
4336 return 0;
4337}
4338
4339
4340
4341
4342
4343
4344
4345
4346void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
4347 enum zone_type highest_zoneidx)
4348{
4349 pg_data_t *pgdat;
4350 enum zone_type curr_idx;
4351
4352 if (!managed_zone(zone))
4353 return;
4354
4355 if (!cpuset_zone_allowed(zone, gfp_flags))
4356 return;
4357
4358 pgdat = zone->zone_pgdat;
4359 curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
4360
4361 if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
4362 WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
4363
4364 if (READ_ONCE(pgdat->kswapd_order) < order)
4365 WRITE_ONCE(pgdat->kswapd_order, order);
4366
4367 if (!waitqueue_active(&pgdat->kswapd_wait))
4368 return;
4369
4370
4371 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
4372 (pgdat_balanced(pgdat, order, highest_zoneidx) &&
4373 !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
4374
4375
4376
4377
4378
4379
4380
4381 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
4382 wakeup_kcompactd(pgdat, order, highest_zoneidx);
4383 return;
4384 }
4385
4386 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
4387 gfp_flags);
4388 wake_up_interruptible(&pgdat->kswapd_wait);
4389}
4390
4391#ifdef CONFIG_HIBERNATION
4392
4393
4394
4395
4396
4397
4398
4399
4400unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
4401{
4402 struct scan_control sc = {
4403 .nr_to_reclaim = nr_to_reclaim,
4404 .gfp_mask = GFP_HIGHUSER_MOVABLE,
4405 .reclaim_idx = MAX_NR_ZONES - 1,
4406 .priority = DEF_PRIORITY,
4407 .may_writepage = 1,
4408 .may_unmap = 1,
4409 .may_swap = 1,
4410 .hibernation_mode = 1,
4411 };
4412 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4413 unsigned long nr_reclaimed;
4414 unsigned int noreclaim_flag;
4415
4416 fs_reclaim_acquire(sc.gfp_mask);
4417 noreclaim_flag = memalloc_noreclaim_save();
4418 set_task_reclaim_state(current, &sc.reclaim_state);
4419
4420 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4421
4422 set_task_reclaim_state(current, NULL);
4423 memalloc_noreclaim_restore(noreclaim_flag);
4424 fs_reclaim_release(sc.gfp_mask);
4425
4426 return nr_reclaimed;
4427}
4428#endif
4429
4430
4431
4432
4433
4434void kswapd_run(int nid)
4435{
4436 pg_data_t *pgdat = NODE_DATA(nid);
4437
4438 if (pgdat->kswapd)
4439 return;
4440
4441 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
4442 if (IS_ERR(pgdat->kswapd)) {
4443
4444 BUG_ON(system_state < SYSTEM_RUNNING);
4445 pr_err("Failed to start kswapd on node %d\n", nid);
4446 pgdat->kswapd = NULL;
4447 }
4448}
4449
4450
4451
4452
4453
4454void kswapd_stop(int nid)
4455{
4456 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
4457
4458 if (kswapd) {
4459 kthread_stop(kswapd);
4460 NODE_DATA(nid)->kswapd = NULL;
4461 }
4462}
4463
4464static int __init kswapd_init(void)
4465{
4466 int nid;
4467
4468 swap_setup();
4469 for_each_node_state(nid, N_MEMORY)
4470 kswapd_run(nid);
4471 return 0;
4472}
4473
4474module_init(kswapd_init)
4475
4476#ifdef CONFIG_NUMA
4477
4478
4479
4480
4481
4482
4483int node_reclaim_mode __read_mostly;
4484
4485
4486
4487
4488
4489
4490#define NODE_RECLAIM_PRIORITY 4
4491
4492
4493
4494
4495
4496int sysctl_min_unmapped_ratio = 1;
4497
4498
4499
4500
4501
4502int sysctl_min_slab_ratio = 5;
4503
4504static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4505{
4506 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4507 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4508 node_page_state(pgdat, NR_ACTIVE_FILE);
4509
4510
4511
4512
4513
4514
4515 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4516}
4517
4518
4519static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4520{
4521 unsigned long nr_pagecache_reclaimable;
4522 unsigned long delta = 0;
4523
4524
4525
4526
4527
4528
4529
4530 if (node_reclaim_mode & RECLAIM_UNMAP)
4531 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4532 else
4533 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4534
4535
4536 if (!(node_reclaim_mode & RECLAIM_WRITE))
4537 delta += node_page_state(pgdat, NR_FILE_DIRTY);
4538
4539
4540 if (unlikely(delta > nr_pagecache_reclaimable))
4541 delta = nr_pagecache_reclaimable;
4542
4543 return nr_pagecache_reclaimable - delta;
4544}
4545
4546
4547
4548
4549static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4550{
4551
4552 const unsigned long nr_pages = 1 << order;
4553 struct task_struct *p = current;
4554 unsigned int noreclaim_flag;
4555 struct scan_control sc = {
4556 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4557 .gfp_mask = current_gfp_context(gfp_mask),
4558 .order = order,
4559 .priority = NODE_RECLAIM_PRIORITY,
4560 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4561 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4562 .may_swap = 1,
4563 .reclaim_idx = gfp_zone(gfp_mask),
4564 };
4565 unsigned long pflags;
4566
4567 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
4568 sc.gfp_mask);
4569
4570 cond_resched();
4571 psi_memstall_enter(&pflags);
4572 fs_reclaim_acquire(sc.gfp_mask);
4573
4574
4575
4576
4577
4578 noreclaim_flag = memalloc_noreclaim_save();
4579 p->flags |= PF_SWAPWRITE;
4580 set_task_reclaim_state(p, &sc.reclaim_state);
4581
4582 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4583
4584
4585
4586
4587 do {
4588 shrink_node(pgdat, &sc);
4589 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4590 }
4591
4592 set_task_reclaim_state(p, NULL);
4593 current->flags &= ~PF_SWAPWRITE;
4594 memalloc_noreclaim_restore(noreclaim_flag);
4595 fs_reclaim_release(sc.gfp_mask);
4596 psi_memstall_leave(&pflags);
4597
4598 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
4599
4600 return sc.nr_reclaimed >= nr_pages;
4601}
4602
4603int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4604{
4605 int ret;
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4618 node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
4619 pgdat->min_slab_pages)
4620 return NODE_RECLAIM_FULL;
4621
4622
4623
4624
4625 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4626 return NODE_RECLAIM_NOSCAN;
4627
4628
4629
4630
4631
4632
4633
4634 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4635 return NODE_RECLAIM_NOSCAN;
4636
4637 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4638 return NODE_RECLAIM_NOSCAN;
4639
4640 ret = __node_reclaim(pgdat, gfp_mask, order);
4641 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4642
4643 if (!ret)
4644 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4645
4646 return ret;
4647}
4648#endif
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659void check_move_unevictable_pages(struct pagevec *pvec)
4660{
4661 struct lruvec *lruvec = NULL;
4662 int pgscanned = 0;
4663 int pgrescued = 0;
4664 int i;
4665
4666 for (i = 0; i < pvec->nr; i++) {
4667 struct page *page = pvec->pages[i];
4668 int nr_pages;
4669
4670 if (PageTransTail(page))
4671 continue;
4672
4673 nr_pages = thp_nr_pages(page);
4674 pgscanned += nr_pages;
4675
4676
4677 if (!TestClearPageLRU(page))
4678 continue;
4679
4680 lruvec = relock_page_lruvec_irq(page, lruvec);
4681 if (page_evictable(page) && PageUnevictable(page)) {
4682 del_page_from_lru_list(page, lruvec);
4683 ClearPageUnevictable(page);
4684 add_page_to_lru_list(page, lruvec);
4685 pgrescued += nr_pages;
4686 }
4687 SetPageLRU(page);
4688 }
4689
4690 if (lruvec) {
4691 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4692 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4693 unlock_page_lruvec_irq(lruvec);
4694 } else if (pgscanned) {
4695 count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4696 }
4697}
4698EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4699