1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17#include <linux/mm.h>
18#include <linux/sched/mm.h>
19#include <linux/module.h>
20#include <linux/gfp.h>
21#include <linux/kernel_stat.h>
22#include <linux/swap.h>
23#include <linux/pagemap.h>
24#include <linux/init.h>
25#include <linux/highmem.h>
26#include <linux/vmpressure.h>
27#include <linux/vmstat.h>
28#include <linux/file.h>
29#include <linux/writeback.h>
30#include <linux/blkdev.h>
31#include <linux/buffer_head.h>
32
33#include <linux/mm_inline.h>
34#include <linux/backing-dev.h>
35#include <linux/rmap.h>
36#include <linux/topology.h>
37#include <linux/cpu.h>
38#include <linux/cpuset.h>
39#include <linux/compaction.h>
40#include <linux/notifier.h>
41#include <linux/rwsem.h>
42#include <linux/delay.h>
43#include <linux/kthread.h>
44#include <linux/freezer.h>
45#include <linux/memcontrol.h>
46#include <linux/delayacct.h>
47#include <linux/sysctl.h>
48#include <linux/oom.h>
49#include <linux/pagevec.h>
50#include <linux/prefetch.h>
51#include <linux/printk.h>
52#include <linux/dax.h>
53#include <linux/psi.h>
54
55#include <asm/tlbflush.h>
56#include <asm/div64.h>
57
58#include <linux/swapops.h>
59#include <linux/balloon_compaction.h>
60
61#include "internal.h"
62
63#define CREATE_TRACE_POINTS
64#include <trace/events/vmscan.h>
65
66struct scan_control {
67
68 unsigned long nr_to_reclaim;
69
70
71
72
73
74 nodemask_t *nodemask;
75
76
77
78
79
80 struct mem_cgroup *target_mem_cgroup;
81
82
83 unsigned int may_writepage:1;
84
85
86 unsigned int may_unmap:1;
87
88
89 unsigned int may_swap:1;
90
91
92
93
94
95
96 unsigned int memcg_low_reclaim:1;
97 unsigned int memcg_low_skipped:1;
98
99 unsigned int hibernation_mode:1;
100
101
102 unsigned int compaction_ready:1;
103
104
105 s8 order;
106
107
108 s8 priority;
109
110
111 s8 reclaim_idx;
112
113
114 gfp_t gfp_mask;
115
116
117 unsigned long nr_scanned;
118
119
120 unsigned long nr_reclaimed;
121
122 struct {
123 unsigned int dirty;
124 unsigned int unqueued_dirty;
125 unsigned int congested;
126 unsigned int writeback;
127 unsigned int immediate;
128 unsigned int file_taken;
129 unsigned int taken;
130 } nr;
131
132
133 struct reclaim_state reclaim_state;
134};
135
136#ifdef ARCH_HAS_PREFETCH
137#define prefetch_prev_lru_page(_page, _base, _field) \
138 do { \
139 if ((_page)->lru.prev != _base) { \
140 struct page *prev; \
141 \
142 prev = lru_to_page(&(_page->lru)); \
143 prefetch(&prev->_field); \
144 } \
145 } while (0)
146#else
147#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
148#endif
149
150#ifdef ARCH_HAS_PREFETCHW
151#define prefetchw_prev_lru_page(_page, _base, _field) \
152 do { \
153 if ((_page)->lru.prev != _base) { \
154 struct page *prev; \
155 \
156 prev = lru_to_page(&(_page->lru)); \
157 prefetchw(&prev->_field); \
158 } \
159 } while (0)
160#else
161#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
162#endif
163
164
165
166
167int vm_swappiness = 60;
168
169
170
171
172unsigned long vm_total_pages;
173
174static LIST_HEAD(shrinker_list);
175static DECLARE_RWSEM(shrinker_rwsem);
176
177#ifdef CONFIG_MEMCG_KMEM
178
179
180
181
182
183
184
185
186
187
188
189
190#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
191
192static DEFINE_IDR(shrinker_idr);
193static int shrinker_nr_max;
194
195static int prealloc_memcg_shrinker(struct shrinker *shrinker)
196{
197 int id, ret = -ENOMEM;
198
199 down_write(&shrinker_rwsem);
200
201 id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
202 if (id < 0)
203 goto unlock;
204
205 if (id >= shrinker_nr_max) {
206 if (memcg_expand_shrinker_maps(id)) {
207 idr_remove(&shrinker_idr, id);
208 goto unlock;
209 }
210
211 shrinker_nr_max = id + 1;
212 }
213 shrinker->id = id;
214 ret = 0;
215unlock:
216 up_write(&shrinker_rwsem);
217 return ret;
218}
219
220static void unregister_memcg_shrinker(struct shrinker *shrinker)
221{
222 int id = shrinker->id;
223
224 BUG_ON(id < 0);
225
226 down_write(&shrinker_rwsem);
227 idr_remove(&shrinker_idr, id);
228 up_write(&shrinker_rwsem);
229}
230#else
231static int prealloc_memcg_shrinker(struct shrinker *shrinker)
232{
233 return 0;
234}
235
236static void unregister_memcg_shrinker(struct shrinker *shrinker)
237{
238}
239#endif
240
241static void set_task_reclaim_state(struct task_struct *task,
242 struct reclaim_state *rs)
243{
244
245 WARN_ON_ONCE(rs && task->reclaim_state);
246
247
248 WARN_ON_ONCE(!rs && !task->reclaim_state);
249
250 task->reclaim_state = rs;
251}
252
253#ifdef CONFIG_MEMCG
254static bool global_reclaim(struct scan_control *sc)
255{
256 return !sc->target_mem_cgroup;
257}
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272static bool sane_reclaim(struct scan_control *sc)
273{
274 struct mem_cgroup *memcg = sc->target_mem_cgroup;
275
276 if (!memcg)
277 return true;
278#ifdef CONFIG_CGROUP_WRITEBACK
279 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
280 return true;
281#endif
282 return false;
283}
284
285static void set_memcg_congestion(pg_data_t *pgdat,
286 struct mem_cgroup *memcg,
287 bool congested)
288{
289 struct mem_cgroup_per_node *mn;
290
291 if (!memcg)
292 return;
293
294 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
295 WRITE_ONCE(mn->congested, congested);
296}
297
298static bool memcg_congested(pg_data_t *pgdat,
299 struct mem_cgroup *memcg)
300{
301 struct mem_cgroup_per_node *mn;
302
303 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
304 return READ_ONCE(mn->congested);
305
306}
307#else
308static bool global_reclaim(struct scan_control *sc)
309{
310 return true;
311}
312
313static bool sane_reclaim(struct scan_control *sc)
314{
315 return true;
316}
317
318static inline void set_memcg_congestion(struct pglist_data *pgdat,
319 struct mem_cgroup *memcg, bool congested)
320{
321}
322
323static inline bool memcg_congested(struct pglist_data *pgdat,
324 struct mem_cgroup *memcg)
325{
326 return false;
327
328}
329#endif
330
331
332
333
334
335
336unsigned long zone_reclaimable_pages(struct zone *zone)
337{
338 unsigned long nr;
339
340 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
341 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
342 if (get_nr_swap_pages() > 0)
343 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
344 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
345
346 return nr;
347}
348
349
350
351
352
353
354
355unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
356{
357 unsigned long lru_size;
358 int zid;
359
360 if (!mem_cgroup_disabled())
361 lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
362 else
363 lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
364
365 for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
366 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
367 unsigned long size;
368
369 if (!managed_zone(zone))
370 continue;
371
372 if (!mem_cgroup_disabled())
373 size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
374 else
375 size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
376 NR_ZONE_LRU_BASE + lru);
377 lru_size -= min(size, lru_size);
378 }
379
380 return lru_size;
381
382}
383
384
385
386
387int prealloc_shrinker(struct shrinker *shrinker)
388{
389 unsigned int size = sizeof(*shrinker->nr_deferred);
390
391 if (shrinker->flags & SHRINKER_NUMA_AWARE)
392 size *= nr_node_ids;
393
394 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
395 if (!shrinker->nr_deferred)
396 return -ENOMEM;
397
398 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
399 if (prealloc_memcg_shrinker(shrinker))
400 goto free_deferred;
401 }
402
403 return 0;
404
405free_deferred:
406 kfree(shrinker->nr_deferred);
407 shrinker->nr_deferred = NULL;
408 return -ENOMEM;
409}
410
411void free_prealloced_shrinker(struct shrinker *shrinker)
412{
413 if (!shrinker->nr_deferred)
414 return;
415
416 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
417 unregister_memcg_shrinker(shrinker);
418
419 kfree(shrinker->nr_deferred);
420 shrinker->nr_deferred = NULL;
421}
422
423void register_shrinker_prepared(struct shrinker *shrinker)
424{
425 down_write(&shrinker_rwsem);
426 list_add_tail(&shrinker->list, &shrinker_list);
427#ifdef CONFIG_MEMCG_KMEM
428 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
429 idr_replace(&shrinker_idr, shrinker, shrinker->id);
430#endif
431 up_write(&shrinker_rwsem);
432}
433
434int register_shrinker(struct shrinker *shrinker)
435{
436 int err = prealloc_shrinker(shrinker);
437
438 if (err)
439 return err;
440 register_shrinker_prepared(shrinker);
441 return 0;
442}
443EXPORT_SYMBOL(register_shrinker);
444
445
446
447
448void unregister_shrinker(struct shrinker *shrinker)
449{
450 if (!shrinker->nr_deferred)
451 return;
452 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
453 unregister_memcg_shrinker(shrinker);
454 down_write(&shrinker_rwsem);
455 list_del(&shrinker->list);
456 up_write(&shrinker_rwsem);
457 kfree(shrinker->nr_deferred);
458 shrinker->nr_deferred = NULL;
459}
460EXPORT_SYMBOL(unregister_shrinker);
461
462#define SHRINK_BATCH 128
463
464static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
465 struct shrinker *shrinker, int priority)
466{
467 unsigned long freed = 0;
468 unsigned long long delta;
469 long total_scan;
470 long freeable;
471 long nr;
472 long new_nr;
473 int nid = shrinkctl->nid;
474 long batch_size = shrinker->batch ? shrinker->batch
475 : SHRINK_BATCH;
476 long scanned = 0, next_deferred;
477
478 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
479 nid = 0;
480
481 freeable = shrinker->count_objects(shrinker, shrinkctl);
482 if (freeable == 0 || freeable == SHRINK_EMPTY)
483 return freeable;
484
485
486
487
488
489
490 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
491
492 total_scan = nr;
493 if (shrinker->seeks) {
494 delta = freeable >> priority;
495 delta *= 4;
496 do_div(delta, shrinker->seeks);
497 } else {
498
499
500
501
502
503 delta = freeable / 2;
504 }
505
506 total_scan += delta;
507 if (total_scan < 0) {
508 pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
509 shrinker->scan_objects, total_scan);
510 total_scan = freeable;
511 next_deferred = nr;
512 } else
513 next_deferred = total_scan;
514
515
516
517
518
519
520
521
522
523
524
525
526
527 if (delta < freeable / 4)
528 total_scan = min(total_scan, freeable / 2);
529
530
531
532
533
534
535 if (total_scan > freeable * 2)
536 total_scan = freeable * 2;
537
538 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
539 freeable, delta, total_scan, priority);
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556 while (total_scan >= batch_size ||
557 total_scan >= freeable) {
558 unsigned long ret;
559 unsigned long nr_to_scan = min(batch_size, total_scan);
560
561 shrinkctl->nr_to_scan = nr_to_scan;
562 shrinkctl->nr_scanned = nr_to_scan;
563 ret = shrinker->scan_objects(shrinker, shrinkctl);
564 if (ret == SHRINK_STOP)
565 break;
566 freed += ret;
567
568 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
569 total_scan -= shrinkctl->nr_scanned;
570 scanned += shrinkctl->nr_scanned;
571
572 cond_resched();
573 }
574
575 if (next_deferred >= scanned)
576 next_deferred -= scanned;
577 else
578 next_deferred = 0;
579
580
581
582
583
584 if (next_deferred > 0)
585 new_nr = atomic_long_add_return(next_deferred,
586 &shrinker->nr_deferred[nid]);
587 else
588 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
589
590 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
591 return freed;
592}
593
594#ifdef CONFIG_MEMCG_KMEM
595static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
596 struct mem_cgroup *memcg, int priority)
597{
598 struct memcg_shrinker_map *map;
599 unsigned long ret, freed = 0;
600 int i;
601
602 if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
603 return 0;
604
605 if (!down_read_trylock(&shrinker_rwsem))
606 return 0;
607
608 map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
609 true);
610 if (unlikely(!map))
611 goto unlock;
612
613 for_each_set_bit(i, map->map, shrinker_nr_max) {
614 struct shrink_control sc = {
615 .gfp_mask = gfp_mask,
616 .nid = nid,
617 .memcg = memcg,
618 };
619 struct shrinker *shrinker;
620
621 shrinker = idr_find(&shrinker_idr, i);
622 if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
623 if (!shrinker)
624 clear_bit(i, map->map);
625 continue;
626 }
627
628 ret = do_shrink_slab(&sc, shrinker, priority);
629 if (ret == SHRINK_EMPTY) {
630 clear_bit(i, map->map);
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646 smp_mb__after_atomic();
647 ret = do_shrink_slab(&sc, shrinker, priority);
648 if (ret == SHRINK_EMPTY)
649 ret = 0;
650 else
651 memcg_set_shrinker_bit(memcg, nid, i);
652 }
653 freed += ret;
654
655 if (rwsem_is_contended(&shrinker_rwsem)) {
656 freed = freed ? : 1;
657 break;
658 }
659 }
660unlock:
661 up_read(&shrinker_rwsem);
662 return freed;
663}
664#else
665static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
666 struct mem_cgroup *memcg, int priority)
667{
668 return 0;
669}
670#endif
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
693 struct mem_cgroup *memcg,
694 int priority)
695{
696 unsigned long ret, freed = 0;
697 struct shrinker *shrinker;
698
699
700
701
702
703
704
705
706 if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
707 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
708
709 if (!down_read_trylock(&shrinker_rwsem))
710 goto out;
711
712 list_for_each_entry(shrinker, &shrinker_list, list) {
713 struct shrink_control sc = {
714 .gfp_mask = gfp_mask,
715 .nid = nid,
716 .memcg = memcg,
717 };
718
719 ret = do_shrink_slab(&sc, shrinker, priority);
720 if (ret == SHRINK_EMPTY)
721 ret = 0;
722 freed += ret;
723
724
725
726
727
728 if (rwsem_is_contended(&shrinker_rwsem)) {
729 freed = freed ? : 1;
730 break;
731 }
732 }
733
734 up_read(&shrinker_rwsem);
735out:
736 cond_resched();
737 return freed;
738}
739
740void drop_slab_node(int nid)
741{
742 unsigned long freed;
743
744 do {
745 struct mem_cgroup *memcg = NULL;
746
747 freed = 0;
748 memcg = mem_cgroup_iter(NULL, NULL, NULL);
749 do {
750 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
751 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
752 } while (freed > 10);
753}
754
755void drop_slab(void)
756{
757 int nid;
758
759 for_each_online_node(nid)
760 drop_slab_node(nid);
761}
762
763static inline int is_page_cache_freeable(struct page *page)
764{
765
766
767
768
769
770 int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
771 HPAGE_PMD_NR : 1;
772 return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
773}
774
775static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
776{
777 if (current->flags & PF_SWAPWRITE)
778 return 1;
779 if (!inode_write_congested(inode))
780 return 1;
781 if (inode_to_bdi(inode) == current->backing_dev_info)
782 return 1;
783 return 0;
784}
785
786
787
788
789
790
791
792
793
794
795
796
797
798static void handle_write_error(struct address_space *mapping,
799 struct page *page, int error)
800{
801 lock_page(page);
802 if (page_mapping(page) == mapping)
803 mapping_set_error(mapping, error);
804 unlock_page(page);
805}
806
807
808typedef enum {
809
810 PAGE_KEEP,
811
812 PAGE_ACTIVATE,
813
814 PAGE_SUCCESS,
815
816 PAGE_CLEAN,
817} pageout_t;
818
819
820
821
822
823static pageout_t pageout(struct page *page, struct address_space *mapping,
824 struct scan_control *sc)
825{
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842 if (!is_page_cache_freeable(page))
843 return PAGE_KEEP;
844 if (!mapping) {
845
846
847
848
849 if (page_has_private(page)) {
850 if (try_to_free_buffers(page)) {
851 ClearPageDirty(page);
852 pr_info("%s: orphaned page\n", __func__);
853 return PAGE_CLEAN;
854 }
855 }
856 return PAGE_KEEP;
857 }
858 if (mapping->a_ops->writepage == NULL)
859 return PAGE_ACTIVATE;
860 if (!may_write_to_inode(mapping->host, sc))
861 return PAGE_KEEP;
862
863 if (clear_page_dirty_for_io(page)) {
864 int res;
865 struct writeback_control wbc = {
866 .sync_mode = WB_SYNC_NONE,
867 .nr_to_write = SWAP_CLUSTER_MAX,
868 .range_start = 0,
869 .range_end = LLONG_MAX,
870 .for_reclaim = 1,
871 };
872
873 SetPageReclaim(page);
874 res = mapping->a_ops->writepage(page, &wbc);
875 if (res < 0)
876 handle_write_error(mapping, page, res);
877 if (res == AOP_WRITEPAGE_ACTIVATE) {
878 ClearPageReclaim(page);
879 return PAGE_ACTIVATE;
880 }
881
882 if (!PageWriteback(page)) {
883
884 ClearPageReclaim(page);
885 }
886 trace_mm_vmscan_writepage(page);
887 inc_node_page_state(page, NR_VMSCAN_WRITE);
888 return PAGE_SUCCESS;
889 }
890
891 return PAGE_CLEAN;
892}
893
894
895
896
897
898static int __remove_mapping(struct address_space *mapping, struct page *page,
899 bool reclaimed)
900{
901 unsigned long flags;
902 int refcount;
903
904 BUG_ON(!PageLocked(page));
905 BUG_ON(mapping != page_mapping(page));
906
907 xa_lock_irqsave(&mapping->i_pages, flags);
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933 if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
934 refcount = 1 + HPAGE_PMD_NR;
935 else
936 refcount = 2;
937 if (!page_ref_freeze(page, refcount))
938 goto cannot_free;
939
940 if (unlikely(PageDirty(page))) {
941 page_ref_unfreeze(page, refcount);
942 goto cannot_free;
943 }
944
945 if (PageSwapCache(page)) {
946 swp_entry_t swap = { .val = page_private(page) };
947 mem_cgroup_swapout(page, swap);
948 __delete_from_swap_cache(page, swap);
949 xa_unlock_irqrestore(&mapping->i_pages, flags);
950 put_swap_page(page, swap);
951 } else {
952 void (*freepage)(struct page *);
953 void *shadow = NULL;
954
955 freepage = mapping->a_ops->freepage;
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972 if (reclaimed && page_is_file_cache(page) &&
973 !mapping_exiting(mapping) && !dax_mapping(mapping))
974 shadow = workingset_eviction(page);
975 __delete_from_page_cache(page, shadow);
976 xa_unlock_irqrestore(&mapping->i_pages, flags);
977
978 if (freepage != NULL)
979 freepage(page);
980 }
981
982 return 1;
983
984cannot_free:
985 xa_unlock_irqrestore(&mapping->i_pages, flags);
986 return 0;
987}
988
989
990
991
992
993
994
995int remove_mapping(struct address_space *mapping, struct page *page)
996{
997 if (__remove_mapping(mapping, page, false)) {
998
999
1000
1001
1002
1003 page_ref_unfreeze(page, 1);
1004 return 1;
1005 }
1006 return 0;
1007}
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018void putback_lru_page(struct page *page)
1019{
1020 lru_cache_add(page);
1021 put_page(page);
1022}
1023
1024enum page_references {
1025 PAGEREF_RECLAIM,
1026 PAGEREF_RECLAIM_CLEAN,
1027 PAGEREF_KEEP,
1028 PAGEREF_ACTIVATE,
1029};
1030
1031static enum page_references page_check_references(struct page *page,
1032 struct scan_control *sc)
1033{
1034 int referenced_ptes, referenced_page;
1035 unsigned long vm_flags;
1036
1037 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
1038 &vm_flags);
1039 referenced_page = TestClearPageReferenced(page);
1040
1041
1042
1043
1044
1045 if (vm_flags & VM_LOCKED)
1046 return PAGEREF_RECLAIM;
1047
1048 if (referenced_ptes) {
1049 if (PageSwapBacked(page))
1050 return PAGEREF_ACTIVATE;
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065 SetPageReferenced(page);
1066
1067 if (referenced_page || referenced_ptes > 1)
1068 return PAGEREF_ACTIVATE;
1069
1070
1071
1072
1073 if (vm_flags & VM_EXEC)
1074 return PAGEREF_ACTIVATE;
1075
1076 return PAGEREF_KEEP;
1077 }
1078
1079
1080 if (referenced_page && !PageSwapBacked(page))
1081 return PAGEREF_RECLAIM_CLEAN;
1082
1083 return PAGEREF_RECLAIM;
1084}
1085
1086
1087static void page_check_dirty_writeback(struct page *page,
1088 bool *dirty, bool *writeback)
1089{
1090 struct address_space *mapping;
1091
1092
1093
1094
1095
1096 if (!page_is_file_cache(page) ||
1097 (PageAnon(page) && !PageSwapBacked(page))) {
1098 *dirty = false;
1099 *writeback = false;
1100 return;
1101 }
1102
1103
1104 *dirty = PageDirty(page);
1105 *writeback = PageWriteback(page);
1106
1107
1108 if (!page_has_private(page))
1109 return;
1110
1111 mapping = page_mapping(page);
1112 if (mapping && mapping->a_ops->is_dirty_writeback)
1113 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1114}
1115
1116
1117
1118
1119static unsigned long shrink_page_list(struct list_head *page_list,
1120 struct pglist_data *pgdat,
1121 struct scan_control *sc,
1122 enum ttu_flags ttu_flags,
1123 struct reclaim_stat *stat,
1124 bool force_reclaim)
1125{
1126 LIST_HEAD(ret_pages);
1127 LIST_HEAD(free_pages);
1128 unsigned nr_reclaimed = 0;
1129 unsigned pgactivate = 0;
1130
1131 memset(stat, 0, sizeof(*stat));
1132 cond_resched();
1133
1134 while (!list_empty(page_list)) {
1135 struct address_space *mapping;
1136 struct page *page;
1137 int may_enter_fs;
1138 enum page_references references = PAGEREF_RECLAIM_CLEAN;
1139 bool dirty, writeback;
1140 unsigned int nr_pages;
1141
1142 cond_resched();
1143
1144 page = lru_to_page(page_list);
1145 list_del(&page->lru);
1146
1147 if (!trylock_page(page))
1148 goto keep;
1149
1150 VM_BUG_ON_PAGE(PageActive(page), page);
1151
1152 nr_pages = 1 << compound_order(page);
1153
1154
1155 sc->nr_scanned += nr_pages;
1156
1157 if (unlikely(!page_evictable(page)))
1158 goto activate_locked;
1159
1160 if (!sc->may_unmap && page_mapped(page))
1161 goto keep_locked;
1162
1163 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1164 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1165
1166
1167
1168
1169
1170
1171
1172 page_check_dirty_writeback(page, &dirty, &writeback);
1173 if (dirty || writeback)
1174 stat->nr_dirty++;
1175
1176 if (dirty && !writeback)
1177 stat->nr_unqueued_dirty++;
1178
1179
1180
1181
1182
1183
1184
1185 mapping = page_mapping(page);
1186 if (((dirty || writeback) && mapping &&
1187 inode_write_congested(mapping->host)) ||
1188 (writeback && PageReclaim(page)))
1189 stat->nr_congested++;
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233 if (PageWriteback(page)) {
1234
1235 if (current_is_kswapd() &&
1236 PageReclaim(page) &&
1237 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1238 stat->nr_immediate++;
1239 goto activate_locked;
1240
1241
1242 } else if (sane_reclaim(sc) ||
1243 !PageReclaim(page) || !may_enter_fs) {
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255 SetPageReclaim(page);
1256 stat->nr_writeback++;
1257 goto activate_locked;
1258
1259
1260 } else {
1261 unlock_page(page);
1262 wait_on_page_writeback(page);
1263
1264 list_add_tail(&page->lru, page_list);
1265 continue;
1266 }
1267 }
1268
1269 if (!force_reclaim)
1270 references = page_check_references(page, sc);
1271
1272 switch (references) {
1273 case PAGEREF_ACTIVATE:
1274 goto activate_locked;
1275 case PAGEREF_KEEP:
1276 stat->nr_ref_keep += nr_pages;
1277 goto keep_locked;
1278 case PAGEREF_RECLAIM:
1279 case PAGEREF_RECLAIM_CLEAN:
1280 ;
1281 }
1282
1283
1284
1285
1286
1287
1288 if (PageAnon(page) && PageSwapBacked(page)) {
1289 if (!PageSwapCache(page)) {
1290 if (!(sc->gfp_mask & __GFP_IO))
1291 goto keep_locked;
1292 if (PageTransHuge(page)) {
1293
1294 if (!can_split_huge_page(page, NULL))
1295 goto activate_locked;
1296
1297
1298
1299
1300
1301 if (!compound_mapcount(page) &&
1302 split_huge_page_to_list(page,
1303 page_list))
1304 goto activate_locked;
1305 }
1306 if (!add_to_swap(page)) {
1307 if (!PageTransHuge(page))
1308 goto activate_locked_split;
1309
1310 if (split_huge_page_to_list(page,
1311 page_list))
1312 goto activate_locked;
1313#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1314 count_vm_event(THP_SWPOUT_FALLBACK);
1315#endif
1316 if (!add_to_swap(page))
1317 goto activate_locked_split;
1318 }
1319
1320 may_enter_fs = 1;
1321
1322
1323 mapping = page_mapping(page);
1324 }
1325 } else if (unlikely(PageTransHuge(page))) {
1326
1327 if (split_huge_page_to_list(page, page_list))
1328 goto keep_locked;
1329 }
1330
1331
1332
1333
1334
1335
1336
1337
1338 if ((nr_pages > 1) && !PageTransHuge(page)) {
1339 sc->nr_scanned -= (nr_pages - 1);
1340 nr_pages = 1;
1341 }
1342
1343
1344
1345
1346
1347 if (page_mapped(page)) {
1348 enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
1349
1350 if (unlikely(PageTransHuge(page)))
1351 flags |= TTU_SPLIT_HUGE_PMD;
1352 if (!try_to_unmap(page, flags)) {
1353 stat->nr_unmap_fail += nr_pages;
1354 goto activate_locked;
1355 }
1356 }
1357
1358 if (PageDirty(page)) {
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369 if (page_is_file_cache(page) &&
1370 (!current_is_kswapd() || !PageReclaim(page) ||
1371 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1372
1373
1374
1375
1376
1377
1378 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1379 SetPageReclaim(page);
1380
1381 goto activate_locked;
1382 }
1383
1384 if (references == PAGEREF_RECLAIM_CLEAN)
1385 goto keep_locked;
1386 if (!may_enter_fs)
1387 goto keep_locked;
1388 if (!sc->may_writepage)
1389 goto keep_locked;
1390
1391
1392
1393
1394
1395
1396 try_to_unmap_flush_dirty();
1397 switch (pageout(page, mapping, sc)) {
1398 case PAGE_KEEP:
1399 goto keep_locked;
1400 case PAGE_ACTIVATE:
1401 goto activate_locked;
1402 case PAGE_SUCCESS:
1403 if (PageWriteback(page))
1404 goto keep;
1405 if (PageDirty(page))
1406 goto keep;
1407
1408
1409
1410
1411
1412 if (!trylock_page(page))
1413 goto keep;
1414 if (PageDirty(page) || PageWriteback(page))
1415 goto keep_locked;
1416 mapping = page_mapping(page);
1417 case PAGE_CLEAN:
1418 ;
1419 }
1420 }
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443 if (page_has_private(page)) {
1444 if (!try_to_release_page(page, sc->gfp_mask))
1445 goto activate_locked;
1446 if (!mapping && page_count(page) == 1) {
1447 unlock_page(page);
1448 if (put_page_testzero(page))
1449 goto free_it;
1450 else {
1451
1452
1453
1454
1455
1456
1457
1458 nr_reclaimed++;
1459 continue;
1460 }
1461 }
1462 }
1463
1464 if (PageAnon(page) && !PageSwapBacked(page)) {
1465
1466 if (!page_ref_freeze(page, 1))
1467 goto keep_locked;
1468 if (PageDirty(page)) {
1469 page_ref_unfreeze(page, 1);
1470 goto keep_locked;
1471 }
1472
1473 count_vm_event(PGLAZYFREED);
1474 count_memcg_page_event(page, PGLAZYFREED);
1475 } else if (!mapping || !__remove_mapping(mapping, page, true))
1476 goto keep_locked;
1477
1478 unlock_page(page);
1479free_it:
1480
1481
1482
1483
1484 nr_reclaimed += nr_pages;
1485
1486
1487
1488
1489
1490 if (unlikely(PageTransHuge(page))) {
1491 mem_cgroup_uncharge(page);
1492 (*get_compound_page_dtor(page))(page);
1493 } else
1494 list_add(&page->lru, &free_pages);
1495 continue;
1496
1497activate_locked_split:
1498
1499
1500
1501
1502 if (nr_pages > 1) {
1503 sc->nr_scanned -= (nr_pages - 1);
1504 nr_pages = 1;
1505 }
1506activate_locked:
1507
1508 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1509 PageMlocked(page)))
1510 try_to_free_swap(page);
1511 VM_BUG_ON_PAGE(PageActive(page), page);
1512 if (!PageMlocked(page)) {
1513 int type = page_is_file_cache(page);
1514 SetPageActive(page);
1515 stat->nr_activate[type] += nr_pages;
1516 count_memcg_page_event(page, PGACTIVATE);
1517 }
1518keep_locked:
1519 unlock_page(page);
1520keep:
1521 list_add(&page->lru, &ret_pages);
1522 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1523 }
1524
1525 pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
1526
1527 mem_cgroup_uncharge_list(&free_pages);
1528 try_to_unmap_flush();
1529 free_unref_page_list(&free_pages);
1530
1531 list_splice(&ret_pages, page_list);
1532 count_vm_events(PGACTIVATE, pgactivate);
1533
1534 return nr_reclaimed;
1535}
1536
1537unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1538 struct list_head *page_list)
1539{
1540 struct scan_control sc = {
1541 .gfp_mask = GFP_KERNEL,
1542 .priority = DEF_PRIORITY,
1543 .may_unmap = 1,
1544 };
1545 struct reclaim_stat dummy_stat;
1546 unsigned long ret;
1547 struct page *page, *next;
1548 LIST_HEAD(clean_pages);
1549
1550 list_for_each_entry_safe(page, next, page_list, lru) {
1551 if (page_is_file_cache(page) && !PageDirty(page) &&
1552 !__PageMovable(page) && !PageUnevictable(page)) {
1553 ClearPageActive(page);
1554 list_move(&page->lru, &clean_pages);
1555 }
1556 }
1557
1558 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1559 TTU_IGNORE_ACCESS, &dummy_stat, true);
1560 list_splice(&clean_pages, page_list);
1561 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
1562 return ret;
1563}
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1576{
1577 int ret = -EINVAL;
1578
1579
1580 if (!PageLRU(page))
1581 return ret;
1582
1583
1584 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1585 return ret;
1586
1587 ret = -EBUSY;
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597 if (mode & ISOLATE_ASYNC_MIGRATE) {
1598
1599 if (PageWriteback(page))
1600 return ret;
1601
1602 if (PageDirty(page)) {
1603 struct address_space *mapping;
1604 bool migrate_dirty;
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615 if (!trylock_page(page))
1616 return ret;
1617
1618 mapping = page_mapping(page);
1619 migrate_dirty = !mapping || mapping->a_ops->migratepage;
1620 unlock_page(page);
1621 if (!migrate_dirty)
1622 return ret;
1623 }
1624 }
1625
1626 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1627 return ret;
1628
1629 if (likely(get_page_unless_zero(page))) {
1630
1631
1632
1633
1634
1635 ClearPageLRU(page);
1636 ret = 0;
1637 }
1638
1639 return ret;
1640}
1641
1642
1643
1644
1645
1646
1647static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1648 enum lru_list lru, unsigned long *nr_zone_taken)
1649{
1650 int zid;
1651
1652 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1653 if (!nr_zone_taken[zid])
1654 continue;
1655
1656 __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1657#ifdef CONFIG_MEMCG
1658 mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1659#endif
1660 }
1661
1662}
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1685 struct lruvec *lruvec, struct list_head *dst,
1686 unsigned long *nr_scanned, struct scan_control *sc,
1687 enum lru_list lru)
1688{
1689 struct list_head *src = &lruvec->lists[lru];
1690 unsigned long nr_taken = 0;
1691 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1692 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1693 unsigned long skipped = 0;
1694 unsigned long scan, total_scan, nr_pages;
1695 LIST_HEAD(pages_skipped);
1696 isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
1697
1698 total_scan = 0;
1699 scan = 0;
1700 while (scan < nr_to_scan && !list_empty(src)) {
1701 struct page *page;
1702
1703 page = lru_to_page(src);
1704 prefetchw_prev_lru_page(page, src, flags);
1705
1706 VM_BUG_ON_PAGE(!PageLRU(page), page);
1707
1708 nr_pages = 1 << compound_order(page);
1709 total_scan += nr_pages;
1710
1711 if (page_zonenum(page) > sc->reclaim_idx) {
1712 list_move(&page->lru, &pages_skipped);
1713 nr_skipped[page_zonenum(page)] += nr_pages;
1714 continue;
1715 }
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727 scan += nr_pages;
1728 switch (__isolate_lru_page(page, mode)) {
1729 case 0:
1730 nr_taken += nr_pages;
1731 nr_zone_taken[page_zonenum(page)] += nr_pages;
1732 list_move(&page->lru, dst);
1733 break;
1734
1735 case -EBUSY:
1736
1737 list_move(&page->lru, src);
1738 continue;
1739
1740 default:
1741 BUG();
1742 }
1743 }
1744
1745
1746
1747
1748
1749
1750
1751
1752 if (!list_empty(&pages_skipped)) {
1753 int zid;
1754
1755 list_splice(&pages_skipped, src);
1756 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1757 if (!nr_skipped[zid])
1758 continue;
1759
1760 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1761 skipped += nr_skipped[zid];
1762 }
1763 }
1764 *nr_scanned = total_scan;
1765 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1766 total_scan, skipped, nr_taken, mode, lru);
1767 update_lru_sizes(lruvec, lru, nr_zone_taken);
1768 return nr_taken;
1769}
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797int isolate_lru_page(struct page *page)
1798{
1799 int ret = -EBUSY;
1800
1801 VM_BUG_ON_PAGE(!page_count(page), page);
1802 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1803
1804 if (PageLRU(page)) {
1805 pg_data_t *pgdat = page_pgdat(page);
1806 struct lruvec *lruvec;
1807
1808 spin_lock_irq(&pgdat->lru_lock);
1809 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1810 if (PageLRU(page)) {
1811 int lru = page_lru(page);
1812 get_page(page);
1813 ClearPageLRU(page);
1814 del_page_from_lru_list(page, lruvec, lru);
1815 ret = 0;
1816 }
1817 spin_unlock_irq(&pgdat->lru_lock);
1818 }
1819 return ret;
1820}
1821
1822
1823
1824
1825
1826
1827
1828
1829static int too_many_isolated(struct pglist_data *pgdat, int file,
1830 struct scan_control *sc)
1831{
1832 unsigned long inactive, isolated;
1833
1834 if (current_is_kswapd())
1835 return 0;
1836
1837 if (!sane_reclaim(sc))
1838 return 0;
1839
1840 if (file) {
1841 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1842 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1843 } else {
1844 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1845 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1846 }
1847
1848
1849
1850
1851
1852
1853 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1854 inactive >>= 3;
1855
1856 return isolated > inactive;
1857}
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
1880 struct list_head *list)
1881{
1882 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1883 int nr_pages, nr_moved = 0;
1884 LIST_HEAD(pages_to_free);
1885 struct page *page;
1886 enum lru_list lru;
1887
1888 while (!list_empty(list)) {
1889 page = lru_to_page(list);
1890 VM_BUG_ON_PAGE(PageLRU(page), page);
1891 if (unlikely(!page_evictable(page))) {
1892 list_del(&page->lru);
1893 spin_unlock_irq(&pgdat->lru_lock);
1894 putback_lru_page(page);
1895 spin_lock_irq(&pgdat->lru_lock);
1896 continue;
1897 }
1898 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1899
1900 SetPageLRU(page);
1901 lru = page_lru(page);
1902
1903 nr_pages = hpage_nr_pages(page);
1904 update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
1905 list_move(&page->lru, &lruvec->lists[lru]);
1906
1907 if (put_page_testzero(page)) {
1908 __ClearPageLRU(page);
1909 __ClearPageActive(page);
1910 del_page_from_lru_list(page, lruvec, lru);
1911
1912 if (unlikely(PageCompound(page))) {
1913 spin_unlock_irq(&pgdat->lru_lock);
1914 mem_cgroup_uncharge(page);
1915 (*get_compound_page_dtor(page))(page);
1916 spin_lock_irq(&pgdat->lru_lock);
1917 } else
1918 list_add(&page->lru, &pages_to_free);
1919 } else {
1920 nr_moved += nr_pages;
1921 }
1922 }
1923
1924
1925
1926
1927 list_splice(&pages_to_free, list);
1928
1929 return nr_moved;
1930}
1931
1932
1933
1934
1935
1936
1937
1938static int current_may_throttle(void)
1939{
1940 return !(current->flags & PF_LESS_THROTTLE) ||
1941 current->backing_dev_info == NULL ||
1942 bdi_write_congested(current->backing_dev_info);
1943}
1944
1945
1946
1947
1948
1949static noinline_for_stack unsigned long
1950shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1951 struct scan_control *sc, enum lru_list lru)
1952{
1953 LIST_HEAD(page_list);
1954 unsigned long nr_scanned;
1955 unsigned long nr_reclaimed = 0;
1956 unsigned long nr_taken;
1957 struct reclaim_stat stat;
1958 int file = is_file_lru(lru);
1959 enum vm_event_item item;
1960 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1961 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1962 bool stalled = false;
1963
1964 while (unlikely(too_many_isolated(pgdat, file, sc))) {
1965 if (stalled)
1966 return 0;
1967
1968
1969 msleep(100);
1970 stalled = true;
1971
1972
1973 if (fatal_signal_pending(current))
1974 return SWAP_CLUSTER_MAX;
1975 }
1976
1977 lru_add_drain();
1978
1979 spin_lock_irq(&pgdat->lru_lock);
1980
1981 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1982 &nr_scanned, sc, lru);
1983
1984 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1985 reclaim_stat->recent_scanned[file] += nr_taken;
1986
1987 item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
1988 if (global_reclaim(sc))
1989 __count_vm_events(item, nr_scanned);
1990 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
1991 spin_unlock_irq(&pgdat->lru_lock);
1992
1993 if (nr_taken == 0)
1994 return 0;
1995
1996 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
1997 &stat, false);
1998
1999 spin_lock_irq(&pgdat->lru_lock);
2000
2001 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
2002 if (global_reclaim(sc))
2003 __count_vm_events(item, nr_reclaimed);
2004 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
2005 reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
2006 reclaim_stat->recent_rotated[1] += stat.nr_activate[1];
2007
2008 move_pages_to_lru(lruvec, &page_list);
2009
2010 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2011
2012 spin_unlock_irq(&pgdat->lru_lock);
2013
2014 mem_cgroup_uncharge_list(&page_list);
2015 free_unref_page_list(&page_list);
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028 if (stat.nr_unqueued_dirty == nr_taken)
2029 wakeup_flusher_threads(WB_REASON_VMSCAN);
2030
2031 sc->nr.dirty += stat.nr_dirty;
2032 sc->nr.congested += stat.nr_congested;
2033 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2034 sc->nr.writeback += stat.nr_writeback;
2035 sc->nr.immediate += stat.nr_immediate;
2036 sc->nr.taken += nr_taken;
2037 if (file)
2038 sc->nr.file_taken += nr_taken;
2039
2040 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2041 nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2042 return nr_reclaimed;
2043}
2044
2045static void shrink_active_list(unsigned long nr_to_scan,
2046 struct lruvec *lruvec,
2047 struct scan_control *sc,
2048 enum lru_list lru)
2049{
2050 unsigned long nr_taken;
2051 unsigned long nr_scanned;
2052 unsigned long vm_flags;
2053 LIST_HEAD(l_hold);
2054 LIST_HEAD(l_active);
2055 LIST_HEAD(l_inactive);
2056 struct page *page;
2057 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2058 unsigned nr_deactivate, nr_activate;
2059 unsigned nr_rotated = 0;
2060 int file = is_file_lru(lru);
2061 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2062
2063 lru_add_drain();
2064
2065 spin_lock_irq(&pgdat->lru_lock);
2066
2067 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2068 &nr_scanned, sc, lru);
2069
2070 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2071 reclaim_stat->recent_scanned[file] += nr_taken;
2072
2073 __count_vm_events(PGREFILL, nr_scanned);
2074 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2075
2076 spin_unlock_irq(&pgdat->lru_lock);
2077
2078 while (!list_empty(&l_hold)) {
2079 cond_resched();
2080 page = lru_to_page(&l_hold);
2081 list_del(&page->lru);
2082
2083 if (unlikely(!page_evictable(page))) {
2084 putback_lru_page(page);
2085 continue;
2086 }
2087
2088 if (unlikely(buffer_heads_over_limit)) {
2089 if (page_has_private(page) && trylock_page(page)) {
2090 if (page_has_private(page))
2091 try_to_release_page(page, 0);
2092 unlock_page(page);
2093 }
2094 }
2095
2096 if (page_referenced(page, 0, sc->target_mem_cgroup,
2097 &vm_flags)) {
2098 nr_rotated += hpage_nr_pages(page);
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
2109 list_add(&page->lru, &l_active);
2110 continue;
2111 }
2112 }
2113
2114 ClearPageActive(page);
2115 SetPageWorkingset(page);
2116 list_add(&page->lru, &l_inactive);
2117 }
2118
2119
2120
2121
2122 spin_lock_irq(&pgdat->lru_lock);
2123
2124
2125
2126
2127
2128
2129 reclaim_stat->recent_rotated[file] += nr_rotated;
2130
2131 nr_activate = move_pages_to_lru(lruvec, &l_active);
2132 nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2133
2134 list_splice(&l_inactive, &l_active);
2135
2136 __count_vm_events(PGDEACTIVATE, nr_deactivate);
2137 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2138
2139 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2140 spin_unlock_irq(&pgdat->lru_lock);
2141
2142 mem_cgroup_uncharge_list(&l_active);
2143 free_unref_page_list(&l_active);
2144 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2145 nr_deactivate, nr_rotated, sc->priority, file);
2146}
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2177 struct scan_control *sc, bool trace)
2178{
2179 enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
2180 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2181 enum lru_list inactive_lru = file * LRU_FILE;
2182 unsigned long inactive, active;
2183 unsigned long inactive_ratio;
2184 unsigned long refaults;
2185 unsigned long gb;
2186
2187
2188
2189
2190
2191 if (!file && !total_swap_pages)
2192 return false;
2193
2194 inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
2195 active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
2196
2197
2198
2199
2200
2201
2202 refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
2203 if (file && lruvec->refaults != refaults) {
2204 inactive_ratio = 0;
2205 } else {
2206 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2207 if (gb)
2208 inactive_ratio = int_sqrt(10 * gb);
2209 else
2210 inactive_ratio = 1;
2211 }
2212
2213 if (trace)
2214 trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
2215 lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
2216 lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
2217 inactive_ratio, file);
2218
2219 return inactive * inactive_ratio < active;
2220}
2221
2222static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2223 struct lruvec *lruvec, struct scan_control *sc)
2224{
2225 if (is_active_lru(lru)) {
2226 if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
2227 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2228 return 0;
2229 }
2230
2231 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2232}
2233
2234enum scan_balance {
2235 SCAN_EQUAL,
2236 SCAN_FRACT,
2237 SCAN_ANON,
2238 SCAN_FILE,
2239};
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2251 struct scan_control *sc, unsigned long *nr,
2252 unsigned long *lru_pages)
2253{
2254 int swappiness = mem_cgroup_swappiness(memcg);
2255 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2256 u64 fraction[2];
2257 u64 denominator = 0;
2258 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2259 unsigned long anon_prio, file_prio;
2260 enum scan_balance scan_balance;
2261 unsigned long anon, file;
2262 unsigned long ap, fp;
2263 enum lru_list lru;
2264
2265
2266 if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2267 scan_balance = SCAN_FILE;
2268 goto out;
2269 }
2270
2271
2272
2273
2274
2275
2276
2277
2278 if (!global_reclaim(sc) && !swappiness) {
2279 scan_balance = SCAN_FILE;
2280 goto out;
2281 }
2282
2283
2284
2285
2286
2287
2288 if (!sc->priority && swappiness) {
2289 scan_balance = SCAN_EQUAL;
2290 goto out;
2291 }
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302 if (global_reclaim(sc)) {
2303 unsigned long pgdatfile;
2304 unsigned long pgdatfree;
2305 int z;
2306 unsigned long total_high_wmark = 0;
2307
2308 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2309 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
2310 node_page_state(pgdat, NR_INACTIVE_FILE);
2311
2312 for (z = 0; z < MAX_NR_ZONES; z++) {
2313 struct zone *zone = &pgdat->node_zones[z];
2314 if (!managed_zone(zone))
2315 continue;
2316
2317 total_high_wmark += high_wmark_pages(zone);
2318 }
2319
2320 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
2321
2322
2323
2324
2325
2326 if (!inactive_list_is_low(lruvec, false, sc, false) &&
2327 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
2328 >> sc->priority) {
2329 scan_balance = SCAN_ANON;
2330 goto out;
2331 }
2332 }
2333 }
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344 if (!inactive_list_is_low(lruvec, true, sc, false) &&
2345 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
2346 scan_balance = SCAN_FILE;
2347 goto out;
2348 }
2349
2350 scan_balance = SCAN_FRACT;
2351
2352
2353
2354
2355
2356 anon_prio = swappiness;
2357 file_prio = 200 - anon_prio;
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
2372 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
2373 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
2374 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
2375
2376 spin_lock_irq(&pgdat->lru_lock);
2377 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
2378 reclaim_stat->recent_scanned[0] /= 2;
2379 reclaim_stat->recent_rotated[0] /= 2;
2380 }
2381
2382 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
2383 reclaim_stat->recent_scanned[1] /= 2;
2384 reclaim_stat->recent_rotated[1] /= 2;
2385 }
2386
2387
2388
2389
2390
2391
2392 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
2393 ap /= reclaim_stat->recent_rotated[0] + 1;
2394
2395 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
2396 fp /= reclaim_stat->recent_rotated[1] + 1;
2397 spin_unlock_irq(&pgdat->lru_lock);
2398
2399 fraction[0] = ap;
2400 fraction[1] = fp;
2401 denominator = ap + fp + 1;
2402out:
2403 *lru_pages = 0;
2404 for_each_evictable_lru(lru) {
2405 int file = is_file_lru(lru);
2406 unsigned long size;
2407 unsigned long scan;
2408
2409 size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2410 scan = size >> sc->priority;
2411
2412
2413
2414
2415 if (!scan && !mem_cgroup_online(memcg))
2416 scan = min(size, SWAP_CLUSTER_MAX);
2417
2418 switch (scan_balance) {
2419 case SCAN_EQUAL:
2420
2421 break;
2422 case SCAN_FRACT:
2423
2424
2425
2426
2427
2428
2429 scan = DIV64_U64_ROUND_UP(scan * fraction[file],
2430 denominator);
2431 break;
2432 case SCAN_FILE:
2433 case SCAN_ANON:
2434
2435 if ((scan_balance == SCAN_FILE) != file) {
2436 size = 0;
2437 scan = 0;
2438 }
2439 break;
2440 default:
2441
2442 BUG();
2443 }
2444
2445 *lru_pages += size;
2446 nr[lru] = scan;
2447 }
2448}
2449
2450
2451
2452
2453static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
2454 struct scan_control *sc, unsigned long *lru_pages)
2455{
2456 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2457 unsigned long nr[NR_LRU_LISTS];
2458 unsigned long targets[NR_LRU_LISTS];
2459 unsigned long nr_to_scan;
2460 enum lru_list lru;
2461 unsigned long nr_reclaimed = 0;
2462 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2463 struct blk_plug plug;
2464 bool scan_adjusted;
2465
2466 get_scan_count(lruvec, memcg, sc, nr, lru_pages);
2467
2468
2469 memcpy(targets, nr, sizeof(nr));
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2483 sc->priority == DEF_PRIORITY);
2484
2485 blk_start_plug(&plug);
2486 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2487 nr[LRU_INACTIVE_FILE]) {
2488 unsigned long nr_anon, nr_file, percentage;
2489 unsigned long nr_scanned;
2490
2491 for_each_evictable_lru(lru) {
2492 if (nr[lru]) {
2493 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2494 nr[lru] -= nr_to_scan;
2495
2496 nr_reclaimed += shrink_list(lru, nr_to_scan,
2497 lruvec, sc);
2498 }
2499 }
2500
2501 cond_resched();
2502
2503 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2504 continue;
2505
2506
2507
2508
2509
2510
2511
2512
2513 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2514 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2515
2516
2517
2518
2519
2520
2521
2522 if (!nr_file || !nr_anon)
2523 break;
2524
2525 if (nr_file > nr_anon) {
2526 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2527 targets[LRU_ACTIVE_ANON] + 1;
2528 lru = LRU_BASE;
2529 percentage = nr_anon * 100 / scan_target;
2530 } else {
2531 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2532 targets[LRU_ACTIVE_FILE] + 1;
2533 lru = LRU_FILE;
2534 percentage = nr_file * 100 / scan_target;
2535 }
2536
2537
2538 nr[lru] = 0;
2539 nr[lru + LRU_ACTIVE] = 0;
2540
2541
2542
2543
2544
2545 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2546 nr_scanned = targets[lru] - nr[lru];
2547 nr[lru] = targets[lru] * (100 - percentage) / 100;
2548 nr[lru] -= min(nr[lru], nr_scanned);
2549
2550 lru += LRU_ACTIVE;
2551 nr_scanned = targets[lru] - nr[lru];
2552 nr[lru] = targets[lru] * (100 - percentage) / 100;
2553 nr[lru] -= min(nr[lru], nr_scanned);
2554
2555 scan_adjusted = true;
2556 }
2557 blk_finish_plug(&plug);
2558 sc->nr_reclaimed += nr_reclaimed;
2559
2560
2561
2562
2563
2564 if (inactive_list_is_low(lruvec, false, sc, true))
2565 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2566 sc, LRU_ACTIVE_ANON);
2567}
2568
2569
2570static bool in_reclaim_compaction(struct scan_control *sc)
2571{
2572 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2573 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2574 sc->priority < DEF_PRIORITY - 2))
2575 return true;
2576
2577 return false;
2578}
2579
2580
2581
2582
2583
2584
2585
2586
2587static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2588 unsigned long nr_reclaimed,
2589 unsigned long nr_scanned,
2590 struct scan_control *sc)
2591{
2592 unsigned long pages_for_compaction;
2593 unsigned long inactive_lru_pages;
2594 int z;
2595
2596
2597 if (!in_reclaim_compaction(sc))
2598 return false;
2599
2600
2601 if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
2602
2603
2604
2605
2606
2607
2608 if (!nr_reclaimed && !nr_scanned)
2609 return false;
2610 } else {
2611
2612
2613
2614
2615
2616
2617
2618
2619 if (!nr_reclaimed)
2620 return false;
2621 }
2622
2623
2624
2625
2626
2627 pages_for_compaction = compact_gap(sc->order);
2628 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2629 if (get_nr_swap_pages() > 0)
2630 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2631 if (sc->nr_reclaimed < pages_for_compaction &&
2632 inactive_lru_pages > pages_for_compaction)
2633 return true;
2634
2635
2636 for (z = 0; z <= sc->reclaim_idx; z++) {
2637 struct zone *zone = &pgdat->node_zones[z];
2638 if (!managed_zone(zone))
2639 continue;
2640
2641 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2642 case COMPACT_SUCCESS:
2643 case COMPACT_CONTINUE:
2644 return false;
2645 default:
2646
2647 ;
2648 }
2649 }
2650 return true;
2651}
2652
2653static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
2654{
2655 return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
2656 (memcg && memcg_congested(pgdat, memcg));
2657}
2658
2659static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2660{
2661 struct reclaim_state *reclaim_state = current->reclaim_state;
2662 unsigned long nr_reclaimed, nr_scanned;
2663 bool reclaimable = false;
2664
2665 do {
2666 struct mem_cgroup *root = sc->target_mem_cgroup;
2667 struct mem_cgroup_reclaim_cookie reclaim = {
2668 .pgdat = pgdat,
2669 .priority = sc->priority,
2670 };
2671 unsigned long node_lru_pages = 0;
2672 struct mem_cgroup *memcg;
2673
2674 memset(&sc->nr, 0, sizeof(sc->nr));
2675
2676 nr_reclaimed = sc->nr_reclaimed;
2677 nr_scanned = sc->nr_scanned;
2678
2679 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2680 do {
2681 unsigned long lru_pages;
2682 unsigned long reclaimed;
2683 unsigned long scanned;
2684
2685 switch (mem_cgroup_protected(root, memcg)) {
2686 case MEMCG_PROT_MIN:
2687
2688
2689
2690
2691 continue;
2692 case MEMCG_PROT_LOW:
2693
2694
2695
2696
2697
2698
2699 if (!sc->memcg_low_reclaim) {
2700 sc->memcg_low_skipped = 1;
2701 continue;
2702 }
2703 memcg_memory_event(memcg, MEMCG_LOW);
2704 break;
2705 case MEMCG_PROT_NONE:
2706 break;
2707 }
2708
2709 reclaimed = sc->nr_reclaimed;
2710 scanned = sc->nr_scanned;
2711 shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
2712 node_lru_pages += lru_pages;
2713
2714 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
2715 sc->priority);
2716
2717
2718 vmpressure(sc->gfp_mask, memcg, false,
2719 sc->nr_scanned - scanned,
2720 sc->nr_reclaimed - reclaimed);
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731 if (!current_is_kswapd() &&
2732 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2733 mem_cgroup_iter_break(root, memcg);
2734 break;
2735 }
2736 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
2737
2738 if (reclaim_state) {
2739 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2740 reclaim_state->reclaimed_slab = 0;
2741 }
2742
2743
2744 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2745 sc->nr_scanned - nr_scanned,
2746 sc->nr_reclaimed - nr_reclaimed);
2747
2748 if (sc->nr_reclaimed - nr_reclaimed)
2749 reclaimable = true;
2750
2751 if (current_is_kswapd()) {
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
2770 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
2771
2772
2773
2774
2775
2776
2777 if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2778 set_bit(PGDAT_CONGESTED, &pgdat->flags);
2779
2780
2781 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
2782 set_bit(PGDAT_DIRTY, &pgdat->flags);
2783
2784
2785
2786
2787
2788
2789
2790 if (sc->nr.immediate)
2791 congestion_wait(BLK_RW_ASYNC, HZ/10);
2792 }
2793
2794
2795
2796
2797
2798 if (!global_reclaim(sc) && sane_reclaim(sc) &&
2799 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2800 set_memcg_congestion(pgdat, root, true);
2801
2802
2803
2804
2805
2806
2807
2808 if (!sc->hibernation_mode && !current_is_kswapd() &&
2809 current_may_throttle() && pgdat_memcg_congested(pgdat, root))
2810 wait_iff_congested(BLK_RW_ASYNC, HZ/10);
2811
2812 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2813 sc->nr_scanned - nr_scanned, sc));
2814
2815
2816
2817
2818
2819
2820
2821 if (reclaimable)
2822 pgdat->kswapd_failures = 0;
2823
2824 return reclaimable;
2825}
2826
2827
2828
2829
2830
2831
2832static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2833{
2834 unsigned long watermark;
2835 enum compact_result suitable;
2836
2837 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2838 if (suitable == COMPACT_SUCCESS)
2839
2840 return true;
2841 if (suitable == COMPACT_SKIPPED)
2842
2843 return false;
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2855
2856 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2857}
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2868{
2869 struct zoneref *z;
2870 struct zone *zone;
2871 unsigned long nr_soft_reclaimed;
2872 unsigned long nr_soft_scanned;
2873 gfp_t orig_mask;
2874 pg_data_t *last_pgdat = NULL;
2875
2876
2877
2878
2879
2880
2881 orig_mask = sc->gfp_mask;
2882 if (buffer_heads_over_limit) {
2883 sc->gfp_mask |= __GFP_HIGHMEM;
2884 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2885 }
2886
2887 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2888 sc->reclaim_idx, sc->nodemask) {
2889
2890
2891
2892
2893 if (global_reclaim(sc)) {
2894 if (!cpuset_zone_allowed(zone,
2895 GFP_KERNEL | __GFP_HARDWALL))
2896 continue;
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907 if (IS_ENABLED(CONFIG_COMPACTION) &&
2908 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2909 compaction_ready(zone, sc)) {
2910 sc->compaction_ready = true;
2911 continue;
2912 }
2913
2914
2915
2916
2917
2918
2919
2920 if (zone->zone_pgdat == last_pgdat)
2921 continue;
2922
2923
2924
2925
2926
2927
2928
2929 nr_soft_scanned = 0;
2930 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2931 sc->order, sc->gfp_mask,
2932 &nr_soft_scanned);
2933 sc->nr_reclaimed += nr_soft_reclaimed;
2934 sc->nr_scanned += nr_soft_scanned;
2935
2936 }
2937
2938
2939 if (zone->zone_pgdat == last_pgdat)
2940 continue;
2941 last_pgdat = zone->zone_pgdat;
2942 shrink_node(zone->zone_pgdat, sc);
2943 }
2944
2945
2946
2947
2948
2949 sc->gfp_mask = orig_mask;
2950}
2951
2952static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
2953{
2954 struct mem_cgroup *memcg;
2955
2956 memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
2957 do {
2958 unsigned long refaults;
2959 struct lruvec *lruvec;
2960
2961 lruvec = mem_cgroup_lruvec(pgdat, memcg);
2962 refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
2963 lruvec->refaults = refaults;
2964 } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
2965}
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2984 struct scan_control *sc)
2985{
2986 int initial_priority = sc->priority;
2987 pg_data_t *last_pgdat;
2988 struct zoneref *z;
2989 struct zone *zone;
2990retry:
2991 delayacct_freepages_start();
2992
2993 if (global_reclaim(sc))
2994 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
2995
2996 do {
2997 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2998 sc->priority);
2999 sc->nr_scanned = 0;
3000 shrink_zones(zonelist, sc);
3001
3002 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3003 break;
3004
3005 if (sc->compaction_ready)
3006 break;
3007
3008
3009
3010
3011
3012 if (sc->priority < DEF_PRIORITY - 2)
3013 sc->may_writepage = 1;
3014 } while (--sc->priority >= 0);
3015
3016 last_pgdat = NULL;
3017 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3018 sc->nodemask) {
3019 if (zone->zone_pgdat == last_pgdat)
3020 continue;
3021 last_pgdat = zone->zone_pgdat;
3022 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3023 set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
3024 }
3025
3026 delayacct_freepages_end();
3027
3028 if (sc->nr_reclaimed)
3029 return sc->nr_reclaimed;
3030
3031
3032 if (sc->compaction_ready)
3033 return 1;
3034
3035
3036 if (sc->memcg_low_skipped) {
3037 sc->priority = initial_priority;
3038 sc->memcg_low_reclaim = 1;
3039 sc->memcg_low_skipped = 0;
3040 goto retry;
3041 }
3042
3043 return 0;
3044}
3045
3046static bool allow_direct_reclaim(pg_data_t *pgdat)
3047{
3048 struct zone *zone;
3049 unsigned long pfmemalloc_reserve = 0;
3050 unsigned long free_pages = 0;
3051 int i;
3052 bool wmark_ok;
3053
3054 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3055 return true;
3056
3057 for (i = 0; i <= ZONE_NORMAL; i++) {
3058 zone = &pgdat->node_zones[i];
3059 if (!managed_zone(zone))
3060 continue;
3061
3062 if (!zone_reclaimable_pages(zone))
3063 continue;
3064
3065 pfmemalloc_reserve += min_wmark_pages(zone);
3066 free_pages += zone_page_state(zone, NR_FREE_PAGES);
3067 }
3068
3069
3070 if (!pfmemalloc_reserve)
3071 return true;
3072
3073 wmark_ok = free_pages > pfmemalloc_reserve / 2;
3074
3075
3076 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3077 pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
3078 (enum zone_type)ZONE_NORMAL);
3079 wake_up_interruptible(&pgdat->kswapd_wait);
3080 }
3081
3082 return wmark_ok;
3083}
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3095 nodemask_t *nodemask)
3096{
3097 struct zoneref *z;
3098 struct zone *zone;
3099 pg_data_t *pgdat = NULL;
3100
3101
3102
3103
3104
3105
3106
3107
3108 if (current->flags & PF_KTHREAD)
3109 goto out;
3110
3111
3112
3113
3114
3115 if (fatal_signal_pending(current))
3116 goto out;
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3133 gfp_zone(gfp_mask), nodemask) {
3134 if (zone_idx(zone) > ZONE_NORMAL)
3135 continue;
3136
3137
3138 pgdat = zone->zone_pgdat;
3139 if (allow_direct_reclaim(pgdat))
3140 goto out;
3141 break;
3142 }
3143
3144
3145 if (!pgdat)
3146 goto out;
3147
3148
3149 count_vm_event(PGSCAN_DIRECT_THROTTLE);
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159 if (!(gfp_mask & __GFP_FS)) {
3160 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3161 allow_direct_reclaim(pgdat), HZ);
3162
3163 goto check_pending;
3164 }
3165
3166
3167 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3168 allow_direct_reclaim(pgdat));
3169
3170check_pending:
3171 if (fatal_signal_pending(current))
3172 return true;
3173
3174out:
3175 return false;
3176}
3177
3178unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3179 gfp_t gfp_mask, nodemask_t *nodemask)
3180{
3181 unsigned long nr_reclaimed;
3182 struct scan_control sc = {
3183 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3184 .gfp_mask = current_gfp_context(gfp_mask),
3185 .reclaim_idx = gfp_zone(gfp_mask),
3186 .order = order,
3187 .nodemask = nodemask,
3188 .priority = DEF_PRIORITY,
3189 .may_writepage = !laptop_mode,
3190 .may_unmap = 1,
3191 .may_swap = 1,
3192 };
3193
3194
3195
3196
3197
3198 BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3199 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3200 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3201
3202
3203
3204
3205
3206
3207 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3208 return 1;
3209
3210 set_task_reclaim_state(current, &sc.reclaim_state);
3211 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3212
3213 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3214
3215 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3216 set_task_reclaim_state(current, NULL);
3217
3218 return nr_reclaimed;
3219}
3220
3221#ifdef CONFIG_MEMCG
3222
3223
3224unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3225 gfp_t gfp_mask, bool noswap,
3226 pg_data_t *pgdat,
3227 unsigned long *nr_scanned)
3228{
3229 struct scan_control sc = {
3230 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3231 .target_mem_cgroup = memcg,
3232 .may_writepage = !laptop_mode,
3233 .may_unmap = 1,
3234 .reclaim_idx = MAX_NR_ZONES - 1,
3235 .may_swap = !noswap,
3236 };
3237 unsigned long lru_pages;
3238
3239 WARN_ON_ONCE(!current->reclaim_state);
3240
3241 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3242 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3243
3244 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3245 sc.gfp_mask);
3246
3247
3248
3249
3250
3251
3252
3253
3254 shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
3255
3256 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3257
3258 *nr_scanned = sc.nr_scanned;
3259
3260 return sc.nr_reclaimed;
3261}
3262
3263unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3264 unsigned long nr_pages,
3265 gfp_t gfp_mask,
3266 bool may_swap)
3267{
3268 struct zonelist *zonelist;
3269 unsigned long nr_reclaimed;
3270 unsigned long pflags;
3271 int nid;
3272 unsigned int noreclaim_flag;
3273 struct scan_control sc = {
3274 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3275 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3276 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3277 .reclaim_idx = MAX_NR_ZONES - 1,
3278 .target_mem_cgroup = memcg,
3279 .priority = DEF_PRIORITY,
3280 .may_writepage = !laptop_mode,
3281 .may_unmap = 1,
3282 .may_swap = may_swap,
3283 };
3284
3285 set_task_reclaim_state(current, &sc.reclaim_state);
3286
3287
3288
3289
3290
3291 nid = mem_cgroup_select_victim_node(memcg);
3292
3293 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
3294
3295 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
3296
3297 psi_memstall_enter(&pflags);
3298 noreclaim_flag = memalloc_noreclaim_save();
3299
3300 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3301
3302 memalloc_noreclaim_restore(noreclaim_flag);
3303 psi_memstall_leave(&pflags);
3304
3305 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3306 set_task_reclaim_state(current, NULL);
3307
3308 return nr_reclaimed;
3309}
3310#endif
3311
3312static void age_active_anon(struct pglist_data *pgdat,
3313 struct scan_control *sc)
3314{
3315 struct mem_cgroup *memcg;
3316
3317 if (!total_swap_pages)
3318 return;
3319
3320 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3321 do {
3322 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
3323
3324 if (inactive_list_is_low(lruvec, false, sc, true))
3325 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3326 sc, LRU_ACTIVE_ANON);
3327
3328 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3329 } while (memcg);
3330}
3331
3332static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
3333{
3334 int i;
3335 struct zone *zone;
3336
3337
3338
3339
3340
3341
3342
3343
3344 for (i = classzone_idx; i >= 0; i--) {
3345 zone = pgdat->node_zones + i;
3346 if (!managed_zone(zone))
3347 continue;
3348
3349 if (zone->watermark_boost)
3350 return true;
3351 }
3352
3353 return false;
3354}
3355
3356
3357
3358
3359
3360static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3361{
3362 int i;
3363 unsigned long mark = -1;
3364 struct zone *zone;
3365
3366
3367
3368
3369
3370 for (i = 0; i <= classzone_idx; i++) {
3371 zone = pgdat->node_zones + i;
3372
3373 if (!managed_zone(zone))
3374 continue;
3375
3376 mark = high_wmark_pages(zone);
3377 if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
3378 return true;
3379 }
3380
3381
3382
3383
3384
3385
3386 if (mark == -1)
3387 return true;
3388
3389 return false;
3390}
3391
3392
3393static void clear_pgdat_congested(pg_data_t *pgdat)
3394{
3395 clear_bit(PGDAT_CONGESTED, &pgdat->flags);
3396 clear_bit(PGDAT_DIRTY, &pgdat->flags);
3397 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3398}
3399
3400
3401
3402
3403
3404
3405
3406static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3407{
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3422 wake_up_all(&pgdat->pfmemalloc_wait);
3423
3424
3425 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3426 return true;
3427
3428 if (pgdat_balanced(pgdat, order, classzone_idx)) {
3429 clear_pgdat_congested(pgdat);
3430 return true;
3431 }
3432
3433 return false;
3434}
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444static bool kswapd_shrink_node(pg_data_t *pgdat,
3445 struct scan_control *sc)
3446{
3447 struct zone *zone;
3448 int z;
3449
3450
3451 sc->nr_to_reclaim = 0;
3452 for (z = 0; z <= sc->reclaim_idx; z++) {
3453 zone = pgdat->node_zones + z;
3454 if (!managed_zone(zone))
3455 continue;
3456
3457 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3458 }
3459
3460
3461
3462
3463
3464 shrink_node(pgdat, sc);
3465
3466
3467
3468
3469
3470
3471
3472
3473 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3474 sc->order = 0;
3475
3476 return sc->nr_scanned >= sc->nr_to_reclaim;
3477}
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3493{
3494 int i;
3495 unsigned long nr_soft_reclaimed;
3496 unsigned long nr_soft_scanned;
3497 unsigned long pflags;
3498 unsigned long nr_boost_reclaim;
3499 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
3500 bool boosted;
3501 struct zone *zone;
3502 struct scan_control sc = {
3503 .gfp_mask = GFP_KERNEL,
3504 .order = order,
3505 .may_unmap = 1,
3506 };
3507
3508 set_task_reclaim_state(current, &sc.reclaim_state);
3509 psi_memstall_enter(&pflags);
3510 __fs_reclaim_acquire();
3511
3512 count_vm_event(PAGEOUTRUN);
3513
3514
3515
3516
3517
3518
3519 nr_boost_reclaim = 0;
3520 for (i = 0; i <= classzone_idx; i++) {
3521 zone = pgdat->node_zones + i;
3522 if (!managed_zone(zone))
3523 continue;
3524
3525 nr_boost_reclaim += zone->watermark_boost;
3526 zone_boosts[i] = zone->watermark_boost;
3527 }
3528 boosted = nr_boost_reclaim;
3529
3530restart:
3531 sc.priority = DEF_PRIORITY;
3532 do {
3533 unsigned long nr_reclaimed = sc.nr_reclaimed;
3534 bool raise_priority = true;
3535 bool balanced;
3536 bool ret;
3537
3538 sc.reclaim_idx = classzone_idx;
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550 if (buffer_heads_over_limit) {
3551 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3552 zone = pgdat->node_zones + i;
3553 if (!managed_zone(zone))
3554 continue;
3555
3556 sc.reclaim_idx = i;
3557 break;
3558 }
3559 }
3560
3561
3562
3563
3564
3565
3566
3567
3568 balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
3569 if (!balanced && nr_boost_reclaim) {
3570 nr_boost_reclaim = 0;
3571 goto restart;
3572 }
3573
3574
3575
3576
3577
3578
3579 if (!nr_boost_reclaim && balanced)
3580 goto out;
3581
3582
3583 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
3584 raise_priority = false;
3585
3586
3587
3588
3589
3590
3591
3592 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
3593 sc.may_swap = !nr_boost_reclaim;
3594
3595
3596
3597
3598
3599
3600
3601 age_active_anon(pgdat, &sc);
3602
3603
3604
3605
3606
3607 if (sc.priority < DEF_PRIORITY - 2)
3608 sc.may_writepage = 1;
3609
3610
3611 sc.nr_scanned = 0;
3612 nr_soft_scanned = 0;
3613 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3614 sc.gfp_mask, &nr_soft_scanned);
3615 sc.nr_reclaimed += nr_soft_reclaimed;
3616
3617
3618
3619
3620
3621
3622 if (kswapd_shrink_node(pgdat, &sc))
3623 raise_priority = false;
3624
3625
3626
3627
3628
3629
3630 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3631 allow_direct_reclaim(pgdat))
3632 wake_up_all(&pgdat->pfmemalloc_wait);
3633
3634
3635 __fs_reclaim_release();
3636 ret = try_to_freeze();
3637 __fs_reclaim_acquire();
3638 if (ret || kthread_should_stop())
3639 break;
3640
3641
3642
3643
3644
3645 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3646 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
3647
3648
3649
3650
3651
3652
3653 if (nr_boost_reclaim && !nr_reclaimed)
3654 break;
3655
3656 if (raise_priority || !nr_reclaimed)
3657 sc.priority--;
3658 } while (sc.priority >= 1);
3659
3660 if (!sc.nr_reclaimed)
3661 pgdat->kswapd_failures++;
3662
3663out:
3664
3665 if (boosted) {
3666 unsigned long flags;
3667
3668 for (i = 0; i <= classzone_idx; i++) {
3669 if (!zone_boosts[i])
3670 continue;
3671
3672
3673 zone = pgdat->node_zones + i;
3674 spin_lock_irqsave(&zone->lock, flags);
3675 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
3676 spin_unlock_irqrestore(&zone->lock, flags);
3677 }
3678
3679
3680
3681
3682
3683 wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
3684 }
3685
3686 snapshot_refaults(NULL, pgdat);
3687 __fs_reclaim_release();
3688 psi_memstall_leave(&pflags);
3689 set_task_reclaim_state(current, NULL);
3690
3691
3692
3693
3694
3695
3696
3697 return sc.order;
3698}
3699
3700
3701
3702
3703
3704
3705
3706
3707static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
3708 enum zone_type prev_classzone_idx)
3709{
3710 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3711 return prev_classzone_idx;
3712 return pgdat->kswapd_classzone_idx;
3713}
3714
3715static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3716 unsigned int classzone_idx)
3717{
3718 long remaining = 0;
3719 DEFINE_WAIT(wait);
3720
3721 if (freezing(current) || kthread_should_stop())
3722 return;
3723
3724 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3725
3726
3727
3728
3729
3730
3731
3732
3733 if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3734
3735
3736
3737
3738
3739
3740 reset_isolation_suitable(pgdat);
3741
3742
3743
3744
3745
3746 wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
3747
3748 remaining = schedule_timeout(HZ/10);
3749
3750
3751
3752
3753
3754
3755 if (remaining) {
3756 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3757 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
3758 }
3759
3760 finish_wait(&pgdat->kswapd_wait, &wait);
3761 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3762 }
3763
3764
3765
3766
3767
3768 if (!remaining &&
3769 prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3770 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3781
3782 if (!kthread_should_stop())
3783 schedule();
3784
3785 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3786 } else {
3787 if (remaining)
3788 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3789 else
3790 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3791 }
3792 finish_wait(&pgdat->kswapd_wait, &wait);
3793}
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808static int kswapd(void *p)
3809{
3810 unsigned int alloc_order, reclaim_order;
3811 unsigned int classzone_idx = MAX_NR_ZONES - 1;
3812 pg_data_t *pgdat = (pg_data_t*)p;
3813 struct task_struct *tsk = current;
3814 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3815
3816 if (!cpumask_empty(cpumask))
3817 set_cpus_allowed_ptr(tsk, cpumask);
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3832 set_freezable();
3833
3834 pgdat->kswapd_order = 0;
3835 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3836 for ( ; ; ) {
3837 bool ret;
3838
3839 alloc_order = reclaim_order = pgdat->kswapd_order;
3840 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3841
3842kswapd_try_sleep:
3843 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3844 classzone_idx);
3845
3846
3847 alloc_order = reclaim_order = pgdat->kswapd_order;
3848 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3849 pgdat->kswapd_order = 0;
3850 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3851
3852 ret = try_to_freeze();
3853 if (kthread_should_stop())
3854 break;
3855
3856
3857
3858
3859
3860 if (ret)
3861 continue;
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
3872 alloc_order);
3873 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
3874 if (reclaim_order < alloc_order)
3875 goto kswapd_try_sleep;
3876 }
3877
3878 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3879
3880 return 0;
3881}
3882
3883
3884
3885
3886
3887
3888
3889
3890void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3891 enum zone_type classzone_idx)
3892{
3893 pg_data_t *pgdat;
3894
3895 if (!managed_zone(zone))
3896 return;
3897
3898 if (!cpuset_zone_allowed(zone, gfp_flags))
3899 return;
3900 pgdat = zone->zone_pgdat;
3901
3902 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3903 pgdat->kswapd_classzone_idx = classzone_idx;
3904 else
3905 pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
3906 classzone_idx);
3907 pgdat->kswapd_order = max(pgdat->kswapd_order, order);
3908 if (!waitqueue_active(&pgdat->kswapd_wait))
3909 return;
3910
3911
3912 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3913 (pgdat_balanced(pgdat, order, classzone_idx) &&
3914 !pgdat_watermark_boosted(pgdat, classzone_idx))) {
3915
3916
3917
3918
3919
3920
3921
3922 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
3923 wakeup_kcompactd(pgdat, order, classzone_idx);
3924 return;
3925 }
3926
3927 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
3928 gfp_flags);
3929 wake_up_interruptible(&pgdat->kswapd_wait);
3930}
3931
3932#ifdef CONFIG_HIBERNATION
3933
3934
3935
3936
3937
3938
3939
3940
3941unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3942{
3943 struct scan_control sc = {
3944 .nr_to_reclaim = nr_to_reclaim,
3945 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3946 .reclaim_idx = MAX_NR_ZONES - 1,
3947 .priority = DEF_PRIORITY,
3948 .may_writepage = 1,
3949 .may_unmap = 1,
3950 .may_swap = 1,
3951 .hibernation_mode = 1,
3952 };
3953 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3954 unsigned long nr_reclaimed;
3955 unsigned int noreclaim_flag;
3956
3957 fs_reclaim_acquire(sc.gfp_mask);
3958 noreclaim_flag = memalloc_noreclaim_save();
3959 set_task_reclaim_state(current, &sc.reclaim_state);
3960
3961 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3962
3963 set_task_reclaim_state(current, NULL);
3964 memalloc_noreclaim_restore(noreclaim_flag);
3965 fs_reclaim_release(sc.gfp_mask);
3966
3967 return nr_reclaimed;
3968}
3969#endif
3970
3971
3972
3973
3974
3975static int kswapd_cpu_online(unsigned int cpu)
3976{
3977 int nid;
3978
3979 for_each_node_state(nid, N_MEMORY) {
3980 pg_data_t *pgdat = NODE_DATA(nid);
3981 const struct cpumask *mask;
3982
3983 mask = cpumask_of_node(pgdat->node_id);
3984
3985 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3986
3987 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3988 }
3989 return 0;
3990}
3991
3992
3993
3994
3995
3996int kswapd_run(int nid)
3997{
3998 pg_data_t *pgdat = NODE_DATA(nid);
3999 int ret = 0;
4000
4001 if (pgdat->kswapd)
4002 return 0;
4003
4004 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
4005 if (IS_ERR(pgdat->kswapd)) {
4006
4007 BUG_ON(system_state < SYSTEM_RUNNING);
4008 pr_err("Failed to start kswapd on node %d\n", nid);
4009 ret = PTR_ERR(pgdat->kswapd);
4010 pgdat->kswapd = NULL;
4011 }
4012 return ret;
4013}
4014
4015
4016
4017
4018
4019void kswapd_stop(int nid)
4020{
4021 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
4022
4023 if (kswapd) {
4024 kthread_stop(kswapd);
4025 NODE_DATA(nid)->kswapd = NULL;
4026 }
4027}
4028
4029static int __init kswapd_init(void)
4030{
4031 int nid, ret;
4032
4033 swap_setup();
4034 for_each_node_state(nid, N_MEMORY)
4035 kswapd_run(nid);
4036 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
4037 "mm/vmscan:online", kswapd_cpu_online,
4038 NULL);
4039 WARN_ON(ret < 0);
4040 return 0;
4041}
4042
4043module_init(kswapd_init)
4044
4045#ifdef CONFIG_NUMA
4046
4047
4048
4049
4050
4051
4052int node_reclaim_mode __read_mostly;
4053
4054#define RECLAIM_OFF 0
4055#define RECLAIM_ZONE (1<<0)
4056#define RECLAIM_WRITE (1<<1)
4057#define RECLAIM_UNMAP (1<<2)
4058
4059
4060
4061
4062
4063
4064#define NODE_RECLAIM_PRIORITY 4
4065
4066
4067
4068
4069
4070int sysctl_min_unmapped_ratio = 1;
4071
4072
4073
4074
4075
4076int sysctl_min_slab_ratio = 5;
4077
4078static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4079{
4080 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4081 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4082 node_page_state(pgdat, NR_ACTIVE_FILE);
4083
4084
4085
4086
4087
4088
4089 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4090}
4091
4092
4093static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4094{
4095 unsigned long nr_pagecache_reclaimable;
4096 unsigned long delta = 0;
4097
4098
4099
4100
4101
4102
4103
4104 if (node_reclaim_mode & RECLAIM_UNMAP)
4105 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4106 else
4107 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4108
4109
4110 if (!(node_reclaim_mode & RECLAIM_WRITE))
4111 delta += node_page_state(pgdat, NR_FILE_DIRTY);
4112
4113
4114 if (unlikely(delta > nr_pagecache_reclaimable))
4115 delta = nr_pagecache_reclaimable;
4116
4117 return nr_pagecache_reclaimable - delta;
4118}
4119
4120
4121
4122
4123static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4124{
4125
4126 const unsigned long nr_pages = 1 << order;
4127 struct task_struct *p = current;
4128 unsigned int noreclaim_flag;
4129 struct scan_control sc = {
4130 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4131 .gfp_mask = current_gfp_context(gfp_mask),
4132 .order = order,
4133 .priority = NODE_RECLAIM_PRIORITY,
4134 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4135 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4136 .may_swap = 1,
4137 .reclaim_idx = gfp_zone(gfp_mask),
4138 };
4139
4140 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
4141 sc.gfp_mask);
4142
4143 cond_resched();
4144 fs_reclaim_acquire(sc.gfp_mask);
4145
4146
4147
4148
4149
4150 noreclaim_flag = memalloc_noreclaim_save();
4151 p->flags |= PF_SWAPWRITE;
4152 set_task_reclaim_state(p, &sc.reclaim_state);
4153
4154 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4155
4156
4157
4158
4159 do {
4160 shrink_node(pgdat, &sc);
4161 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4162 }
4163
4164 set_task_reclaim_state(p, NULL);
4165 current->flags &= ~PF_SWAPWRITE;
4166 memalloc_noreclaim_restore(noreclaim_flag);
4167 fs_reclaim_release(sc.gfp_mask);
4168
4169 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
4170
4171 return sc.nr_reclaimed >= nr_pages;
4172}
4173
4174int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4175{
4176 int ret;
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4189 node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
4190 return NODE_RECLAIM_FULL;
4191
4192
4193
4194
4195 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4196 return NODE_RECLAIM_NOSCAN;
4197
4198
4199
4200
4201
4202
4203
4204 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4205 return NODE_RECLAIM_NOSCAN;
4206
4207 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4208 return NODE_RECLAIM_NOSCAN;
4209
4210 ret = __node_reclaim(pgdat, gfp_mask, order);
4211 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4212
4213 if (!ret)
4214 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4215
4216 return ret;
4217}
4218#endif
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232int page_evictable(struct page *page)
4233{
4234 int ret;
4235
4236
4237 rcu_read_lock();
4238 ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
4239 rcu_read_unlock();
4240 return ret;
4241}
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252void check_move_unevictable_pages(struct pagevec *pvec)
4253{
4254 struct lruvec *lruvec;
4255 struct pglist_data *pgdat = NULL;
4256 int pgscanned = 0;
4257 int pgrescued = 0;
4258 int i;
4259
4260 for (i = 0; i < pvec->nr; i++) {
4261 struct page *page = pvec->pages[i];
4262 struct pglist_data *pagepgdat = page_pgdat(page);
4263
4264 pgscanned++;
4265 if (pagepgdat != pgdat) {
4266 if (pgdat)
4267 spin_unlock_irq(&pgdat->lru_lock);
4268 pgdat = pagepgdat;
4269 spin_lock_irq(&pgdat->lru_lock);
4270 }
4271 lruvec = mem_cgroup_page_lruvec(page, pgdat);
4272
4273 if (!PageLRU(page) || !PageUnevictable(page))
4274 continue;
4275
4276 if (page_evictable(page)) {
4277 enum lru_list lru = page_lru_base_type(page);
4278
4279 VM_BUG_ON_PAGE(PageActive(page), page);
4280 ClearPageUnevictable(page);
4281 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
4282 add_page_to_lru_list(page, lruvec, lru);
4283 pgrescued++;
4284 }
4285 }
4286
4287 if (pgdat) {
4288 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4289 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4290 spin_unlock_irq(&pgdat->lru_lock);
4291 }
4292}
4293EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4294