1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17#include <linux/mm.h>
18#include <linux/sched/mm.h>
19#include <linux/module.h>
20#include <linux/gfp.h>
21#include <linux/kernel_stat.h>
22#include <linux/swap.h>
23#include <linux/pagemap.h>
24#include <linux/init.h>
25#include <linux/highmem.h>
26#include <linux/vmpressure.h>
27#include <linux/vmstat.h>
28#include <linux/file.h>
29#include <linux/writeback.h>
30#include <linux/blkdev.h>
31#include <linux/buffer_head.h>
32
33#include <linux/mm_inline.h>
34#include <linux/backing-dev.h>
35#include <linux/rmap.h>
36#include <linux/topology.h>
37#include <linux/cpu.h>
38#include <linux/cpuset.h>
39#include <linux/compaction.h>
40#include <linux/notifier.h>
41#include <linux/rwsem.h>
42#include <linux/delay.h>
43#include <linux/kthread.h>
44#include <linux/freezer.h>
45#include <linux/memcontrol.h>
46#include <linux/delayacct.h>
47#include <linux/sysctl.h>
48#include <linux/oom.h>
49#include <linux/prefetch.h>
50#include <linux/printk.h>
51#include <linux/dax.h>
52#include <linux/psi.h>
53
54#include <asm/tlbflush.h>
55#include <asm/div64.h>
56
57#include <linux/swapops.h>
58#include <linux/balloon_compaction.h>
59
60#include "internal.h"
61
62#define CREATE_TRACE_POINTS
63#include <trace/events/vmscan.h>
64
65struct scan_control {
66
67 unsigned long nr_to_reclaim;
68
69
70
71
72
73 nodemask_t *nodemask;
74
75
76
77
78
79 struct mem_cgroup *target_mem_cgroup;
80
81
82 unsigned int may_writepage:1;
83
84
85 unsigned int may_unmap:1;
86
87
88 unsigned int may_swap:1;
89
90
91
92
93
94
95 unsigned int memcg_low_reclaim:1;
96 unsigned int memcg_low_skipped:1;
97
98 unsigned int hibernation_mode:1;
99
100
101 unsigned int compaction_ready:1;
102
103
104 s8 order;
105
106
107 s8 priority;
108
109
110 s8 reclaim_idx;
111
112
113 gfp_t gfp_mask;
114
115
116 unsigned long nr_scanned;
117
118
119 unsigned long nr_reclaimed;
120
121 struct {
122 unsigned int dirty;
123 unsigned int unqueued_dirty;
124 unsigned int congested;
125 unsigned int writeback;
126 unsigned int immediate;
127 unsigned int file_taken;
128 unsigned int taken;
129 } nr;
130};
131
132#ifdef ARCH_HAS_PREFETCH
133#define prefetch_prev_lru_page(_page, _base, _field) \
134 do { \
135 if ((_page)->lru.prev != _base) { \
136 struct page *prev; \
137 \
138 prev = lru_to_page(&(_page->lru)); \
139 prefetch(&prev->_field); \
140 } \
141 } while (0)
142#else
143#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
144#endif
145
146#ifdef ARCH_HAS_PREFETCHW
147#define prefetchw_prev_lru_page(_page, _base, _field) \
148 do { \
149 if ((_page)->lru.prev != _base) { \
150 struct page *prev; \
151 \
152 prev = lru_to_page(&(_page->lru)); \
153 prefetchw(&prev->_field); \
154 } \
155 } while (0)
156#else
157#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
158#endif
159
160
161
162
163int vm_swappiness = 60;
164
165
166
167
168unsigned long vm_total_pages;
169
170static LIST_HEAD(shrinker_list);
171static DECLARE_RWSEM(shrinker_rwsem);
172
173#ifdef CONFIG_MEMCG_KMEM
174
175
176
177
178
179
180
181
182
183
184
185
186#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
187
188static DEFINE_IDR(shrinker_idr);
189static int shrinker_nr_max;
190
191static int prealloc_memcg_shrinker(struct shrinker *shrinker)
192{
193 int id, ret = -ENOMEM;
194
195 down_write(&shrinker_rwsem);
196
197 id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
198 if (id < 0)
199 goto unlock;
200
201 if (id >= shrinker_nr_max) {
202 if (memcg_expand_shrinker_maps(id)) {
203 idr_remove(&shrinker_idr, id);
204 goto unlock;
205 }
206
207 shrinker_nr_max = id + 1;
208 }
209 shrinker->id = id;
210 ret = 0;
211unlock:
212 up_write(&shrinker_rwsem);
213 return ret;
214}
215
216static void unregister_memcg_shrinker(struct shrinker *shrinker)
217{
218 int id = shrinker->id;
219
220 BUG_ON(id < 0);
221
222 down_write(&shrinker_rwsem);
223 idr_remove(&shrinker_idr, id);
224 up_write(&shrinker_rwsem);
225}
226#else
227static int prealloc_memcg_shrinker(struct shrinker *shrinker)
228{
229 return 0;
230}
231
232static void unregister_memcg_shrinker(struct shrinker *shrinker)
233{
234}
235#endif
236
237#ifdef CONFIG_MEMCG
238static bool global_reclaim(struct scan_control *sc)
239{
240 return !sc->target_mem_cgroup;
241}
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256static bool sane_reclaim(struct scan_control *sc)
257{
258 struct mem_cgroup *memcg = sc->target_mem_cgroup;
259
260 if (!memcg)
261 return true;
262#ifdef CONFIG_CGROUP_WRITEBACK
263 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
264 return true;
265#endif
266 return false;
267}
268
269static void set_memcg_congestion(pg_data_t *pgdat,
270 struct mem_cgroup *memcg,
271 bool congested)
272{
273 struct mem_cgroup_per_node *mn;
274
275 if (!memcg)
276 return;
277
278 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
279 WRITE_ONCE(mn->congested, congested);
280}
281
282static bool memcg_congested(pg_data_t *pgdat,
283 struct mem_cgroup *memcg)
284{
285 struct mem_cgroup_per_node *mn;
286
287 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
288 return READ_ONCE(mn->congested);
289
290}
291#else
292static bool global_reclaim(struct scan_control *sc)
293{
294 return true;
295}
296
297static bool sane_reclaim(struct scan_control *sc)
298{
299 return true;
300}
301
302static inline void set_memcg_congestion(struct pglist_data *pgdat,
303 struct mem_cgroup *memcg, bool congested)
304{
305}
306
307static inline bool memcg_congested(struct pglist_data *pgdat,
308 struct mem_cgroup *memcg)
309{
310 return false;
311
312}
313#endif
314
315
316
317
318
319
320unsigned long zone_reclaimable_pages(struct zone *zone)
321{
322 unsigned long nr;
323
324 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
325 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
326 if (get_nr_swap_pages() > 0)
327 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
328 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
329
330 return nr;
331}
332
333
334
335
336
337
338
339unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
340{
341 unsigned long lru_size;
342 int zid;
343
344 if (!mem_cgroup_disabled())
345 lru_size = mem_cgroup_get_lru_size(lruvec, lru);
346 else
347 lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
348
349 for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
350 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
351 unsigned long size;
352
353 if (!managed_zone(zone))
354 continue;
355
356 if (!mem_cgroup_disabled())
357 size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
358 else
359 size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
360 NR_ZONE_LRU_BASE + lru);
361 lru_size -= min(size, lru_size);
362 }
363
364 return lru_size;
365
366}
367
368
369
370
371int prealloc_shrinker(struct shrinker *shrinker)
372{
373 size_t size = sizeof(*shrinker->nr_deferred);
374
375 if (shrinker->flags & SHRINKER_NUMA_AWARE)
376 size *= nr_node_ids;
377
378 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
379 if (!shrinker->nr_deferred)
380 return -ENOMEM;
381
382 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
383 if (prealloc_memcg_shrinker(shrinker))
384 goto free_deferred;
385 }
386
387 return 0;
388
389free_deferred:
390 kfree(shrinker->nr_deferred);
391 shrinker->nr_deferred = NULL;
392 return -ENOMEM;
393}
394
395void free_prealloced_shrinker(struct shrinker *shrinker)
396{
397 if (!shrinker->nr_deferred)
398 return;
399
400 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
401 unregister_memcg_shrinker(shrinker);
402
403 kfree(shrinker->nr_deferred);
404 shrinker->nr_deferred = NULL;
405}
406
407void register_shrinker_prepared(struct shrinker *shrinker)
408{
409 down_write(&shrinker_rwsem);
410 list_add_tail(&shrinker->list, &shrinker_list);
411#ifdef CONFIG_MEMCG_KMEM
412 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
413 idr_replace(&shrinker_idr, shrinker, shrinker->id);
414#endif
415 up_write(&shrinker_rwsem);
416}
417
418int register_shrinker(struct shrinker *shrinker)
419{
420 int err = prealloc_shrinker(shrinker);
421
422 if (err)
423 return err;
424 register_shrinker_prepared(shrinker);
425 return 0;
426}
427EXPORT_SYMBOL(register_shrinker);
428
429
430
431
432void unregister_shrinker(struct shrinker *shrinker)
433{
434 if (!shrinker->nr_deferred)
435 return;
436 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
437 unregister_memcg_shrinker(shrinker);
438 down_write(&shrinker_rwsem);
439 list_del(&shrinker->list);
440 up_write(&shrinker_rwsem);
441 kfree(shrinker->nr_deferred);
442 shrinker->nr_deferred = NULL;
443}
444EXPORT_SYMBOL(unregister_shrinker);
445
446#define SHRINK_BATCH 128
447
448static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
449 struct shrinker *shrinker, int priority)
450{
451 unsigned long freed = 0;
452 unsigned long long delta;
453 long total_scan;
454 long freeable;
455 long nr;
456 long new_nr;
457 int nid = shrinkctl->nid;
458 long batch_size = shrinker->batch ? shrinker->batch
459 : SHRINK_BATCH;
460 long scanned = 0, next_deferred;
461
462 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
463 nid = 0;
464
465 freeable = shrinker->count_objects(shrinker, shrinkctl);
466 if (freeable == 0 || freeable == SHRINK_EMPTY)
467 return freeable;
468
469
470
471
472
473
474 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
475
476 total_scan = nr;
477 if (shrinker->seeks) {
478 delta = freeable >> priority;
479 delta *= 4;
480 do_div(delta, shrinker->seeks);
481 } else {
482
483
484
485
486
487 delta = freeable / 2;
488 }
489
490
491
492
493
494
495
496
497
498 delta = max_t(unsigned long long, delta, min(freeable, batch_size));
499
500 total_scan += delta;
501 if (total_scan < 0) {
502 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
503 shrinker->scan_objects, total_scan);
504 total_scan = freeable;
505 next_deferred = nr;
506 } else
507 next_deferred = total_scan;
508
509
510
511
512
513
514
515
516
517
518
519
520
521 if (delta < freeable / 4)
522 total_scan = min(total_scan, freeable / 2);
523
524
525
526
527
528
529 if (total_scan > freeable * 2)
530 total_scan = freeable * 2;
531
532 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
533 freeable, delta, total_scan, priority);
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550 while (total_scan >= batch_size ||
551 total_scan >= freeable) {
552 unsigned long ret;
553 unsigned long nr_to_scan = min(batch_size, total_scan);
554
555 shrinkctl->nr_to_scan = nr_to_scan;
556 shrinkctl->nr_scanned = nr_to_scan;
557 ret = shrinker->scan_objects(shrinker, shrinkctl);
558 if (ret == SHRINK_STOP)
559 break;
560 freed += ret;
561
562 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
563 total_scan -= shrinkctl->nr_scanned;
564 scanned += shrinkctl->nr_scanned;
565
566 cond_resched();
567 }
568
569 if (next_deferred >= scanned)
570 next_deferred -= scanned;
571 else
572 next_deferred = 0;
573
574
575
576
577
578 if (next_deferred > 0)
579 new_nr = atomic_long_add_return(next_deferred,
580 &shrinker->nr_deferred[nid]);
581 else
582 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
583
584 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
585 return freed;
586}
587
588#ifdef CONFIG_MEMCG_KMEM
589static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
590 struct mem_cgroup *memcg, int priority)
591{
592 struct memcg_shrinker_map *map;
593 unsigned long ret, freed = 0;
594 int i;
595
596 if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
597 return 0;
598
599 if (!down_read_trylock(&shrinker_rwsem))
600 return 0;
601
602 map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
603 true);
604 if (unlikely(!map))
605 goto unlock;
606
607 for_each_set_bit(i, map->map, shrinker_nr_max) {
608 struct shrink_control sc = {
609 .gfp_mask = gfp_mask,
610 .nid = nid,
611 .memcg = memcg,
612 };
613 struct shrinker *shrinker;
614
615 shrinker = idr_find(&shrinker_idr, i);
616 if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
617 if (!shrinker)
618 clear_bit(i, map->map);
619 continue;
620 }
621
622 ret = do_shrink_slab(&sc, shrinker, priority);
623 if (ret == SHRINK_EMPTY) {
624 clear_bit(i, map->map);
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640 smp_mb__after_atomic();
641 ret = do_shrink_slab(&sc, shrinker, priority);
642 if (ret == SHRINK_EMPTY)
643 ret = 0;
644 else
645 memcg_set_shrinker_bit(memcg, nid, i);
646 }
647 freed += ret;
648
649 if (rwsem_is_contended(&shrinker_rwsem)) {
650 freed = freed ? : 1;
651 break;
652 }
653 }
654unlock:
655 up_read(&shrinker_rwsem);
656 return freed;
657}
658#else
659static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
660 struct mem_cgroup *memcg, int priority)
661{
662 return 0;
663}
664#endif
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
687 struct mem_cgroup *memcg,
688 int priority)
689{
690 unsigned long ret, freed = 0;
691 struct shrinker *shrinker;
692
693 if (!mem_cgroup_is_root(memcg))
694 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
695
696 if (!down_read_trylock(&shrinker_rwsem))
697 goto out;
698
699 list_for_each_entry(shrinker, &shrinker_list, list) {
700 struct shrink_control sc = {
701 .gfp_mask = gfp_mask,
702 .nid = nid,
703 .memcg = memcg,
704 };
705
706 ret = do_shrink_slab(&sc, shrinker, priority);
707 if (ret == SHRINK_EMPTY)
708 ret = 0;
709 freed += ret;
710
711
712
713
714
715 if (rwsem_is_contended(&shrinker_rwsem)) {
716 freed = freed ? : 1;
717 break;
718 }
719 }
720
721 up_read(&shrinker_rwsem);
722out:
723 cond_resched();
724 return freed;
725}
726
727void drop_slab_node(int nid)
728{
729 unsigned long freed;
730
731 do {
732 struct mem_cgroup *memcg = NULL;
733
734 freed = 0;
735 memcg = mem_cgroup_iter(NULL, NULL, NULL);
736 do {
737 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
738 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
739 } while (freed > 10);
740}
741
742void drop_slab(void)
743{
744 int nid;
745
746 for_each_online_node(nid)
747 drop_slab_node(nid);
748}
749
750static inline int is_page_cache_freeable(struct page *page)
751{
752
753
754
755
756
757 int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
758 HPAGE_PMD_NR : 1;
759 return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
760}
761
762static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
763{
764 if (current->flags & PF_SWAPWRITE)
765 return 1;
766 if (!inode_write_congested(inode))
767 return 1;
768 if (inode_to_bdi(inode) == current->backing_dev_info)
769 return 1;
770 return 0;
771}
772
773
774
775
776
777
778
779
780
781
782
783
784
785static void handle_write_error(struct address_space *mapping,
786 struct page *page, int error)
787{
788 lock_page(page);
789 if (page_mapping(page) == mapping)
790 mapping_set_error(mapping, error);
791 unlock_page(page);
792}
793
794
795typedef enum {
796
797 PAGE_KEEP,
798
799 PAGE_ACTIVATE,
800
801 PAGE_SUCCESS,
802
803 PAGE_CLEAN,
804} pageout_t;
805
806
807
808
809
810static pageout_t pageout(struct page *page, struct address_space *mapping,
811 struct scan_control *sc)
812{
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829 if (!is_page_cache_freeable(page))
830 return PAGE_KEEP;
831 if (!mapping) {
832
833
834
835
836 if (page_has_private(page)) {
837 if (try_to_free_buffers(page)) {
838 ClearPageDirty(page);
839 pr_info("%s: orphaned page\n", __func__);
840 return PAGE_CLEAN;
841 }
842 }
843 return PAGE_KEEP;
844 }
845 if (mapping->a_ops->writepage == NULL)
846 return PAGE_ACTIVATE;
847 if (!may_write_to_inode(mapping->host, sc))
848 return PAGE_KEEP;
849
850 if (clear_page_dirty_for_io(page)) {
851 int res;
852 struct writeback_control wbc = {
853 .sync_mode = WB_SYNC_NONE,
854 .nr_to_write = SWAP_CLUSTER_MAX,
855 .range_start = 0,
856 .range_end = LLONG_MAX,
857 .for_reclaim = 1,
858 };
859
860 SetPageReclaim(page);
861 res = mapping->a_ops->writepage(page, &wbc);
862 if (res < 0)
863 handle_write_error(mapping, page, res);
864 if (res == AOP_WRITEPAGE_ACTIVATE) {
865 ClearPageReclaim(page);
866 return PAGE_ACTIVATE;
867 }
868
869 if (!PageWriteback(page)) {
870
871 ClearPageReclaim(page);
872 }
873 trace_mm_vmscan_writepage(page);
874 inc_node_page_state(page, NR_VMSCAN_WRITE);
875 return PAGE_SUCCESS;
876 }
877
878 return PAGE_CLEAN;
879}
880
881
882
883
884
885static int __remove_mapping(struct address_space *mapping, struct page *page,
886 bool reclaimed)
887{
888 unsigned long flags;
889 int refcount;
890
891 BUG_ON(!PageLocked(page));
892 BUG_ON(mapping != page_mapping(page));
893
894 xa_lock_irqsave(&mapping->i_pages, flags);
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920 if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
921 refcount = 1 + HPAGE_PMD_NR;
922 else
923 refcount = 2;
924 if (!page_ref_freeze(page, refcount))
925 goto cannot_free;
926
927 if (unlikely(PageDirty(page))) {
928 page_ref_unfreeze(page, refcount);
929 goto cannot_free;
930 }
931
932 if (PageSwapCache(page)) {
933 swp_entry_t swap = { .val = page_private(page) };
934 mem_cgroup_swapout(page, swap);
935 __delete_from_swap_cache(page, swap);
936 xa_unlock_irqrestore(&mapping->i_pages, flags);
937 put_swap_page(page, swap);
938 } else {
939 void (*freepage)(struct page *);
940 void *shadow = NULL;
941
942 freepage = mapping->a_ops->freepage;
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959 if (reclaimed && page_is_file_cache(page) &&
960 !mapping_exiting(mapping) && !dax_mapping(mapping))
961 shadow = workingset_eviction(mapping, page);
962 __delete_from_page_cache(page, shadow);
963 xa_unlock_irqrestore(&mapping->i_pages, flags);
964
965 if (freepage != NULL)
966 freepage(page);
967 }
968
969 return 1;
970
971cannot_free:
972 xa_unlock_irqrestore(&mapping->i_pages, flags);
973 return 0;
974}
975
976
977
978
979
980
981
982int remove_mapping(struct address_space *mapping, struct page *page)
983{
984 if (__remove_mapping(mapping, page, false)) {
985
986
987
988
989
990 page_ref_unfreeze(page, 1);
991 return 1;
992 }
993 return 0;
994}
995
996
997
998
999
1000
1001
1002
1003
1004
1005void putback_lru_page(struct page *page)
1006{
1007 lru_cache_add(page);
1008 put_page(page);
1009}
1010
1011enum page_references {
1012 PAGEREF_RECLAIM,
1013 PAGEREF_RECLAIM_CLEAN,
1014 PAGEREF_KEEP,
1015 PAGEREF_ACTIVATE,
1016};
1017
1018static enum page_references page_check_references(struct page *page,
1019 struct scan_control *sc)
1020{
1021 int referenced_ptes, referenced_page;
1022 unsigned long vm_flags;
1023
1024 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
1025 &vm_flags);
1026 referenced_page = TestClearPageReferenced(page);
1027
1028
1029
1030
1031
1032 if (vm_flags & VM_LOCKED)
1033 return PAGEREF_RECLAIM;
1034
1035 if (referenced_ptes) {
1036 if (PageSwapBacked(page))
1037 return PAGEREF_ACTIVATE;
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052 SetPageReferenced(page);
1053
1054 if (referenced_page || referenced_ptes > 1)
1055 return PAGEREF_ACTIVATE;
1056
1057
1058
1059
1060 if (vm_flags & VM_EXEC)
1061 return PAGEREF_ACTIVATE;
1062
1063 return PAGEREF_KEEP;
1064 }
1065
1066
1067 if (referenced_page && !PageSwapBacked(page))
1068 return PAGEREF_RECLAIM_CLEAN;
1069
1070 return PAGEREF_RECLAIM;
1071}
1072
1073
1074static void page_check_dirty_writeback(struct page *page,
1075 bool *dirty, bool *writeback)
1076{
1077 struct address_space *mapping;
1078
1079
1080
1081
1082
1083 if (!page_is_file_cache(page) ||
1084 (PageAnon(page) && !PageSwapBacked(page))) {
1085 *dirty = false;
1086 *writeback = false;
1087 return;
1088 }
1089
1090
1091 *dirty = PageDirty(page);
1092 *writeback = PageWriteback(page);
1093
1094
1095 if (!page_has_private(page))
1096 return;
1097
1098 mapping = page_mapping(page);
1099 if (mapping && mapping->a_ops->is_dirty_writeback)
1100 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1101}
1102
1103
1104
1105
1106static unsigned long shrink_page_list(struct list_head *page_list,
1107 struct pglist_data *pgdat,
1108 struct scan_control *sc,
1109 enum ttu_flags ttu_flags,
1110 struct reclaim_stat *stat,
1111 bool force_reclaim)
1112{
1113 LIST_HEAD(ret_pages);
1114 LIST_HEAD(free_pages);
1115 int pgactivate = 0;
1116 unsigned nr_unqueued_dirty = 0;
1117 unsigned nr_dirty = 0;
1118 unsigned nr_congested = 0;
1119 unsigned nr_reclaimed = 0;
1120 unsigned nr_writeback = 0;
1121 unsigned nr_immediate = 0;
1122 unsigned nr_ref_keep = 0;
1123 unsigned nr_unmap_fail = 0;
1124
1125 cond_resched();
1126
1127 while (!list_empty(page_list)) {
1128 struct address_space *mapping;
1129 struct page *page;
1130 int may_enter_fs;
1131 enum page_references references = PAGEREF_RECLAIM_CLEAN;
1132 bool dirty, writeback;
1133
1134 cond_resched();
1135
1136 page = lru_to_page(page_list);
1137 list_del(&page->lru);
1138
1139 if (!trylock_page(page))
1140 goto keep;
1141
1142 VM_BUG_ON_PAGE(PageActive(page), page);
1143
1144 sc->nr_scanned++;
1145
1146 if (unlikely(!page_evictable(page)))
1147 goto activate_locked;
1148
1149 if (!sc->may_unmap && page_mapped(page))
1150 goto keep_locked;
1151
1152
1153 if ((page_mapped(page) || PageSwapCache(page)) &&
1154 !(PageAnon(page) && !PageSwapBacked(page)))
1155 sc->nr_scanned++;
1156
1157 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1158 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1159
1160
1161
1162
1163
1164
1165
1166 page_check_dirty_writeback(page, &dirty, &writeback);
1167 if (dirty || writeback)
1168 nr_dirty++;
1169
1170 if (dirty && !writeback)
1171 nr_unqueued_dirty++;
1172
1173
1174
1175
1176
1177
1178
1179 mapping = page_mapping(page);
1180 if (((dirty || writeback) && mapping &&
1181 inode_write_congested(mapping->host)) ||
1182 (writeback && PageReclaim(page)))
1183 nr_congested++;
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227 if (PageWriteback(page)) {
1228
1229 if (current_is_kswapd() &&
1230 PageReclaim(page) &&
1231 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1232 nr_immediate++;
1233 goto activate_locked;
1234
1235
1236 } else if (sane_reclaim(sc) ||
1237 !PageReclaim(page) || !may_enter_fs) {
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249 SetPageReclaim(page);
1250 nr_writeback++;
1251 goto activate_locked;
1252
1253
1254 } else {
1255 unlock_page(page);
1256 wait_on_page_writeback(page);
1257
1258 list_add_tail(&page->lru, page_list);
1259 continue;
1260 }
1261 }
1262
1263 if (!force_reclaim)
1264 references = page_check_references(page, sc);
1265
1266 switch (references) {
1267 case PAGEREF_ACTIVATE:
1268 goto activate_locked;
1269 case PAGEREF_KEEP:
1270 nr_ref_keep++;
1271 goto keep_locked;
1272 case PAGEREF_RECLAIM:
1273 case PAGEREF_RECLAIM_CLEAN:
1274 ;
1275 }
1276
1277
1278
1279
1280
1281
1282 if (PageAnon(page) && PageSwapBacked(page)) {
1283 if (!PageSwapCache(page)) {
1284 if (!(sc->gfp_mask & __GFP_IO))
1285 goto keep_locked;
1286 if (PageTransHuge(page)) {
1287
1288 if (!can_split_huge_page(page, NULL))
1289 goto activate_locked;
1290
1291
1292
1293
1294
1295 if (!compound_mapcount(page) &&
1296 split_huge_page_to_list(page,
1297 page_list))
1298 goto activate_locked;
1299 }
1300 if (!add_to_swap(page)) {
1301 if (!PageTransHuge(page))
1302 goto activate_locked;
1303
1304 if (split_huge_page_to_list(page,
1305 page_list))
1306 goto activate_locked;
1307#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1308 count_vm_event(THP_SWPOUT_FALLBACK);
1309#endif
1310 if (!add_to_swap(page))
1311 goto activate_locked;
1312 }
1313
1314 may_enter_fs = 1;
1315
1316
1317 mapping = page_mapping(page);
1318 }
1319 } else if (unlikely(PageTransHuge(page))) {
1320
1321 if (split_huge_page_to_list(page, page_list))
1322 goto keep_locked;
1323 }
1324
1325
1326
1327
1328
1329 if (page_mapped(page)) {
1330 enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
1331
1332 if (unlikely(PageTransHuge(page)))
1333 flags |= TTU_SPLIT_HUGE_PMD;
1334 if (!try_to_unmap(page, flags)) {
1335 nr_unmap_fail++;
1336 goto activate_locked;
1337 }
1338 }
1339
1340 if (PageDirty(page)) {
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351 if (page_is_file_cache(page) &&
1352 (!current_is_kswapd() || !PageReclaim(page) ||
1353 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1354
1355
1356
1357
1358
1359
1360 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1361 SetPageReclaim(page);
1362
1363 goto activate_locked;
1364 }
1365
1366 if (references == PAGEREF_RECLAIM_CLEAN)
1367 goto keep_locked;
1368 if (!may_enter_fs)
1369 goto keep_locked;
1370 if (!sc->may_writepage)
1371 goto keep_locked;
1372
1373
1374
1375
1376
1377
1378 try_to_unmap_flush_dirty();
1379 switch (pageout(page, mapping, sc)) {
1380 case PAGE_KEEP:
1381 goto keep_locked;
1382 case PAGE_ACTIVATE:
1383 goto activate_locked;
1384 case PAGE_SUCCESS:
1385 if (PageWriteback(page))
1386 goto keep;
1387 if (PageDirty(page))
1388 goto keep;
1389
1390
1391
1392
1393
1394 if (!trylock_page(page))
1395 goto keep;
1396 if (PageDirty(page) || PageWriteback(page))
1397 goto keep_locked;
1398 mapping = page_mapping(page);
1399 case PAGE_CLEAN:
1400 ;
1401 }
1402 }
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425 if (page_has_private(page)) {
1426 if (!try_to_release_page(page, sc->gfp_mask))
1427 goto activate_locked;
1428 if (!mapping && page_count(page) == 1) {
1429 unlock_page(page);
1430 if (put_page_testzero(page))
1431 goto free_it;
1432 else {
1433
1434
1435
1436
1437
1438
1439
1440 nr_reclaimed++;
1441 continue;
1442 }
1443 }
1444 }
1445
1446 if (PageAnon(page) && !PageSwapBacked(page)) {
1447
1448 if (!page_ref_freeze(page, 1))
1449 goto keep_locked;
1450 if (PageDirty(page)) {
1451 page_ref_unfreeze(page, 1);
1452 goto keep_locked;
1453 }
1454
1455 count_vm_event(PGLAZYFREED);
1456 count_memcg_page_event(page, PGLAZYFREED);
1457 } else if (!mapping || !__remove_mapping(mapping, page, true))
1458 goto keep_locked;
1459
1460
1461
1462
1463
1464
1465
1466 __ClearPageLocked(page);
1467free_it:
1468 nr_reclaimed++;
1469
1470
1471
1472
1473
1474 if (unlikely(PageTransHuge(page))) {
1475 mem_cgroup_uncharge(page);
1476 (*get_compound_page_dtor(page))(page);
1477 } else
1478 list_add(&page->lru, &free_pages);
1479 continue;
1480
1481activate_locked:
1482
1483 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1484 PageMlocked(page)))
1485 try_to_free_swap(page);
1486 VM_BUG_ON_PAGE(PageActive(page), page);
1487 if (!PageMlocked(page)) {
1488 SetPageActive(page);
1489 pgactivate++;
1490 count_memcg_page_event(page, PGACTIVATE);
1491 }
1492keep_locked:
1493 unlock_page(page);
1494keep:
1495 list_add(&page->lru, &ret_pages);
1496 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1497 }
1498
1499 mem_cgroup_uncharge_list(&free_pages);
1500 try_to_unmap_flush();
1501 free_unref_page_list(&free_pages);
1502
1503 list_splice(&ret_pages, page_list);
1504 count_vm_events(PGACTIVATE, pgactivate);
1505
1506 if (stat) {
1507 stat->nr_dirty = nr_dirty;
1508 stat->nr_congested = nr_congested;
1509 stat->nr_unqueued_dirty = nr_unqueued_dirty;
1510 stat->nr_writeback = nr_writeback;
1511 stat->nr_immediate = nr_immediate;
1512 stat->nr_activate = pgactivate;
1513 stat->nr_ref_keep = nr_ref_keep;
1514 stat->nr_unmap_fail = nr_unmap_fail;
1515 }
1516 return nr_reclaimed;
1517}
1518
1519unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1520 struct list_head *page_list)
1521{
1522 struct scan_control sc = {
1523 .gfp_mask = GFP_KERNEL,
1524 .priority = DEF_PRIORITY,
1525 .may_unmap = 1,
1526 };
1527 unsigned long ret;
1528 struct page *page, *next;
1529 LIST_HEAD(clean_pages);
1530
1531 list_for_each_entry_safe(page, next, page_list, lru) {
1532 if (page_is_file_cache(page) && !PageDirty(page) &&
1533 !__PageMovable(page)) {
1534 ClearPageActive(page);
1535 list_move(&page->lru, &clean_pages);
1536 }
1537 }
1538
1539 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1540 TTU_IGNORE_ACCESS, NULL, true);
1541 list_splice(&clean_pages, page_list);
1542 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
1543 return ret;
1544}
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1557{
1558 int ret = -EINVAL;
1559
1560
1561 if (!PageLRU(page))
1562 return ret;
1563
1564
1565 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1566 return ret;
1567
1568 ret = -EBUSY;
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578 if (mode & ISOLATE_ASYNC_MIGRATE) {
1579
1580 if (PageWriteback(page))
1581 return ret;
1582
1583 if (PageDirty(page)) {
1584 struct address_space *mapping;
1585 bool migrate_dirty;
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596 if (!trylock_page(page))
1597 return ret;
1598
1599 mapping = page_mapping(page);
1600 migrate_dirty = !mapping || mapping->a_ops->migratepage;
1601 unlock_page(page);
1602 if (!migrate_dirty)
1603 return ret;
1604 }
1605 }
1606
1607 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1608 return ret;
1609
1610 if (likely(get_page_unless_zero(page))) {
1611
1612
1613
1614
1615
1616 ClearPageLRU(page);
1617 ret = 0;
1618 }
1619
1620 return ret;
1621}
1622
1623
1624
1625
1626
1627
1628static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1629 enum lru_list lru, unsigned long *nr_zone_taken)
1630{
1631 int zid;
1632
1633 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1634 if (!nr_zone_taken[zid])
1635 continue;
1636
1637 __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1638#ifdef CONFIG_MEMCG
1639 mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1640#endif
1641 }
1642
1643}
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1666 struct lruvec *lruvec, struct list_head *dst,
1667 unsigned long *nr_scanned, struct scan_control *sc,
1668 isolate_mode_t mode, enum lru_list lru)
1669{
1670 struct list_head *src = &lruvec->lists[lru];
1671 unsigned long nr_taken = 0;
1672 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1673 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1674 unsigned long skipped = 0;
1675 unsigned long scan, total_scan, nr_pages;
1676 LIST_HEAD(pages_skipped);
1677
1678 scan = 0;
1679 for (total_scan = 0;
1680 scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
1681 total_scan++) {
1682 struct page *page;
1683
1684 page = lru_to_page(src);
1685 prefetchw_prev_lru_page(page, src, flags);
1686
1687 VM_BUG_ON_PAGE(!PageLRU(page), page);
1688
1689 if (page_zonenum(page) > sc->reclaim_idx) {
1690 list_move(&page->lru, &pages_skipped);
1691 nr_skipped[page_zonenum(page)]++;
1692 continue;
1693 }
1694
1695
1696
1697
1698
1699
1700
1701 scan++;
1702 switch (__isolate_lru_page(page, mode)) {
1703 case 0:
1704 nr_pages = hpage_nr_pages(page);
1705 nr_taken += nr_pages;
1706 nr_zone_taken[page_zonenum(page)] += nr_pages;
1707 list_move(&page->lru, dst);
1708 break;
1709
1710 case -EBUSY:
1711
1712 list_move(&page->lru, src);
1713 continue;
1714
1715 default:
1716 BUG();
1717 }
1718 }
1719
1720
1721
1722
1723
1724
1725
1726
1727 if (!list_empty(&pages_skipped)) {
1728 int zid;
1729
1730 list_splice(&pages_skipped, src);
1731 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1732 if (!nr_skipped[zid])
1733 continue;
1734
1735 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1736 skipped += nr_skipped[zid];
1737 }
1738 }
1739 *nr_scanned = total_scan;
1740 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1741 total_scan, skipped, nr_taken, mode, lru);
1742 update_lru_sizes(lruvec, lru, nr_zone_taken);
1743 return nr_taken;
1744}
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772int isolate_lru_page(struct page *page)
1773{
1774 int ret = -EBUSY;
1775
1776 VM_BUG_ON_PAGE(!page_count(page), page);
1777 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1778
1779 if (PageLRU(page)) {
1780 struct zone *zone = page_zone(page);
1781 struct lruvec *lruvec;
1782
1783 spin_lock_irq(zone_lru_lock(zone));
1784 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
1785 if (PageLRU(page)) {
1786 int lru = page_lru(page);
1787 get_page(page);
1788 ClearPageLRU(page);
1789 del_page_from_lru_list(page, lruvec, lru);
1790 ret = 0;
1791 }
1792 spin_unlock_irq(zone_lru_lock(zone));
1793 }
1794 return ret;
1795}
1796
1797
1798
1799
1800
1801
1802
1803
1804static int too_many_isolated(struct pglist_data *pgdat, int file,
1805 struct scan_control *sc)
1806{
1807 unsigned long inactive, isolated;
1808
1809 if (current_is_kswapd())
1810 return 0;
1811
1812 if (!sane_reclaim(sc))
1813 return 0;
1814
1815 if (file) {
1816 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1817 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1818 } else {
1819 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1820 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1821 }
1822
1823
1824
1825
1826
1827
1828 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1829 inactive >>= 3;
1830
1831 return isolated > inactive;
1832}
1833
1834static noinline_for_stack void
1835putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1836{
1837 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1838 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1839 LIST_HEAD(pages_to_free);
1840
1841
1842
1843
1844 while (!list_empty(page_list)) {
1845 struct page *page = lru_to_page(page_list);
1846 int lru;
1847
1848 VM_BUG_ON_PAGE(PageLRU(page), page);
1849 list_del(&page->lru);
1850 if (unlikely(!page_evictable(page))) {
1851 spin_unlock_irq(&pgdat->lru_lock);
1852 putback_lru_page(page);
1853 spin_lock_irq(&pgdat->lru_lock);
1854 continue;
1855 }
1856
1857 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1858
1859 SetPageLRU(page);
1860 lru = page_lru(page);
1861 add_page_to_lru_list(page, lruvec, lru);
1862
1863 if (is_active_lru(lru)) {
1864 int file = is_file_lru(lru);
1865 int numpages = hpage_nr_pages(page);
1866 reclaim_stat->recent_rotated[file] += numpages;
1867 }
1868 if (put_page_testzero(page)) {
1869 __ClearPageLRU(page);
1870 __ClearPageActive(page);
1871 del_page_from_lru_list(page, lruvec, lru);
1872
1873 if (unlikely(PageCompound(page))) {
1874 spin_unlock_irq(&pgdat->lru_lock);
1875 mem_cgroup_uncharge(page);
1876 (*get_compound_page_dtor(page))(page);
1877 spin_lock_irq(&pgdat->lru_lock);
1878 } else
1879 list_add(&page->lru, &pages_to_free);
1880 }
1881 }
1882
1883
1884
1885
1886 list_splice(&pages_to_free, page_list);
1887}
1888
1889
1890
1891
1892
1893
1894
1895static int current_may_throttle(void)
1896{
1897 return !(current->flags & PF_LESS_THROTTLE) ||
1898 current->backing_dev_info == NULL ||
1899 bdi_write_congested(current->backing_dev_info);
1900}
1901
1902
1903
1904
1905
1906static noinline_for_stack unsigned long
1907shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1908 struct scan_control *sc, enum lru_list lru)
1909{
1910 LIST_HEAD(page_list);
1911 unsigned long nr_scanned;
1912 unsigned long nr_reclaimed = 0;
1913 unsigned long nr_taken;
1914 struct reclaim_stat stat = {};
1915 isolate_mode_t isolate_mode = 0;
1916 int file = is_file_lru(lru);
1917 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1918 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1919 bool stalled = false;
1920
1921 while (unlikely(too_many_isolated(pgdat, file, sc))) {
1922 if (stalled)
1923 return 0;
1924
1925
1926 msleep(100);
1927 stalled = true;
1928
1929
1930 if (fatal_signal_pending(current))
1931 return SWAP_CLUSTER_MAX;
1932 }
1933
1934 lru_add_drain();
1935
1936 if (!sc->may_unmap)
1937 isolate_mode |= ISOLATE_UNMAPPED;
1938
1939 spin_lock_irq(&pgdat->lru_lock);
1940
1941 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1942 &nr_scanned, sc, isolate_mode, lru);
1943
1944 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1945 reclaim_stat->recent_scanned[file] += nr_taken;
1946
1947 if (current_is_kswapd()) {
1948 if (global_reclaim(sc))
1949 __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
1950 count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
1951 nr_scanned);
1952 } else {
1953 if (global_reclaim(sc))
1954 __count_vm_events(PGSCAN_DIRECT, nr_scanned);
1955 count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
1956 nr_scanned);
1957 }
1958 spin_unlock_irq(&pgdat->lru_lock);
1959
1960 if (nr_taken == 0)
1961 return 0;
1962
1963 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
1964 &stat, false);
1965
1966 spin_lock_irq(&pgdat->lru_lock);
1967
1968 if (current_is_kswapd()) {
1969 if (global_reclaim(sc))
1970 __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
1971 count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
1972 nr_reclaimed);
1973 } else {
1974 if (global_reclaim(sc))
1975 __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
1976 count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
1977 nr_reclaimed);
1978 }
1979
1980 putback_inactive_pages(lruvec, &page_list);
1981
1982 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1983
1984 spin_unlock_irq(&pgdat->lru_lock);
1985
1986 mem_cgroup_uncharge_list(&page_list);
1987 free_unref_page_list(&page_list);
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000 if (stat.nr_unqueued_dirty == nr_taken)
2001 wakeup_flusher_threads(WB_REASON_VMSCAN);
2002
2003 sc->nr.dirty += stat.nr_dirty;
2004 sc->nr.congested += stat.nr_congested;
2005 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
2006 sc->nr.writeback += stat.nr_writeback;
2007 sc->nr.immediate += stat.nr_immediate;
2008 sc->nr.taken += nr_taken;
2009 if (file)
2010 sc->nr.file_taken += nr_taken;
2011
2012 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2013 nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2014 return nr_reclaimed;
2015}
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
2038 struct list_head *list,
2039 struct list_head *pages_to_free,
2040 enum lru_list lru)
2041{
2042 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2043 struct page *page;
2044 int nr_pages;
2045 int nr_moved = 0;
2046
2047 while (!list_empty(list)) {
2048 page = lru_to_page(list);
2049 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2050
2051 VM_BUG_ON_PAGE(PageLRU(page), page);
2052 SetPageLRU(page);
2053
2054 nr_pages = hpage_nr_pages(page);
2055 update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
2056 list_move(&page->lru, &lruvec->lists[lru]);
2057
2058 if (put_page_testzero(page)) {
2059 __ClearPageLRU(page);
2060 __ClearPageActive(page);
2061 del_page_from_lru_list(page, lruvec, lru);
2062
2063 if (unlikely(PageCompound(page))) {
2064 spin_unlock_irq(&pgdat->lru_lock);
2065 mem_cgroup_uncharge(page);
2066 (*get_compound_page_dtor(page))(page);
2067 spin_lock_irq(&pgdat->lru_lock);
2068 } else
2069 list_add(&page->lru, pages_to_free);
2070 } else {
2071 nr_moved += nr_pages;
2072 }
2073 }
2074
2075 if (!is_active_lru(lru)) {
2076 __count_vm_events(PGDEACTIVATE, nr_moved);
2077 count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
2078 nr_moved);
2079 }
2080
2081 return nr_moved;
2082}
2083
2084static void shrink_active_list(unsigned long nr_to_scan,
2085 struct lruvec *lruvec,
2086 struct scan_control *sc,
2087 enum lru_list lru)
2088{
2089 unsigned long nr_taken;
2090 unsigned long nr_scanned;
2091 unsigned long vm_flags;
2092 LIST_HEAD(l_hold);
2093 LIST_HEAD(l_active);
2094 LIST_HEAD(l_inactive);
2095 struct page *page;
2096 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2097 unsigned nr_deactivate, nr_activate;
2098 unsigned nr_rotated = 0;
2099 isolate_mode_t isolate_mode = 0;
2100 int file = is_file_lru(lru);
2101 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2102
2103 lru_add_drain();
2104
2105 if (!sc->may_unmap)
2106 isolate_mode |= ISOLATE_UNMAPPED;
2107
2108 spin_lock_irq(&pgdat->lru_lock);
2109
2110 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2111 &nr_scanned, sc, isolate_mode, lru);
2112
2113 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2114 reclaim_stat->recent_scanned[file] += nr_taken;
2115
2116 __count_vm_events(PGREFILL, nr_scanned);
2117 count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2118
2119 spin_unlock_irq(&pgdat->lru_lock);
2120
2121 while (!list_empty(&l_hold)) {
2122 cond_resched();
2123 page = lru_to_page(&l_hold);
2124 list_del(&page->lru);
2125
2126 if (unlikely(!page_evictable(page))) {
2127 putback_lru_page(page);
2128 continue;
2129 }
2130
2131 if (unlikely(buffer_heads_over_limit)) {
2132 if (page_has_private(page) && trylock_page(page)) {
2133 if (page_has_private(page))
2134 try_to_release_page(page, 0);
2135 unlock_page(page);
2136 }
2137 }
2138
2139 if (page_referenced(page, 0, sc->target_mem_cgroup,
2140 &vm_flags)) {
2141 nr_rotated += hpage_nr_pages(page);
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
2152 list_add(&page->lru, &l_active);
2153 continue;
2154 }
2155 }
2156
2157 ClearPageActive(page);
2158 SetPageWorkingset(page);
2159 list_add(&page->lru, &l_inactive);
2160 }
2161
2162
2163
2164
2165 spin_lock_irq(&pgdat->lru_lock);
2166
2167
2168
2169
2170
2171
2172 reclaim_stat->recent_rotated[file] += nr_rotated;
2173
2174 nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
2175 nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
2176 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2177 spin_unlock_irq(&pgdat->lru_lock);
2178
2179 mem_cgroup_uncharge_list(&l_hold);
2180 free_unref_page_list(&l_hold);
2181 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2182 nr_deactivate, nr_rotated, sc->priority, file);
2183}
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2214 struct mem_cgroup *memcg,
2215 struct scan_control *sc, bool actual_reclaim)
2216{
2217 enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
2218 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2219 enum lru_list inactive_lru = file * LRU_FILE;
2220 unsigned long inactive, active;
2221 unsigned long inactive_ratio;
2222 unsigned long refaults;
2223 unsigned long gb;
2224
2225
2226
2227
2228
2229 if (!file && !total_swap_pages)
2230 return false;
2231
2232 inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
2233 active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
2234
2235 if (memcg)
2236 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
2237 else
2238 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
2239
2240
2241
2242
2243
2244
2245 if (file && actual_reclaim && lruvec->refaults != refaults) {
2246 inactive_ratio = 0;
2247 } else {
2248 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2249 if (gb)
2250 inactive_ratio = int_sqrt(10 * gb);
2251 else
2252 inactive_ratio = 1;
2253 }
2254
2255 if (actual_reclaim)
2256 trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
2257 lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
2258 lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
2259 inactive_ratio, file);
2260
2261 return inactive * inactive_ratio < active;
2262}
2263
2264static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2265 struct lruvec *lruvec, struct mem_cgroup *memcg,
2266 struct scan_control *sc)
2267{
2268 if (is_active_lru(lru)) {
2269 if (inactive_list_is_low(lruvec, is_file_lru(lru),
2270 memcg, sc, true))
2271 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2272 return 0;
2273 }
2274
2275 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2276}
2277
2278enum scan_balance {
2279 SCAN_EQUAL,
2280 SCAN_FRACT,
2281 SCAN_ANON,
2282 SCAN_FILE,
2283};
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2295 struct scan_control *sc, unsigned long *nr,
2296 unsigned long *lru_pages)
2297{
2298 int swappiness = mem_cgroup_swappiness(memcg);
2299 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2300 u64 fraction[2];
2301 u64 denominator = 0;
2302 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2303 unsigned long anon_prio, file_prio;
2304 enum scan_balance scan_balance;
2305 unsigned long anon, file;
2306 unsigned long ap, fp;
2307 enum lru_list lru;
2308
2309
2310 if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2311 scan_balance = SCAN_FILE;
2312 goto out;
2313 }
2314
2315
2316
2317
2318
2319
2320
2321
2322 if (!global_reclaim(sc) && !swappiness) {
2323 scan_balance = SCAN_FILE;
2324 goto out;
2325 }
2326
2327
2328
2329
2330
2331
2332 if (!sc->priority && swappiness) {
2333 scan_balance = SCAN_EQUAL;
2334 goto out;
2335 }
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346 if (global_reclaim(sc)) {
2347 unsigned long pgdatfile;
2348 unsigned long pgdatfree;
2349 int z;
2350 unsigned long total_high_wmark = 0;
2351
2352 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2353 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
2354 node_page_state(pgdat, NR_INACTIVE_FILE);
2355
2356 for (z = 0; z < MAX_NR_ZONES; z++) {
2357 struct zone *zone = &pgdat->node_zones[z];
2358 if (!managed_zone(zone))
2359 continue;
2360
2361 total_high_wmark += high_wmark_pages(zone);
2362 }
2363
2364 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
2365
2366
2367
2368
2369
2370 if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
2371 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
2372 >> sc->priority) {
2373 scan_balance = SCAN_ANON;
2374 goto out;
2375 }
2376 }
2377 }
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388 if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
2389 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
2390 scan_balance = SCAN_FILE;
2391 goto out;
2392 }
2393
2394 scan_balance = SCAN_FRACT;
2395
2396
2397
2398
2399
2400 anon_prio = swappiness;
2401 file_prio = 200 - anon_prio;
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
2416 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
2417 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
2418 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
2419
2420 spin_lock_irq(&pgdat->lru_lock);
2421 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
2422 reclaim_stat->recent_scanned[0] /= 2;
2423 reclaim_stat->recent_rotated[0] /= 2;
2424 }
2425
2426 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
2427 reclaim_stat->recent_scanned[1] /= 2;
2428 reclaim_stat->recent_rotated[1] /= 2;
2429 }
2430
2431
2432
2433
2434
2435
2436 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
2437 ap /= reclaim_stat->recent_rotated[0] + 1;
2438
2439 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
2440 fp /= reclaim_stat->recent_rotated[1] + 1;
2441 spin_unlock_irq(&pgdat->lru_lock);
2442
2443 fraction[0] = ap;
2444 fraction[1] = fp;
2445 denominator = ap + fp + 1;
2446out:
2447 *lru_pages = 0;
2448 for_each_evictable_lru(lru) {
2449 int file = is_file_lru(lru);
2450 unsigned long size;
2451 unsigned long scan;
2452
2453 size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2454 scan = size >> sc->priority;
2455
2456
2457
2458
2459 if (!scan && !mem_cgroup_online(memcg))
2460 scan = min(size, SWAP_CLUSTER_MAX);
2461
2462 switch (scan_balance) {
2463 case SCAN_EQUAL:
2464
2465 break;
2466 case SCAN_FRACT:
2467
2468
2469
2470
2471
2472
2473 scan = DIV64_U64_ROUND_UP(scan * fraction[file],
2474 denominator);
2475 break;
2476 case SCAN_FILE:
2477 case SCAN_ANON:
2478
2479 if ((scan_balance == SCAN_FILE) != file) {
2480 size = 0;
2481 scan = 0;
2482 }
2483 break;
2484 default:
2485
2486 BUG();
2487 }
2488
2489 *lru_pages += size;
2490 nr[lru] = scan;
2491 }
2492}
2493
2494
2495
2496
2497static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
2498 struct scan_control *sc, unsigned long *lru_pages)
2499{
2500 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2501 unsigned long nr[NR_LRU_LISTS];
2502 unsigned long targets[NR_LRU_LISTS];
2503 unsigned long nr_to_scan;
2504 enum lru_list lru;
2505 unsigned long nr_reclaimed = 0;
2506 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2507 struct blk_plug plug;
2508 bool scan_adjusted;
2509
2510 get_scan_count(lruvec, memcg, sc, nr, lru_pages);
2511
2512
2513 memcpy(targets, nr, sizeof(nr));
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2527 sc->priority == DEF_PRIORITY);
2528
2529 blk_start_plug(&plug);
2530 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2531 nr[LRU_INACTIVE_FILE]) {
2532 unsigned long nr_anon, nr_file, percentage;
2533 unsigned long nr_scanned;
2534
2535 for_each_evictable_lru(lru) {
2536 if (nr[lru]) {
2537 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2538 nr[lru] -= nr_to_scan;
2539
2540 nr_reclaimed += shrink_list(lru, nr_to_scan,
2541 lruvec, memcg, sc);
2542 }
2543 }
2544
2545 cond_resched();
2546
2547 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2548 continue;
2549
2550
2551
2552
2553
2554
2555
2556
2557 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2558 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2559
2560
2561
2562
2563
2564
2565
2566 if (!nr_file || !nr_anon)
2567 break;
2568
2569 if (nr_file > nr_anon) {
2570 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2571 targets[LRU_ACTIVE_ANON] + 1;
2572 lru = LRU_BASE;
2573 percentage = nr_anon * 100 / scan_target;
2574 } else {
2575 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2576 targets[LRU_ACTIVE_FILE] + 1;
2577 lru = LRU_FILE;
2578 percentage = nr_file * 100 / scan_target;
2579 }
2580
2581
2582 nr[lru] = 0;
2583 nr[lru + LRU_ACTIVE] = 0;
2584
2585
2586
2587
2588
2589 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2590 nr_scanned = targets[lru] - nr[lru];
2591 nr[lru] = targets[lru] * (100 - percentage) / 100;
2592 nr[lru] -= min(nr[lru], nr_scanned);
2593
2594 lru += LRU_ACTIVE;
2595 nr_scanned = targets[lru] - nr[lru];
2596 nr[lru] = targets[lru] * (100 - percentage) / 100;
2597 nr[lru] -= min(nr[lru], nr_scanned);
2598
2599 scan_adjusted = true;
2600 }
2601 blk_finish_plug(&plug);
2602 sc->nr_reclaimed += nr_reclaimed;
2603
2604
2605
2606
2607
2608 if (inactive_list_is_low(lruvec, false, memcg, sc, true))
2609 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2610 sc, LRU_ACTIVE_ANON);
2611}
2612
2613
2614static bool in_reclaim_compaction(struct scan_control *sc)
2615{
2616 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2617 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2618 sc->priority < DEF_PRIORITY - 2))
2619 return true;
2620
2621 return false;
2622}
2623
2624
2625
2626
2627
2628
2629
2630
2631static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2632 unsigned long nr_reclaimed,
2633 unsigned long nr_scanned,
2634 struct scan_control *sc)
2635{
2636 unsigned long pages_for_compaction;
2637 unsigned long inactive_lru_pages;
2638 int z;
2639
2640
2641 if (!in_reclaim_compaction(sc))
2642 return false;
2643
2644
2645 if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
2646
2647
2648
2649
2650
2651
2652 if (!nr_reclaimed && !nr_scanned)
2653 return false;
2654 } else {
2655
2656
2657
2658
2659
2660
2661
2662
2663 if (!nr_reclaimed)
2664 return false;
2665 }
2666
2667
2668
2669
2670
2671 pages_for_compaction = compact_gap(sc->order);
2672 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2673 if (get_nr_swap_pages() > 0)
2674 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2675 if (sc->nr_reclaimed < pages_for_compaction &&
2676 inactive_lru_pages > pages_for_compaction)
2677 return true;
2678
2679
2680 for (z = 0; z <= sc->reclaim_idx; z++) {
2681 struct zone *zone = &pgdat->node_zones[z];
2682 if (!managed_zone(zone))
2683 continue;
2684
2685 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2686 case COMPACT_SUCCESS:
2687 case COMPACT_CONTINUE:
2688 return false;
2689 default:
2690
2691 ;
2692 }
2693 }
2694 return true;
2695}
2696
2697static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
2698{
2699 return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
2700 (memcg && memcg_congested(pgdat, memcg));
2701}
2702
2703static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2704{
2705 struct reclaim_state *reclaim_state = current->reclaim_state;
2706 unsigned long nr_reclaimed, nr_scanned;
2707 bool reclaimable = false;
2708
2709 do {
2710 struct mem_cgroup *root = sc->target_mem_cgroup;
2711 struct mem_cgroup_reclaim_cookie reclaim = {
2712 .pgdat = pgdat,
2713 .priority = sc->priority,
2714 };
2715 unsigned long node_lru_pages = 0;
2716 struct mem_cgroup *memcg;
2717
2718 memset(&sc->nr, 0, sizeof(sc->nr));
2719
2720 nr_reclaimed = sc->nr_reclaimed;
2721 nr_scanned = sc->nr_scanned;
2722
2723 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2724 do {
2725 unsigned long lru_pages;
2726 unsigned long reclaimed;
2727 unsigned long scanned;
2728
2729 switch (mem_cgroup_protected(root, memcg)) {
2730 case MEMCG_PROT_MIN:
2731
2732
2733
2734
2735 continue;
2736 case MEMCG_PROT_LOW:
2737
2738
2739
2740
2741
2742
2743 if (!sc->memcg_low_reclaim) {
2744 sc->memcg_low_skipped = 1;
2745 continue;
2746 }
2747 memcg_memory_event(memcg, MEMCG_LOW);
2748 break;
2749 case MEMCG_PROT_NONE:
2750 break;
2751 }
2752
2753 reclaimed = sc->nr_reclaimed;
2754 scanned = sc->nr_scanned;
2755 shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
2756 node_lru_pages += lru_pages;
2757
2758 shrink_slab(sc->gfp_mask, pgdat->node_id,
2759 memcg, sc->priority);
2760
2761
2762 vmpressure(sc->gfp_mask, memcg, false,
2763 sc->nr_scanned - scanned,
2764 sc->nr_reclaimed - reclaimed);
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776 if (!global_reclaim(sc) &&
2777 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2778 mem_cgroup_iter_break(root, memcg);
2779 break;
2780 }
2781 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
2782
2783 if (reclaim_state) {
2784 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2785 reclaim_state->reclaimed_slab = 0;
2786 }
2787
2788
2789 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2790 sc->nr_scanned - nr_scanned,
2791 sc->nr_reclaimed - nr_reclaimed);
2792
2793 if (sc->nr_reclaimed - nr_reclaimed)
2794 reclaimable = true;
2795
2796 if (current_is_kswapd()) {
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
2815 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
2816
2817
2818
2819
2820
2821
2822 if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2823 set_bit(PGDAT_CONGESTED, &pgdat->flags);
2824
2825
2826 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
2827 set_bit(PGDAT_DIRTY, &pgdat->flags);
2828
2829
2830
2831
2832
2833
2834
2835 if (sc->nr.immediate)
2836 congestion_wait(BLK_RW_ASYNC, HZ/10);
2837 }
2838
2839
2840
2841
2842
2843 if (!global_reclaim(sc) && sane_reclaim(sc) &&
2844 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2845 set_memcg_congestion(pgdat, root, true);
2846
2847
2848
2849
2850
2851
2852
2853 if (!sc->hibernation_mode && !current_is_kswapd() &&
2854 current_may_throttle() && pgdat_memcg_congested(pgdat, root))
2855 wait_iff_congested(BLK_RW_ASYNC, HZ/10);
2856
2857 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2858 sc->nr_scanned - nr_scanned, sc));
2859
2860
2861
2862
2863
2864
2865
2866 if (reclaimable)
2867 pgdat->kswapd_failures = 0;
2868
2869 return reclaimable;
2870}
2871
2872
2873
2874
2875
2876
2877static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2878{
2879 unsigned long watermark;
2880 enum compact_result suitable;
2881
2882 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2883 if (suitable == COMPACT_SUCCESS)
2884
2885 return true;
2886 if (suitable == COMPACT_SKIPPED)
2887
2888 return false;
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2900
2901 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2902}
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2913{
2914 struct zoneref *z;
2915 struct zone *zone;
2916 unsigned long nr_soft_reclaimed;
2917 unsigned long nr_soft_scanned;
2918 gfp_t orig_mask;
2919 pg_data_t *last_pgdat = NULL;
2920
2921
2922
2923
2924
2925
2926 orig_mask = sc->gfp_mask;
2927 if (buffer_heads_over_limit) {
2928 sc->gfp_mask |= __GFP_HIGHMEM;
2929 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2930 }
2931
2932 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2933 sc->reclaim_idx, sc->nodemask) {
2934
2935
2936
2937
2938 if (global_reclaim(sc)) {
2939 if (!cpuset_zone_allowed(zone,
2940 GFP_KERNEL | __GFP_HARDWALL))
2941 continue;
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952 if (IS_ENABLED(CONFIG_COMPACTION) &&
2953 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2954 compaction_ready(zone, sc)) {
2955 sc->compaction_ready = true;
2956 continue;
2957 }
2958
2959
2960
2961
2962
2963
2964
2965 if (zone->zone_pgdat == last_pgdat)
2966 continue;
2967
2968
2969
2970
2971
2972
2973
2974 nr_soft_scanned = 0;
2975 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2976 sc->order, sc->gfp_mask,
2977 &nr_soft_scanned);
2978 sc->nr_reclaimed += nr_soft_reclaimed;
2979 sc->nr_scanned += nr_soft_scanned;
2980
2981 }
2982
2983
2984 if (zone->zone_pgdat == last_pgdat)
2985 continue;
2986 last_pgdat = zone->zone_pgdat;
2987 shrink_node(zone->zone_pgdat, sc);
2988 }
2989
2990
2991
2992
2993
2994 sc->gfp_mask = orig_mask;
2995}
2996
2997static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
2998{
2999 struct mem_cgroup *memcg;
3000
3001 memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
3002 do {
3003 unsigned long refaults;
3004 struct lruvec *lruvec;
3005
3006 if (memcg)
3007 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
3008 else
3009 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
3010
3011 lruvec = mem_cgroup_lruvec(pgdat, memcg);
3012 lruvec->refaults = refaults;
3013 } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
3014}
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3033 struct scan_control *sc)
3034{
3035 int initial_priority = sc->priority;
3036 pg_data_t *last_pgdat;
3037 struct zoneref *z;
3038 struct zone *zone;
3039retry:
3040 delayacct_freepages_start();
3041
3042 if (global_reclaim(sc))
3043 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3044
3045 do {
3046 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3047 sc->priority);
3048 sc->nr_scanned = 0;
3049 shrink_zones(zonelist, sc);
3050
3051 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3052 break;
3053
3054 if (sc->compaction_ready)
3055 break;
3056
3057
3058
3059
3060
3061 if (sc->priority < DEF_PRIORITY - 2)
3062 sc->may_writepage = 1;
3063 } while (--sc->priority >= 0);
3064
3065 last_pgdat = NULL;
3066 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3067 sc->nodemask) {
3068 if (zone->zone_pgdat == last_pgdat)
3069 continue;
3070 last_pgdat = zone->zone_pgdat;
3071 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3072 set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
3073 }
3074
3075 delayacct_freepages_end();
3076
3077 if (sc->nr_reclaimed)
3078 return sc->nr_reclaimed;
3079
3080
3081 if (sc->compaction_ready)
3082 return 1;
3083
3084
3085 if (sc->memcg_low_skipped) {
3086 sc->priority = initial_priority;
3087 sc->memcg_low_reclaim = 1;
3088 sc->memcg_low_skipped = 0;
3089 goto retry;
3090 }
3091
3092 return 0;
3093}
3094
3095static bool allow_direct_reclaim(pg_data_t *pgdat)
3096{
3097 struct zone *zone;
3098 unsigned long pfmemalloc_reserve = 0;
3099 unsigned long free_pages = 0;
3100 int i;
3101 bool wmark_ok;
3102
3103 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3104 return true;
3105
3106 for (i = 0; i <= ZONE_NORMAL; i++) {
3107 zone = &pgdat->node_zones[i];
3108 if (!managed_zone(zone))
3109 continue;
3110
3111 if (!zone_reclaimable_pages(zone))
3112 continue;
3113
3114 pfmemalloc_reserve += min_wmark_pages(zone);
3115 free_pages += zone_page_state(zone, NR_FREE_PAGES);
3116 }
3117
3118
3119 if (!pfmemalloc_reserve)
3120 return true;
3121
3122 wmark_ok = free_pages > pfmemalloc_reserve / 2;
3123
3124
3125 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3126 pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
3127 (enum zone_type)ZONE_NORMAL);
3128 wake_up_interruptible(&pgdat->kswapd_wait);
3129 }
3130
3131 return wmark_ok;
3132}
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3144 nodemask_t *nodemask)
3145{
3146 struct zoneref *z;
3147 struct zone *zone;
3148 pg_data_t *pgdat = NULL;
3149
3150
3151
3152
3153
3154
3155
3156
3157 if (current->flags & PF_KTHREAD)
3158 goto out;
3159
3160
3161
3162
3163
3164 if (fatal_signal_pending(current))
3165 goto out;
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3182 gfp_zone(gfp_mask), nodemask) {
3183 if (zone_idx(zone) > ZONE_NORMAL)
3184 continue;
3185
3186
3187 pgdat = zone->zone_pgdat;
3188 if (allow_direct_reclaim(pgdat))
3189 goto out;
3190 break;
3191 }
3192
3193
3194 if (!pgdat)
3195 goto out;
3196
3197
3198 count_vm_event(PGSCAN_DIRECT_THROTTLE);
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208 if (!(gfp_mask & __GFP_FS)) {
3209 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3210 allow_direct_reclaim(pgdat), HZ);
3211
3212 goto check_pending;
3213 }
3214
3215
3216 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3217 allow_direct_reclaim(pgdat));
3218
3219check_pending:
3220 if (fatal_signal_pending(current))
3221 return true;
3222
3223out:
3224 return false;
3225}
3226
3227unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3228 gfp_t gfp_mask, nodemask_t *nodemask)
3229{
3230 unsigned long nr_reclaimed;
3231 struct scan_control sc = {
3232 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3233 .gfp_mask = current_gfp_context(gfp_mask),
3234 .reclaim_idx = gfp_zone(gfp_mask),
3235 .order = order,
3236 .nodemask = nodemask,
3237 .priority = DEF_PRIORITY,
3238 .may_writepage = !laptop_mode,
3239 .may_unmap = 1,
3240 .may_swap = 1,
3241 };
3242
3243
3244
3245
3246
3247 BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3248 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3249 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3250
3251
3252
3253
3254
3255
3256 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3257 return 1;
3258
3259 trace_mm_vmscan_direct_reclaim_begin(order,
3260 sc.may_writepage,
3261 sc.gfp_mask,
3262 sc.reclaim_idx);
3263
3264 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3265
3266 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3267
3268 return nr_reclaimed;
3269}
3270
3271#ifdef CONFIG_MEMCG
3272
3273unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3274 gfp_t gfp_mask, bool noswap,
3275 pg_data_t *pgdat,
3276 unsigned long *nr_scanned)
3277{
3278 struct scan_control sc = {
3279 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3280 .target_mem_cgroup = memcg,
3281 .may_writepage = !laptop_mode,
3282 .may_unmap = 1,
3283 .reclaim_idx = MAX_NR_ZONES - 1,
3284 .may_swap = !noswap,
3285 };
3286 unsigned long lru_pages;
3287
3288 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3289 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3290
3291 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3292 sc.may_writepage,
3293 sc.gfp_mask,
3294 sc.reclaim_idx);
3295
3296
3297
3298
3299
3300
3301
3302
3303 shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
3304
3305 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3306
3307 *nr_scanned = sc.nr_scanned;
3308 return sc.nr_reclaimed;
3309}
3310
3311unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3312 unsigned long nr_pages,
3313 gfp_t gfp_mask,
3314 bool may_swap)
3315{
3316 struct zonelist *zonelist;
3317 unsigned long nr_reclaimed;
3318 unsigned long pflags;
3319 int nid;
3320 unsigned int noreclaim_flag;
3321 struct scan_control sc = {
3322 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3323 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3324 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3325 .reclaim_idx = MAX_NR_ZONES - 1,
3326 .target_mem_cgroup = memcg,
3327 .priority = DEF_PRIORITY,
3328 .may_writepage = !laptop_mode,
3329 .may_unmap = 1,
3330 .may_swap = may_swap,
3331 };
3332
3333
3334
3335
3336
3337
3338 nid = mem_cgroup_select_victim_node(memcg);
3339
3340 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
3341
3342 trace_mm_vmscan_memcg_reclaim_begin(0,
3343 sc.may_writepage,
3344 sc.gfp_mask,
3345 sc.reclaim_idx);
3346
3347 psi_memstall_enter(&pflags);
3348 noreclaim_flag = memalloc_noreclaim_save();
3349
3350 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3351
3352 memalloc_noreclaim_restore(noreclaim_flag);
3353 psi_memstall_leave(&pflags);
3354
3355 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3356
3357 return nr_reclaimed;
3358}
3359#endif
3360
3361static void age_active_anon(struct pglist_data *pgdat,
3362 struct scan_control *sc)
3363{
3364 struct mem_cgroup *memcg;
3365
3366 if (!total_swap_pages)
3367 return;
3368
3369 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3370 do {
3371 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
3372
3373 if (inactive_list_is_low(lruvec, false, memcg, sc, true))
3374 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3375 sc, LRU_ACTIVE_ANON);
3376
3377 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3378 } while (memcg);
3379}
3380
3381
3382
3383
3384
3385static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3386{
3387 int i;
3388 unsigned long mark = -1;
3389 struct zone *zone;
3390
3391 for (i = 0; i <= classzone_idx; i++) {
3392 zone = pgdat->node_zones + i;
3393
3394 if (!managed_zone(zone))
3395 continue;
3396
3397 mark = high_wmark_pages(zone);
3398 if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
3399 return true;
3400 }
3401
3402
3403
3404
3405
3406
3407 if (mark == -1)
3408 return true;
3409
3410 return false;
3411}
3412
3413
3414static void clear_pgdat_congested(pg_data_t *pgdat)
3415{
3416 clear_bit(PGDAT_CONGESTED, &pgdat->flags);
3417 clear_bit(PGDAT_DIRTY, &pgdat->flags);
3418 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3419}
3420
3421
3422
3423
3424
3425
3426
3427static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3428{
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3443 wake_up_all(&pgdat->pfmemalloc_wait);
3444
3445
3446 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3447 return true;
3448
3449 if (pgdat_balanced(pgdat, order, classzone_idx)) {
3450 clear_pgdat_congested(pgdat);
3451 return true;
3452 }
3453
3454 return false;
3455}
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465static bool kswapd_shrink_node(pg_data_t *pgdat,
3466 struct scan_control *sc)
3467{
3468 struct zone *zone;
3469 int z;
3470
3471
3472 sc->nr_to_reclaim = 0;
3473 for (z = 0; z <= sc->reclaim_idx; z++) {
3474 zone = pgdat->node_zones + z;
3475 if (!managed_zone(zone))
3476 continue;
3477
3478 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3479 }
3480
3481
3482
3483
3484
3485 shrink_node(pgdat, sc);
3486
3487
3488
3489
3490
3491
3492
3493
3494 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3495 sc->order = 0;
3496
3497 return sc->nr_scanned >= sc->nr_to_reclaim;
3498}
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3514{
3515 int i;
3516 unsigned long nr_soft_reclaimed;
3517 unsigned long nr_soft_scanned;
3518 unsigned long pflags;
3519 struct zone *zone;
3520 struct scan_control sc = {
3521 .gfp_mask = GFP_KERNEL,
3522 .order = order,
3523 .priority = DEF_PRIORITY,
3524 .may_writepage = !laptop_mode,
3525 .may_unmap = 1,
3526 .may_swap = 1,
3527 };
3528
3529 psi_memstall_enter(&pflags);
3530 __fs_reclaim_acquire();
3531
3532 count_vm_event(PAGEOUTRUN);
3533
3534 do {
3535 unsigned long nr_reclaimed = sc.nr_reclaimed;
3536 bool raise_priority = true;
3537 bool ret;
3538
3539 sc.reclaim_idx = classzone_idx;
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551 if (buffer_heads_over_limit) {
3552 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3553 zone = pgdat->node_zones + i;
3554 if (!managed_zone(zone))
3555 continue;
3556
3557 sc.reclaim_idx = i;
3558 break;
3559 }
3560 }
3561
3562
3563
3564
3565
3566
3567 if (pgdat_balanced(pgdat, sc.order, classzone_idx))
3568 goto out;
3569
3570
3571
3572
3573
3574
3575
3576 age_active_anon(pgdat, &sc);
3577
3578
3579
3580
3581
3582 if (sc.priority < DEF_PRIORITY - 2)
3583 sc.may_writepage = 1;
3584
3585
3586 sc.nr_scanned = 0;
3587 nr_soft_scanned = 0;
3588 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3589 sc.gfp_mask, &nr_soft_scanned);
3590 sc.nr_reclaimed += nr_soft_reclaimed;
3591
3592
3593
3594
3595
3596
3597 if (kswapd_shrink_node(pgdat, &sc))
3598 raise_priority = false;
3599
3600
3601
3602
3603
3604
3605 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3606 allow_direct_reclaim(pgdat))
3607 wake_up_all(&pgdat->pfmemalloc_wait);
3608
3609
3610 __fs_reclaim_release();
3611 ret = try_to_freeze();
3612 __fs_reclaim_acquire();
3613 if (ret || kthread_should_stop())
3614 break;
3615
3616
3617
3618
3619
3620 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3621 if (raise_priority || !nr_reclaimed)
3622 sc.priority--;
3623 } while (sc.priority >= 1);
3624
3625 if (!sc.nr_reclaimed)
3626 pgdat->kswapd_failures++;
3627
3628out:
3629 snapshot_refaults(NULL, pgdat);
3630 __fs_reclaim_release();
3631 psi_memstall_leave(&pflags);
3632
3633
3634
3635
3636
3637
3638 return sc.order;
3639}
3640
3641
3642
3643
3644
3645
3646
3647
3648static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
3649 enum zone_type classzone_idx)
3650{
3651 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3652 return classzone_idx;
3653
3654 return max(pgdat->kswapd_classzone_idx, classzone_idx);
3655}
3656
3657static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3658 unsigned int classzone_idx)
3659{
3660 long remaining = 0;
3661 DEFINE_WAIT(wait);
3662
3663 if (freezing(current) || kthread_should_stop())
3664 return;
3665
3666 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3667
3668
3669
3670
3671
3672
3673
3674
3675 if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3676
3677
3678
3679
3680
3681
3682 reset_isolation_suitable(pgdat);
3683
3684
3685
3686
3687
3688 wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
3689
3690 remaining = schedule_timeout(HZ/10);
3691
3692
3693
3694
3695
3696
3697 if (remaining) {
3698 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3699 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
3700 }
3701
3702 finish_wait(&pgdat->kswapd_wait, &wait);
3703 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3704 }
3705
3706
3707
3708
3709
3710 if (!remaining &&
3711 prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3712 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3723
3724 if (!kthread_should_stop())
3725 schedule();
3726
3727 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3728 } else {
3729 if (remaining)
3730 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3731 else
3732 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3733 }
3734 finish_wait(&pgdat->kswapd_wait, &wait);
3735}
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750static int kswapd(void *p)
3751{
3752 unsigned int alloc_order, reclaim_order;
3753 unsigned int classzone_idx = MAX_NR_ZONES - 1;
3754 pg_data_t *pgdat = (pg_data_t*)p;
3755 struct task_struct *tsk = current;
3756
3757 struct reclaim_state reclaim_state = {
3758 .reclaimed_slab = 0,
3759 };
3760 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3761
3762 if (!cpumask_empty(cpumask))
3763 set_cpus_allowed_ptr(tsk, cpumask);
3764 current->reclaim_state = &reclaim_state;
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3779 set_freezable();
3780
3781 pgdat->kswapd_order = 0;
3782 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3783 for ( ; ; ) {
3784 bool ret;
3785
3786 alloc_order = reclaim_order = pgdat->kswapd_order;
3787 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3788
3789kswapd_try_sleep:
3790 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3791 classzone_idx);
3792
3793
3794 alloc_order = reclaim_order = pgdat->kswapd_order;
3795 classzone_idx = kswapd_classzone_idx(pgdat, 0);
3796 pgdat->kswapd_order = 0;
3797 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3798
3799 ret = try_to_freeze();
3800 if (kthread_should_stop())
3801 break;
3802
3803
3804
3805
3806
3807 if (ret)
3808 continue;
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
3819 alloc_order);
3820 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
3821 if (reclaim_order < alloc_order)
3822 goto kswapd_try_sleep;
3823 }
3824
3825 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3826 current->reclaim_state = NULL;
3827
3828 return 0;
3829}
3830
3831
3832
3833
3834
3835
3836
3837
3838void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3839 enum zone_type classzone_idx)
3840{
3841 pg_data_t *pgdat;
3842
3843 if (!managed_zone(zone))
3844 return;
3845
3846 if (!cpuset_zone_allowed(zone, gfp_flags))
3847 return;
3848 pgdat = zone->zone_pgdat;
3849 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
3850 classzone_idx);
3851 pgdat->kswapd_order = max(pgdat->kswapd_order, order);
3852 if (!waitqueue_active(&pgdat->kswapd_wait))
3853 return;
3854
3855
3856 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3857 pgdat_balanced(pgdat, order, classzone_idx)) {
3858
3859
3860
3861
3862
3863
3864
3865 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
3866 wakeup_kcompactd(pgdat, order, classzone_idx);
3867 return;
3868 }
3869
3870 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
3871 gfp_flags);
3872 wake_up_interruptible(&pgdat->kswapd_wait);
3873}
3874
3875#ifdef CONFIG_HIBERNATION
3876
3877
3878
3879
3880
3881
3882
3883
3884unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3885{
3886 struct reclaim_state reclaim_state;
3887 struct scan_control sc = {
3888 .nr_to_reclaim = nr_to_reclaim,
3889 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3890 .reclaim_idx = MAX_NR_ZONES - 1,
3891 .priority = DEF_PRIORITY,
3892 .may_writepage = 1,
3893 .may_unmap = 1,
3894 .may_swap = 1,
3895 .hibernation_mode = 1,
3896 };
3897 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3898 struct task_struct *p = current;
3899 unsigned long nr_reclaimed;
3900 unsigned int noreclaim_flag;
3901
3902 fs_reclaim_acquire(sc.gfp_mask);
3903 noreclaim_flag = memalloc_noreclaim_save();
3904 reclaim_state.reclaimed_slab = 0;
3905 p->reclaim_state = &reclaim_state;
3906
3907 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3908
3909 p->reclaim_state = NULL;
3910 memalloc_noreclaim_restore(noreclaim_flag);
3911 fs_reclaim_release(sc.gfp_mask);
3912
3913 return nr_reclaimed;
3914}
3915#endif
3916
3917
3918
3919
3920
3921static int kswapd_cpu_online(unsigned int cpu)
3922{
3923 int nid;
3924
3925 for_each_node_state(nid, N_MEMORY) {
3926 pg_data_t *pgdat = NODE_DATA(nid);
3927 const struct cpumask *mask;
3928
3929 mask = cpumask_of_node(pgdat->node_id);
3930
3931 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3932
3933 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3934 }
3935 return 0;
3936}
3937
3938
3939
3940
3941
3942int kswapd_run(int nid)
3943{
3944 pg_data_t *pgdat = NODE_DATA(nid);
3945 int ret = 0;
3946
3947 if (pgdat->kswapd)
3948 return 0;
3949
3950 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3951 if (IS_ERR(pgdat->kswapd)) {
3952
3953 BUG_ON(system_state < SYSTEM_RUNNING);
3954 pr_err("Failed to start kswapd on node %d\n", nid);
3955 ret = PTR_ERR(pgdat->kswapd);
3956 pgdat->kswapd = NULL;
3957 }
3958 return ret;
3959}
3960
3961
3962
3963
3964
3965void kswapd_stop(int nid)
3966{
3967 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3968
3969 if (kswapd) {
3970 kthread_stop(kswapd);
3971 NODE_DATA(nid)->kswapd = NULL;
3972 }
3973}
3974
3975static int __init kswapd_init(void)
3976{
3977 int nid, ret;
3978
3979 swap_setup();
3980 for_each_node_state(nid, N_MEMORY)
3981 kswapd_run(nid);
3982 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
3983 "mm/vmscan:online", kswapd_cpu_online,
3984 NULL);
3985 WARN_ON(ret < 0);
3986 return 0;
3987}
3988
3989module_init(kswapd_init)
3990
3991#ifdef CONFIG_NUMA
3992
3993
3994
3995
3996
3997
3998int node_reclaim_mode __read_mostly;
3999
4000#define RECLAIM_OFF 0
4001#define RECLAIM_ZONE (1<<0)
4002#define RECLAIM_WRITE (1<<1)
4003#define RECLAIM_UNMAP (1<<2)
4004
4005
4006
4007
4008
4009
4010#define NODE_RECLAIM_PRIORITY 4
4011
4012
4013
4014
4015
4016int sysctl_min_unmapped_ratio = 1;
4017
4018
4019
4020
4021
4022int sysctl_min_slab_ratio = 5;
4023
4024static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4025{
4026 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4027 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4028 node_page_state(pgdat, NR_ACTIVE_FILE);
4029
4030
4031
4032
4033
4034
4035 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4036}
4037
4038
4039static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4040{
4041 unsigned long nr_pagecache_reclaimable;
4042 unsigned long delta = 0;
4043
4044
4045
4046
4047
4048
4049
4050 if (node_reclaim_mode & RECLAIM_UNMAP)
4051 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4052 else
4053 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4054
4055
4056 if (!(node_reclaim_mode & RECLAIM_WRITE))
4057 delta += node_page_state(pgdat, NR_FILE_DIRTY);
4058
4059
4060 if (unlikely(delta > nr_pagecache_reclaimable))
4061 delta = nr_pagecache_reclaimable;
4062
4063 return nr_pagecache_reclaimable - delta;
4064}
4065
4066
4067
4068
4069static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4070{
4071
4072 const unsigned long nr_pages = 1 << order;
4073 struct task_struct *p = current;
4074 struct reclaim_state reclaim_state;
4075 unsigned int noreclaim_flag;
4076 struct scan_control sc = {
4077 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4078 .gfp_mask = current_gfp_context(gfp_mask),
4079 .order = order,
4080 .priority = NODE_RECLAIM_PRIORITY,
4081 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4082 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4083 .may_swap = 1,
4084 .reclaim_idx = gfp_zone(gfp_mask),
4085 };
4086
4087 cond_resched();
4088 fs_reclaim_acquire(sc.gfp_mask);
4089
4090
4091
4092
4093
4094 noreclaim_flag = memalloc_noreclaim_save();
4095 p->flags |= PF_SWAPWRITE;
4096 reclaim_state.reclaimed_slab = 0;
4097 p->reclaim_state = &reclaim_state;
4098
4099 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4100
4101
4102
4103
4104 do {
4105 shrink_node(pgdat, &sc);
4106 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4107 }
4108
4109 p->reclaim_state = NULL;
4110 current->flags &= ~PF_SWAPWRITE;
4111 memalloc_noreclaim_restore(noreclaim_flag);
4112 fs_reclaim_release(sc.gfp_mask);
4113 return sc.nr_reclaimed >= nr_pages;
4114}
4115
4116int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4117{
4118 int ret;
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4131 node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
4132 return NODE_RECLAIM_FULL;
4133
4134
4135
4136
4137 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4138 return NODE_RECLAIM_NOSCAN;
4139
4140
4141
4142
4143
4144
4145
4146 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4147 return NODE_RECLAIM_NOSCAN;
4148
4149 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4150 return NODE_RECLAIM_NOSCAN;
4151
4152 ret = __node_reclaim(pgdat, gfp_mask, order);
4153 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4154
4155 if (!ret)
4156 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4157
4158 return ret;
4159}
4160#endif
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174int page_evictable(struct page *page)
4175{
4176 int ret;
4177
4178
4179 rcu_read_lock();
4180 ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
4181 rcu_read_unlock();
4182 return ret;
4183}
4184
4185#ifdef CONFIG_SHMEM
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195void check_move_unevictable_pages(struct page **pages, int nr_pages)
4196{
4197 struct lruvec *lruvec;
4198 struct pglist_data *pgdat = NULL;
4199 int pgscanned = 0;
4200 int pgrescued = 0;
4201 int i;
4202
4203 for (i = 0; i < nr_pages; i++) {
4204 struct page *page = pages[i];
4205 struct pglist_data *pagepgdat = page_pgdat(page);
4206
4207 pgscanned++;
4208 if (pagepgdat != pgdat) {
4209 if (pgdat)
4210 spin_unlock_irq(&pgdat->lru_lock);
4211 pgdat = pagepgdat;
4212 spin_lock_irq(&pgdat->lru_lock);
4213 }
4214 lruvec = mem_cgroup_page_lruvec(page, pgdat);
4215
4216 if (!PageLRU(page) || !PageUnevictable(page))
4217 continue;
4218
4219 if (page_evictable(page)) {
4220 enum lru_list lru = page_lru_base_type(page);
4221
4222 VM_BUG_ON_PAGE(PageActive(page), page);
4223 ClearPageUnevictable(page);
4224 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
4225 add_page_to_lru_list(page, lruvec, lru);
4226 pgrescued++;
4227 }
4228 }
4229
4230 if (pgdat) {
4231 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4232 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4233 spin_unlock_irq(&pgdat->lru_lock);
4234 }
4235}
4236#endif
4237