1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17#include <linux/mm.h>
18#include <linux/sched/mm.h>
19#include <linux/module.h>
20#include <linux/gfp.h>
21#include <linux/kernel_stat.h>
22#include <linux/swap.h>
23#include <linux/pagemap.h>
24#include <linux/init.h>
25#include <linux/highmem.h>
26#include <linux/vmpressure.h>
27#include <linux/vmstat.h>
28#include <linux/file.h>
29#include <linux/writeback.h>
30#include <linux/blkdev.h>
31#include <linux/buffer_head.h>
32
33#include <linux/mm_inline.h>
34#include <linux/backing-dev.h>
35#include <linux/rmap.h>
36#include <linux/topology.h>
37#include <linux/cpu.h>
38#include <linux/cpuset.h>
39#include <linux/compaction.h>
40#include <linux/notifier.h>
41#include <linux/rwsem.h>
42#include <linux/delay.h>
43#include <linux/kthread.h>
44#include <linux/freezer.h>
45#include <linux/memcontrol.h>
46#include <linux/delayacct.h>
47#include <linux/sysctl.h>
48#include <linux/oom.h>
49#include <linux/prefetch.h>
50#include <linux/printk.h>
51#include <linux/dax.h>
52
53#include <asm/tlbflush.h>
54#include <asm/div64.h>
55
56#include <linux/swapops.h>
57#include <linux/balloon_compaction.h>
58
59#include "internal.h"
60
61#define CREATE_TRACE_POINTS
62#include <trace/events/vmscan.h>
63
64struct scan_control {
65
66 unsigned long nr_to_reclaim;
67
68
69
70
71
72 nodemask_t *nodemask;
73
74
75
76
77
78 struct mem_cgroup *target_mem_cgroup;
79
80
81 unsigned int may_writepage:1;
82
83
84 unsigned int may_unmap:1;
85
86
87 unsigned int may_swap:1;
88
89
90
91
92
93
94 unsigned int memcg_low_reclaim:1;
95 unsigned int memcg_low_skipped:1;
96
97 unsigned int hibernation_mode:1;
98
99
100 unsigned int compaction_ready:1;
101
102
103 s8 order;
104
105
106 s8 priority;
107
108
109 s8 reclaim_idx;
110
111
112 gfp_t gfp_mask;
113
114
115 unsigned long nr_scanned;
116
117
118 unsigned long nr_reclaimed;
119
120 struct {
121 unsigned int dirty;
122 unsigned int unqueued_dirty;
123 unsigned int congested;
124 unsigned int writeback;
125 unsigned int immediate;
126 unsigned int file_taken;
127 unsigned int taken;
128 } nr;
129};
130
131#ifdef ARCH_HAS_PREFETCH
132#define prefetch_prev_lru_page(_page, _base, _field) \
133 do { \
134 if ((_page)->lru.prev != _base) { \
135 struct page *prev; \
136 \
137 prev = lru_to_page(&(_page->lru)); \
138 prefetch(&prev->_field); \
139 } \
140 } while (0)
141#else
142#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
143#endif
144
145#ifdef ARCH_HAS_PREFETCHW
146#define prefetchw_prev_lru_page(_page, _base, _field) \
147 do { \
148 if ((_page)->lru.prev != _base) { \
149 struct page *prev; \
150 \
151 prev = lru_to_page(&(_page->lru)); \
152 prefetchw(&prev->_field); \
153 } \
154 } while (0)
155#else
156#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
157#endif
158
159
160
161
162int vm_swappiness = 60;
163
164
165
166
167unsigned long vm_total_pages;
168
169static LIST_HEAD(shrinker_list);
170static DECLARE_RWSEM(shrinker_rwsem);
171
172#ifdef CONFIG_MEMCG_KMEM
173
174
175
176
177
178
179
180
181
182
183
184
185#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
186
187static DEFINE_IDR(shrinker_idr);
188static int shrinker_nr_max;
189
190static int prealloc_memcg_shrinker(struct shrinker *shrinker)
191{
192 int id, ret = -ENOMEM;
193
194 down_write(&shrinker_rwsem);
195
196 id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
197 if (id < 0)
198 goto unlock;
199
200 if (id >= shrinker_nr_max) {
201 if (memcg_expand_shrinker_maps(id)) {
202 idr_remove(&shrinker_idr, id);
203 goto unlock;
204 }
205
206 shrinker_nr_max = id + 1;
207 }
208 shrinker->id = id;
209 ret = 0;
210unlock:
211 up_write(&shrinker_rwsem);
212 return ret;
213}
214
215static void unregister_memcg_shrinker(struct shrinker *shrinker)
216{
217 int id = shrinker->id;
218
219 BUG_ON(id < 0);
220
221 down_write(&shrinker_rwsem);
222 idr_remove(&shrinker_idr, id);
223 up_write(&shrinker_rwsem);
224}
225#else
226static int prealloc_memcg_shrinker(struct shrinker *shrinker)
227{
228 return 0;
229}
230
231static void unregister_memcg_shrinker(struct shrinker *shrinker)
232{
233}
234#endif
235
236#ifdef CONFIG_MEMCG
237static bool global_reclaim(struct scan_control *sc)
238{
239 return !sc->target_mem_cgroup;
240}
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255static bool sane_reclaim(struct scan_control *sc)
256{
257 struct mem_cgroup *memcg = sc->target_mem_cgroup;
258
259 if (!memcg)
260 return true;
261#ifdef CONFIG_CGROUP_WRITEBACK
262 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
263 return true;
264#endif
265 return false;
266}
267
268static void set_memcg_congestion(pg_data_t *pgdat,
269 struct mem_cgroup *memcg,
270 bool congested)
271{
272 struct mem_cgroup_per_node *mn;
273
274 if (!memcg)
275 return;
276
277 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
278 WRITE_ONCE(mn->congested, congested);
279}
280
281static bool memcg_congested(pg_data_t *pgdat,
282 struct mem_cgroup *memcg)
283{
284 struct mem_cgroup_per_node *mn;
285
286 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
287 return READ_ONCE(mn->congested);
288
289}
290#else
291static bool global_reclaim(struct scan_control *sc)
292{
293 return true;
294}
295
296static bool sane_reclaim(struct scan_control *sc)
297{
298 return true;
299}
300
301static inline void set_memcg_congestion(struct pglist_data *pgdat,
302 struct mem_cgroup *memcg, bool congested)
303{
304}
305
306static inline bool memcg_congested(struct pglist_data *pgdat,
307 struct mem_cgroup *memcg)
308{
309 return false;
310
311}
312#endif
313
314
315
316
317
318
319unsigned long zone_reclaimable_pages(struct zone *zone)
320{
321 unsigned long nr;
322
323 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
324 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
325 if (get_nr_swap_pages() > 0)
326 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
327 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
328
329 return nr;
330}
331
332
333
334
335
336
337
338unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
339{
340 unsigned long lru_size;
341 int zid;
342
343 if (!mem_cgroup_disabled())
344 lru_size = mem_cgroup_get_lru_size(lruvec, lru);
345 else
346 lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
347
348 for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
349 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
350 unsigned long size;
351
352 if (!managed_zone(zone))
353 continue;
354
355 if (!mem_cgroup_disabled())
356 size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
357 else
358 size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
359 NR_ZONE_LRU_BASE + lru);
360 lru_size -= min(size, lru_size);
361 }
362
363 return lru_size;
364
365}
366
367
368
369
370int prealloc_shrinker(struct shrinker *shrinker)
371{
372 size_t size = sizeof(*shrinker->nr_deferred);
373
374 if (shrinker->flags & SHRINKER_NUMA_AWARE)
375 size *= nr_node_ids;
376
377 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
378 if (!shrinker->nr_deferred)
379 return -ENOMEM;
380
381 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
382 if (prealloc_memcg_shrinker(shrinker))
383 goto free_deferred;
384 }
385
386 return 0;
387
388free_deferred:
389 kfree(shrinker->nr_deferred);
390 shrinker->nr_deferred = NULL;
391 return -ENOMEM;
392}
393
394void free_prealloced_shrinker(struct shrinker *shrinker)
395{
396 if (!shrinker->nr_deferred)
397 return;
398
399 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
400 unregister_memcg_shrinker(shrinker);
401
402 kfree(shrinker->nr_deferred);
403 shrinker->nr_deferred = NULL;
404}
405
406void register_shrinker_prepared(struct shrinker *shrinker)
407{
408 down_write(&shrinker_rwsem);
409 list_add_tail(&shrinker->list, &shrinker_list);
410#ifdef CONFIG_MEMCG_KMEM
411 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
412 idr_replace(&shrinker_idr, shrinker, shrinker->id);
413#endif
414 up_write(&shrinker_rwsem);
415}
416
417int register_shrinker(struct shrinker *shrinker)
418{
419 int err = prealloc_shrinker(shrinker);
420
421 if (err)
422 return err;
423 register_shrinker_prepared(shrinker);
424 return 0;
425}
426EXPORT_SYMBOL(register_shrinker);
427
428
429
430
431void unregister_shrinker(struct shrinker *shrinker)
432{
433 if (!shrinker->nr_deferred)
434 return;
435 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
436 unregister_memcg_shrinker(shrinker);
437 down_write(&shrinker_rwsem);
438 list_del(&shrinker->list);
439 up_write(&shrinker_rwsem);
440 kfree(shrinker->nr_deferred);
441 shrinker->nr_deferred = NULL;
442}
443EXPORT_SYMBOL(unregister_shrinker);
444
445#define SHRINK_BATCH 128
446
447static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
448 struct shrinker *shrinker, int priority)
449{
450 unsigned long freed = 0;
451 unsigned long long delta;
452 long total_scan;
453 long freeable;
454 long nr;
455 long new_nr;
456 int nid = shrinkctl->nid;
457 long batch_size = shrinker->batch ? shrinker->batch
458 : SHRINK_BATCH;
459 long scanned = 0, next_deferred;
460
461 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
462 nid = 0;
463
464 freeable = shrinker->count_objects(shrinker, shrinkctl);
465 if (freeable == 0 || freeable == SHRINK_EMPTY)
466 return freeable;
467
468
469
470
471
472
473 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
474
475 total_scan = nr;
476 delta = freeable >> priority;
477 delta *= 4;
478 do_div(delta, shrinker->seeks);
479
480
481
482
483
484
485
486
487
488 delta = max_t(unsigned long long, delta, min(freeable, batch_size));
489
490 total_scan += delta;
491 if (total_scan < 0) {
492 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
493 shrinker->scan_objects, total_scan);
494 total_scan = freeable;
495 next_deferred = nr;
496 } else
497 next_deferred = total_scan;
498
499
500
501
502
503
504
505
506
507
508
509
510
511 if (delta < freeable / 4)
512 total_scan = min(total_scan, freeable / 2);
513
514
515
516
517
518
519 if (total_scan > freeable * 2)
520 total_scan = freeable * 2;
521
522 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
523 freeable, delta, total_scan, priority);
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540 while (total_scan >= batch_size ||
541 total_scan >= freeable) {
542 unsigned long ret;
543 unsigned long nr_to_scan = min(batch_size, total_scan);
544
545 shrinkctl->nr_to_scan = nr_to_scan;
546 shrinkctl->nr_scanned = nr_to_scan;
547 ret = shrinker->scan_objects(shrinker, shrinkctl);
548 if (ret == SHRINK_STOP)
549 break;
550 freed += ret;
551
552 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
553 total_scan -= shrinkctl->nr_scanned;
554 scanned += shrinkctl->nr_scanned;
555
556 cond_resched();
557 }
558
559 if (next_deferred >= scanned)
560 next_deferred -= scanned;
561 else
562 next_deferred = 0;
563
564
565
566
567
568 if (next_deferred > 0)
569 new_nr = atomic_long_add_return(next_deferred,
570 &shrinker->nr_deferred[nid]);
571 else
572 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
573
574 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
575 return freed;
576}
577
578#ifdef CONFIG_MEMCG_KMEM
579static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
580 struct mem_cgroup *memcg, int priority)
581{
582 struct memcg_shrinker_map *map;
583 unsigned long ret, freed = 0;
584 int i;
585
586 if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
587 return 0;
588
589 if (!down_read_trylock(&shrinker_rwsem))
590 return 0;
591
592 map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
593 true);
594 if (unlikely(!map))
595 goto unlock;
596
597 for_each_set_bit(i, map->map, shrinker_nr_max) {
598 struct shrink_control sc = {
599 .gfp_mask = gfp_mask,
600 .nid = nid,
601 .memcg = memcg,
602 };
603 struct shrinker *shrinker;
604
605 shrinker = idr_find(&shrinker_idr, i);
606 if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
607 if (!shrinker)
608 clear_bit(i, map->map);
609 continue;
610 }
611
612 ret = do_shrink_slab(&sc, shrinker, priority);
613 if (ret == SHRINK_EMPTY) {
614 clear_bit(i, map->map);
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630 smp_mb__after_atomic();
631 ret = do_shrink_slab(&sc, shrinker, priority);
632 if (ret == SHRINK_EMPTY)
633 ret = 0;
634 else
635 memcg_set_shrinker_bit(memcg, nid, i);
636 }
637 freed += ret;
638
639 if (rwsem_is_contended(&shrinker_rwsem)) {
640 freed = freed ? : 1;
641 break;
642 }
643 }
644unlock:
645 up_read(&shrinker_rwsem);
646 return freed;
647}
648#else
649static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
650 struct mem_cgroup *memcg, int priority)
651{
652 return 0;
653}
654#endif
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
677 struct mem_cgroup *memcg,
678 int priority)
679{
680 unsigned long ret, freed = 0;
681 struct shrinker *shrinker;
682
683 if (!mem_cgroup_is_root(memcg))
684 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
685
686 if (!down_read_trylock(&shrinker_rwsem))
687 goto out;
688
689 list_for_each_entry(shrinker, &shrinker_list, list) {
690 struct shrink_control sc = {
691 .gfp_mask = gfp_mask,
692 .nid = nid,
693 .memcg = memcg,
694 };
695
696 ret = do_shrink_slab(&sc, shrinker, priority);
697 if (ret == SHRINK_EMPTY)
698 ret = 0;
699 freed += ret;
700
701
702
703
704
705 if (rwsem_is_contended(&shrinker_rwsem)) {
706 freed = freed ? : 1;
707 break;
708 }
709 }
710
711 up_read(&shrinker_rwsem);
712out:
713 cond_resched();
714 return freed;
715}
716
717void drop_slab_node(int nid)
718{
719 unsigned long freed;
720
721 do {
722 struct mem_cgroup *memcg = NULL;
723
724 freed = 0;
725 memcg = mem_cgroup_iter(NULL, NULL, NULL);
726 do {
727 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
728 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
729 } while (freed > 10);
730}
731
732void drop_slab(void)
733{
734 int nid;
735
736 for_each_online_node(nid)
737 drop_slab_node(nid);
738}
739
740static inline int is_page_cache_freeable(struct page *page)
741{
742
743
744
745
746
747 int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
748 HPAGE_PMD_NR : 1;
749 return page_count(page) - page_has_private(page) == 1 + radix_pins;
750}
751
752static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
753{
754 if (current->flags & PF_SWAPWRITE)
755 return 1;
756 if (!inode_write_congested(inode))
757 return 1;
758 if (inode_to_bdi(inode) == current->backing_dev_info)
759 return 1;
760 return 0;
761}
762
763
764
765
766
767
768
769
770
771
772
773
774
775static void handle_write_error(struct address_space *mapping,
776 struct page *page, int error)
777{
778 lock_page(page);
779 if (page_mapping(page) == mapping)
780 mapping_set_error(mapping, error);
781 unlock_page(page);
782}
783
784
785typedef enum {
786
787 PAGE_KEEP,
788
789 PAGE_ACTIVATE,
790
791 PAGE_SUCCESS,
792
793 PAGE_CLEAN,
794} pageout_t;
795
796
797
798
799
800static pageout_t pageout(struct page *page, struct address_space *mapping,
801 struct scan_control *sc)
802{
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819 if (!is_page_cache_freeable(page))
820 return PAGE_KEEP;
821 if (!mapping) {
822
823
824
825
826 if (page_has_private(page)) {
827 if (try_to_free_buffers(page)) {
828 ClearPageDirty(page);
829 pr_info("%s: orphaned page\n", __func__);
830 return PAGE_CLEAN;
831 }
832 }
833 return PAGE_KEEP;
834 }
835 if (mapping->a_ops->writepage == NULL)
836 return PAGE_ACTIVATE;
837 if (!may_write_to_inode(mapping->host, sc))
838 return PAGE_KEEP;
839
840 if (clear_page_dirty_for_io(page)) {
841 int res;
842 struct writeback_control wbc = {
843 .sync_mode = WB_SYNC_NONE,
844 .nr_to_write = SWAP_CLUSTER_MAX,
845 .range_start = 0,
846 .range_end = LLONG_MAX,
847 .for_reclaim = 1,
848 };
849
850 SetPageReclaim(page);
851 res = mapping->a_ops->writepage(page, &wbc);
852 if (res < 0)
853 handle_write_error(mapping, page, res);
854 if (res == AOP_WRITEPAGE_ACTIVATE) {
855 ClearPageReclaim(page);
856 return PAGE_ACTIVATE;
857 }
858
859 if (!PageWriteback(page)) {
860
861 ClearPageReclaim(page);
862 }
863 trace_mm_vmscan_writepage(page);
864 inc_node_page_state(page, NR_VMSCAN_WRITE);
865 return PAGE_SUCCESS;
866 }
867
868 return PAGE_CLEAN;
869}
870
871
872
873
874
875static int __remove_mapping(struct address_space *mapping, struct page *page,
876 bool reclaimed)
877{
878 unsigned long flags;
879 int refcount;
880
881 BUG_ON(!PageLocked(page));
882 BUG_ON(mapping != page_mapping(page));
883
884 xa_lock_irqsave(&mapping->i_pages, flags);
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910 if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
911 refcount = 1 + HPAGE_PMD_NR;
912 else
913 refcount = 2;
914 if (!page_ref_freeze(page, refcount))
915 goto cannot_free;
916
917 if (unlikely(PageDirty(page))) {
918 page_ref_unfreeze(page, refcount);
919 goto cannot_free;
920 }
921
922 if (PageSwapCache(page)) {
923 swp_entry_t swap = { .val = page_private(page) };
924 mem_cgroup_swapout(page, swap);
925 __delete_from_swap_cache(page);
926 xa_unlock_irqrestore(&mapping->i_pages, flags);
927 put_swap_page(page, swap);
928 } else {
929 void (*freepage)(struct page *);
930 void *shadow = NULL;
931
932 freepage = mapping->a_ops->freepage;
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949 if (reclaimed && page_is_file_cache(page) &&
950 !mapping_exiting(mapping) && !dax_mapping(mapping))
951 shadow = workingset_eviction(mapping, page);
952 __delete_from_page_cache(page, shadow);
953 xa_unlock_irqrestore(&mapping->i_pages, flags);
954
955 if (freepage != NULL)
956 freepage(page);
957 }
958
959 return 1;
960
961cannot_free:
962 xa_unlock_irqrestore(&mapping->i_pages, flags);
963 return 0;
964}
965
966
967
968
969
970
971
972int remove_mapping(struct address_space *mapping, struct page *page)
973{
974 if (__remove_mapping(mapping, page, false)) {
975
976
977
978
979
980 page_ref_unfreeze(page, 1);
981 return 1;
982 }
983 return 0;
984}
985
986
987
988
989
990
991
992
993
994
995void putback_lru_page(struct page *page)
996{
997 lru_cache_add(page);
998 put_page(page);
999}
1000
1001enum page_references {
1002 PAGEREF_RECLAIM,
1003 PAGEREF_RECLAIM_CLEAN,
1004 PAGEREF_KEEP,
1005 PAGEREF_ACTIVATE,
1006};
1007
1008static enum page_references page_check_references(struct page *page,
1009 struct scan_control *sc)
1010{
1011 int referenced_ptes, referenced_page;
1012 unsigned long vm_flags;
1013
1014 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
1015 &vm_flags);
1016 referenced_page = TestClearPageReferenced(page);
1017
1018
1019
1020
1021
1022 if (vm_flags & VM_LOCKED)
1023 return PAGEREF_RECLAIM;
1024
1025 if (referenced_ptes) {
1026 if (PageSwapBacked(page))
1027 return PAGEREF_ACTIVATE;
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042 SetPageReferenced(page);
1043
1044 if (referenced_page || referenced_ptes > 1)
1045 return PAGEREF_ACTIVATE;
1046
1047
1048
1049
1050 if (vm_flags & VM_EXEC)
1051 return PAGEREF_ACTIVATE;
1052
1053 return PAGEREF_KEEP;
1054 }
1055
1056
1057 if (referenced_page && !PageSwapBacked(page))
1058 return PAGEREF_RECLAIM_CLEAN;
1059
1060 return PAGEREF_RECLAIM;
1061}
1062
1063
1064static void page_check_dirty_writeback(struct page *page,
1065 bool *dirty, bool *writeback)
1066{
1067 struct address_space *mapping;
1068
1069
1070
1071
1072
1073 if (!page_is_file_cache(page) ||
1074 (PageAnon(page) && !PageSwapBacked(page))) {
1075 *dirty = false;
1076 *writeback = false;
1077 return;
1078 }
1079
1080
1081 *dirty = PageDirty(page);
1082 *writeback = PageWriteback(page);
1083
1084
1085 if (!page_has_private(page))
1086 return;
1087
1088 mapping = page_mapping(page);
1089 if (mapping && mapping->a_ops->is_dirty_writeback)
1090 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1091}
1092
1093
1094
1095
1096static unsigned long shrink_page_list(struct list_head *page_list,
1097 struct pglist_data *pgdat,
1098 struct scan_control *sc,
1099 enum ttu_flags ttu_flags,
1100 struct reclaim_stat *stat,
1101 bool force_reclaim)
1102{
1103 LIST_HEAD(ret_pages);
1104 LIST_HEAD(free_pages);
1105 int pgactivate = 0;
1106 unsigned nr_unqueued_dirty = 0;
1107 unsigned nr_dirty = 0;
1108 unsigned nr_congested = 0;
1109 unsigned nr_reclaimed = 0;
1110 unsigned nr_writeback = 0;
1111 unsigned nr_immediate = 0;
1112 unsigned nr_ref_keep = 0;
1113 unsigned nr_unmap_fail = 0;
1114
1115 cond_resched();
1116
1117 while (!list_empty(page_list)) {
1118 struct address_space *mapping;
1119 struct page *page;
1120 int may_enter_fs;
1121 enum page_references references = PAGEREF_RECLAIM_CLEAN;
1122 bool dirty, writeback;
1123
1124 cond_resched();
1125
1126 page = lru_to_page(page_list);
1127 list_del(&page->lru);
1128
1129 if (!trylock_page(page))
1130 goto keep;
1131
1132 VM_BUG_ON_PAGE(PageActive(page), page);
1133
1134 sc->nr_scanned++;
1135
1136 if (unlikely(!page_evictable(page)))
1137 goto activate_locked;
1138
1139 if (!sc->may_unmap && page_mapped(page))
1140 goto keep_locked;
1141
1142
1143 if ((page_mapped(page) || PageSwapCache(page)) &&
1144 !(PageAnon(page) && !PageSwapBacked(page)))
1145 sc->nr_scanned++;
1146
1147 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1148 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1149
1150
1151
1152
1153
1154
1155
1156 page_check_dirty_writeback(page, &dirty, &writeback);
1157 if (dirty || writeback)
1158 nr_dirty++;
1159
1160 if (dirty && !writeback)
1161 nr_unqueued_dirty++;
1162
1163
1164
1165
1166
1167
1168
1169 mapping = page_mapping(page);
1170 if (((dirty || writeback) && mapping &&
1171 inode_write_congested(mapping->host)) ||
1172 (writeback && PageReclaim(page)))
1173 nr_congested++;
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217 if (PageWriteback(page)) {
1218
1219 if (current_is_kswapd() &&
1220 PageReclaim(page) &&
1221 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1222 nr_immediate++;
1223 goto activate_locked;
1224
1225
1226 } else if (sane_reclaim(sc) ||
1227 !PageReclaim(page) || !may_enter_fs) {
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239 SetPageReclaim(page);
1240 nr_writeback++;
1241 goto activate_locked;
1242
1243
1244 } else {
1245 unlock_page(page);
1246 wait_on_page_writeback(page);
1247
1248 list_add_tail(&page->lru, page_list);
1249 continue;
1250 }
1251 }
1252
1253 if (!force_reclaim)
1254 references = page_check_references(page, sc);
1255
1256 switch (references) {
1257 case PAGEREF_ACTIVATE:
1258 goto activate_locked;
1259 case PAGEREF_KEEP:
1260 nr_ref_keep++;
1261 goto keep_locked;
1262 case PAGEREF_RECLAIM:
1263 case PAGEREF_RECLAIM_CLEAN:
1264 ;
1265 }
1266
1267
1268
1269
1270
1271
1272 if (PageAnon(page) && PageSwapBacked(page)) {
1273 if (!PageSwapCache(page)) {
1274 if (!(sc->gfp_mask & __GFP_IO))
1275 goto keep_locked;
1276 if (PageTransHuge(page)) {
1277
1278 if (!can_split_huge_page(page, NULL))
1279 goto activate_locked;
1280
1281
1282
1283
1284
1285 if (!compound_mapcount(page) &&
1286 split_huge_page_to_list(page,
1287 page_list))
1288 goto activate_locked;
1289 }
1290 if (!add_to_swap(page)) {
1291 if (!PageTransHuge(page))
1292 goto activate_locked;
1293
1294 if (split_huge_page_to_list(page,
1295 page_list))
1296 goto activate_locked;
1297#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1298 count_vm_event(THP_SWPOUT_FALLBACK);
1299#endif
1300 if (!add_to_swap(page))
1301 goto activate_locked;
1302 }
1303
1304 may_enter_fs = 1;
1305
1306
1307 mapping = page_mapping(page);
1308 }
1309 } else if (unlikely(PageTransHuge(page))) {
1310
1311 if (split_huge_page_to_list(page, page_list))
1312 goto keep_locked;
1313 }
1314
1315
1316
1317
1318
1319 if (page_mapped(page)) {
1320 enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
1321
1322 if (unlikely(PageTransHuge(page)))
1323 flags |= TTU_SPLIT_HUGE_PMD;
1324 if (!try_to_unmap(page, flags)) {
1325 nr_unmap_fail++;
1326 goto activate_locked;
1327 }
1328 }
1329
1330 if (PageDirty(page)) {
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341 if (page_is_file_cache(page) &&
1342 (!current_is_kswapd() || !PageReclaim(page) ||
1343 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1344
1345
1346
1347
1348
1349
1350 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1351 SetPageReclaim(page);
1352
1353 goto activate_locked;
1354 }
1355
1356 if (references == PAGEREF_RECLAIM_CLEAN)
1357 goto keep_locked;
1358 if (!may_enter_fs)
1359 goto keep_locked;
1360 if (!sc->may_writepage)
1361 goto keep_locked;
1362
1363
1364
1365
1366
1367
1368 try_to_unmap_flush_dirty();
1369 switch (pageout(page, mapping, sc)) {
1370 case PAGE_KEEP:
1371 goto keep_locked;
1372 case PAGE_ACTIVATE:
1373 goto activate_locked;
1374 case PAGE_SUCCESS:
1375 if (PageWriteback(page))
1376 goto keep;
1377 if (PageDirty(page))
1378 goto keep;
1379
1380
1381
1382
1383
1384 if (!trylock_page(page))
1385 goto keep;
1386 if (PageDirty(page) || PageWriteback(page))
1387 goto keep_locked;
1388 mapping = page_mapping(page);
1389 case PAGE_CLEAN:
1390 ;
1391 }
1392 }
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415 if (page_has_private(page)) {
1416 if (!try_to_release_page(page, sc->gfp_mask))
1417 goto activate_locked;
1418 if (!mapping && page_count(page) == 1) {
1419 unlock_page(page);
1420 if (put_page_testzero(page))
1421 goto free_it;
1422 else {
1423
1424
1425
1426
1427
1428
1429
1430 nr_reclaimed++;
1431 continue;
1432 }
1433 }
1434 }
1435
1436 if (PageAnon(page) && !PageSwapBacked(page)) {
1437
1438 if (!page_ref_freeze(page, 1))
1439 goto keep_locked;
1440 if (PageDirty(page)) {
1441 page_ref_unfreeze(page, 1);
1442 goto keep_locked;
1443 }
1444
1445 count_vm_event(PGLAZYFREED);
1446 count_memcg_page_event(page, PGLAZYFREED);
1447 } else if (!mapping || !__remove_mapping(mapping, page, true))
1448 goto keep_locked;
1449
1450
1451
1452
1453
1454
1455
1456 __ClearPageLocked(page);
1457free_it:
1458 nr_reclaimed++;
1459
1460
1461
1462
1463
1464 if (unlikely(PageTransHuge(page))) {
1465 mem_cgroup_uncharge(page);
1466 (*get_compound_page_dtor(page))(page);
1467 } else
1468 list_add(&page->lru, &free_pages);
1469 continue;
1470
1471activate_locked:
1472
1473 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1474 PageMlocked(page)))
1475 try_to_free_swap(page);
1476 VM_BUG_ON_PAGE(PageActive(page), page);
1477 if (!PageMlocked(page)) {
1478 SetPageActive(page);
1479 pgactivate++;
1480 count_memcg_page_event(page, PGACTIVATE);
1481 }
1482keep_locked:
1483 unlock_page(page);
1484keep:
1485 list_add(&page->lru, &ret_pages);
1486 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1487 }
1488
1489 mem_cgroup_uncharge_list(&free_pages);
1490 try_to_unmap_flush();
1491 free_unref_page_list(&free_pages);
1492
1493 list_splice(&ret_pages, page_list);
1494 count_vm_events(PGACTIVATE, pgactivate);
1495
1496 if (stat) {
1497 stat->nr_dirty = nr_dirty;
1498 stat->nr_congested = nr_congested;
1499 stat->nr_unqueued_dirty = nr_unqueued_dirty;
1500 stat->nr_writeback = nr_writeback;
1501 stat->nr_immediate = nr_immediate;
1502 stat->nr_activate = pgactivate;
1503 stat->nr_ref_keep = nr_ref_keep;
1504 stat->nr_unmap_fail = nr_unmap_fail;
1505 }
1506 return nr_reclaimed;
1507}
1508
1509unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1510 struct list_head *page_list)
1511{
1512 struct scan_control sc = {
1513 .gfp_mask = GFP_KERNEL,
1514 .priority = DEF_PRIORITY,
1515 .may_unmap = 1,
1516 };
1517 unsigned long ret;
1518 struct page *page, *next;
1519 LIST_HEAD(clean_pages);
1520
1521 list_for_each_entry_safe(page, next, page_list, lru) {
1522 if (page_is_file_cache(page) && !PageDirty(page) &&
1523 !__PageMovable(page)) {
1524 ClearPageActive(page);
1525 list_move(&page->lru, &clean_pages);
1526 }
1527 }
1528
1529 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1530 TTU_IGNORE_ACCESS, NULL, true);
1531 list_splice(&clean_pages, page_list);
1532 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
1533 return ret;
1534}
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1547{
1548 int ret = -EINVAL;
1549
1550
1551 if (!PageLRU(page))
1552 return ret;
1553
1554
1555 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1556 return ret;
1557
1558 ret = -EBUSY;
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568 if (mode & ISOLATE_ASYNC_MIGRATE) {
1569
1570 if (PageWriteback(page))
1571 return ret;
1572
1573 if (PageDirty(page)) {
1574 struct address_space *mapping;
1575 bool migrate_dirty;
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586 if (!trylock_page(page))
1587 return ret;
1588
1589 mapping = page_mapping(page);
1590 migrate_dirty = !mapping || mapping->a_ops->migratepage;
1591 unlock_page(page);
1592 if (!migrate_dirty)
1593 return ret;
1594 }
1595 }
1596
1597 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1598 return ret;
1599
1600 if (likely(get_page_unless_zero(page))) {
1601
1602
1603
1604
1605
1606 ClearPageLRU(page);
1607 ret = 0;
1608 }
1609
1610 return ret;
1611}
1612
1613
1614
1615
1616
1617
1618static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1619 enum lru_list lru, unsigned long *nr_zone_taken)
1620{
1621 int zid;
1622
1623 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1624 if (!nr_zone_taken[zid])
1625 continue;
1626
1627 __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1628#ifdef CONFIG_MEMCG
1629 mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1630#endif
1631 }
1632
1633}
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1656 struct lruvec *lruvec, struct list_head *dst,
1657 unsigned long *nr_scanned, struct scan_control *sc,
1658 isolate_mode_t mode, enum lru_list lru)
1659{
1660 struct list_head *src = &lruvec->lists[lru];
1661 unsigned long nr_taken = 0;
1662 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1663 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1664 unsigned long skipped = 0;
1665 unsigned long scan, total_scan, nr_pages;
1666 LIST_HEAD(pages_skipped);
1667
1668 scan = 0;
1669 for (total_scan = 0;
1670 scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
1671 total_scan++) {
1672 struct page *page;
1673
1674 page = lru_to_page(src);
1675 prefetchw_prev_lru_page(page, src, flags);
1676
1677 VM_BUG_ON_PAGE(!PageLRU(page), page);
1678
1679 if (page_zonenum(page) > sc->reclaim_idx) {
1680 list_move(&page->lru, &pages_skipped);
1681 nr_skipped[page_zonenum(page)]++;
1682 continue;
1683 }
1684
1685
1686
1687
1688
1689
1690
1691 scan++;
1692 switch (__isolate_lru_page(page, mode)) {
1693 case 0:
1694 nr_pages = hpage_nr_pages(page);
1695 nr_taken += nr_pages;
1696 nr_zone_taken[page_zonenum(page)] += nr_pages;
1697 list_move(&page->lru, dst);
1698 break;
1699
1700 case -EBUSY:
1701
1702 list_move(&page->lru, src);
1703 continue;
1704
1705 default:
1706 BUG();
1707 }
1708 }
1709
1710
1711
1712
1713
1714
1715
1716
1717 if (!list_empty(&pages_skipped)) {
1718 int zid;
1719
1720 list_splice(&pages_skipped, src);
1721 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1722 if (!nr_skipped[zid])
1723 continue;
1724
1725 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1726 skipped += nr_skipped[zid];
1727 }
1728 }
1729 *nr_scanned = total_scan;
1730 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1731 total_scan, skipped, nr_taken, mode, lru);
1732 update_lru_sizes(lruvec, lru, nr_zone_taken);
1733 return nr_taken;
1734}
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762int isolate_lru_page(struct page *page)
1763{
1764 int ret = -EBUSY;
1765
1766 VM_BUG_ON_PAGE(!page_count(page), page);
1767 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1768
1769 if (PageLRU(page)) {
1770 struct zone *zone = page_zone(page);
1771 struct lruvec *lruvec;
1772
1773 spin_lock_irq(zone_lru_lock(zone));
1774 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
1775 if (PageLRU(page)) {
1776 int lru = page_lru(page);
1777 get_page(page);
1778 ClearPageLRU(page);
1779 del_page_from_lru_list(page, lruvec, lru);
1780 ret = 0;
1781 }
1782 spin_unlock_irq(zone_lru_lock(zone));
1783 }
1784 return ret;
1785}
1786
1787
1788
1789
1790
1791
1792
1793
1794static int too_many_isolated(struct pglist_data *pgdat, int file,
1795 struct scan_control *sc)
1796{
1797 unsigned long inactive, isolated;
1798
1799 if (current_is_kswapd())
1800 return 0;
1801
1802 if (!sane_reclaim(sc))
1803 return 0;
1804
1805 if (file) {
1806 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1807 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1808 } else {
1809 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1810 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1811 }
1812
1813
1814
1815
1816
1817
1818 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1819 inactive >>= 3;
1820
1821 return isolated > inactive;
1822}
1823
1824static noinline_for_stack void
1825putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1826{
1827 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1828 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1829 LIST_HEAD(pages_to_free);
1830
1831
1832
1833
1834 while (!list_empty(page_list)) {
1835 struct page *page = lru_to_page(page_list);
1836 int lru;
1837
1838 VM_BUG_ON_PAGE(PageLRU(page), page);
1839 list_del(&page->lru);
1840 if (unlikely(!page_evictable(page))) {
1841 spin_unlock_irq(&pgdat->lru_lock);
1842 putback_lru_page(page);
1843 spin_lock_irq(&pgdat->lru_lock);
1844 continue;
1845 }
1846
1847 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1848
1849 SetPageLRU(page);
1850 lru = page_lru(page);
1851 add_page_to_lru_list(page, lruvec, lru);
1852
1853 if (is_active_lru(lru)) {
1854 int file = is_file_lru(lru);
1855 int numpages = hpage_nr_pages(page);
1856 reclaim_stat->recent_rotated[file] += numpages;
1857 }
1858 if (put_page_testzero(page)) {
1859 __ClearPageLRU(page);
1860 __ClearPageActive(page);
1861 del_page_from_lru_list(page, lruvec, lru);
1862
1863 if (unlikely(PageCompound(page))) {
1864 spin_unlock_irq(&pgdat->lru_lock);
1865 mem_cgroup_uncharge(page);
1866 (*get_compound_page_dtor(page))(page);
1867 spin_lock_irq(&pgdat->lru_lock);
1868 } else
1869 list_add(&page->lru, &pages_to_free);
1870 }
1871 }
1872
1873
1874
1875
1876 list_splice(&pages_to_free, page_list);
1877}
1878
1879
1880
1881
1882
1883
1884
1885static int current_may_throttle(void)
1886{
1887 return !(current->flags & PF_LESS_THROTTLE) ||
1888 current->backing_dev_info == NULL ||
1889 bdi_write_congested(current->backing_dev_info);
1890}
1891
1892
1893
1894
1895
1896static noinline_for_stack unsigned long
1897shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1898 struct scan_control *sc, enum lru_list lru)
1899{
1900 LIST_HEAD(page_list);
1901 unsigned long nr_scanned;
1902 unsigned long nr_reclaimed = 0;
1903 unsigned long nr_taken;
1904 struct reclaim_stat stat = {};
1905 isolate_mode_t isolate_mode = 0;
1906 int file = is_file_lru(lru);
1907 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1908 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1909 bool stalled = false;
1910
1911 while (unlikely(too_many_isolated(pgdat, file, sc))) {
1912 if (stalled)
1913 return 0;
1914
1915
1916 msleep(100);
1917 stalled = true;
1918
1919
1920 if (fatal_signal_pending(current))
1921 return SWAP_CLUSTER_MAX;
1922 }
1923
1924 lru_add_drain();
1925
1926 if (!sc->may_unmap)
1927 isolate_mode |= ISOLATE_UNMAPPED;
1928
1929 spin_lock_irq(&pgdat->lru_lock);
1930
1931 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1932 &nr_scanned, sc, isolate_mode, lru);
1933
1934 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1935 reclaim_stat->recent_scanned[file] += nr_taken;
1936
1937 if (current_is_kswapd()) {
1938 if (global_reclaim(sc))
1939 __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
1940 count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
1941 nr_scanned);
1942 } else {
1943 if (global_reclaim(sc))
1944 __count_vm_events(PGSCAN_DIRECT, nr_scanned);
1945 count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
1946 nr_scanned);
1947 }
1948 spin_unlock_irq(&pgdat->lru_lock);
1949
1950 if (nr_taken == 0)
1951 return 0;
1952
1953 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
1954 &stat, false);
1955
1956 spin_lock_irq(&pgdat->lru_lock);
1957
1958 if (current_is_kswapd()) {
1959 if (global_reclaim(sc))
1960 __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
1961 count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
1962 nr_reclaimed);
1963 } else {
1964 if (global_reclaim(sc))
1965 __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
1966 count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
1967 nr_reclaimed);
1968 }
1969
1970 putback_inactive_pages(lruvec, &page_list);
1971
1972 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1973
1974 spin_unlock_irq(&pgdat->lru_lock);
1975
1976 mem_cgroup_uncharge_list(&page_list);
1977 free_unref_page_list(&page_list);
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990 if (stat.nr_unqueued_dirty == nr_taken)
1991 wakeup_flusher_threads(WB_REASON_VMSCAN);
1992
1993 sc->nr.dirty += stat.nr_dirty;
1994 sc->nr.congested += stat.nr_congested;
1995 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
1996 sc->nr.writeback += stat.nr_writeback;
1997 sc->nr.immediate += stat.nr_immediate;
1998 sc->nr.taken += nr_taken;
1999 if (file)
2000 sc->nr.file_taken += nr_taken;
2001
2002 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
2003 nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2004 return nr_reclaimed;
2005}
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
2028 struct list_head *list,
2029 struct list_head *pages_to_free,
2030 enum lru_list lru)
2031{
2032 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2033 struct page *page;
2034 int nr_pages;
2035 int nr_moved = 0;
2036
2037 while (!list_empty(list)) {
2038 page = lru_to_page(list);
2039 lruvec = mem_cgroup_page_lruvec(page, pgdat);
2040
2041 VM_BUG_ON_PAGE(PageLRU(page), page);
2042 SetPageLRU(page);
2043
2044 nr_pages = hpage_nr_pages(page);
2045 update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
2046 list_move(&page->lru, &lruvec->lists[lru]);
2047
2048 if (put_page_testzero(page)) {
2049 __ClearPageLRU(page);
2050 __ClearPageActive(page);
2051 del_page_from_lru_list(page, lruvec, lru);
2052
2053 if (unlikely(PageCompound(page))) {
2054 spin_unlock_irq(&pgdat->lru_lock);
2055 mem_cgroup_uncharge(page);
2056 (*get_compound_page_dtor(page))(page);
2057 spin_lock_irq(&pgdat->lru_lock);
2058 } else
2059 list_add(&page->lru, pages_to_free);
2060 } else {
2061 nr_moved += nr_pages;
2062 }
2063 }
2064
2065 if (!is_active_lru(lru)) {
2066 __count_vm_events(PGDEACTIVATE, nr_moved);
2067 count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
2068 nr_moved);
2069 }
2070
2071 return nr_moved;
2072}
2073
2074static void shrink_active_list(unsigned long nr_to_scan,
2075 struct lruvec *lruvec,
2076 struct scan_control *sc,
2077 enum lru_list lru)
2078{
2079 unsigned long nr_taken;
2080 unsigned long nr_scanned;
2081 unsigned long vm_flags;
2082 LIST_HEAD(l_hold);
2083 LIST_HEAD(l_active);
2084 LIST_HEAD(l_inactive);
2085 struct page *page;
2086 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2087 unsigned nr_deactivate, nr_activate;
2088 unsigned nr_rotated = 0;
2089 isolate_mode_t isolate_mode = 0;
2090 int file = is_file_lru(lru);
2091 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2092
2093 lru_add_drain();
2094
2095 if (!sc->may_unmap)
2096 isolate_mode |= ISOLATE_UNMAPPED;
2097
2098 spin_lock_irq(&pgdat->lru_lock);
2099
2100 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2101 &nr_scanned, sc, isolate_mode, lru);
2102
2103 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2104 reclaim_stat->recent_scanned[file] += nr_taken;
2105
2106 __count_vm_events(PGREFILL, nr_scanned);
2107 count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2108
2109 spin_unlock_irq(&pgdat->lru_lock);
2110
2111 while (!list_empty(&l_hold)) {
2112 cond_resched();
2113 page = lru_to_page(&l_hold);
2114 list_del(&page->lru);
2115
2116 if (unlikely(!page_evictable(page))) {
2117 putback_lru_page(page);
2118 continue;
2119 }
2120
2121 if (unlikely(buffer_heads_over_limit)) {
2122 if (page_has_private(page) && trylock_page(page)) {
2123 if (page_has_private(page))
2124 try_to_release_page(page, 0);
2125 unlock_page(page);
2126 }
2127 }
2128
2129 if (page_referenced(page, 0, sc->target_mem_cgroup,
2130 &vm_flags)) {
2131 nr_rotated += hpage_nr_pages(page);
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
2142 list_add(&page->lru, &l_active);
2143 continue;
2144 }
2145 }
2146
2147 ClearPageActive(page);
2148 list_add(&page->lru, &l_inactive);
2149 }
2150
2151
2152
2153
2154 spin_lock_irq(&pgdat->lru_lock);
2155
2156
2157
2158
2159
2160
2161 reclaim_stat->recent_rotated[file] += nr_rotated;
2162
2163 nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
2164 nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
2165 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2166 spin_unlock_irq(&pgdat->lru_lock);
2167
2168 mem_cgroup_uncharge_list(&l_hold);
2169 free_unref_page_list(&l_hold);
2170 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2171 nr_deactivate, nr_rotated, sc->priority, file);
2172}
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2203 struct mem_cgroup *memcg,
2204 struct scan_control *sc, bool actual_reclaim)
2205{
2206 enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
2207 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2208 enum lru_list inactive_lru = file * LRU_FILE;
2209 unsigned long inactive, active;
2210 unsigned long inactive_ratio;
2211 unsigned long refaults;
2212 unsigned long gb;
2213
2214
2215
2216
2217
2218 if (!file && !total_swap_pages)
2219 return false;
2220
2221 inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
2222 active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
2223
2224 if (memcg)
2225 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
2226 else
2227 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
2228
2229
2230
2231
2232
2233
2234 if (file && actual_reclaim && lruvec->refaults != refaults) {
2235 inactive_ratio = 0;
2236 } else {
2237 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2238 if (gb)
2239 inactive_ratio = int_sqrt(10 * gb);
2240 else
2241 inactive_ratio = 1;
2242 }
2243
2244 if (actual_reclaim)
2245 trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
2246 lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
2247 lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
2248 inactive_ratio, file);
2249
2250 return inactive * inactive_ratio < active;
2251}
2252
2253static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2254 struct lruvec *lruvec, struct mem_cgroup *memcg,
2255 struct scan_control *sc)
2256{
2257 if (is_active_lru(lru)) {
2258 if (inactive_list_is_low(lruvec, is_file_lru(lru),
2259 memcg, sc, true))
2260 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2261 return 0;
2262 }
2263
2264 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2265}
2266
2267enum scan_balance {
2268 SCAN_EQUAL,
2269 SCAN_FRACT,
2270 SCAN_ANON,
2271 SCAN_FILE,
2272};
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2284 struct scan_control *sc, unsigned long *nr,
2285 unsigned long *lru_pages)
2286{
2287 int swappiness = mem_cgroup_swappiness(memcg);
2288 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2289 u64 fraction[2];
2290 u64 denominator = 0;
2291 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2292 unsigned long anon_prio, file_prio;
2293 enum scan_balance scan_balance;
2294 unsigned long anon, file;
2295 unsigned long ap, fp;
2296 enum lru_list lru;
2297
2298
2299 if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2300 scan_balance = SCAN_FILE;
2301 goto out;
2302 }
2303
2304
2305
2306
2307
2308
2309
2310
2311 if (!global_reclaim(sc) && !swappiness) {
2312 scan_balance = SCAN_FILE;
2313 goto out;
2314 }
2315
2316
2317
2318
2319
2320
2321 if (!sc->priority && swappiness) {
2322 scan_balance = SCAN_EQUAL;
2323 goto out;
2324 }
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335 if (global_reclaim(sc)) {
2336 unsigned long pgdatfile;
2337 unsigned long pgdatfree;
2338 int z;
2339 unsigned long total_high_wmark = 0;
2340
2341 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2342 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
2343 node_page_state(pgdat, NR_INACTIVE_FILE);
2344
2345 for (z = 0; z < MAX_NR_ZONES; z++) {
2346 struct zone *zone = &pgdat->node_zones[z];
2347 if (!managed_zone(zone))
2348 continue;
2349
2350 total_high_wmark += high_wmark_pages(zone);
2351 }
2352
2353 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
2354
2355
2356
2357
2358
2359 if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
2360 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
2361 >> sc->priority) {
2362 scan_balance = SCAN_ANON;
2363 goto out;
2364 }
2365 }
2366 }
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377 if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
2378 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
2379 scan_balance = SCAN_FILE;
2380 goto out;
2381 }
2382
2383 scan_balance = SCAN_FRACT;
2384
2385
2386
2387
2388
2389 anon_prio = swappiness;
2390 file_prio = 200 - anon_prio;
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
2405 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
2406 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
2407 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
2408
2409 spin_lock_irq(&pgdat->lru_lock);
2410 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
2411 reclaim_stat->recent_scanned[0] /= 2;
2412 reclaim_stat->recent_rotated[0] /= 2;
2413 }
2414
2415 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
2416 reclaim_stat->recent_scanned[1] /= 2;
2417 reclaim_stat->recent_rotated[1] /= 2;
2418 }
2419
2420
2421
2422
2423
2424
2425 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
2426 ap /= reclaim_stat->recent_rotated[0] + 1;
2427
2428 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
2429 fp /= reclaim_stat->recent_rotated[1] + 1;
2430 spin_unlock_irq(&pgdat->lru_lock);
2431
2432 fraction[0] = ap;
2433 fraction[1] = fp;
2434 denominator = ap + fp + 1;
2435out:
2436 *lru_pages = 0;
2437 for_each_evictable_lru(lru) {
2438 int file = is_file_lru(lru);
2439 unsigned long size;
2440 unsigned long scan;
2441
2442 size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2443 scan = size >> sc->priority;
2444
2445
2446
2447
2448 if (!scan && !mem_cgroup_online(memcg))
2449 scan = min(size, SWAP_CLUSTER_MAX);
2450
2451 switch (scan_balance) {
2452 case SCAN_EQUAL:
2453
2454 break;
2455 case SCAN_FRACT:
2456
2457
2458
2459
2460 scan = div64_u64(scan * fraction[file],
2461 denominator);
2462 break;
2463 case SCAN_FILE:
2464 case SCAN_ANON:
2465
2466 if ((scan_balance == SCAN_FILE) != file) {
2467 size = 0;
2468 scan = 0;
2469 }
2470 break;
2471 default:
2472
2473 BUG();
2474 }
2475
2476 *lru_pages += size;
2477 nr[lru] = scan;
2478 }
2479}
2480
2481
2482
2483
2484static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
2485 struct scan_control *sc, unsigned long *lru_pages)
2486{
2487 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2488 unsigned long nr[NR_LRU_LISTS];
2489 unsigned long targets[NR_LRU_LISTS];
2490 unsigned long nr_to_scan;
2491 enum lru_list lru;
2492 unsigned long nr_reclaimed = 0;
2493 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2494 struct blk_plug plug;
2495 bool scan_adjusted;
2496
2497 get_scan_count(lruvec, memcg, sc, nr, lru_pages);
2498
2499
2500 memcpy(targets, nr, sizeof(nr));
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2514 sc->priority == DEF_PRIORITY);
2515
2516 blk_start_plug(&plug);
2517 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2518 nr[LRU_INACTIVE_FILE]) {
2519 unsigned long nr_anon, nr_file, percentage;
2520 unsigned long nr_scanned;
2521
2522 for_each_evictable_lru(lru) {
2523 if (nr[lru]) {
2524 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2525 nr[lru] -= nr_to_scan;
2526
2527 nr_reclaimed += shrink_list(lru, nr_to_scan,
2528 lruvec, memcg, sc);
2529 }
2530 }
2531
2532 cond_resched();
2533
2534 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2535 continue;
2536
2537
2538
2539
2540
2541
2542
2543
2544 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2545 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2546
2547
2548
2549
2550
2551
2552
2553 if (!nr_file || !nr_anon)
2554 break;
2555
2556 if (nr_file > nr_anon) {
2557 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2558 targets[LRU_ACTIVE_ANON] + 1;
2559 lru = LRU_BASE;
2560 percentage = nr_anon * 100 / scan_target;
2561 } else {
2562 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2563 targets[LRU_ACTIVE_FILE] + 1;
2564 lru = LRU_FILE;
2565 percentage = nr_file * 100 / scan_target;
2566 }
2567
2568
2569 nr[lru] = 0;
2570 nr[lru + LRU_ACTIVE] = 0;
2571
2572
2573
2574
2575
2576 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2577 nr_scanned = targets[lru] - nr[lru];
2578 nr[lru] = targets[lru] * (100 - percentage) / 100;
2579 nr[lru] -= min(nr[lru], nr_scanned);
2580
2581 lru += LRU_ACTIVE;
2582 nr_scanned = targets[lru] - nr[lru];
2583 nr[lru] = targets[lru] * (100 - percentage) / 100;
2584 nr[lru] -= min(nr[lru], nr_scanned);
2585
2586 scan_adjusted = true;
2587 }
2588 blk_finish_plug(&plug);
2589 sc->nr_reclaimed += nr_reclaimed;
2590
2591
2592
2593
2594
2595 if (inactive_list_is_low(lruvec, false, memcg, sc, true))
2596 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2597 sc, LRU_ACTIVE_ANON);
2598}
2599
2600
2601static bool in_reclaim_compaction(struct scan_control *sc)
2602{
2603 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2604 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2605 sc->priority < DEF_PRIORITY - 2))
2606 return true;
2607
2608 return false;
2609}
2610
2611
2612
2613
2614
2615
2616
2617
2618static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2619 unsigned long nr_reclaimed,
2620 unsigned long nr_scanned,
2621 struct scan_control *sc)
2622{
2623 unsigned long pages_for_compaction;
2624 unsigned long inactive_lru_pages;
2625 int z;
2626
2627
2628 if (!in_reclaim_compaction(sc))
2629 return false;
2630
2631
2632 if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
2633
2634
2635
2636
2637
2638
2639 if (!nr_reclaimed && !nr_scanned)
2640 return false;
2641 } else {
2642
2643
2644
2645
2646
2647
2648
2649
2650 if (!nr_reclaimed)
2651 return false;
2652 }
2653
2654
2655
2656
2657
2658 pages_for_compaction = compact_gap(sc->order);
2659 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2660 if (get_nr_swap_pages() > 0)
2661 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2662 if (sc->nr_reclaimed < pages_for_compaction &&
2663 inactive_lru_pages > pages_for_compaction)
2664 return true;
2665
2666
2667 for (z = 0; z <= sc->reclaim_idx; z++) {
2668 struct zone *zone = &pgdat->node_zones[z];
2669 if (!managed_zone(zone))
2670 continue;
2671
2672 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2673 case COMPACT_SUCCESS:
2674 case COMPACT_CONTINUE:
2675 return false;
2676 default:
2677
2678 ;
2679 }
2680 }
2681 return true;
2682}
2683
2684static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
2685{
2686 return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
2687 (memcg && memcg_congested(pgdat, memcg));
2688}
2689
2690static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2691{
2692 struct reclaim_state *reclaim_state = current->reclaim_state;
2693 unsigned long nr_reclaimed, nr_scanned;
2694 bool reclaimable = false;
2695
2696 do {
2697 struct mem_cgroup *root = sc->target_mem_cgroup;
2698 struct mem_cgroup_reclaim_cookie reclaim = {
2699 .pgdat = pgdat,
2700 .priority = sc->priority,
2701 };
2702 unsigned long node_lru_pages = 0;
2703 struct mem_cgroup *memcg;
2704
2705 memset(&sc->nr, 0, sizeof(sc->nr));
2706
2707 nr_reclaimed = sc->nr_reclaimed;
2708 nr_scanned = sc->nr_scanned;
2709
2710 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2711 do {
2712 unsigned long lru_pages;
2713 unsigned long reclaimed;
2714 unsigned long scanned;
2715
2716 switch (mem_cgroup_protected(root, memcg)) {
2717 case MEMCG_PROT_MIN:
2718
2719
2720
2721
2722 continue;
2723 case MEMCG_PROT_LOW:
2724
2725
2726
2727
2728
2729
2730 if (!sc->memcg_low_reclaim) {
2731 sc->memcg_low_skipped = 1;
2732 continue;
2733 }
2734 memcg_memory_event(memcg, MEMCG_LOW);
2735 break;
2736 case MEMCG_PROT_NONE:
2737 break;
2738 }
2739
2740 reclaimed = sc->nr_reclaimed;
2741 scanned = sc->nr_scanned;
2742 shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
2743 node_lru_pages += lru_pages;
2744
2745 shrink_slab(sc->gfp_mask, pgdat->node_id,
2746 memcg, sc->priority);
2747
2748
2749 vmpressure(sc->gfp_mask, memcg, false,
2750 sc->nr_scanned - scanned,
2751 sc->nr_reclaimed - reclaimed);
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763 if (!global_reclaim(sc) &&
2764 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2765 mem_cgroup_iter_break(root, memcg);
2766 break;
2767 }
2768 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
2769
2770 if (reclaim_state) {
2771 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2772 reclaim_state->reclaimed_slab = 0;
2773 }
2774
2775
2776 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2777 sc->nr_scanned - nr_scanned,
2778 sc->nr_reclaimed - nr_reclaimed);
2779
2780 if (sc->nr_reclaimed - nr_reclaimed)
2781 reclaimable = true;
2782
2783 if (current_is_kswapd()) {
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
2802 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
2803
2804
2805
2806
2807
2808
2809 if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2810 set_bit(PGDAT_CONGESTED, &pgdat->flags);
2811
2812
2813 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
2814 set_bit(PGDAT_DIRTY, &pgdat->flags);
2815
2816
2817
2818
2819
2820
2821
2822 if (sc->nr.immediate)
2823 congestion_wait(BLK_RW_ASYNC, HZ/10);
2824 }
2825
2826
2827
2828
2829
2830 if (!global_reclaim(sc) && sane_reclaim(sc) &&
2831 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2832 set_memcg_congestion(pgdat, root, true);
2833
2834
2835
2836
2837
2838
2839
2840 if (!sc->hibernation_mode && !current_is_kswapd() &&
2841 current_may_throttle() && pgdat_memcg_congested(pgdat, root))
2842 wait_iff_congested(BLK_RW_ASYNC, HZ/10);
2843
2844 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2845 sc->nr_scanned - nr_scanned, sc));
2846
2847
2848
2849
2850
2851
2852
2853 if (reclaimable)
2854 pgdat->kswapd_failures = 0;
2855
2856 return reclaimable;
2857}
2858
2859
2860
2861
2862
2863
2864static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2865{
2866 unsigned long watermark;
2867 enum compact_result suitable;
2868
2869 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2870 if (suitable == COMPACT_SUCCESS)
2871
2872 return true;
2873 if (suitable == COMPACT_SKIPPED)
2874
2875 return false;
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2887
2888 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2889}
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2900{
2901 struct zoneref *z;
2902 struct zone *zone;
2903 unsigned long nr_soft_reclaimed;
2904 unsigned long nr_soft_scanned;
2905 gfp_t orig_mask;
2906 pg_data_t *last_pgdat = NULL;
2907
2908
2909
2910
2911
2912
2913 orig_mask = sc->gfp_mask;
2914 if (buffer_heads_over_limit) {
2915 sc->gfp_mask |= __GFP_HIGHMEM;
2916 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2917 }
2918
2919 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2920 sc->reclaim_idx, sc->nodemask) {
2921
2922
2923
2924
2925 if (global_reclaim(sc)) {
2926 if (!cpuset_zone_allowed(zone,
2927 GFP_KERNEL | __GFP_HARDWALL))
2928 continue;
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939 if (IS_ENABLED(CONFIG_COMPACTION) &&
2940 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2941 compaction_ready(zone, sc)) {
2942 sc->compaction_ready = true;
2943 continue;
2944 }
2945
2946
2947
2948
2949
2950
2951
2952 if (zone->zone_pgdat == last_pgdat)
2953 continue;
2954
2955
2956
2957
2958
2959
2960
2961 nr_soft_scanned = 0;
2962 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2963 sc->order, sc->gfp_mask,
2964 &nr_soft_scanned);
2965 sc->nr_reclaimed += nr_soft_reclaimed;
2966 sc->nr_scanned += nr_soft_scanned;
2967
2968 }
2969
2970
2971 if (zone->zone_pgdat == last_pgdat)
2972 continue;
2973 last_pgdat = zone->zone_pgdat;
2974 shrink_node(zone->zone_pgdat, sc);
2975 }
2976
2977
2978
2979
2980
2981 sc->gfp_mask = orig_mask;
2982}
2983
2984static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
2985{
2986 struct mem_cgroup *memcg;
2987
2988 memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
2989 do {
2990 unsigned long refaults;
2991 struct lruvec *lruvec;
2992
2993 if (memcg)
2994 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
2995 else
2996 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
2997
2998 lruvec = mem_cgroup_lruvec(pgdat, memcg);
2999 lruvec->refaults = refaults;
3000 } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
3001}
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3020 struct scan_control *sc)
3021{
3022 int initial_priority = sc->priority;
3023 pg_data_t *last_pgdat;
3024 struct zoneref *z;
3025 struct zone *zone;
3026retry:
3027 delayacct_freepages_start();
3028
3029 if (global_reclaim(sc))
3030 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3031
3032 do {
3033 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3034 sc->priority);
3035 sc->nr_scanned = 0;
3036 shrink_zones(zonelist, sc);
3037
3038 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3039 break;
3040
3041 if (sc->compaction_ready)
3042 break;
3043
3044
3045
3046
3047
3048 if (sc->priority < DEF_PRIORITY - 2)
3049 sc->may_writepage = 1;
3050 } while (--sc->priority >= 0);
3051
3052 last_pgdat = NULL;
3053 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3054 sc->nodemask) {
3055 if (zone->zone_pgdat == last_pgdat)
3056 continue;
3057 last_pgdat = zone->zone_pgdat;
3058 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3059 set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
3060 }
3061
3062 delayacct_freepages_end();
3063
3064 if (sc->nr_reclaimed)
3065 return sc->nr_reclaimed;
3066
3067
3068 if (sc->compaction_ready)
3069 return 1;
3070
3071
3072 if (sc->memcg_low_skipped) {
3073 sc->priority = initial_priority;
3074 sc->memcg_low_reclaim = 1;
3075 sc->memcg_low_skipped = 0;
3076 goto retry;
3077 }
3078
3079 return 0;
3080}
3081
3082static bool allow_direct_reclaim(pg_data_t *pgdat)
3083{
3084 struct zone *zone;
3085 unsigned long pfmemalloc_reserve = 0;
3086 unsigned long free_pages = 0;
3087 int i;
3088 bool wmark_ok;
3089
3090 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3091 return true;
3092
3093 for (i = 0; i <= ZONE_NORMAL; i++) {
3094 zone = &pgdat->node_zones[i];
3095 if (!managed_zone(zone))
3096 continue;
3097
3098 if (!zone_reclaimable_pages(zone))
3099 continue;
3100
3101 pfmemalloc_reserve += min_wmark_pages(zone);
3102 free_pages += zone_page_state(zone, NR_FREE_PAGES);
3103 }
3104
3105
3106 if (!pfmemalloc_reserve)
3107 return true;
3108
3109 wmark_ok = free_pages > pfmemalloc_reserve / 2;
3110
3111
3112 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3113 pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
3114 (enum zone_type)ZONE_NORMAL);
3115 wake_up_interruptible(&pgdat->kswapd_wait);
3116 }
3117
3118 return wmark_ok;
3119}
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3131 nodemask_t *nodemask)
3132{
3133 struct zoneref *z;
3134 struct zone *zone;
3135 pg_data_t *pgdat = NULL;
3136
3137
3138
3139
3140
3141
3142
3143
3144 if (current->flags & PF_KTHREAD)
3145 goto out;
3146
3147
3148
3149
3150
3151 if (fatal_signal_pending(current))
3152 goto out;
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3169 gfp_zone(gfp_mask), nodemask) {
3170 if (zone_idx(zone) > ZONE_NORMAL)
3171 continue;
3172
3173
3174 pgdat = zone->zone_pgdat;
3175 if (allow_direct_reclaim(pgdat))
3176 goto out;
3177 break;
3178 }
3179
3180
3181 if (!pgdat)
3182 goto out;
3183
3184
3185 count_vm_event(PGSCAN_DIRECT_THROTTLE);
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195 if (!(gfp_mask & __GFP_FS)) {
3196 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3197 allow_direct_reclaim(pgdat), HZ);
3198
3199 goto check_pending;
3200 }
3201
3202
3203 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3204 allow_direct_reclaim(pgdat));
3205
3206check_pending:
3207 if (fatal_signal_pending(current))
3208 return true;
3209
3210out:
3211 return false;
3212}
3213
3214unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3215 gfp_t gfp_mask, nodemask_t *nodemask)
3216{
3217 unsigned long nr_reclaimed;
3218 struct scan_control sc = {
3219 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3220 .gfp_mask = current_gfp_context(gfp_mask),
3221 .reclaim_idx = gfp_zone(gfp_mask),
3222 .order = order,
3223 .nodemask = nodemask,
3224 .priority = DEF_PRIORITY,
3225 .may_writepage = !laptop_mode,
3226 .may_unmap = 1,
3227 .may_swap = 1,
3228 };
3229
3230
3231
3232
3233
3234 BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3235 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3236 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3237
3238
3239
3240
3241
3242
3243 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3244 return 1;
3245
3246 trace_mm_vmscan_direct_reclaim_begin(order,
3247 sc.may_writepage,
3248 sc.gfp_mask,
3249 sc.reclaim_idx);
3250
3251 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3252
3253 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3254
3255 return nr_reclaimed;
3256}
3257
3258#ifdef CONFIG_MEMCG
3259
3260unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3261 gfp_t gfp_mask, bool noswap,
3262 pg_data_t *pgdat,
3263 unsigned long *nr_scanned)
3264{
3265 struct scan_control sc = {
3266 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3267 .target_mem_cgroup = memcg,
3268 .may_writepage = !laptop_mode,
3269 .may_unmap = 1,
3270 .reclaim_idx = MAX_NR_ZONES - 1,
3271 .may_swap = !noswap,
3272 };
3273 unsigned long lru_pages;
3274
3275 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3276 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3277
3278 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3279 sc.may_writepage,
3280 sc.gfp_mask,
3281 sc.reclaim_idx);
3282
3283
3284
3285
3286
3287
3288
3289
3290 shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
3291
3292 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3293
3294 *nr_scanned = sc.nr_scanned;
3295 return sc.nr_reclaimed;
3296}
3297
3298unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3299 unsigned long nr_pages,
3300 gfp_t gfp_mask,
3301 bool may_swap)
3302{
3303 struct zonelist *zonelist;
3304 unsigned long nr_reclaimed;
3305 int nid;
3306 unsigned int noreclaim_flag;
3307 struct scan_control sc = {
3308 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3309 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3310 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3311 .reclaim_idx = MAX_NR_ZONES - 1,
3312 .target_mem_cgroup = memcg,
3313 .priority = DEF_PRIORITY,
3314 .may_writepage = !laptop_mode,
3315 .may_unmap = 1,
3316 .may_swap = may_swap,
3317 };
3318
3319
3320
3321
3322
3323
3324 nid = mem_cgroup_select_victim_node(memcg);
3325
3326 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
3327
3328 trace_mm_vmscan_memcg_reclaim_begin(0,
3329 sc.may_writepage,
3330 sc.gfp_mask,
3331 sc.reclaim_idx);
3332
3333 noreclaim_flag = memalloc_noreclaim_save();
3334 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3335 memalloc_noreclaim_restore(noreclaim_flag);
3336
3337 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3338
3339 return nr_reclaimed;
3340}
3341#endif
3342
3343static void age_active_anon(struct pglist_data *pgdat,
3344 struct scan_control *sc)
3345{
3346 struct mem_cgroup *memcg;
3347
3348 if (!total_swap_pages)
3349 return;
3350
3351 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3352 do {
3353 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
3354
3355 if (inactive_list_is_low(lruvec, false, memcg, sc, true))
3356 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3357 sc, LRU_ACTIVE_ANON);
3358
3359 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3360 } while (memcg);
3361}
3362
3363
3364
3365
3366
3367static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3368{
3369 int i;
3370 unsigned long mark = -1;
3371 struct zone *zone;
3372
3373 for (i = 0; i <= classzone_idx; i++) {
3374 zone = pgdat->node_zones + i;
3375
3376 if (!managed_zone(zone))
3377 continue;
3378
3379 mark = high_wmark_pages(zone);
3380 if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
3381 return true;
3382 }
3383
3384
3385
3386
3387
3388
3389 if (mark == -1)
3390 return true;
3391
3392 return false;
3393}
3394
3395
3396static void clear_pgdat_congested(pg_data_t *pgdat)
3397{
3398 clear_bit(PGDAT_CONGESTED, &pgdat->flags);
3399 clear_bit(PGDAT_DIRTY, &pgdat->flags);
3400 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3401}
3402
3403
3404
3405
3406
3407
3408
3409static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3410{
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3425 wake_up_all(&pgdat->pfmemalloc_wait);
3426
3427
3428 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3429 return true;
3430
3431 if (pgdat_balanced(pgdat, order, classzone_idx)) {
3432 clear_pgdat_congested(pgdat);
3433 return true;
3434 }
3435
3436 return false;
3437}
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447static bool kswapd_shrink_node(pg_data_t *pgdat,
3448 struct scan_control *sc)
3449{
3450 struct zone *zone;
3451 int z;
3452
3453
3454 sc->nr_to_reclaim = 0;
3455 for (z = 0; z <= sc->reclaim_idx; z++) {
3456 zone = pgdat->node_zones + z;
3457 if (!managed_zone(zone))
3458 continue;
3459
3460 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3461 }
3462
3463
3464
3465
3466
3467 shrink_node(pgdat, sc);
3468
3469
3470
3471
3472
3473
3474
3475
3476 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3477 sc->order = 0;
3478
3479 return sc->nr_scanned >= sc->nr_to_reclaim;
3480}
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3496{
3497 int i;
3498 unsigned long nr_soft_reclaimed;
3499 unsigned long nr_soft_scanned;
3500 struct zone *zone;
3501 struct scan_control sc = {
3502 .gfp_mask = GFP_KERNEL,
3503 .order = order,
3504 .priority = DEF_PRIORITY,
3505 .may_writepage = !laptop_mode,
3506 .may_unmap = 1,
3507 .may_swap = 1,
3508 };
3509
3510 __fs_reclaim_acquire();
3511
3512 count_vm_event(PAGEOUTRUN);
3513
3514 do {
3515 unsigned long nr_reclaimed = sc.nr_reclaimed;
3516 bool raise_priority = true;
3517 bool ret;
3518
3519 sc.reclaim_idx = classzone_idx;
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531 if (buffer_heads_over_limit) {
3532 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3533 zone = pgdat->node_zones + i;
3534 if (!managed_zone(zone))
3535 continue;
3536
3537 sc.reclaim_idx = i;
3538 break;
3539 }
3540 }
3541
3542
3543
3544
3545
3546
3547 if (pgdat_balanced(pgdat, sc.order, classzone_idx))
3548 goto out;
3549
3550
3551
3552
3553
3554
3555
3556 age_active_anon(pgdat, &sc);
3557
3558
3559
3560
3561
3562 if (sc.priority < DEF_PRIORITY - 2)
3563 sc.may_writepage = 1;
3564
3565
3566 sc.nr_scanned = 0;
3567 nr_soft_scanned = 0;
3568 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3569 sc.gfp_mask, &nr_soft_scanned);
3570 sc.nr_reclaimed += nr_soft_reclaimed;
3571
3572
3573
3574
3575
3576
3577 if (kswapd_shrink_node(pgdat, &sc))
3578 raise_priority = false;
3579
3580
3581
3582
3583
3584
3585 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3586 allow_direct_reclaim(pgdat))
3587 wake_up_all(&pgdat->pfmemalloc_wait);
3588
3589
3590 __fs_reclaim_release();
3591 ret = try_to_freeze();
3592 __fs_reclaim_acquire();
3593 if (ret || kthread_should_stop())
3594 break;
3595
3596
3597
3598
3599
3600 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3601 if (raise_priority || !nr_reclaimed)
3602 sc.priority--;
3603 } while (sc.priority >= 1);
3604
3605 if (!sc.nr_reclaimed)
3606 pgdat->kswapd_failures++;
3607
3608out:
3609 snapshot_refaults(NULL, pgdat);
3610 __fs_reclaim_release();
3611
3612
3613
3614
3615
3616
3617 return sc.order;
3618}
3619
3620
3621
3622
3623
3624
3625
3626
3627static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
3628 enum zone_type classzone_idx)
3629{
3630 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3631 return classzone_idx;
3632
3633 return max(pgdat->kswapd_classzone_idx, classzone_idx);
3634}
3635
3636static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3637 unsigned int classzone_idx)
3638{
3639 long remaining = 0;
3640 DEFINE_WAIT(wait);
3641
3642 if (freezing(current) || kthread_should_stop())
3643 return;
3644
3645 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3646
3647
3648
3649
3650
3651
3652
3653
3654 if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3655
3656
3657
3658
3659
3660
3661 reset_isolation_suitable(pgdat);
3662
3663
3664
3665
3666
3667 wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
3668
3669 remaining = schedule_timeout(HZ/10);
3670
3671
3672
3673
3674
3675
3676 if (remaining) {
3677 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3678 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
3679 }
3680
3681 finish_wait(&pgdat->kswapd_wait, &wait);
3682 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3683 }
3684
3685
3686
3687
3688
3689 if (!remaining &&
3690 prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3691 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3702
3703 if (!kthread_should_stop())
3704 schedule();
3705
3706 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3707 } else {
3708 if (remaining)
3709 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3710 else
3711 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3712 }
3713 finish_wait(&pgdat->kswapd_wait, &wait);
3714}
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729static int kswapd(void *p)
3730{
3731 unsigned int alloc_order, reclaim_order;
3732 unsigned int classzone_idx = MAX_NR_ZONES - 1;
3733 pg_data_t *pgdat = (pg_data_t*)p;
3734 struct task_struct *tsk = current;
3735
3736 struct reclaim_state reclaim_state = {
3737 .reclaimed_slab = 0,
3738 };
3739 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3740
3741 if (!cpumask_empty(cpumask))
3742 set_cpus_allowed_ptr(tsk, cpumask);
3743 current->reclaim_state = &reclaim_state;
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3758 set_freezable();
3759
3760 pgdat->kswapd_order = 0;
3761 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3762 for ( ; ; ) {
3763 bool ret;
3764
3765 alloc_order = reclaim_order = pgdat->kswapd_order;
3766 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3767
3768kswapd_try_sleep:
3769 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3770 classzone_idx);
3771
3772
3773 alloc_order = reclaim_order = pgdat->kswapd_order;
3774 classzone_idx = kswapd_classzone_idx(pgdat, 0);
3775 pgdat->kswapd_order = 0;
3776 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3777
3778 ret = try_to_freeze();
3779 if (kthread_should_stop())
3780 break;
3781
3782
3783
3784
3785
3786 if (ret)
3787 continue;
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
3798 alloc_order);
3799 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
3800 if (reclaim_order < alloc_order)
3801 goto kswapd_try_sleep;
3802 }
3803
3804 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3805 current->reclaim_state = NULL;
3806
3807 return 0;
3808}
3809
3810
3811
3812
3813
3814
3815
3816
3817void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3818 enum zone_type classzone_idx)
3819{
3820 pg_data_t *pgdat;
3821
3822 if (!managed_zone(zone))
3823 return;
3824
3825 if (!cpuset_zone_allowed(zone, gfp_flags))
3826 return;
3827 pgdat = zone->zone_pgdat;
3828 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
3829 classzone_idx);
3830 pgdat->kswapd_order = max(pgdat->kswapd_order, order);
3831 if (!waitqueue_active(&pgdat->kswapd_wait))
3832 return;
3833
3834
3835 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3836 pgdat_balanced(pgdat, order, classzone_idx)) {
3837
3838
3839
3840
3841
3842
3843
3844 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
3845 wakeup_kcompactd(pgdat, order, classzone_idx);
3846 return;
3847 }
3848
3849 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
3850 gfp_flags);
3851 wake_up_interruptible(&pgdat->kswapd_wait);
3852}
3853
3854#ifdef CONFIG_HIBERNATION
3855
3856
3857
3858
3859
3860
3861
3862
3863unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3864{
3865 struct reclaim_state reclaim_state;
3866 struct scan_control sc = {
3867 .nr_to_reclaim = nr_to_reclaim,
3868 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3869 .reclaim_idx = MAX_NR_ZONES - 1,
3870 .priority = DEF_PRIORITY,
3871 .may_writepage = 1,
3872 .may_unmap = 1,
3873 .may_swap = 1,
3874 .hibernation_mode = 1,
3875 };
3876 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3877 struct task_struct *p = current;
3878 unsigned long nr_reclaimed;
3879 unsigned int noreclaim_flag;
3880
3881 fs_reclaim_acquire(sc.gfp_mask);
3882 noreclaim_flag = memalloc_noreclaim_save();
3883 reclaim_state.reclaimed_slab = 0;
3884 p->reclaim_state = &reclaim_state;
3885
3886 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3887
3888 p->reclaim_state = NULL;
3889 memalloc_noreclaim_restore(noreclaim_flag);
3890 fs_reclaim_release(sc.gfp_mask);
3891
3892 return nr_reclaimed;
3893}
3894#endif
3895
3896
3897
3898
3899
3900static int kswapd_cpu_online(unsigned int cpu)
3901{
3902 int nid;
3903
3904 for_each_node_state(nid, N_MEMORY) {
3905 pg_data_t *pgdat = NODE_DATA(nid);
3906 const struct cpumask *mask;
3907
3908 mask = cpumask_of_node(pgdat->node_id);
3909
3910 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3911
3912 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3913 }
3914 return 0;
3915}
3916
3917
3918
3919
3920
3921int kswapd_run(int nid)
3922{
3923 pg_data_t *pgdat = NODE_DATA(nid);
3924 int ret = 0;
3925
3926 if (pgdat->kswapd)
3927 return 0;
3928
3929 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3930 if (IS_ERR(pgdat->kswapd)) {
3931
3932 BUG_ON(system_state < SYSTEM_RUNNING);
3933 pr_err("Failed to start kswapd on node %d\n", nid);
3934 ret = PTR_ERR(pgdat->kswapd);
3935 pgdat->kswapd = NULL;
3936 }
3937 return ret;
3938}
3939
3940
3941
3942
3943
3944void kswapd_stop(int nid)
3945{
3946 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3947
3948 if (kswapd) {
3949 kthread_stop(kswapd);
3950 NODE_DATA(nid)->kswapd = NULL;
3951 }
3952}
3953
3954static int __init kswapd_init(void)
3955{
3956 int nid, ret;
3957
3958 swap_setup();
3959 for_each_node_state(nid, N_MEMORY)
3960 kswapd_run(nid);
3961 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
3962 "mm/vmscan:online", kswapd_cpu_online,
3963 NULL);
3964 WARN_ON(ret < 0);
3965 return 0;
3966}
3967
3968module_init(kswapd_init)
3969
3970#ifdef CONFIG_NUMA
3971
3972
3973
3974
3975
3976
3977int node_reclaim_mode __read_mostly;
3978
3979#define RECLAIM_OFF 0
3980#define RECLAIM_ZONE (1<<0)
3981#define RECLAIM_WRITE (1<<1)
3982#define RECLAIM_UNMAP (1<<2)
3983
3984
3985
3986
3987
3988
3989#define NODE_RECLAIM_PRIORITY 4
3990
3991
3992
3993
3994
3995int sysctl_min_unmapped_ratio = 1;
3996
3997
3998
3999
4000
4001int sysctl_min_slab_ratio = 5;
4002
4003static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4004{
4005 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4006 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4007 node_page_state(pgdat, NR_ACTIVE_FILE);
4008
4009
4010
4011
4012
4013
4014 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4015}
4016
4017
4018static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4019{
4020 unsigned long nr_pagecache_reclaimable;
4021 unsigned long delta = 0;
4022
4023
4024
4025
4026
4027
4028
4029 if (node_reclaim_mode & RECLAIM_UNMAP)
4030 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4031 else
4032 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4033
4034
4035 if (!(node_reclaim_mode & RECLAIM_WRITE))
4036 delta += node_page_state(pgdat, NR_FILE_DIRTY);
4037
4038
4039 if (unlikely(delta > nr_pagecache_reclaimable))
4040 delta = nr_pagecache_reclaimable;
4041
4042 return nr_pagecache_reclaimable - delta;
4043}
4044
4045
4046
4047
4048static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4049{
4050
4051 const unsigned long nr_pages = 1 << order;
4052 struct task_struct *p = current;
4053 struct reclaim_state reclaim_state;
4054 unsigned int noreclaim_flag;
4055 struct scan_control sc = {
4056 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4057 .gfp_mask = current_gfp_context(gfp_mask),
4058 .order = order,
4059 .priority = NODE_RECLAIM_PRIORITY,
4060 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4061 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4062 .may_swap = 1,
4063 .reclaim_idx = gfp_zone(gfp_mask),
4064 };
4065
4066 cond_resched();
4067 fs_reclaim_acquire(sc.gfp_mask);
4068
4069
4070
4071
4072
4073 noreclaim_flag = memalloc_noreclaim_save();
4074 p->flags |= PF_SWAPWRITE;
4075 reclaim_state.reclaimed_slab = 0;
4076 p->reclaim_state = &reclaim_state;
4077
4078 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4079
4080
4081
4082
4083 do {
4084 shrink_node(pgdat, &sc);
4085 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4086 }
4087
4088 p->reclaim_state = NULL;
4089 current->flags &= ~PF_SWAPWRITE;
4090 memalloc_noreclaim_restore(noreclaim_flag);
4091 fs_reclaim_release(sc.gfp_mask);
4092 return sc.nr_reclaimed >= nr_pages;
4093}
4094
4095int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4096{
4097 int ret;
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4110 node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
4111 return NODE_RECLAIM_FULL;
4112
4113
4114
4115
4116 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4117 return NODE_RECLAIM_NOSCAN;
4118
4119
4120
4121
4122
4123
4124
4125 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4126 return NODE_RECLAIM_NOSCAN;
4127
4128 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4129 return NODE_RECLAIM_NOSCAN;
4130
4131 ret = __node_reclaim(pgdat, gfp_mask, order);
4132 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4133
4134 if (!ret)
4135 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4136
4137 return ret;
4138}
4139#endif
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153int page_evictable(struct page *page)
4154{
4155 int ret;
4156
4157
4158 rcu_read_lock();
4159 ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
4160 rcu_read_unlock();
4161 return ret;
4162}
4163
4164#ifdef CONFIG_SHMEM
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174void check_move_unevictable_pages(struct page **pages, int nr_pages)
4175{
4176 struct lruvec *lruvec;
4177 struct pglist_data *pgdat = NULL;
4178 int pgscanned = 0;
4179 int pgrescued = 0;
4180 int i;
4181
4182 for (i = 0; i < nr_pages; i++) {
4183 struct page *page = pages[i];
4184 struct pglist_data *pagepgdat = page_pgdat(page);
4185
4186 pgscanned++;
4187 if (pagepgdat != pgdat) {
4188 if (pgdat)
4189 spin_unlock_irq(&pgdat->lru_lock);
4190 pgdat = pagepgdat;
4191 spin_lock_irq(&pgdat->lru_lock);
4192 }
4193 lruvec = mem_cgroup_page_lruvec(page, pgdat);
4194
4195 if (!PageLRU(page) || !PageUnevictable(page))
4196 continue;
4197
4198 if (page_evictable(page)) {
4199 enum lru_list lru = page_lru_base_type(page);
4200
4201 VM_BUG_ON_PAGE(PageActive(page), page);
4202 ClearPageUnevictable(page);
4203 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
4204 add_page_to_lru_list(page, lruvec, lru);
4205 pgrescued++;
4206 }
4207 }
4208
4209 if (pgdat) {
4210 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4211 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4212 spin_unlock_irq(&pgdat->lru_lock);
4213 }
4214}
4215#endif
4216