1
2
3
4
5
6
7
8
9
10
11
12
13
14
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17#include <linux/mm.h>
18#include <linux/sched/mm.h>
19#include <linux/module.h>
20#include <linux/gfp.h>
21#include <linux/kernel_stat.h>
22#include <linux/swap.h>
23#include <linux/pagemap.h>
24#include <linux/init.h>
25#include <linux/highmem.h>
26#include <linux/vmpressure.h>
27#include <linux/vmstat.h>
28#include <linux/file.h>
29#include <linux/writeback.h>
30#include <linux/blkdev.h>
31#include <linux/buffer_head.h>
32
33#include <linux/mm_inline.h>
34#include <linux/backing-dev.h>
35#include <linux/rmap.h>
36#include <linux/topology.h>
37#include <linux/cpu.h>
38#include <linux/cpuset.h>
39#include <linux/compaction.h>
40#include <linux/notifier.h>
41#include <linux/rwsem.h>
42#include <linux/delay.h>
43#include <linux/kthread.h>
44#include <linux/freezer.h>
45#include <linux/memcontrol.h>
46#include <linux/delayacct.h>
47#include <linux/sysctl.h>
48#include <linux/oom.h>
49#include <linux/pagevec.h>
50#include <linux/prefetch.h>
51#include <linux/printk.h>
52#include <linux/dax.h>
53#include <linux/psi.h>
54
55#include <asm/tlbflush.h>
56#include <asm/div64.h>
57
58#include <linux/swapops.h>
59#include <linux/balloon_compaction.h>
60
61#include "internal.h"
62
63#define CREATE_TRACE_POINTS
64#include <trace/events/vmscan.h>
65
66struct scan_control {
67
68 unsigned long nr_to_reclaim;
69
70
71
72
73
74 nodemask_t *nodemask;
75
76
77
78
79
80 struct mem_cgroup *target_mem_cgroup;
81
82
83 unsigned int may_writepage:1;
84
85
86 unsigned int may_unmap:1;
87
88
89 unsigned int may_swap:1;
90
91
92 unsigned int may_shrinkslab:1;
93
94
95
96
97
98
99 unsigned int memcg_low_reclaim:1;
100 unsigned int memcg_low_skipped:1;
101
102 unsigned int hibernation_mode:1;
103
104
105 unsigned int compaction_ready:1;
106
107
108 s8 order;
109
110
111 s8 priority;
112
113
114 s8 reclaim_idx;
115
116
117 gfp_t gfp_mask;
118
119
120 unsigned long nr_scanned;
121
122
123 unsigned long nr_reclaimed;
124
125 struct {
126 unsigned int dirty;
127 unsigned int unqueued_dirty;
128 unsigned int congested;
129 unsigned int writeback;
130 unsigned int immediate;
131 unsigned int file_taken;
132 unsigned int taken;
133 } nr;
134};
135
136#ifdef ARCH_HAS_PREFETCH
137#define prefetch_prev_lru_page(_page, _base, _field) \
138 do { \
139 if ((_page)->lru.prev != _base) { \
140 struct page *prev; \
141 \
142 prev = lru_to_page(&(_page->lru)); \
143 prefetch(&prev->_field); \
144 } \
145 } while (0)
146#else
147#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
148#endif
149
150#ifdef ARCH_HAS_PREFETCHW
151#define prefetchw_prev_lru_page(_page, _base, _field) \
152 do { \
153 if ((_page)->lru.prev != _base) { \
154 struct page *prev; \
155 \
156 prev = lru_to_page(&(_page->lru)); \
157 prefetchw(&prev->_field); \
158 } \
159 } while (0)
160#else
161#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
162#endif
163
164
165
166
167int vm_swappiness = 60;
168
169
170
171
172unsigned long vm_total_pages;
173
174static LIST_HEAD(shrinker_list);
175static DECLARE_RWSEM(shrinker_rwsem);
176
177#ifdef CONFIG_MEMCG_KMEM
178
179
180
181
182
183
184
185
186
187
188
189
190#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
191
192static DEFINE_IDR(shrinker_idr);
193static int shrinker_nr_max;
194
195static int prealloc_memcg_shrinker(struct shrinker *shrinker)
196{
197 int id, ret = -ENOMEM;
198
199 down_write(&shrinker_rwsem);
200
201 id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
202 if (id < 0)
203 goto unlock;
204
205 if (id >= shrinker_nr_max) {
206 if (memcg_expand_shrinker_maps(id)) {
207 idr_remove(&shrinker_idr, id);
208 goto unlock;
209 }
210
211 shrinker_nr_max = id + 1;
212 }
213 shrinker->id = id;
214 ret = 0;
215unlock:
216 up_write(&shrinker_rwsem);
217 return ret;
218}
219
220static void unregister_memcg_shrinker(struct shrinker *shrinker)
221{
222 int id = shrinker->id;
223
224 BUG_ON(id < 0);
225
226 down_write(&shrinker_rwsem);
227 idr_remove(&shrinker_idr, id);
228 up_write(&shrinker_rwsem);
229}
230#else
231static int prealloc_memcg_shrinker(struct shrinker *shrinker)
232{
233 return 0;
234}
235
236static void unregister_memcg_shrinker(struct shrinker *shrinker)
237{
238}
239#endif
240
241#ifdef CONFIG_MEMCG
242static bool global_reclaim(struct scan_control *sc)
243{
244 return !sc->target_mem_cgroup;
245}
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260static bool sane_reclaim(struct scan_control *sc)
261{
262 struct mem_cgroup *memcg = sc->target_mem_cgroup;
263
264 if (!memcg)
265 return true;
266#ifdef CONFIG_CGROUP_WRITEBACK
267 if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
268 return true;
269#endif
270 return false;
271}
272
273static void set_memcg_congestion(pg_data_t *pgdat,
274 struct mem_cgroup *memcg,
275 bool congested)
276{
277 struct mem_cgroup_per_node *mn;
278
279 if (!memcg)
280 return;
281
282 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
283 WRITE_ONCE(mn->congested, congested);
284}
285
286static bool memcg_congested(pg_data_t *pgdat,
287 struct mem_cgroup *memcg)
288{
289 struct mem_cgroup_per_node *mn;
290
291 mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
292 return READ_ONCE(mn->congested);
293
294}
295#else
296static bool global_reclaim(struct scan_control *sc)
297{
298 return true;
299}
300
301static bool sane_reclaim(struct scan_control *sc)
302{
303 return true;
304}
305
306static inline void set_memcg_congestion(struct pglist_data *pgdat,
307 struct mem_cgroup *memcg, bool congested)
308{
309}
310
311static inline bool memcg_congested(struct pglist_data *pgdat,
312 struct mem_cgroup *memcg)
313{
314 return false;
315
316}
317#endif
318
319
320
321
322
323
324unsigned long zone_reclaimable_pages(struct zone *zone)
325{
326 unsigned long nr;
327
328 nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
329 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
330 if (get_nr_swap_pages() > 0)
331 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
332 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
333
334 return nr;
335}
336
337
338
339
340
341
342
343unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
344{
345 unsigned long lru_size;
346 int zid;
347
348 if (!mem_cgroup_disabled())
349 lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
350 else
351 lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
352
353 for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
354 struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
355 unsigned long size;
356
357 if (!managed_zone(zone))
358 continue;
359
360 if (!mem_cgroup_disabled())
361 size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
362 else
363 size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
364 NR_ZONE_LRU_BASE + lru);
365 lru_size -= min(size, lru_size);
366 }
367
368 return lru_size;
369
370}
371
372
373
374
375int prealloc_shrinker(struct shrinker *shrinker)
376{
377 unsigned int size = sizeof(*shrinker->nr_deferred);
378
379 if (shrinker->flags & SHRINKER_NUMA_AWARE)
380 size *= nr_node_ids;
381
382 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
383 if (!shrinker->nr_deferred)
384 return -ENOMEM;
385
386 if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
387 if (prealloc_memcg_shrinker(shrinker))
388 goto free_deferred;
389 }
390
391 return 0;
392
393free_deferred:
394 kfree(shrinker->nr_deferred);
395 shrinker->nr_deferred = NULL;
396 return -ENOMEM;
397}
398
399void free_prealloced_shrinker(struct shrinker *shrinker)
400{
401 if (!shrinker->nr_deferred)
402 return;
403
404 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
405 unregister_memcg_shrinker(shrinker);
406
407 kfree(shrinker->nr_deferred);
408 shrinker->nr_deferred = NULL;
409}
410
411void register_shrinker_prepared(struct shrinker *shrinker)
412{
413 down_write(&shrinker_rwsem);
414 list_add_tail(&shrinker->list, &shrinker_list);
415#ifdef CONFIG_MEMCG_KMEM
416 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
417 idr_replace(&shrinker_idr, shrinker, shrinker->id);
418#endif
419 up_write(&shrinker_rwsem);
420}
421
422int register_shrinker(struct shrinker *shrinker)
423{
424 int err = prealloc_shrinker(shrinker);
425
426 if (err)
427 return err;
428 register_shrinker_prepared(shrinker);
429 return 0;
430}
431EXPORT_SYMBOL(register_shrinker);
432
433
434
435
436void unregister_shrinker(struct shrinker *shrinker)
437{
438 if (!shrinker->nr_deferred)
439 return;
440 if (shrinker->flags & SHRINKER_MEMCG_AWARE)
441 unregister_memcg_shrinker(shrinker);
442 down_write(&shrinker_rwsem);
443 list_del(&shrinker->list);
444 up_write(&shrinker_rwsem);
445 kfree(shrinker->nr_deferred);
446 shrinker->nr_deferred = NULL;
447}
448EXPORT_SYMBOL(unregister_shrinker);
449
450#define SHRINK_BATCH 128
451
452static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
453 struct shrinker *shrinker, int priority)
454{
455 unsigned long freed = 0;
456 unsigned long long delta;
457 long total_scan;
458 long freeable;
459 long nr;
460 long new_nr;
461 int nid = shrinkctl->nid;
462 long batch_size = shrinker->batch ? shrinker->batch
463 : SHRINK_BATCH;
464 long scanned = 0, next_deferred;
465
466 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
467 nid = 0;
468
469 freeable = shrinker->count_objects(shrinker, shrinkctl);
470 if (freeable == 0 || freeable == SHRINK_EMPTY)
471 return freeable;
472
473
474
475
476
477
478 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
479
480 total_scan = nr;
481 if (shrinker->seeks) {
482 delta = freeable >> priority;
483 delta *= 4;
484 do_div(delta, shrinker->seeks);
485 } else {
486
487
488
489
490
491 delta = freeable / 2;
492 }
493
494 total_scan += delta;
495 if (total_scan < 0) {
496 pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
497 shrinker->scan_objects, total_scan);
498 total_scan = freeable;
499 next_deferred = nr;
500 } else
501 next_deferred = total_scan;
502
503
504
505
506
507
508
509
510
511
512
513
514
515 if (delta < freeable / 4)
516 total_scan = min(total_scan, freeable / 2);
517
518
519
520
521
522
523 if (total_scan > freeable * 2)
524 total_scan = freeable * 2;
525
526 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
527 freeable, delta, total_scan, priority);
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544 while (total_scan >= batch_size ||
545 total_scan >= freeable) {
546 unsigned long ret;
547 unsigned long nr_to_scan = min(batch_size, total_scan);
548
549 shrinkctl->nr_to_scan = nr_to_scan;
550 shrinkctl->nr_scanned = nr_to_scan;
551 ret = shrinker->scan_objects(shrinker, shrinkctl);
552 if (ret == SHRINK_STOP)
553 break;
554 freed += ret;
555
556 count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
557 total_scan -= shrinkctl->nr_scanned;
558 scanned += shrinkctl->nr_scanned;
559
560 cond_resched();
561 }
562
563 if (next_deferred >= scanned)
564 next_deferred -= scanned;
565 else
566 next_deferred = 0;
567
568
569
570
571
572 if (next_deferred > 0)
573 new_nr = atomic_long_add_return(next_deferred,
574 &shrinker->nr_deferred[nid]);
575 else
576 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
577
578 trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
579 return freed;
580}
581
582#ifdef CONFIG_MEMCG_KMEM
583static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
584 struct mem_cgroup *memcg, int priority)
585{
586 struct memcg_shrinker_map *map;
587 unsigned long ret, freed = 0;
588 int i;
589
590 if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
591 return 0;
592
593 if (!down_read_trylock(&shrinker_rwsem))
594 return 0;
595
596 map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
597 true);
598 if (unlikely(!map))
599 goto unlock;
600
601 for_each_set_bit(i, map->map, shrinker_nr_max) {
602 struct shrink_control sc = {
603 .gfp_mask = gfp_mask,
604 .nid = nid,
605 .memcg = memcg,
606 };
607 struct shrinker *shrinker;
608
609 shrinker = idr_find(&shrinker_idr, i);
610 if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
611 if (!shrinker)
612 clear_bit(i, map->map);
613 continue;
614 }
615
616 ret = do_shrink_slab(&sc, shrinker, priority);
617 if (ret == SHRINK_EMPTY) {
618 clear_bit(i, map->map);
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634 smp_mb__after_atomic();
635 ret = do_shrink_slab(&sc, shrinker, priority);
636 if (ret == SHRINK_EMPTY)
637 ret = 0;
638 else
639 memcg_set_shrinker_bit(memcg, nid, i);
640 }
641 freed += ret;
642
643 if (rwsem_is_contended(&shrinker_rwsem)) {
644 freed = freed ? : 1;
645 break;
646 }
647 }
648unlock:
649 up_read(&shrinker_rwsem);
650 return freed;
651}
652#else
653static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
654 struct mem_cgroup *memcg, int priority)
655{
656 return 0;
657}
658#endif
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
681 struct mem_cgroup *memcg,
682 int priority)
683{
684 unsigned long ret, freed = 0;
685 struct shrinker *shrinker;
686
687 if (!mem_cgroup_is_root(memcg))
688 return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
689
690 if (!down_read_trylock(&shrinker_rwsem))
691 goto out;
692
693 list_for_each_entry(shrinker, &shrinker_list, list) {
694 struct shrink_control sc = {
695 .gfp_mask = gfp_mask,
696 .nid = nid,
697 .memcg = memcg,
698 };
699
700 ret = do_shrink_slab(&sc, shrinker, priority);
701 if (ret == SHRINK_EMPTY)
702 ret = 0;
703 freed += ret;
704
705
706
707
708
709 if (rwsem_is_contended(&shrinker_rwsem)) {
710 freed = freed ? : 1;
711 break;
712 }
713 }
714
715 up_read(&shrinker_rwsem);
716out:
717 cond_resched();
718 return freed;
719}
720
721void drop_slab_node(int nid)
722{
723 unsigned long freed;
724
725 do {
726 struct mem_cgroup *memcg = NULL;
727
728 freed = 0;
729 memcg = mem_cgroup_iter(NULL, NULL, NULL);
730 do {
731 freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
732 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
733 } while (freed > 10);
734}
735
736void drop_slab(void)
737{
738 int nid;
739
740 for_each_online_node(nid)
741 drop_slab_node(nid);
742}
743
744static inline int is_page_cache_freeable(struct page *page)
745{
746
747
748
749
750
751 int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
752 HPAGE_PMD_NR : 1;
753 return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
754}
755
756static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
757{
758 if (current->flags & PF_SWAPWRITE)
759 return 1;
760 if (!inode_write_congested(inode))
761 return 1;
762 if (inode_to_bdi(inode) == current->backing_dev_info)
763 return 1;
764 return 0;
765}
766
767
768
769
770
771
772
773
774
775
776
777
778
779static void handle_write_error(struct address_space *mapping,
780 struct page *page, int error)
781{
782 lock_page(page);
783 if (page_mapping(page) == mapping)
784 mapping_set_error(mapping, error);
785 unlock_page(page);
786}
787
788
789typedef enum {
790
791 PAGE_KEEP,
792
793 PAGE_ACTIVATE,
794
795 PAGE_SUCCESS,
796
797 PAGE_CLEAN,
798} pageout_t;
799
800
801
802
803
804static pageout_t pageout(struct page *page, struct address_space *mapping,
805 struct scan_control *sc)
806{
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823 if (!is_page_cache_freeable(page))
824 return PAGE_KEEP;
825 if (!mapping) {
826
827
828
829
830 if (page_has_private(page)) {
831 if (try_to_free_buffers(page)) {
832 ClearPageDirty(page);
833 pr_info("%s: orphaned page\n", __func__);
834 return PAGE_CLEAN;
835 }
836 }
837 return PAGE_KEEP;
838 }
839 if (mapping->a_ops->writepage == NULL)
840 return PAGE_ACTIVATE;
841 if (!may_write_to_inode(mapping->host, sc))
842 return PAGE_KEEP;
843
844 if (clear_page_dirty_for_io(page)) {
845 int res;
846 struct writeback_control wbc = {
847 .sync_mode = WB_SYNC_NONE,
848 .nr_to_write = SWAP_CLUSTER_MAX,
849 .range_start = 0,
850 .range_end = LLONG_MAX,
851 .for_reclaim = 1,
852 };
853
854 SetPageReclaim(page);
855 res = mapping->a_ops->writepage(page, &wbc);
856 if (res < 0)
857 handle_write_error(mapping, page, res);
858 if (res == AOP_WRITEPAGE_ACTIVATE) {
859 ClearPageReclaim(page);
860 return PAGE_ACTIVATE;
861 }
862
863 if (!PageWriteback(page)) {
864
865 ClearPageReclaim(page);
866 }
867 trace_mm_vmscan_writepage(page);
868 inc_node_page_state(page, NR_VMSCAN_WRITE);
869 return PAGE_SUCCESS;
870 }
871
872 return PAGE_CLEAN;
873}
874
875
876
877
878
879static int __remove_mapping(struct address_space *mapping, struct page *page,
880 bool reclaimed)
881{
882 unsigned long flags;
883 int refcount;
884
885 BUG_ON(!PageLocked(page));
886 BUG_ON(mapping != page_mapping(page));
887
888 xa_lock_irqsave(&mapping->i_pages, flags);
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914 if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
915 refcount = 1 + HPAGE_PMD_NR;
916 else
917 refcount = 2;
918 if (!page_ref_freeze(page, refcount))
919 goto cannot_free;
920
921 if (unlikely(PageDirty(page))) {
922 page_ref_unfreeze(page, refcount);
923 goto cannot_free;
924 }
925
926 if (PageSwapCache(page)) {
927 swp_entry_t swap = { .val = page_private(page) };
928 mem_cgroup_swapout(page, swap);
929 __delete_from_swap_cache(page, swap);
930 xa_unlock_irqrestore(&mapping->i_pages, flags);
931 put_swap_page(page, swap);
932 } else {
933 void (*freepage)(struct page *);
934 void *shadow = NULL;
935
936 freepage = mapping->a_ops->freepage;
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953 if (reclaimed && page_is_file_cache(page) &&
954 !mapping_exiting(mapping) && !dax_mapping(mapping))
955 shadow = workingset_eviction(page);
956 __delete_from_page_cache(page, shadow);
957 xa_unlock_irqrestore(&mapping->i_pages, flags);
958
959 if (freepage != NULL)
960 freepage(page);
961 }
962
963 return 1;
964
965cannot_free:
966 xa_unlock_irqrestore(&mapping->i_pages, flags);
967 return 0;
968}
969
970
971
972
973
974
975
976int remove_mapping(struct address_space *mapping, struct page *page)
977{
978 if (__remove_mapping(mapping, page, false)) {
979
980
981
982
983
984 page_ref_unfreeze(page, 1);
985 return 1;
986 }
987 return 0;
988}
989
990
991
992
993
994
995
996
997
998
999void putback_lru_page(struct page *page)
1000{
1001 lru_cache_add(page);
1002 put_page(page);
1003}
1004
1005enum page_references {
1006 PAGEREF_RECLAIM,
1007 PAGEREF_RECLAIM_CLEAN,
1008 PAGEREF_KEEP,
1009 PAGEREF_ACTIVATE,
1010};
1011
1012static enum page_references page_check_references(struct page *page,
1013 struct scan_control *sc)
1014{
1015 int referenced_ptes, referenced_page;
1016 unsigned long vm_flags;
1017
1018 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
1019 &vm_flags);
1020 referenced_page = TestClearPageReferenced(page);
1021
1022
1023
1024
1025
1026 if (vm_flags & VM_LOCKED)
1027 return PAGEREF_RECLAIM;
1028
1029 if (referenced_ptes) {
1030 if (PageSwapBacked(page))
1031 return PAGEREF_ACTIVATE;
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046 SetPageReferenced(page);
1047
1048 if (referenced_page || referenced_ptes > 1)
1049 return PAGEREF_ACTIVATE;
1050
1051
1052
1053
1054 if (vm_flags & VM_EXEC)
1055 return PAGEREF_ACTIVATE;
1056
1057 return PAGEREF_KEEP;
1058 }
1059
1060
1061 if (referenced_page && !PageSwapBacked(page))
1062 return PAGEREF_RECLAIM_CLEAN;
1063
1064 return PAGEREF_RECLAIM;
1065}
1066
1067
1068static void page_check_dirty_writeback(struct page *page,
1069 bool *dirty, bool *writeback)
1070{
1071 struct address_space *mapping;
1072
1073
1074
1075
1076
1077 if (!page_is_file_cache(page) ||
1078 (PageAnon(page) && !PageSwapBacked(page))) {
1079 *dirty = false;
1080 *writeback = false;
1081 return;
1082 }
1083
1084
1085 *dirty = PageDirty(page);
1086 *writeback = PageWriteback(page);
1087
1088
1089 if (!page_has_private(page))
1090 return;
1091
1092 mapping = page_mapping(page);
1093 if (mapping && mapping->a_ops->is_dirty_writeback)
1094 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1095}
1096
1097
1098
1099
1100static unsigned long shrink_page_list(struct list_head *page_list,
1101 struct pglist_data *pgdat,
1102 struct scan_control *sc,
1103 enum ttu_flags ttu_flags,
1104 struct reclaim_stat *stat,
1105 bool force_reclaim)
1106{
1107 LIST_HEAD(ret_pages);
1108 LIST_HEAD(free_pages);
1109 unsigned nr_reclaimed = 0;
1110 unsigned pgactivate = 0;
1111
1112 memset(stat, 0, sizeof(*stat));
1113 cond_resched();
1114
1115 while (!list_empty(page_list)) {
1116 struct address_space *mapping;
1117 struct page *page;
1118 int may_enter_fs;
1119 enum page_references references = PAGEREF_RECLAIM_CLEAN;
1120 bool dirty, writeback;
1121
1122 cond_resched();
1123
1124 page = lru_to_page(page_list);
1125 list_del(&page->lru);
1126
1127 if (!trylock_page(page))
1128 goto keep;
1129
1130 VM_BUG_ON_PAGE(PageActive(page), page);
1131
1132 sc->nr_scanned++;
1133
1134 if (unlikely(!page_evictable(page)))
1135 goto activate_locked;
1136
1137 if (!sc->may_unmap && page_mapped(page))
1138 goto keep_locked;
1139
1140
1141 if ((page_mapped(page) || PageSwapCache(page)) &&
1142 !(PageAnon(page) && !PageSwapBacked(page)))
1143 sc->nr_scanned++;
1144
1145 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1146 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1147
1148
1149
1150
1151
1152
1153
1154 page_check_dirty_writeback(page, &dirty, &writeback);
1155 if (dirty || writeback)
1156 stat->nr_dirty++;
1157
1158 if (dirty && !writeback)
1159 stat->nr_unqueued_dirty++;
1160
1161
1162
1163
1164
1165
1166
1167 mapping = page_mapping(page);
1168 if (((dirty || writeback) && mapping &&
1169 inode_write_congested(mapping->host)) ||
1170 (writeback && PageReclaim(page)))
1171 stat->nr_congested++;
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215 if (PageWriteback(page)) {
1216
1217 if (current_is_kswapd() &&
1218 PageReclaim(page) &&
1219 test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1220 stat->nr_immediate++;
1221 goto activate_locked;
1222
1223
1224 } else if (sane_reclaim(sc) ||
1225 !PageReclaim(page) || !may_enter_fs) {
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237 SetPageReclaim(page);
1238 stat->nr_writeback++;
1239 goto activate_locked;
1240
1241
1242 } else {
1243 unlock_page(page);
1244 wait_on_page_writeback(page);
1245
1246 list_add_tail(&page->lru, page_list);
1247 continue;
1248 }
1249 }
1250
1251 if (!force_reclaim)
1252 references = page_check_references(page, sc);
1253
1254 switch (references) {
1255 case PAGEREF_ACTIVATE:
1256 goto activate_locked;
1257 case PAGEREF_KEEP:
1258 stat->nr_ref_keep++;
1259 goto keep_locked;
1260 case PAGEREF_RECLAIM:
1261 case PAGEREF_RECLAIM_CLEAN:
1262 ;
1263 }
1264
1265
1266
1267
1268
1269
1270 if (PageAnon(page) && PageSwapBacked(page)) {
1271 if (!PageSwapCache(page)) {
1272 if (!(sc->gfp_mask & __GFP_IO))
1273 goto keep_locked;
1274 if (PageTransHuge(page)) {
1275
1276 if (!can_split_huge_page(page, NULL))
1277 goto activate_locked;
1278
1279
1280
1281
1282
1283 if (!compound_mapcount(page) &&
1284 split_huge_page_to_list(page,
1285 page_list))
1286 goto activate_locked;
1287 }
1288 if (!add_to_swap(page)) {
1289 if (!PageTransHuge(page))
1290 goto activate_locked;
1291
1292 if (split_huge_page_to_list(page,
1293 page_list))
1294 goto activate_locked;
1295#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1296 count_vm_event(THP_SWPOUT_FALLBACK);
1297#endif
1298 if (!add_to_swap(page))
1299 goto activate_locked;
1300 }
1301
1302 may_enter_fs = 1;
1303
1304
1305 mapping = page_mapping(page);
1306 }
1307 } else if (unlikely(PageTransHuge(page))) {
1308
1309 if (split_huge_page_to_list(page, page_list))
1310 goto keep_locked;
1311 }
1312
1313
1314
1315
1316
1317 if (page_mapped(page)) {
1318 enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
1319
1320 if (unlikely(PageTransHuge(page)))
1321 flags |= TTU_SPLIT_HUGE_PMD;
1322 if (!try_to_unmap(page, flags)) {
1323 stat->nr_unmap_fail++;
1324 goto activate_locked;
1325 }
1326 }
1327
1328 if (PageDirty(page)) {
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339 if (page_is_file_cache(page) &&
1340 (!current_is_kswapd() || !PageReclaim(page) ||
1341 !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1342
1343
1344
1345
1346
1347
1348 inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1349 SetPageReclaim(page);
1350
1351 goto activate_locked;
1352 }
1353
1354 if (references == PAGEREF_RECLAIM_CLEAN)
1355 goto keep_locked;
1356 if (!may_enter_fs)
1357 goto keep_locked;
1358 if (!sc->may_writepage)
1359 goto keep_locked;
1360
1361
1362
1363
1364
1365
1366 try_to_unmap_flush_dirty();
1367 switch (pageout(page, mapping, sc)) {
1368 case PAGE_KEEP:
1369 goto keep_locked;
1370 case PAGE_ACTIVATE:
1371 goto activate_locked;
1372 case PAGE_SUCCESS:
1373 if (PageWriteback(page))
1374 goto keep;
1375 if (PageDirty(page))
1376 goto keep;
1377
1378
1379
1380
1381
1382 if (!trylock_page(page))
1383 goto keep;
1384 if (PageDirty(page) || PageWriteback(page))
1385 goto keep_locked;
1386 mapping = page_mapping(page);
1387 case PAGE_CLEAN:
1388 ;
1389 }
1390 }
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413 if (page_has_private(page)) {
1414 if (!try_to_release_page(page, sc->gfp_mask))
1415 goto activate_locked;
1416 if (!mapping && page_count(page) == 1) {
1417 unlock_page(page);
1418 if (put_page_testzero(page))
1419 goto free_it;
1420 else {
1421
1422
1423
1424
1425
1426
1427
1428 nr_reclaimed++;
1429 continue;
1430 }
1431 }
1432 }
1433
1434 if (PageAnon(page) && !PageSwapBacked(page)) {
1435
1436 if (!page_ref_freeze(page, 1))
1437 goto keep_locked;
1438 if (PageDirty(page)) {
1439 page_ref_unfreeze(page, 1);
1440 goto keep_locked;
1441 }
1442
1443 count_vm_event(PGLAZYFREED);
1444 count_memcg_page_event(page, PGLAZYFREED);
1445 } else if (!mapping || !__remove_mapping(mapping, page, true))
1446 goto keep_locked;
1447
1448 unlock_page(page);
1449free_it:
1450 nr_reclaimed++;
1451
1452
1453
1454
1455
1456 if (unlikely(PageTransHuge(page))) {
1457 mem_cgroup_uncharge(page);
1458 (*get_compound_page_dtor(page))(page);
1459 } else
1460 list_add(&page->lru, &free_pages);
1461 continue;
1462
1463activate_locked:
1464
1465 if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1466 PageMlocked(page)))
1467 try_to_free_swap(page);
1468 VM_BUG_ON_PAGE(PageActive(page), page);
1469 if (!PageMlocked(page)) {
1470 int type = page_is_file_cache(page);
1471 SetPageActive(page);
1472 pgactivate++;
1473 stat->nr_activate[type] += hpage_nr_pages(page);
1474 count_memcg_page_event(page, PGACTIVATE);
1475 }
1476keep_locked:
1477 unlock_page(page);
1478keep:
1479 list_add(&page->lru, &ret_pages);
1480 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1481 }
1482
1483 mem_cgroup_uncharge_list(&free_pages);
1484 try_to_unmap_flush();
1485 free_unref_page_list(&free_pages);
1486
1487 list_splice(&ret_pages, page_list);
1488 count_vm_events(PGACTIVATE, pgactivate);
1489
1490 return nr_reclaimed;
1491}
1492
1493unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1494 struct list_head *page_list)
1495{
1496 struct scan_control sc = {
1497 .gfp_mask = GFP_KERNEL,
1498 .priority = DEF_PRIORITY,
1499 .may_unmap = 1,
1500 };
1501 struct reclaim_stat dummy_stat;
1502 unsigned long ret;
1503 struct page *page, *next;
1504 LIST_HEAD(clean_pages);
1505
1506 list_for_each_entry_safe(page, next, page_list, lru) {
1507 if (page_is_file_cache(page) && !PageDirty(page) &&
1508 !__PageMovable(page) && !PageUnevictable(page)) {
1509 ClearPageActive(page);
1510 list_move(&page->lru, &clean_pages);
1511 }
1512 }
1513
1514 ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1515 TTU_IGNORE_ACCESS, &dummy_stat, true);
1516 list_splice(&clean_pages, page_list);
1517 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
1518 return ret;
1519}
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1532{
1533 int ret = -EINVAL;
1534
1535
1536 if (!PageLRU(page))
1537 return ret;
1538
1539
1540 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1541 return ret;
1542
1543 ret = -EBUSY;
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553 if (mode & ISOLATE_ASYNC_MIGRATE) {
1554
1555 if (PageWriteback(page))
1556 return ret;
1557
1558 if (PageDirty(page)) {
1559 struct address_space *mapping;
1560 bool migrate_dirty;
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571 if (!trylock_page(page))
1572 return ret;
1573
1574 mapping = page_mapping(page);
1575 migrate_dirty = !mapping || mapping->a_ops->migratepage;
1576 unlock_page(page);
1577 if (!migrate_dirty)
1578 return ret;
1579 }
1580 }
1581
1582 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1583 return ret;
1584
1585 if (likely(get_page_unless_zero(page))) {
1586
1587
1588
1589
1590
1591 ClearPageLRU(page);
1592 ret = 0;
1593 }
1594
1595 return ret;
1596}
1597
1598
1599
1600
1601
1602
1603static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1604 enum lru_list lru, unsigned long *nr_zone_taken)
1605{
1606 int zid;
1607
1608 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1609 if (!nr_zone_taken[zid])
1610 continue;
1611
1612 __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1613#ifdef CONFIG_MEMCG
1614 mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1615#endif
1616 }
1617
1618}
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1641 struct lruvec *lruvec, struct list_head *dst,
1642 unsigned long *nr_scanned, struct scan_control *sc,
1643 enum lru_list lru)
1644{
1645 struct list_head *src = &lruvec->lists[lru];
1646 unsigned long nr_taken = 0;
1647 unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1648 unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1649 unsigned long skipped = 0;
1650 unsigned long scan, total_scan, nr_pages;
1651 LIST_HEAD(pages_skipped);
1652 isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
1653
1654 scan = 0;
1655 for (total_scan = 0;
1656 scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
1657 total_scan++) {
1658 struct page *page;
1659
1660 page = lru_to_page(src);
1661 prefetchw_prev_lru_page(page, src, flags);
1662
1663 VM_BUG_ON_PAGE(!PageLRU(page), page);
1664
1665 if (page_zonenum(page) > sc->reclaim_idx) {
1666 list_move(&page->lru, &pages_skipped);
1667 nr_skipped[page_zonenum(page)]++;
1668 continue;
1669 }
1670
1671
1672
1673
1674
1675
1676
1677 scan++;
1678 switch (__isolate_lru_page(page, mode)) {
1679 case 0:
1680 nr_pages = hpage_nr_pages(page);
1681 nr_taken += nr_pages;
1682 nr_zone_taken[page_zonenum(page)] += nr_pages;
1683 list_move(&page->lru, dst);
1684 break;
1685
1686 case -EBUSY:
1687
1688 list_move(&page->lru, src);
1689 continue;
1690
1691 default:
1692 BUG();
1693 }
1694 }
1695
1696
1697
1698
1699
1700
1701
1702
1703 if (!list_empty(&pages_skipped)) {
1704 int zid;
1705
1706 list_splice(&pages_skipped, src);
1707 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1708 if (!nr_skipped[zid])
1709 continue;
1710
1711 __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1712 skipped += nr_skipped[zid];
1713 }
1714 }
1715 *nr_scanned = total_scan;
1716 trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1717 total_scan, skipped, nr_taken, mode, lru);
1718 update_lru_sizes(lruvec, lru, nr_zone_taken);
1719 return nr_taken;
1720}
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748int isolate_lru_page(struct page *page)
1749{
1750 int ret = -EBUSY;
1751
1752 VM_BUG_ON_PAGE(!page_count(page), page);
1753 WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1754
1755 if (PageLRU(page)) {
1756 pg_data_t *pgdat = page_pgdat(page);
1757 struct lruvec *lruvec;
1758
1759 spin_lock_irq(&pgdat->lru_lock);
1760 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1761 if (PageLRU(page)) {
1762 int lru = page_lru(page);
1763 get_page(page);
1764 ClearPageLRU(page);
1765 del_page_from_lru_list(page, lruvec, lru);
1766 ret = 0;
1767 }
1768 spin_unlock_irq(&pgdat->lru_lock);
1769 }
1770 return ret;
1771}
1772
1773
1774
1775
1776
1777
1778
1779
1780static int too_many_isolated(struct pglist_data *pgdat, int file,
1781 struct scan_control *sc)
1782{
1783 unsigned long inactive, isolated;
1784
1785 if (current_is_kswapd())
1786 return 0;
1787
1788 if (!sane_reclaim(sc))
1789 return 0;
1790
1791 if (file) {
1792 inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1793 isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1794 } else {
1795 inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1796 isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1797 }
1798
1799
1800
1801
1802
1803
1804 if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1805 inactive >>= 3;
1806
1807 return isolated > inactive;
1808}
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
1831 struct list_head *list)
1832{
1833 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1834 int nr_pages, nr_moved = 0;
1835 LIST_HEAD(pages_to_free);
1836 struct page *page;
1837 enum lru_list lru;
1838
1839 while (!list_empty(list)) {
1840 page = lru_to_page(list);
1841 VM_BUG_ON_PAGE(PageLRU(page), page);
1842 if (unlikely(!page_evictable(page))) {
1843 list_del(&page->lru);
1844 spin_unlock_irq(&pgdat->lru_lock);
1845 putback_lru_page(page);
1846 spin_lock_irq(&pgdat->lru_lock);
1847 continue;
1848 }
1849 lruvec = mem_cgroup_page_lruvec(page, pgdat);
1850
1851 SetPageLRU(page);
1852 lru = page_lru(page);
1853
1854 nr_pages = hpage_nr_pages(page);
1855 update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
1856 list_move(&page->lru, &lruvec->lists[lru]);
1857
1858 if (put_page_testzero(page)) {
1859 __ClearPageLRU(page);
1860 __ClearPageActive(page);
1861 del_page_from_lru_list(page, lruvec, lru);
1862
1863 if (unlikely(PageCompound(page))) {
1864 spin_unlock_irq(&pgdat->lru_lock);
1865 mem_cgroup_uncharge(page);
1866 (*get_compound_page_dtor(page))(page);
1867 spin_lock_irq(&pgdat->lru_lock);
1868 } else
1869 list_add(&page->lru, &pages_to_free);
1870 } else {
1871 nr_moved += nr_pages;
1872 }
1873 }
1874
1875
1876
1877
1878 list_splice(&pages_to_free, list);
1879
1880 return nr_moved;
1881}
1882
1883
1884
1885
1886
1887
1888
1889static int current_may_throttle(void)
1890{
1891 return !(current->flags & PF_LESS_THROTTLE) ||
1892 current->backing_dev_info == NULL ||
1893 bdi_write_congested(current->backing_dev_info);
1894}
1895
1896
1897
1898
1899
1900static noinline_for_stack unsigned long
1901shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1902 struct scan_control *sc, enum lru_list lru)
1903{
1904 LIST_HEAD(page_list);
1905 unsigned long nr_scanned;
1906 unsigned long nr_reclaimed = 0;
1907 unsigned long nr_taken;
1908 struct reclaim_stat stat;
1909 int file = is_file_lru(lru);
1910 enum vm_event_item item;
1911 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1912 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1913 bool stalled = false;
1914
1915 while (unlikely(too_many_isolated(pgdat, file, sc))) {
1916 if (stalled)
1917 return 0;
1918
1919
1920 msleep(100);
1921 stalled = true;
1922
1923
1924 if (fatal_signal_pending(current))
1925 return SWAP_CLUSTER_MAX;
1926 }
1927
1928 lru_add_drain();
1929
1930 spin_lock_irq(&pgdat->lru_lock);
1931
1932 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1933 &nr_scanned, sc, lru);
1934
1935 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1936 reclaim_stat->recent_scanned[file] += nr_taken;
1937
1938 item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
1939 if (global_reclaim(sc))
1940 __count_vm_events(item, nr_scanned);
1941 __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
1942 spin_unlock_irq(&pgdat->lru_lock);
1943
1944 if (nr_taken == 0)
1945 return 0;
1946
1947 nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
1948 &stat, false);
1949
1950 spin_lock_irq(&pgdat->lru_lock);
1951
1952 item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
1953 if (global_reclaim(sc))
1954 __count_vm_events(item, nr_reclaimed);
1955 __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
1956 reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
1957 reclaim_stat->recent_rotated[1] += stat.nr_activate[1];
1958
1959 move_pages_to_lru(lruvec, &page_list);
1960
1961 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1962
1963 spin_unlock_irq(&pgdat->lru_lock);
1964
1965 mem_cgroup_uncharge_list(&page_list);
1966 free_unref_page_list(&page_list);
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979 if (stat.nr_unqueued_dirty == nr_taken)
1980 wakeup_flusher_threads(WB_REASON_VMSCAN);
1981
1982 sc->nr.dirty += stat.nr_dirty;
1983 sc->nr.congested += stat.nr_congested;
1984 sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
1985 sc->nr.writeback += stat.nr_writeback;
1986 sc->nr.immediate += stat.nr_immediate;
1987 sc->nr.taken += nr_taken;
1988 if (file)
1989 sc->nr.file_taken += nr_taken;
1990
1991 trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
1992 nr_scanned, nr_reclaimed, &stat, sc->priority, file);
1993 return nr_reclaimed;
1994}
1995
1996static void shrink_active_list(unsigned long nr_to_scan,
1997 struct lruvec *lruvec,
1998 struct scan_control *sc,
1999 enum lru_list lru)
2000{
2001 unsigned long nr_taken;
2002 unsigned long nr_scanned;
2003 unsigned long vm_flags;
2004 LIST_HEAD(l_hold);
2005 LIST_HEAD(l_active);
2006 LIST_HEAD(l_inactive);
2007 struct page *page;
2008 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2009 unsigned nr_deactivate, nr_activate;
2010 unsigned nr_rotated = 0;
2011 int file = is_file_lru(lru);
2012 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2013
2014 lru_add_drain();
2015
2016 spin_lock_irq(&pgdat->lru_lock);
2017
2018 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2019 &nr_scanned, sc, lru);
2020
2021 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2022 reclaim_stat->recent_scanned[file] += nr_taken;
2023
2024 __count_vm_events(PGREFILL, nr_scanned);
2025 __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2026
2027 spin_unlock_irq(&pgdat->lru_lock);
2028
2029 while (!list_empty(&l_hold)) {
2030 cond_resched();
2031 page = lru_to_page(&l_hold);
2032 list_del(&page->lru);
2033
2034 if (unlikely(!page_evictable(page))) {
2035 putback_lru_page(page);
2036 continue;
2037 }
2038
2039 if (unlikely(buffer_heads_over_limit)) {
2040 if (page_has_private(page) && trylock_page(page)) {
2041 if (page_has_private(page))
2042 try_to_release_page(page, 0);
2043 unlock_page(page);
2044 }
2045 }
2046
2047 if (page_referenced(page, 0, sc->target_mem_cgroup,
2048 &vm_flags)) {
2049 nr_rotated += hpage_nr_pages(page);
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059 if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
2060 list_add(&page->lru, &l_active);
2061 continue;
2062 }
2063 }
2064
2065 ClearPageActive(page);
2066 SetPageWorkingset(page);
2067 list_add(&page->lru, &l_inactive);
2068 }
2069
2070
2071
2072
2073 spin_lock_irq(&pgdat->lru_lock);
2074
2075
2076
2077
2078
2079
2080 reclaim_stat->recent_rotated[file] += nr_rotated;
2081
2082 nr_activate = move_pages_to_lru(lruvec, &l_active);
2083 nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2084
2085 list_splice(&l_inactive, &l_active);
2086
2087 __count_vm_events(PGDEACTIVATE, nr_deactivate);
2088 __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2089
2090 __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2091 spin_unlock_irq(&pgdat->lru_lock);
2092
2093 mem_cgroup_uncharge_list(&l_active);
2094 free_unref_page_list(&l_active);
2095 trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2096 nr_deactivate, nr_rotated, sc->priority, file);
2097}
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2128 struct scan_control *sc, bool actual_reclaim)
2129{
2130 enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
2131 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2132 enum lru_list inactive_lru = file * LRU_FILE;
2133 unsigned long inactive, active;
2134 unsigned long inactive_ratio;
2135 unsigned long refaults;
2136 unsigned long gb;
2137
2138
2139
2140
2141
2142 if (!file && !total_swap_pages)
2143 return false;
2144
2145 inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
2146 active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
2147
2148
2149
2150
2151
2152
2153 refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
2154 if (file && actual_reclaim && lruvec->refaults != refaults) {
2155 inactive_ratio = 0;
2156 } else {
2157 gb = (inactive + active) >> (30 - PAGE_SHIFT);
2158 if (gb)
2159 inactive_ratio = int_sqrt(10 * gb);
2160 else
2161 inactive_ratio = 1;
2162 }
2163
2164 if (actual_reclaim)
2165 trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
2166 lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
2167 lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
2168 inactive_ratio, file);
2169
2170 return inactive * inactive_ratio < active;
2171}
2172
2173static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2174 struct lruvec *lruvec, struct scan_control *sc)
2175{
2176 if (is_active_lru(lru)) {
2177 if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
2178 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2179 return 0;
2180 }
2181
2182 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2183}
2184
2185enum scan_balance {
2186 SCAN_EQUAL,
2187 SCAN_FRACT,
2188 SCAN_ANON,
2189 SCAN_FILE,
2190};
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2202 struct scan_control *sc, unsigned long *nr,
2203 unsigned long *lru_pages)
2204{
2205 int swappiness = mem_cgroup_swappiness(memcg);
2206 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
2207 u64 fraction[2];
2208 u64 denominator = 0;
2209 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2210 unsigned long anon_prio, file_prio;
2211 enum scan_balance scan_balance;
2212 unsigned long anon, file;
2213 unsigned long ap, fp;
2214 enum lru_list lru;
2215
2216
2217 if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2218 scan_balance = SCAN_FILE;
2219 goto out;
2220 }
2221
2222
2223
2224
2225
2226
2227
2228
2229 if (!global_reclaim(sc) && !swappiness) {
2230 scan_balance = SCAN_FILE;
2231 goto out;
2232 }
2233
2234
2235
2236
2237
2238
2239 if (!sc->priority && swappiness) {
2240 scan_balance = SCAN_EQUAL;
2241 goto out;
2242 }
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253 if (global_reclaim(sc)) {
2254 unsigned long pgdatfile;
2255 unsigned long pgdatfree;
2256 int z;
2257 unsigned long total_high_wmark = 0;
2258
2259 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2260 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
2261 node_page_state(pgdat, NR_INACTIVE_FILE);
2262
2263 for (z = 0; z < MAX_NR_ZONES; z++) {
2264 struct zone *zone = &pgdat->node_zones[z];
2265 if (!managed_zone(zone))
2266 continue;
2267
2268 total_high_wmark += high_wmark_pages(zone);
2269 }
2270
2271 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
2272
2273
2274
2275
2276
2277 if (!inactive_list_is_low(lruvec, false, sc, false) &&
2278 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
2279 >> sc->priority) {
2280 scan_balance = SCAN_ANON;
2281 goto out;
2282 }
2283 }
2284 }
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295 if (!inactive_list_is_low(lruvec, true, sc, false) &&
2296 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
2297 scan_balance = SCAN_FILE;
2298 goto out;
2299 }
2300
2301 scan_balance = SCAN_FRACT;
2302
2303
2304
2305
2306
2307 anon_prio = swappiness;
2308 file_prio = 200 - anon_prio;
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322 anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
2323 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
2324 file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
2325 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
2326
2327 spin_lock_irq(&pgdat->lru_lock);
2328 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
2329 reclaim_stat->recent_scanned[0] /= 2;
2330 reclaim_stat->recent_rotated[0] /= 2;
2331 }
2332
2333 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
2334 reclaim_stat->recent_scanned[1] /= 2;
2335 reclaim_stat->recent_rotated[1] /= 2;
2336 }
2337
2338
2339
2340
2341
2342
2343 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
2344 ap /= reclaim_stat->recent_rotated[0] + 1;
2345
2346 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
2347 fp /= reclaim_stat->recent_rotated[1] + 1;
2348 spin_unlock_irq(&pgdat->lru_lock);
2349
2350 fraction[0] = ap;
2351 fraction[1] = fp;
2352 denominator = ap + fp + 1;
2353out:
2354 *lru_pages = 0;
2355 for_each_evictable_lru(lru) {
2356 int file = is_file_lru(lru);
2357 unsigned long size;
2358 unsigned long scan;
2359
2360 size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2361 scan = size >> sc->priority;
2362
2363
2364
2365
2366 if (!scan && !mem_cgroup_online(memcg))
2367 scan = min(size, SWAP_CLUSTER_MAX);
2368
2369 switch (scan_balance) {
2370 case SCAN_EQUAL:
2371
2372 break;
2373 case SCAN_FRACT:
2374
2375
2376
2377
2378
2379
2380 scan = DIV64_U64_ROUND_UP(scan * fraction[file],
2381 denominator);
2382 break;
2383 case SCAN_FILE:
2384 case SCAN_ANON:
2385
2386 if ((scan_balance == SCAN_FILE) != file) {
2387 size = 0;
2388 scan = 0;
2389 }
2390 break;
2391 default:
2392
2393 BUG();
2394 }
2395
2396 *lru_pages += size;
2397 nr[lru] = scan;
2398 }
2399}
2400
2401
2402
2403
2404static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
2405 struct scan_control *sc, unsigned long *lru_pages)
2406{
2407 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
2408 unsigned long nr[NR_LRU_LISTS];
2409 unsigned long targets[NR_LRU_LISTS];
2410 unsigned long nr_to_scan;
2411 enum lru_list lru;
2412 unsigned long nr_reclaimed = 0;
2413 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2414 struct blk_plug plug;
2415 bool scan_adjusted;
2416
2417 get_scan_count(lruvec, memcg, sc, nr, lru_pages);
2418
2419
2420 memcpy(targets, nr, sizeof(nr));
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2434 sc->priority == DEF_PRIORITY);
2435
2436 blk_start_plug(&plug);
2437 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2438 nr[LRU_INACTIVE_FILE]) {
2439 unsigned long nr_anon, nr_file, percentage;
2440 unsigned long nr_scanned;
2441
2442 for_each_evictable_lru(lru) {
2443 if (nr[lru]) {
2444 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2445 nr[lru] -= nr_to_scan;
2446
2447 nr_reclaimed += shrink_list(lru, nr_to_scan,
2448 lruvec, sc);
2449 }
2450 }
2451
2452 cond_resched();
2453
2454 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2455 continue;
2456
2457
2458
2459
2460
2461
2462
2463
2464 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2465 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2466
2467
2468
2469
2470
2471
2472
2473 if (!nr_file || !nr_anon)
2474 break;
2475
2476 if (nr_file > nr_anon) {
2477 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2478 targets[LRU_ACTIVE_ANON] + 1;
2479 lru = LRU_BASE;
2480 percentage = nr_anon * 100 / scan_target;
2481 } else {
2482 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2483 targets[LRU_ACTIVE_FILE] + 1;
2484 lru = LRU_FILE;
2485 percentage = nr_file * 100 / scan_target;
2486 }
2487
2488
2489 nr[lru] = 0;
2490 nr[lru + LRU_ACTIVE] = 0;
2491
2492
2493
2494
2495
2496 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2497 nr_scanned = targets[lru] - nr[lru];
2498 nr[lru] = targets[lru] * (100 - percentage) / 100;
2499 nr[lru] -= min(nr[lru], nr_scanned);
2500
2501 lru += LRU_ACTIVE;
2502 nr_scanned = targets[lru] - nr[lru];
2503 nr[lru] = targets[lru] * (100 - percentage) / 100;
2504 nr[lru] -= min(nr[lru], nr_scanned);
2505
2506 scan_adjusted = true;
2507 }
2508 blk_finish_plug(&plug);
2509 sc->nr_reclaimed += nr_reclaimed;
2510
2511
2512
2513
2514
2515 if (inactive_list_is_low(lruvec, false, sc, true))
2516 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2517 sc, LRU_ACTIVE_ANON);
2518}
2519
2520
2521static bool in_reclaim_compaction(struct scan_control *sc)
2522{
2523 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2524 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2525 sc->priority < DEF_PRIORITY - 2))
2526 return true;
2527
2528 return false;
2529}
2530
2531
2532
2533
2534
2535
2536
2537
2538static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2539 unsigned long nr_reclaimed,
2540 unsigned long nr_scanned,
2541 struct scan_control *sc)
2542{
2543 unsigned long pages_for_compaction;
2544 unsigned long inactive_lru_pages;
2545 int z;
2546
2547
2548 if (!in_reclaim_compaction(sc))
2549 return false;
2550
2551
2552 if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
2553
2554
2555
2556
2557
2558
2559 if (!nr_reclaimed && !nr_scanned)
2560 return false;
2561 } else {
2562
2563
2564
2565
2566
2567
2568
2569
2570 if (!nr_reclaimed)
2571 return false;
2572 }
2573
2574
2575
2576
2577
2578 pages_for_compaction = compact_gap(sc->order);
2579 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2580 if (get_nr_swap_pages() > 0)
2581 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2582 if (sc->nr_reclaimed < pages_for_compaction &&
2583 inactive_lru_pages > pages_for_compaction)
2584 return true;
2585
2586
2587 for (z = 0; z <= sc->reclaim_idx; z++) {
2588 struct zone *zone = &pgdat->node_zones[z];
2589 if (!managed_zone(zone))
2590 continue;
2591
2592 switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2593 case COMPACT_SUCCESS:
2594 case COMPACT_CONTINUE:
2595 return false;
2596 default:
2597
2598 ;
2599 }
2600 }
2601 return true;
2602}
2603
2604static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
2605{
2606 return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
2607 (memcg && memcg_congested(pgdat, memcg));
2608}
2609
2610static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2611{
2612 struct reclaim_state *reclaim_state = current->reclaim_state;
2613 unsigned long nr_reclaimed, nr_scanned;
2614 bool reclaimable = false;
2615
2616 do {
2617 struct mem_cgroup *root = sc->target_mem_cgroup;
2618 struct mem_cgroup_reclaim_cookie reclaim = {
2619 .pgdat = pgdat,
2620 .priority = sc->priority,
2621 };
2622 unsigned long node_lru_pages = 0;
2623 struct mem_cgroup *memcg;
2624
2625 memset(&sc->nr, 0, sizeof(sc->nr));
2626
2627 nr_reclaimed = sc->nr_reclaimed;
2628 nr_scanned = sc->nr_scanned;
2629
2630 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2631 do {
2632 unsigned long lru_pages;
2633 unsigned long reclaimed;
2634 unsigned long scanned;
2635
2636 switch (mem_cgroup_protected(root, memcg)) {
2637 case MEMCG_PROT_MIN:
2638
2639
2640
2641
2642 continue;
2643 case MEMCG_PROT_LOW:
2644
2645
2646
2647
2648
2649
2650 if (!sc->memcg_low_reclaim) {
2651 sc->memcg_low_skipped = 1;
2652 continue;
2653 }
2654 memcg_memory_event(memcg, MEMCG_LOW);
2655 break;
2656 case MEMCG_PROT_NONE:
2657 break;
2658 }
2659
2660 reclaimed = sc->nr_reclaimed;
2661 scanned = sc->nr_scanned;
2662 shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
2663 node_lru_pages += lru_pages;
2664
2665 if (sc->may_shrinkslab) {
2666 shrink_slab(sc->gfp_mask, pgdat->node_id,
2667 memcg, sc->priority);
2668 }
2669
2670
2671 vmpressure(sc->gfp_mask, memcg, false,
2672 sc->nr_scanned - scanned,
2673 sc->nr_reclaimed - reclaimed);
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684 if (!current_is_kswapd() &&
2685 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2686 mem_cgroup_iter_break(root, memcg);
2687 break;
2688 }
2689 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
2690
2691 if (reclaim_state) {
2692 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2693 reclaim_state->reclaimed_slab = 0;
2694 }
2695
2696
2697 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2698 sc->nr_scanned - nr_scanned,
2699 sc->nr_reclaimed - nr_reclaimed);
2700
2701 if (sc->nr_reclaimed - nr_reclaimed)
2702 reclaimable = true;
2703
2704 if (current_is_kswapd()) {
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
2723 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
2724
2725
2726
2727
2728
2729
2730 if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2731 set_bit(PGDAT_CONGESTED, &pgdat->flags);
2732
2733
2734 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
2735 set_bit(PGDAT_DIRTY, &pgdat->flags);
2736
2737
2738
2739
2740
2741
2742
2743 if (sc->nr.immediate)
2744 congestion_wait(BLK_RW_ASYNC, HZ/10);
2745 }
2746
2747
2748
2749
2750
2751 if (!global_reclaim(sc) && sane_reclaim(sc) &&
2752 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2753 set_memcg_congestion(pgdat, root, true);
2754
2755
2756
2757
2758
2759
2760
2761 if (!sc->hibernation_mode && !current_is_kswapd() &&
2762 current_may_throttle() && pgdat_memcg_congested(pgdat, root))
2763 wait_iff_congested(BLK_RW_ASYNC, HZ/10);
2764
2765 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2766 sc->nr_scanned - nr_scanned, sc));
2767
2768
2769
2770
2771
2772
2773
2774 if (reclaimable)
2775 pgdat->kswapd_failures = 0;
2776
2777 return reclaimable;
2778}
2779
2780
2781
2782
2783
2784
2785static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2786{
2787 unsigned long watermark;
2788 enum compact_result suitable;
2789
2790 suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2791 if (suitable == COMPACT_SUCCESS)
2792
2793 return true;
2794 if (suitable == COMPACT_SKIPPED)
2795
2796 return false;
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807 watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2808
2809 return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2810}
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2821{
2822 struct zoneref *z;
2823 struct zone *zone;
2824 unsigned long nr_soft_reclaimed;
2825 unsigned long nr_soft_scanned;
2826 gfp_t orig_mask;
2827 pg_data_t *last_pgdat = NULL;
2828
2829
2830
2831
2832
2833
2834 orig_mask = sc->gfp_mask;
2835 if (buffer_heads_over_limit) {
2836 sc->gfp_mask |= __GFP_HIGHMEM;
2837 sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2838 }
2839
2840 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2841 sc->reclaim_idx, sc->nodemask) {
2842
2843
2844
2845
2846 if (global_reclaim(sc)) {
2847 if (!cpuset_zone_allowed(zone,
2848 GFP_KERNEL | __GFP_HARDWALL))
2849 continue;
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860 if (IS_ENABLED(CONFIG_COMPACTION) &&
2861 sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2862 compaction_ready(zone, sc)) {
2863 sc->compaction_ready = true;
2864 continue;
2865 }
2866
2867
2868
2869
2870
2871
2872
2873 if (zone->zone_pgdat == last_pgdat)
2874 continue;
2875
2876
2877
2878
2879
2880
2881
2882 nr_soft_scanned = 0;
2883 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2884 sc->order, sc->gfp_mask,
2885 &nr_soft_scanned);
2886 sc->nr_reclaimed += nr_soft_reclaimed;
2887 sc->nr_scanned += nr_soft_scanned;
2888
2889 }
2890
2891
2892 if (zone->zone_pgdat == last_pgdat)
2893 continue;
2894 last_pgdat = zone->zone_pgdat;
2895 shrink_node(zone->zone_pgdat, sc);
2896 }
2897
2898
2899
2900
2901
2902 sc->gfp_mask = orig_mask;
2903}
2904
2905static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
2906{
2907 struct mem_cgroup *memcg;
2908
2909 memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
2910 do {
2911 unsigned long refaults;
2912 struct lruvec *lruvec;
2913
2914 lruvec = mem_cgroup_lruvec(pgdat, memcg);
2915 refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
2916 lruvec->refaults = refaults;
2917 } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
2918}
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2937 struct scan_control *sc)
2938{
2939 int initial_priority = sc->priority;
2940 pg_data_t *last_pgdat;
2941 struct zoneref *z;
2942 struct zone *zone;
2943retry:
2944 delayacct_freepages_start();
2945
2946 if (global_reclaim(sc))
2947 __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
2948
2949 do {
2950 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
2951 sc->priority);
2952 sc->nr_scanned = 0;
2953 shrink_zones(zonelist, sc);
2954
2955 if (sc->nr_reclaimed >= sc->nr_to_reclaim)
2956 break;
2957
2958 if (sc->compaction_ready)
2959 break;
2960
2961
2962
2963
2964
2965 if (sc->priority < DEF_PRIORITY - 2)
2966 sc->may_writepage = 1;
2967 } while (--sc->priority >= 0);
2968
2969 last_pgdat = NULL;
2970 for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
2971 sc->nodemask) {
2972 if (zone->zone_pgdat == last_pgdat)
2973 continue;
2974 last_pgdat = zone->zone_pgdat;
2975 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
2976 set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
2977 }
2978
2979 delayacct_freepages_end();
2980
2981 if (sc->nr_reclaimed)
2982 return sc->nr_reclaimed;
2983
2984
2985 if (sc->compaction_ready)
2986 return 1;
2987
2988
2989 if (sc->memcg_low_skipped) {
2990 sc->priority = initial_priority;
2991 sc->memcg_low_reclaim = 1;
2992 sc->memcg_low_skipped = 0;
2993 goto retry;
2994 }
2995
2996 return 0;
2997}
2998
2999static bool allow_direct_reclaim(pg_data_t *pgdat)
3000{
3001 struct zone *zone;
3002 unsigned long pfmemalloc_reserve = 0;
3003 unsigned long free_pages = 0;
3004 int i;
3005 bool wmark_ok;
3006
3007 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3008 return true;
3009
3010 for (i = 0; i <= ZONE_NORMAL; i++) {
3011 zone = &pgdat->node_zones[i];
3012 if (!managed_zone(zone))
3013 continue;
3014
3015 if (!zone_reclaimable_pages(zone))
3016 continue;
3017
3018 pfmemalloc_reserve += min_wmark_pages(zone);
3019 free_pages += zone_page_state(zone, NR_FREE_PAGES);
3020 }
3021
3022
3023 if (!pfmemalloc_reserve)
3024 return true;
3025
3026 wmark_ok = free_pages > pfmemalloc_reserve / 2;
3027
3028
3029 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3030 pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
3031 (enum zone_type)ZONE_NORMAL);
3032 wake_up_interruptible(&pgdat->kswapd_wait);
3033 }
3034
3035 return wmark_ok;
3036}
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3048 nodemask_t *nodemask)
3049{
3050 struct zoneref *z;
3051 struct zone *zone;
3052 pg_data_t *pgdat = NULL;
3053
3054
3055
3056
3057
3058
3059
3060
3061 if (current->flags & PF_KTHREAD)
3062 goto out;
3063
3064
3065
3066
3067
3068 if (fatal_signal_pending(current))
3069 goto out;
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085 for_each_zone_zonelist_nodemask(zone, z, zonelist,
3086 gfp_zone(gfp_mask), nodemask) {
3087 if (zone_idx(zone) > ZONE_NORMAL)
3088 continue;
3089
3090
3091 pgdat = zone->zone_pgdat;
3092 if (allow_direct_reclaim(pgdat))
3093 goto out;
3094 break;
3095 }
3096
3097
3098 if (!pgdat)
3099 goto out;
3100
3101
3102 count_vm_event(PGSCAN_DIRECT_THROTTLE);
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112 if (!(gfp_mask & __GFP_FS)) {
3113 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3114 allow_direct_reclaim(pgdat), HZ);
3115
3116 goto check_pending;
3117 }
3118
3119
3120 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3121 allow_direct_reclaim(pgdat));
3122
3123check_pending:
3124 if (fatal_signal_pending(current))
3125 return true;
3126
3127out:
3128 return false;
3129}
3130
3131unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3132 gfp_t gfp_mask, nodemask_t *nodemask)
3133{
3134 unsigned long nr_reclaimed;
3135 struct scan_control sc = {
3136 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3137 .gfp_mask = current_gfp_context(gfp_mask),
3138 .reclaim_idx = gfp_zone(gfp_mask),
3139 .order = order,
3140 .nodemask = nodemask,
3141 .priority = DEF_PRIORITY,
3142 .may_writepage = !laptop_mode,
3143 .may_unmap = 1,
3144 .may_swap = 1,
3145 .may_shrinkslab = 1,
3146 };
3147
3148
3149
3150
3151
3152 BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3153 BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3154 BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3155
3156
3157
3158
3159
3160
3161 if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3162 return 1;
3163
3164 trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3165
3166 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3167
3168 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3169
3170 return nr_reclaimed;
3171}
3172
3173#ifdef CONFIG_MEMCG
3174
3175unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3176 gfp_t gfp_mask, bool noswap,
3177 pg_data_t *pgdat,
3178 unsigned long *nr_scanned)
3179{
3180 struct scan_control sc = {
3181 .nr_to_reclaim = SWAP_CLUSTER_MAX,
3182 .target_mem_cgroup = memcg,
3183 .may_writepage = !laptop_mode,
3184 .may_unmap = 1,
3185 .reclaim_idx = MAX_NR_ZONES - 1,
3186 .may_swap = !noswap,
3187 .may_shrinkslab = 1,
3188 };
3189 unsigned long lru_pages;
3190
3191 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3192 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3193
3194 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3195 sc.gfp_mask);
3196
3197
3198
3199
3200
3201
3202
3203
3204 shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
3205
3206 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3207
3208 *nr_scanned = sc.nr_scanned;
3209 return sc.nr_reclaimed;
3210}
3211
3212unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3213 unsigned long nr_pages,
3214 gfp_t gfp_mask,
3215 bool may_swap)
3216{
3217 struct zonelist *zonelist;
3218 unsigned long nr_reclaimed;
3219 unsigned long pflags;
3220 int nid;
3221 unsigned int noreclaim_flag;
3222 struct scan_control sc = {
3223 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3224 .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3225 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3226 .reclaim_idx = MAX_NR_ZONES - 1,
3227 .target_mem_cgroup = memcg,
3228 .priority = DEF_PRIORITY,
3229 .may_writepage = !laptop_mode,
3230 .may_unmap = 1,
3231 .may_swap = may_swap,
3232 .may_shrinkslab = 1,
3233 };
3234
3235
3236
3237
3238
3239
3240 nid = mem_cgroup_select_victim_node(memcg);
3241
3242 zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
3243
3244 trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
3245
3246 psi_memstall_enter(&pflags);
3247 noreclaim_flag = memalloc_noreclaim_save();
3248
3249 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3250
3251 memalloc_noreclaim_restore(noreclaim_flag);
3252 psi_memstall_leave(&pflags);
3253
3254 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3255
3256 return nr_reclaimed;
3257}
3258#endif
3259
3260static void age_active_anon(struct pglist_data *pgdat,
3261 struct scan_control *sc)
3262{
3263 struct mem_cgroup *memcg;
3264
3265 if (!total_swap_pages)
3266 return;
3267
3268 memcg = mem_cgroup_iter(NULL, NULL, NULL);
3269 do {
3270 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
3271
3272 if (inactive_list_is_low(lruvec, false, sc, true))
3273 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3274 sc, LRU_ACTIVE_ANON);
3275
3276 memcg = mem_cgroup_iter(NULL, memcg, NULL);
3277 } while (memcg);
3278}
3279
3280static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
3281{
3282 int i;
3283 struct zone *zone;
3284
3285
3286
3287
3288
3289
3290
3291
3292 for (i = classzone_idx; i >= 0; i--) {
3293 zone = pgdat->node_zones + i;
3294 if (!managed_zone(zone))
3295 continue;
3296
3297 if (zone->watermark_boost)
3298 return true;
3299 }
3300
3301 return false;
3302}
3303
3304
3305
3306
3307
3308static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
3309{
3310 int i;
3311 unsigned long mark = -1;
3312 struct zone *zone;
3313
3314
3315
3316
3317
3318 for (i = 0; i <= classzone_idx; i++) {
3319 zone = pgdat->node_zones + i;
3320
3321 if (!managed_zone(zone))
3322 continue;
3323
3324 mark = high_wmark_pages(zone);
3325 if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
3326 return true;
3327 }
3328
3329
3330
3331
3332
3333
3334 if (mark == -1)
3335 return true;
3336
3337 return false;
3338}
3339
3340
3341static void clear_pgdat_congested(pg_data_t *pgdat)
3342{
3343 clear_bit(PGDAT_CONGESTED, &pgdat->flags);
3344 clear_bit(PGDAT_DIRTY, &pgdat->flags);
3345 clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3346}
3347
3348
3349
3350
3351
3352
3353
3354static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3355{
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3370 wake_up_all(&pgdat->pfmemalloc_wait);
3371
3372
3373 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3374 return true;
3375
3376 if (pgdat_balanced(pgdat, order, classzone_idx)) {
3377 clear_pgdat_congested(pgdat);
3378 return true;
3379 }
3380
3381 return false;
3382}
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392static bool kswapd_shrink_node(pg_data_t *pgdat,
3393 struct scan_control *sc)
3394{
3395 struct zone *zone;
3396 int z;
3397
3398
3399 sc->nr_to_reclaim = 0;
3400 for (z = 0; z <= sc->reclaim_idx; z++) {
3401 zone = pgdat->node_zones + z;
3402 if (!managed_zone(zone))
3403 continue;
3404
3405 sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3406 }
3407
3408
3409
3410
3411
3412 shrink_node(pgdat, sc);
3413
3414
3415
3416
3417
3418
3419
3420
3421 if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3422 sc->order = 0;
3423
3424 return sc->nr_scanned >= sc->nr_to_reclaim;
3425}
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3441{
3442 int i;
3443 unsigned long nr_soft_reclaimed;
3444 unsigned long nr_soft_scanned;
3445 unsigned long pflags;
3446 unsigned long nr_boost_reclaim;
3447 unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
3448 bool boosted;
3449 struct zone *zone;
3450 struct scan_control sc = {
3451 .gfp_mask = GFP_KERNEL,
3452 .order = order,
3453 .may_unmap = 1,
3454 };
3455
3456 psi_memstall_enter(&pflags);
3457 __fs_reclaim_acquire();
3458
3459 count_vm_event(PAGEOUTRUN);
3460
3461
3462
3463
3464
3465
3466 nr_boost_reclaim = 0;
3467 for (i = 0; i <= classzone_idx; i++) {
3468 zone = pgdat->node_zones + i;
3469 if (!managed_zone(zone))
3470 continue;
3471
3472 nr_boost_reclaim += zone->watermark_boost;
3473 zone_boosts[i] = zone->watermark_boost;
3474 }
3475 boosted = nr_boost_reclaim;
3476
3477restart:
3478 sc.priority = DEF_PRIORITY;
3479 do {
3480 unsigned long nr_reclaimed = sc.nr_reclaimed;
3481 bool raise_priority = true;
3482 bool balanced;
3483 bool ret;
3484
3485 sc.reclaim_idx = classzone_idx;
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497 if (buffer_heads_over_limit) {
3498 for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3499 zone = pgdat->node_zones + i;
3500 if (!managed_zone(zone))
3501 continue;
3502
3503 sc.reclaim_idx = i;
3504 break;
3505 }
3506 }
3507
3508
3509
3510
3511
3512
3513
3514
3515 balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
3516 if (!balanced && nr_boost_reclaim) {
3517 nr_boost_reclaim = 0;
3518 goto restart;
3519 }
3520
3521
3522
3523
3524
3525
3526 if (!nr_boost_reclaim && balanced)
3527 goto out;
3528
3529
3530 if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
3531 raise_priority = false;
3532
3533
3534
3535
3536
3537
3538
3539 sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
3540 sc.may_swap = !nr_boost_reclaim;
3541 sc.may_shrinkslab = !nr_boost_reclaim;
3542
3543
3544
3545
3546
3547
3548
3549 age_active_anon(pgdat, &sc);
3550
3551
3552
3553
3554
3555 if (sc.priority < DEF_PRIORITY - 2)
3556 sc.may_writepage = 1;
3557
3558
3559 sc.nr_scanned = 0;
3560 nr_soft_scanned = 0;
3561 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3562 sc.gfp_mask, &nr_soft_scanned);
3563 sc.nr_reclaimed += nr_soft_reclaimed;
3564
3565
3566
3567
3568
3569
3570 if (kswapd_shrink_node(pgdat, &sc))
3571 raise_priority = false;
3572
3573
3574
3575
3576
3577
3578 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3579 allow_direct_reclaim(pgdat))
3580 wake_up_all(&pgdat->pfmemalloc_wait);
3581
3582
3583 __fs_reclaim_release();
3584 ret = try_to_freeze();
3585 __fs_reclaim_acquire();
3586 if (ret || kthread_should_stop())
3587 break;
3588
3589
3590
3591
3592
3593 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3594 nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
3595
3596
3597
3598
3599
3600
3601 if (nr_boost_reclaim && !nr_reclaimed)
3602 break;
3603
3604 if (raise_priority || !nr_reclaimed)
3605 sc.priority--;
3606 } while (sc.priority >= 1);
3607
3608 if (!sc.nr_reclaimed)
3609 pgdat->kswapd_failures++;
3610
3611out:
3612
3613 if (boosted) {
3614 unsigned long flags;
3615
3616 for (i = 0; i <= classzone_idx; i++) {
3617 if (!zone_boosts[i])
3618 continue;
3619
3620
3621 zone = pgdat->node_zones + i;
3622 spin_lock_irqsave(&zone->lock, flags);
3623 zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
3624 spin_unlock_irqrestore(&zone->lock, flags);
3625 }
3626
3627
3628
3629
3630
3631 wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
3632 }
3633
3634 snapshot_refaults(NULL, pgdat);
3635 __fs_reclaim_release();
3636 psi_memstall_leave(&pflags);
3637
3638
3639
3640
3641
3642
3643 return sc.order;
3644}
3645
3646
3647
3648
3649
3650
3651
3652
3653static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
3654 enum zone_type prev_classzone_idx)
3655{
3656 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3657 return prev_classzone_idx;
3658 return pgdat->kswapd_classzone_idx;
3659}
3660
3661static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3662 unsigned int classzone_idx)
3663{
3664 long remaining = 0;
3665 DEFINE_WAIT(wait);
3666
3667 if (freezing(current) || kthread_should_stop())
3668 return;
3669
3670 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3671
3672
3673
3674
3675
3676
3677
3678
3679 if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3680
3681
3682
3683
3684
3685
3686 reset_isolation_suitable(pgdat);
3687
3688
3689
3690
3691
3692 wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
3693
3694 remaining = schedule_timeout(HZ/10);
3695
3696
3697
3698
3699
3700
3701 if (remaining) {
3702 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3703 pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
3704 }
3705
3706 finish_wait(&pgdat->kswapd_wait, &wait);
3707 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3708 }
3709
3710
3711
3712
3713
3714 if (!remaining &&
3715 prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
3716 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3727
3728 if (!kthread_should_stop())
3729 schedule();
3730
3731 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3732 } else {
3733 if (remaining)
3734 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3735 else
3736 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3737 }
3738 finish_wait(&pgdat->kswapd_wait, &wait);
3739}
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754static int kswapd(void *p)
3755{
3756 unsigned int alloc_order, reclaim_order;
3757 unsigned int classzone_idx = MAX_NR_ZONES - 1;
3758 pg_data_t *pgdat = (pg_data_t*)p;
3759 struct task_struct *tsk = current;
3760
3761 struct reclaim_state reclaim_state = {
3762 .reclaimed_slab = 0,
3763 };
3764 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3765
3766 if (!cpumask_empty(cpumask))
3767 set_cpus_allowed_ptr(tsk, cpumask);
3768 current->reclaim_state = &reclaim_state;
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3783 set_freezable();
3784
3785 pgdat->kswapd_order = 0;
3786 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3787 for ( ; ; ) {
3788 bool ret;
3789
3790 alloc_order = reclaim_order = pgdat->kswapd_order;
3791 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3792
3793kswapd_try_sleep:
3794 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3795 classzone_idx);
3796
3797
3798 alloc_order = reclaim_order = pgdat->kswapd_order;
3799 classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
3800 pgdat->kswapd_order = 0;
3801 pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
3802
3803 ret = try_to_freeze();
3804 if (kthread_should_stop())
3805 break;
3806
3807
3808
3809
3810
3811 if (ret)
3812 continue;
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822 trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
3823 alloc_order);
3824 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
3825 if (reclaim_order < alloc_order)
3826 goto kswapd_try_sleep;
3827 }
3828
3829 tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3830 current->reclaim_state = NULL;
3831
3832 return 0;
3833}
3834
3835
3836
3837
3838
3839
3840
3841
3842void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3843 enum zone_type classzone_idx)
3844{
3845 pg_data_t *pgdat;
3846
3847 if (!managed_zone(zone))
3848 return;
3849
3850 if (!cpuset_zone_allowed(zone, gfp_flags))
3851 return;
3852 pgdat = zone->zone_pgdat;
3853
3854 if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
3855 pgdat->kswapd_classzone_idx = classzone_idx;
3856 else
3857 pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
3858 classzone_idx);
3859 pgdat->kswapd_order = max(pgdat->kswapd_order, order);
3860 if (!waitqueue_active(&pgdat->kswapd_wait))
3861 return;
3862
3863
3864 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3865 (pgdat_balanced(pgdat, order, classzone_idx) &&
3866 !pgdat_watermark_boosted(pgdat, classzone_idx))) {
3867
3868
3869
3870
3871
3872
3873
3874 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
3875 wakeup_kcompactd(pgdat, order, classzone_idx);
3876 return;
3877 }
3878
3879 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
3880 gfp_flags);
3881 wake_up_interruptible(&pgdat->kswapd_wait);
3882}
3883
3884#ifdef CONFIG_HIBERNATION
3885
3886
3887
3888
3889
3890
3891
3892
3893unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3894{
3895 struct reclaim_state reclaim_state;
3896 struct scan_control sc = {
3897 .nr_to_reclaim = nr_to_reclaim,
3898 .gfp_mask = GFP_HIGHUSER_MOVABLE,
3899 .reclaim_idx = MAX_NR_ZONES - 1,
3900 .priority = DEF_PRIORITY,
3901 .may_writepage = 1,
3902 .may_unmap = 1,
3903 .may_swap = 1,
3904 .hibernation_mode = 1,
3905 };
3906 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3907 struct task_struct *p = current;
3908 unsigned long nr_reclaimed;
3909 unsigned int noreclaim_flag;
3910
3911 fs_reclaim_acquire(sc.gfp_mask);
3912 noreclaim_flag = memalloc_noreclaim_save();
3913 reclaim_state.reclaimed_slab = 0;
3914 p->reclaim_state = &reclaim_state;
3915
3916 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3917
3918 p->reclaim_state = NULL;
3919 memalloc_noreclaim_restore(noreclaim_flag);
3920 fs_reclaim_release(sc.gfp_mask);
3921
3922 return nr_reclaimed;
3923}
3924#endif
3925
3926
3927
3928
3929
3930static int kswapd_cpu_online(unsigned int cpu)
3931{
3932 int nid;
3933
3934 for_each_node_state(nid, N_MEMORY) {
3935 pg_data_t *pgdat = NODE_DATA(nid);
3936 const struct cpumask *mask;
3937
3938 mask = cpumask_of_node(pgdat->node_id);
3939
3940 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
3941
3942 set_cpus_allowed_ptr(pgdat->kswapd, mask);
3943 }
3944 return 0;
3945}
3946
3947
3948
3949
3950
3951int kswapd_run(int nid)
3952{
3953 pg_data_t *pgdat = NODE_DATA(nid);
3954 int ret = 0;
3955
3956 if (pgdat->kswapd)
3957 return 0;
3958
3959 pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
3960 if (IS_ERR(pgdat->kswapd)) {
3961
3962 BUG_ON(system_state < SYSTEM_RUNNING);
3963 pr_err("Failed to start kswapd on node %d\n", nid);
3964 ret = PTR_ERR(pgdat->kswapd);
3965 pgdat->kswapd = NULL;
3966 }
3967 return ret;
3968}
3969
3970
3971
3972
3973
3974void kswapd_stop(int nid)
3975{
3976 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3977
3978 if (kswapd) {
3979 kthread_stop(kswapd);
3980 NODE_DATA(nid)->kswapd = NULL;
3981 }
3982}
3983
3984static int __init kswapd_init(void)
3985{
3986 int nid, ret;
3987
3988 swap_setup();
3989 for_each_node_state(nid, N_MEMORY)
3990 kswapd_run(nid);
3991 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
3992 "mm/vmscan:online", kswapd_cpu_online,
3993 NULL);
3994 WARN_ON(ret < 0);
3995 return 0;
3996}
3997
3998module_init(kswapd_init)
3999
4000#ifdef CONFIG_NUMA
4001
4002
4003
4004
4005
4006
4007int node_reclaim_mode __read_mostly;
4008
4009#define RECLAIM_OFF 0
4010#define RECLAIM_ZONE (1<<0)
4011#define RECLAIM_WRITE (1<<1)
4012#define RECLAIM_UNMAP (1<<2)
4013
4014
4015
4016
4017
4018
4019#define NODE_RECLAIM_PRIORITY 4
4020
4021
4022
4023
4024
4025int sysctl_min_unmapped_ratio = 1;
4026
4027
4028
4029
4030
4031int sysctl_min_slab_ratio = 5;
4032
4033static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4034{
4035 unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4036 unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4037 node_page_state(pgdat, NR_ACTIVE_FILE);
4038
4039
4040
4041
4042
4043
4044 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4045}
4046
4047
4048static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4049{
4050 unsigned long nr_pagecache_reclaimable;
4051 unsigned long delta = 0;
4052
4053
4054
4055
4056
4057
4058
4059 if (node_reclaim_mode & RECLAIM_UNMAP)
4060 nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4061 else
4062 nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4063
4064
4065 if (!(node_reclaim_mode & RECLAIM_WRITE))
4066 delta += node_page_state(pgdat, NR_FILE_DIRTY);
4067
4068
4069 if (unlikely(delta > nr_pagecache_reclaimable))
4070 delta = nr_pagecache_reclaimable;
4071
4072 return nr_pagecache_reclaimable - delta;
4073}
4074
4075
4076
4077
4078static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4079{
4080
4081 const unsigned long nr_pages = 1 << order;
4082 struct task_struct *p = current;
4083 struct reclaim_state reclaim_state;
4084 unsigned int noreclaim_flag;
4085 struct scan_control sc = {
4086 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4087 .gfp_mask = current_gfp_context(gfp_mask),
4088 .order = order,
4089 .priority = NODE_RECLAIM_PRIORITY,
4090 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4091 .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4092 .may_swap = 1,
4093 .reclaim_idx = gfp_zone(gfp_mask),
4094 };
4095
4096 trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
4097 sc.gfp_mask);
4098
4099 cond_resched();
4100 fs_reclaim_acquire(sc.gfp_mask);
4101
4102
4103
4104
4105
4106 noreclaim_flag = memalloc_noreclaim_save();
4107 p->flags |= PF_SWAPWRITE;
4108 reclaim_state.reclaimed_slab = 0;
4109 p->reclaim_state = &reclaim_state;
4110
4111 if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4112
4113
4114
4115
4116 do {
4117 shrink_node(pgdat, &sc);
4118 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4119 }
4120
4121 p->reclaim_state = NULL;
4122 current->flags &= ~PF_SWAPWRITE;
4123 memalloc_noreclaim_restore(noreclaim_flag);
4124 fs_reclaim_release(sc.gfp_mask);
4125
4126 trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
4127
4128 return sc.nr_reclaimed >= nr_pages;
4129}
4130
4131int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4132{
4133 int ret;
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145 if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4146 node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
4147 return NODE_RECLAIM_FULL;
4148
4149
4150
4151
4152 if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4153 return NODE_RECLAIM_NOSCAN;
4154
4155
4156
4157
4158
4159
4160
4161 if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4162 return NODE_RECLAIM_NOSCAN;
4163
4164 if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4165 return NODE_RECLAIM_NOSCAN;
4166
4167 ret = __node_reclaim(pgdat, gfp_mask, order);
4168 clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4169
4170 if (!ret)
4171 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4172
4173 return ret;
4174}
4175#endif
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189int page_evictable(struct page *page)
4190{
4191 int ret;
4192
4193
4194 rcu_read_lock();
4195 ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
4196 rcu_read_unlock();
4197 return ret;
4198}
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209void check_move_unevictable_pages(struct pagevec *pvec)
4210{
4211 struct lruvec *lruvec;
4212 struct pglist_data *pgdat = NULL;
4213 int pgscanned = 0;
4214 int pgrescued = 0;
4215 int i;
4216
4217 for (i = 0; i < pvec->nr; i++) {
4218 struct page *page = pvec->pages[i];
4219 struct pglist_data *pagepgdat = page_pgdat(page);
4220
4221 pgscanned++;
4222 if (pagepgdat != pgdat) {
4223 if (pgdat)
4224 spin_unlock_irq(&pgdat->lru_lock);
4225 pgdat = pagepgdat;
4226 spin_lock_irq(&pgdat->lru_lock);
4227 }
4228 lruvec = mem_cgroup_page_lruvec(page, pgdat);
4229
4230 if (!PageLRU(page) || !PageUnevictable(page))
4231 continue;
4232
4233 if (page_evictable(page)) {
4234 enum lru_list lru = page_lru_base_type(page);
4235
4236 VM_BUG_ON_PAGE(PageActive(page), page);
4237 ClearPageUnevictable(page);
4238 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
4239 add_page_to_lru_list(page, lruvec, lru);
4240 pgrescued++;
4241 }
4242 }
4243
4244 if (pgdat) {
4245 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4246 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4247 spin_unlock_irq(&pgdat->lru_lock);
4248 }
4249}
4250EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4251