linux/mm/page_idle.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/init.h>
   3#include <linux/memblock.h>
   4#include <linux/fs.h>
   5#include <linux/sysfs.h>
   6#include <linux/kobject.h>
   7#include <linux/memory_hotplug.h>
   8#include <linux/mm.h>
   9#include <linux/mmzone.h>
  10#include <linux/pagemap.h>
  11#include <linux/rmap.h>
  12#include <linux/mmu_notifier.h>
  13#include <linux/page_ext.h>
  14#include <linux/page_idle.h>
  15
  16#include "internal.h"
  17
  18#define BITMAP_CHUNK_SIZE       sizeof(u64)
  19#define BITMAP_CHUNK_BITS       (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
  20
  21/*
  22 * Idle page tracking only considers user memory pages, for other types of
  23 * pages the idle flag is always unset and an attempt to set it is silently
  24 * ignored.
  25 *
  26 * We treat a page as a user memory page if it is on an LRU list, because it is
  27 * always safe to pass such a page to rmap_walk(), which is essential for idle
  28 * page tracking. With such an indicator of user pages we can skip isolated
  29 * pages, but since there are not usually many of them, it will hardly affect
  30 * the overall result.
  31 *
  32 * This function tries to get a user memory page by pfn as described above.
  33 */
  34static struct page *page_idle_get_page(unsigned long pfn)
  35{
  36        struct page *page = pfn_to_online_page(pfn);
  37
  38        if (!page || !PageLRU(page) ||
  39            !get_page_unless_zero(page))
  40                return NULL;
  41
  42        if (unlikely(!PageLRU(page))) {
  43                put_page(page);
  44                page = NULL;
  45        }
  46        return page;
  47}
  48
  49static bool page_idle_clear_pte_refs_one(struct folio *folio,
  50                                        struct vm_area_struct *vma,
  51                                        unsigned long addr, void *arg)
  52{
  53        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
  54        bool referenced = false;
  55
  56        while (page_vma_mapped_walk(&pvmw)) {
  57                addr = pvmw.address;
  58                if (pvmw.pte) {
  59                        /*
  60                         * For PTE-mapped THP, one sub page is referenced,
  61                         * the whole THP is referenced.
  62                         */
  63                        if (ptep_clear_young_notify(vma, addr, pvmw.pte))
  64                                referenced = true;
  65                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
  66                        if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
  67                                referenced = true;
  68                } else {
  69                        /* unexpected pmd-mapped page? */
  70                        WARN_ON_ONCE(1);
  71                }
  72        }
  73
  74        if (referenced) {
  75                folio_clear_idle(folio);
  76                /*
  77                 * We cleared the referenced bit in a mapping to this page. To
  78                 * avoid interference with page reclaim, mark it young so that
  79                 * folio_referenced() will return > 0.
  80                 */
  81                folio_set_young(folio);
  82        }
  83        return true;
  84}
  85
  86static void page_idle_clear_pte_refs(struct page *page)
  87{
  88        struct folio *folio = page_folio(page);
  89
  90        /*
  91         * Since rwc.try_lock is unused, rwc is effectively immutable, so we
  92         * can make it static to save some cycles and stack.
  93         */
  94        static struct rmap_walk_control rwc = {
  95                .rmap_one = page_idle_clear_pte_refs_one,
  96                .anon_lock = folio_lock_anon_vma_read,
  97        };
  98        bool need_lock;
  99
 100        if (!folio_mapped(folio) || !folio_raw_mapping(folio))
 101                return;
 102
 103        need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
 104        if (need_lock && !folio_trylock(folio))
 105                return;
 106
 107        rmap_walk(folio, &rwc);
 108
 109        if (need_lock)
 110                folio_unlock(folio);
 111}
 112
 113static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
 114                                     struct bin_attribute *attr, char *buf,
 115                                     loff_t pos, size_t count)
 116{
 117        u64 *out = (u64 *)buf;
 118        struct page *page;
 119        unsigned long pfn, end_pfn;
 120        int bit;
 121
 122        if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
 123                return -EINVAL;
 124
 125        pfn = pos * BITS_PER_BYTE;
 126        if (pfn >= max_pfn)
 127                return 0;
 128
 129        end_pfn = pfn + count * BITS_PER_BYTE;
 130        if (end_pfn > max_pfn)
 131                end_pfn = max_pfn;
 132
 133        for (; pfn < end_pfn; pfn++) {
 134                bit = pfn % BITMAP_CHUNK_BITS;
 135                if (!bit)
 136                        *out = 0ULL;
 137                page = page_idle_get_page(pfn);
 138                if (page) {
 139                        if (page_is_idle(page)) {
 140                                /*
 141                                 * The page might have been referenced via a
 142                                 * pte, in which case it is not idle. Clear
 143                                 * refs and recheck.
 144                                 */
 145                                page_idle_clear_pte_refs(page);
 146                                if (page_is_idle(page))
 147                                        *out |= 1ULL << bit;
 148                        }
 149                        put_page(page);
 150                }
 151                if (bit == BITMAP_CHUNK_BITS - 1)
 152                        out++;
 153                cond_resched();
 154        }
 155        return (char *)out - buf;
 156}
 157
 158static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
 159                                      struct bin_attribute *attr, char *buf,
 160                                      loff_t pos, size_t count)
 161{
 162        const u64 *in = (u64 *)buf;
 163        struct page *page;
 164        unsigned long pfn, end_pfn;
 165        int bit;
 166
 167        if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
 168                return -EINVAL;
 169
 170        pfn = pos * BITS_PER_BYTE;
 171        if (pfn >= max_pfn)
 172                return -ENXIO;
 173
 174        end_pfn = pfn + count * BITS_PER_BYTE;
 175        if (end_pfn > max_pfn)
 176                end_pfn = max_pfn;
 177
 178        for (; pfn < end_pfn; pfn++) {
 179                bit = pfn % BITMAP_CHUNK_BITS;
 180                if ((*in >> bit) & 1) {
 181                        page = page_idle_get_page(pfn);
 182                        if (page) {
 183                                page_idle_clear_pte_refs(page);
 184                                set_page_idle(page);
 185                                put_page(page);
 186                        }
 187                }
 188                if (bit == BITMAP_CHUNK_BITS - 1)
 189                        in++;
 190                cond_resched();
 191        }
 192        return (char *)in - buf;
 193}
 194
 195static struct bin_attribute page_idle_bitmap_attr =
 196                __BIN_ATTR(bitmap, 0600,
 197                           page_idle_bitmap_read, page_idle_bitmap_write, 0);
 198
 199static struct bin_attribute *page_idle_bin_attrs[] = {
 200        &page_idle_bitmap_attr,
 201        NULL,
 202};
 203
 204static const struct attribute_group page_idle_attr_group = {
 205        .bin_attrs = page_idle_bin_attrs,
 206        .name = "page_idle",
 207};
 208
 209static int __init page_idle_init(void)
 210{
 211        int err;
 212
 213        err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
 214        if (err) {
 215                pr_err("page_idle: register sysfs failed\n");
 216                return err;
 217        }
 218        return 0;
 219}
 220subsys_initcall(page_idle_init);
 221