linux/mm/page_idle.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/init.h>
   3#include <linux/memblock.h>
   4#include <linux/fs.h>
   5#include <linux/sysfs.h>
   6#include <linux/kobject.h>
   7#include <linux/mm.h>
   8#include <linux/mmzone.h>
   9#include <linux/pagemap.h>
  10#include <linux/rmap.h>
  11#include <linux/mmu_notifier.h>
  12#include <linux/page_ext.h>
  13#include <linux/page_idle.h>
  14
  15#define BITMAP_CHUNK_SIZE       sizeof(u64)
  16#define BITMAP_CHUNK_BITS       (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
  17
  18/*
  19 * Idle page tracking only considers user memory pages, for other types of
  20 * pages the idle flag is always unset and an attempt to set it is silently
  21 * ignored.
  22 *
  23 * We treat a page as a user memory page if it is on an LRU list, because it is
  24 * always safe to pass such a page to rmap_walk(), which is essential for idle
  25 * page tracking. With such an indicator of user pages we can skip isolated
  26 * pages, but since there are not usually many of them, it will hardly affect
  27 * the overall result.
  28 *
  29 * This function tries to get a user memory page by pfn as described above.
  30 */
  31static struct page *page_idle_get_page(unsigned long pfn)
  32{
  33        struct page *page;
  34        pg_data_t *pgdat;
  35
  36        if (!pfn_valid(pfn))
  37                return NULL;
  38
  39        page = pfn_to_page(pfn);
  40        if (!page || !PageLRU(page) ||
  41            !get_page_unless_zero(page))
  42                return NULL;
  43
  44        pgdat = page_pgdat(page);
  45        spin_lock_irq(&pgdat->lru_lock);
  46        if (unlikely(!PageLRU(page))) {
  47                put_page(page);
  48                page = NULL;
  49        }
  50        spin_unlock_irq(&pgdat->lru_lock);
  51        return page;
  52}
  53
  54static bool page_idle_clear_pte_refs_one(struct page *page,
  55                                        struct vm_area_struct *vma,
  56                                        unsigned long addr, void *arg)
  57{
  58        struct page_vma_mapped_walk pvmw = {
  59                .page = page,
  60                .vma = vma,
  61                .address = addr,
  62        };
  63        bool referenced = false;
  64
  65        while (page_vma_mapped_walk(&pvmw)) {
  66                addr = pvmw.address;
  67                if (pvmw.pte) {
  68                        /*
  69                         * For PTE-mapped THP, one sub page is referenced,
  70                         * the whole THP is referenced.
  71                         */
  72                        if (ptep_clear_young_notify(vma, addr, pvmw.pte))
  73                                referenced = true;
  74                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
  75                        if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
  76                                referenced = true;
  77                } else {
  78                        /* unexpected pmd-mapped page? */
  79                        WARN_ON_ONCE(1);
  80                }
  81        }
  82
  83        if (referenced) {
  84                clear_page_idle(page);
  85                /*
  86                 * We cleared the referenced bit in a mapping to this page. To
  87                 * avoid interference with page reclaim, mark it young so that
  88                 * page_referenced() will return > 0.
  89                 */
  90                set_page_young(page);
  91        }
  92        return true;
  93}
  94
  95static void page_idle_clear_pte_refs(struct page *page)
  96{
  97        /*
  98         * Since rwc.arg is unused, rwc is effectively immutable, so we
  99         * can make it static const to save some cycles and stack.
 100         */
 101        static const struct rmap_walk_control rwc = {
 102                .rmap_one = page_idle_clear_pte_refs_one,
 103                .anon_lock = page_lock_anon_vma_read,
 104        };
 105        bool need_lock;
 106
 107        if (!page_mapped(page) ||
 108            !page_rmapping(page))
 109                return;
 110
 111        need_lock = !PageAnon(page) || PageKsm(page);
 112        if (need_lock && !trylock_page(page))
 113                return;
 114
 115        rmap_walk(page, (struct rmap_walk_control *)&rwc);
 116
 117        if (need_lock)
 118                unlock_page(page);
 119}
 120
 121static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
 122                                     struct bin_attribute *attr, char *buf,
 123                                     loff_t pos, size_t count)
 124{
 125        u64 *out = (u64 *)buf;
 126        struct page *page;
 127        unsigned long pfn, end_pfn;
 128        int bit;
 129
 130        if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
 131                return -EINVAL;
 132
 133        pfn = pos * BITS_PER_BYTE;
 134        if (pfn >= max_pfn)
 135                return 0;
 136
 137        end_pfn = pfn + count * BITS_PER_BYTE;
 138        if (end_pfn > max_pfn)
 139                end_pfn = max_pfn;
 140
 141        for (; pfn < end_pfn; pfn++) {
 142                bit = pfn % BITMAP_CHUNK_BITS;
 143                if (!bit)
 144                        *out = 0ULL;
 145                page = page_idle_get_page(pfn);
 146                if (page) {
 147                        if (page_is_idle(page)) {
 148                                /*
 149                                 * The page might have been referenced via a
 150                                 * pte, in which case it is not idle. Clear
 151                                 * refs and recheck.
 152                                 */
 153                                page_idle_clear_pte_refs(page);
 154                                if (page_is_idle(page))
 155                                        *out |= 1ULL << bit;
 156                        }
 157                        put_page(page);
 158                }
 159                if (bit == BITMAP_CHUNK_BITS - 1)
 160                        out++;
 161                cond_resched();
 162        }
 163        return (char *)out - buf;
 164}
 165
 166static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
 167                                      struct bin_attribute *attr, char *buf,
 168                                      loff_t pos, size_t count)
 169{
 170        const u64 *in = (u64 *)buf;
 171        struct page *page;
 172        unsigned long pfn, end_pfn;
 173        int bit;
 174
 175        if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
 176                return -EINVAL;
 177
 178        pfn = pos * BITS_PER_BYTE;
 179        if (pfn >= max_pfn)
 180                return -ENXIO;
 181
 182        end_pfn = pfn + count * BITS_PER_BYTE;
 183        if (end_pfn > max_pfn)
 184                end_pfn = max_pfn;
 185
 186        for (; pfn < end_pfn; pfn++) {
 187                bit = pfn % BITMAP_CHUNK_BITS;
 188                if ((*in >> bit) & 1) {
 189                        page = page_idle_get_page(pfn);
 190                        if (page) {
 191                                page_idle_clear_pte_refs(page);
 192                                set_page_idle(page);
 193                                put_page(page);
 194                        }
 195                }
 196                if (bit == BITMAP_CHUNK_BITS - 1)
 197                        in++;
 198                cond_resched();
 199        }
 200        return (char *)in - buf;
 201}
 202
 203static struct bin_attribute page_idle_bitmap_attr =
 204                __BIN_ATTR(bitmap, 0600,
 205                           page_idle_bitmap_read, page_idle_bitmap_write, 0);
 206
 207static struct bin_attribute *page_idle_bin_attrs[] = {
 208        &page_idle_bitmap_attr,
 209        NULL,
 210};
 211
 212static const struct attribute_group page_idle_attr_group = {
 213        .bin_attrs = page_idle_bin_attrs,
 214        .name = "page_idle",
 215};
 216
 217#ifndef CONFIG_64BIT
 218static bool need_page_idle(void)
 219{
 220        return true;
 221}
 222struct page_ext_operations page_idle_ops = {
 223        .need = need_page_idle,
 224};
 225#endif
 226
 227static int __init page_idle_init(void)
 228{
 229        int err;
 230
 231        err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
 232        if (err) {
 233                pr_err("page_idle: register sysfs failed\n");
 234                return err;
 235        }
 236        return 0;
 237}
 238subsys_initcall(page_idle_init);
 239