linux/mm/page_idle.c
<<
>>
Prefs
   1#include <linux/init.h>
   2#include <linux/bootmem.h>
   3#include <linux/fs.h>
   4#include <linux/sysfs.h>
   5#include <linux/kobject.h>
   6#include <linux/mm.h>
   7#include <linux/mmzone.h>
   8#include <linux/pagemap.h>
   9#include <linux/rmap.h>
  10#include <linux/mmu_notifier.h>
  11#include <linux/page_ext.h>
  12#include <linux/page_idle.h>
  13
  14#define BITMAP_CHUNK_SIZE       sizeof(u64)
  15#define BITMAP_CHUNK_BITS       (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
  16
  17/*
  18 * Idle page tracking only considers user memory pages, for other types of
  19 * pages the idle flag is always unset and an attempt to set it is silently
  20 * ignored.
  21 *
  22 * We treat a page as a user memory page if it is on an LRU list, because it is
  23 * always safe to pass such a page to rmap_walk(), which is essential for idle
  24 * page tracking. With such an indicator of user pages we can skip isolated
  25 * pages, but since there are not usually many of them, it will hardly affect
  26 * the overall result.
  27 *
  28 * This function tries to get a user memory page by pfn as described above.
  29 */
  30static struct page *page_idle_get_page(unsigned long pfn)
  31{
  32        struct page *page;
  33        struct zone *zone;
  34
  35        if (!pfn_valid(pfn))
  36                return NULL;
  37
  38        page = pfn_to_page(pfn);
  39        if (!page || !PageLRU(page) ||
  40            !get_page_unless_zero(page))
  41                return NULL;
  42
  43        zone = page_zone(page);
  44        spin_lock_irq(&zone->lru_lock);
  45        if (unlikely(!PageLRU(page))) {
  46                put_page(page);
  47                page = NULL;
  48        }
  49        spin_unlock_irq(&zone->lru_lock);
  50        return page;
  51}
  52
  53static int page_idle_clear_pte_refs_one(struct page *page,
  54                                        struct vm_area_struct *vma,
  55                                        unsigned long addr, void *arg)
  56{
  57        struct mm_struct *mm = vma->vm_mm;
  58        pmd_t *pmd;
  59        pte_t *pte;
  60        spinlock_t *ptl;
  61        bool referenced = false;
  62
  63        if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
  64                return SWAP_AGAIN;
  65
  66        if (pte) {
  67                referenced = ptep_clear_young_notify(vma, addr, pte);
  68                pte_unmap(pte);
  69        } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
  70                referenced = pmdp_clear_young_notify(vma, addr, pmd);
  71        } else {
  72                /* unexpected pmd-mapped page? */
  73                WARN_ON_ONCE(1);
  74        }
  75
  76        spin_unlock(ptl);
  77
  78        if (referenced) {
  79                clear_page_idle(page);
  80                /*
  81                 * We cleared the referenced bit in a mapping to this page. To
  82                 * avoid interference with page reclaim, mark it young so that
  83                 * page_referenced() will return > 0.
  84                 */
  85                set_page_young(page);
  86        }
  87        return SWAP_AGAIN;
  88}
  89
  90static void page_idle_clear_pte_refs(struct page *page)
  91{
  92        /*
  93         * Since rwc.arg is unused, rwc is effectively immutable, so we
  94         * can make it static const to save some cycles and stack.
  95         */
  96        static const struct rmap_walk_control rwc = {
  97                .rmap_one = page_idle_clear_pte_refs_one,
  98                .anon_lock = page_lock_anon_vma_read,
  99        };
 100        bool need_lock;
 101
 102        if (!page_mapped(page) ||
 103            !page_rmapping(page))
 104                return;
 105
 106        need_lock = !PageAnon(page) || PageKsm(page);
 107        if (need_lock && !trylock_page(page))
 108                return;
 109
 110        rmap_walk(page, (struct rmap_walk_control *)&rwc);
 111
 112        if (need_lock)
 113                unlock_page(page);
 114}
 115
 116static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
 117                                     struct bin_attribute *attr, char *buf,
 118                                     loff_t pos, size_t count)
 119{
 120        u64 *out = (u64 *)buf;
 121        struct page *page;
 122        unsigned long pfn, end_pfn;
 123        int bit;
 124
 125        if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
 126                return -EINVAL;
 127
 128        pfn = pos * BITS_PER_BYTE;
 129        if (pfn >= max_pfn)
 130                return 0;
 131
 132        end_pfn = pfn + count * BITS_PER_BYTE;
 133        if (end_pfn > max_pfn)
 134                end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
 135
 136        for (; pfn < end_pfn; pfn++) {
 137                bit = pfn % BITMAP_CHUNK_BITS;
 138                if (!bit)
 139                        *out = 0ULL;
 140                page = page_idle_get_page(pfn);
 141                if (page) {
 142                        if (page_is_idle(page)) {
 143                                /*
 144                                 * The page might have been referenced via a
 145                                 * pte, in which case it is not idle. Clear
 146                                 * refs and recheck.
 147                                 */
 148                                page_idle_clear_pte_refs(page);
 149                                if (page_is_idle(page))
 150                                        *out |= 1ULL << bit;
 151                        }
 152                        put_page(page);
 153                }
 154                if (bit == BITMAP_CHUNK_BITS - 1)
 155                        out++;
 156                cond_resched();
 157        }
 158        return (char *)out - buf;
 159}
 160
 161static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
 162                                      struct bin_attribute *attr, char *buf,
 163                                      loff_t pos, size_t count)
 164{
 165        const u64 *in = (u64 *)buf;
 166        struct page *page;
 167        unsigned long pfn, end_pfn;
 168        int bit;
 169
 170        if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
 171                return -EINVAL;
 172
 173        pfn = pos * BITS_PER_BYTE;
 174        if (pfn >= max_pfn)
 175                return -ENXIO;
 176
 177        end_pfn = pfn + count * BITS_PER_BYTE;
 178        if (end_pfn > max_pfn)
 179                end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
 180
 181        for (; pfn < end_pfn; pfn++) {
 182                bit = pfn % BITMAP_CHUNK_BITS;
 183                if ((*in >> bit) & 1) {
 184                        page = page_idle_get_page(pfn);
 185                        if (page) {
 186                                page_idle_clear_pte_refs(page);
 187                                set_page_idle(page);
 188                                put_page(page);
 189                        }
 190                }
 191                if (bit == BITMAP_CHUNK_BITS - 1)
 192                        in++;
 193                cond_resched();
 194        }
 195        return (char *)in - buf;
 196}
 197
 198static struct bin_attribute page_idle_bitmap_attr =
 199                __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
 200                           page_idle_bitmap_read, page_idle_bitmap_write, 0);
 201
 202static struct bin_attribute *page_idle_bin_attrs[] = {
 203        &page_idle_bitmap_attr,
 204        NULL,
 205};
 206
 207static struct attribute_group page_idle_attr_group = {
 208        .bin_attrs = page_idle_bin_attrs,
 209        .name = "page_idle",
 210};
 211
 212#ifndef CONFIG_64BIT
 213static bool need_page_idle(void)
 214{
 215        return true;
 216}
 217struct page_ext_operations page_idle_ops = {
 218        .need = need_page_idle,
 219};
 220#endif
 221
 222static int __init page_idle_init(void)
 223{
 224        int err;
 225
 226        err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
 227        if (err) {
 228                pr_err("page_idle: register sysfs failed\n");
 229                return err;
 230        }
 231        return 0;
 232}
 233subsys_initcall(page_idle_init);
 234