linux/mm/pagewalk.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/pagewalk.h>
   3#include <linux/highmem.h>
   4#include <linux/sched.h>
   5#include <linux/hugetlb.h>
   6
   7static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
   8                          struct mm_walk *walk)
   9{
  10        pte_t *pte;
  11        int err = 0;
  12        const struct mm_walk_ops *ops = walk->ops;
  13
  14        pte = pte_offset_map(pmd, addr);
  15        for (;;) {
  16                err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
  17                if (err)
  18                       break;
  19                addr += PAGE_SIZE;
  20                if (addr == end)
  21                        break;
  22                pte++;
  23        }
  24
  25        pte_unmap(pte);
  26        return err;
  27}
  28
  29static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  30                          struct mm_walk *walk)
  31{
  32        pmd_t *pmd;
  33        unsigned long next;
  34        const struct mm_walk_ops *ops = walk->ops;
  35        int err = 0;
  36
  37        pmd = pmd_offset(pud, addr);
  38        do {
  39again:
  40                next = pmd_addr_end(addr, end);
  41                if (pmd_none(*pmd) || !walk->vma) {
  42                        if (ops->pte_hole)
  43                                err = ops->pte_hole(addr, next, walk);
  44                        if (err)
  45                                break;
  46                        continue;
  47                }
  48                /*
  49                 * This implies that each ->pmd_entry() handler
  50                 * needs to know about pmd_trans_huge() pmds
  51                 */
  52                if (ops->pmd_entry)
  53                        err = ops->pmd_entry(pmd, addr, next, walk);
  54                if (err)
  55                        break;
  56
  57                /*
  58                 * Check this here so we only break down trans_huge
  59                 * pages when we _need_ to
  60                 */
  61                if (!ops->pte_entry)
  62                        continue;
  63
  64                split_huge_pmd(walk->vma, pmd, addr);
  65                if (pmd_trans_unstable(pmd))
  66                        goto again;
  67                err = walk_pte_range(pmd, addr, next, walk);
  68                if (err)
  69                        break;
  70        } while (pmd++, addr = next, addr != end);
  71
  72        return err;
  73}
  74
  75static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
  76                          struct mm_walk *walk)
  77{
  78        pud_t *pud;
  79        unsigned long next;
  80        const struct mm_walk_ops *ops = walk->ops;
  81        int err = 0;
  82
  83        pud = pud_offset(p4d, addr);
  84        do {
  85 again:
  86                next = pud_addr_end(addr, end);
  87                if (pud_none(*pud) || !walk->vma) {
  88                        if (ops->pte_hole)
  89                                err = ops->pte_hole(addr, next, walk);
  90                        if (err)
  91                                break;
  92                        continue;
  93                }
  94
  95                if (ops->pud_entry) {
  96                        spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
  97
  98                        if (ptl) {
  99                                err = ops->pud_entry(pud, addr, next, walk);
 100                                spin_unlock(ptl);
 101                                if (err)
 102                                        break;
 103                                continue;
 104                        }
 105                }
 106
 107                split_huge_pud(walk->vma, pud, addr);
 108                if (pud_none(*pud))
 109                        goto again;
 110
 111                if (ops->pmd_entry || ops->pte_entry)
 112                        err = walk_pmd_range(pud, addr, next, walk);
 113                if (err)
 114                        break;
 115        } while (pud++, addr = next, addr != end);
 116
 117        return err;
 118}
 119
 120static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 121                          struct mm_walk *walk)
 122{
 123        p4d_t *p4d;
 124        unsigned long next;
 125        const struct mm_walk_ops *ops = walk->ops;
 126        int err = 0;
 127
 128        p4d = p4d_offset(pgd, addr);
 129        do {
 130                next = p4d_addr_end(addr, end);
 131                if (p4d_none_or_clear_bad(p4d)) {
 132                        if (ops->pte_hole)
 133                                err = ops->pte_hole(addr, next, walk);
 134                        if (err)
 135                                break;
 136                        continue;
 137                }
 138                if (ops->pmd_entry || ops->pte_entry)
 139                        err = walk_pud_range(p4d, addr, next, walk);
 140                if (err)
 141                        break;
 142        } while (p4d++, addr = next, addr != end);
 143
 144        return err;
 145}
 146
 147static int walk_pgd_range(unsigned long addr, unsigned long end,
 148                          struct mm_walk *walk)
 149{
 150        pgd_t *pgd;
 151        unsigned long next;
 152        const struct mm_walk_ops *ops = walk->ops;
 153        int err = 0;
 154
 155        pgd = pgd_offset(walk->mm, addr);
 156        do {
 157                next = pgd_addr_end(addr, end);
 158                if (pgd_none_or_clear_bad(pgd)) {
 159                        if (ops->pte_hole)
 160                                err = ops->pte_hole(addr, next, walk);
 161                        if (err)
 162                                break;
 163                        continue;
 164                }
 165                if (ops->pmd_entry || ops->pte_entry)
 166                        err = walk_p4d_range(pgd, addr, next, walk);
 167                if (err)
 168                        break;
 169        } while (pgd++, addr = next, addr != end);
 170
 171        return err;
 172}
 173
 174#ifdef CONFIG_HUGETLB_PAGE
 175static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
 176                                       unsigned long end)
 177{
 178        unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
 179        return boundary < end ? boundary : end;
 180}
 181
 182static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 183                              struct mm_walk *walk)
 184{
 185        struct vm_area_struct *vma = walk->vma;
 186        struct hstate *h = hstate_vma(vma);
 187        unsigned long next;
 188        unsigned long hmask = huge_page_mask(h);
 189        unsigned long sz = huge_page_size(h);
 190        pte_t *pte;
 191        const struct mm_walk_ops *ops = walk->ops;
 192        int err = 0;
 193
 194        do {
 195                next = hugetlb_entry_end(h, addr, end);
 196                pte = huge_pte_offset(walk->mm, addr & hmask, sz);
 197
 198                if (pte)
 199                        err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
 200                else if (ops->pte_hole)
 201                        err = ops->pte_hole(addr, next, walk);
 202
 203                if (err)
 204                        break;
 205        } while (addr = next, addr != end);
 206
 207        return err;
 208}
 209
 210#else /* CONFIG_HUGETLB_PAGE */
 211static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 212                              struct mm_walk *walk)
 213{
 214        return 0;
 215}
 216
 217#endif /* CONFIG_HUGETLB_PAGE */
 218
 219/*
 220 * Decide whether we really walk over the current vma on [@start, @end)
 221 * or skip it via the returned value. Return 0 if we do walk over the
 222 * current vma, and return 1 if we skip the vma. Negative values means
 223 * error, where we abort the current walk.
 224 */
 225static int walk_page_test(unsigned long start, unsigned long end,
 226                        struct mm_walk *walk)
 227{
 228        struct vm_area_struct *vma = walk->vma;
 229        const struct mm_walk_ops *ops = walk->ops;
 230
 231        if (ops->test_walk)
 232                return ops->test_walk(start, end, walk);
 233
 234        /*
 235         * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
 236         * range, so we don't walk over it as we do for normal vmas. However,
 237         * Some callers are interested in handling hole range and they don't
 238         * want to just ignore any single address range. Such users certainly
 239         * define their ->pte_hole() callbacks, so let's delegate them to handle
 240         * vma(VM_PFNMAP).
 241         */
 242        if (vma->vm_flags & VM_PFNMAP) {
 243                int err = 1;
 244                if (ops->pte_hole)
 245                        err = ops->pte_hole(start, end, walk);
 246                return err ? err : 1;
 247        }
 248        return 0;
 249}
 250
 251static int __walk_page_range(unsigned long start, unsigned long end,
 252                        struct mm_walk *walk)
 253{
 254        int err = 0;
 255        struct vm_area_struct *vma = walk->vma;
 256
 257        if (vma && is_vm_hugetlb_page(vma)) {
 258                if (walk->ops->hugetlb_entry)
 259                        err = walk_hugetlb_range(start, end, walk);
 260        } else
 261                err = walk_pgd_range(start, end, walk);
 262
 263        return err;
 264}
 265
 266/**
 267 * walk_page_range - walk page table with caller specific callbacks
 268 * @mm:         mm_struct representing the target process of page table walk
 269 * @start:      start address of the virtual address range
 270 * @end:        end address of the virtual address range
 271 * @ops:        operation to call during the walk
 272 * @private:    private data for callbacks' usage
 273 *
 274 * Recursively walk the page table tree of the process represented by @mm
 275 * within the virtual address range [@start, @end). During walking, we can do
 276 * some caller-specific works for each entry, by setting up pmd_entry(),
 277 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
 278 * callbacks, the associated entries/pages are just ignored.
 279 * The return values of these callbacks are commonly defined like below:
 280 *
 281 *  - 0  : succeeded to handle the current entry, and if you don't reach the
 282 *         end address yet, continue to walk.
 283 *  - >0 : succeeded to handle the current entry, and return to the caller
 284 *         with caller specific value.
 285 *  - <0 : failed to handle the current entry, and return to the caller
 286 *         with error code.
 287 *
 288 * Before starting to walk page table, some callers want to check whether
 289 * they really want to walk over the current vma, typically by checking
 290 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
 291 * purpose.
 292 *
 293 * struct mm_walk keeps current values of some common data like vma and pmd,
 294 * which are useful for the access from callbacks. If you want to pass some
 295 * caller-specific data to callbacks, @private should be helpful.
 296 *
 297 * Locking:
 298 *   Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
 299 *   because these function traverse vma list and/or access to vma's data.
 300 */
 301int walk_page_range(struct mm_struct *mm, unsigned long start,
 302                unsigned long end, const struct mm_walk_ops *ops,
 303                void *private)
 304{
 305        int err = 0;
 306        unsigned long next;
 307        struct vm_area_struct *vma;
 308        struct mm_walk walk = {
 309                .ops            = ops,
 310                .mm             = mm,
 311                .private        = private,
 312        };
 313
 314        if (start >= end)
 315                return -EINVAL;
 316
 317        if (!walk.mm)
 318                return -EINVAL;
 319
 320        lockdep_assert_held(&walk.mm->mmap_sem);
 321
 322        vma = find_vma(walk.mm, start);
 323        do {
 324                if (!vma) { /* after the last vma */
 325                        walk.vma = NULL;
 326                        next = end;
 327                } else if (start < vma->vm_start) { /* outside vma */
 328                        walk.vma = NULL;
 329                        next = min(end, vma->vm_start);
 330                } else { /* inside vma */
 331                        walk.vma = vma;
 332                        next = min(end, vma->vm_end);
 333                        vma = vma->vm_next;
 334
 335                        err = walk_page_test(start, next, &walk);
 336                        if (err > 0) {
 337                                /*
 338                                 * positive return values are purely for
 339                                 * controlling the pagewalk, so should never
 340                                 * be passed to the callers.
 341                                 */
 342                                err = 0;
 343                                continue;
 344                        }
 345                        if (err < 0)
 346                                break;
 347                }
 348                if (walk.vma || walk.ops->pte_hole)
 349                        err = __walk_page_range(start, next, &walk);
 350                if (err)
 351                        break;
 352        } while (start = next, start < end);
 353        return err;
 354}
 355
 356int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
 357                void *private)
 358{
 359        struct mm_walk walk = {
 360                .ops            = ops,
 361                .mm             = vma->vm_mm,
 362                .vma            = vma,
 363                .private        = private,
 364        };
 365        int err;
 366
 367        if (!walk.mm)
 368                return -EINVAL;
 369
 370        lockdep_assert_held(&walk.mm->mmap_sem);
 371
 372        err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
 373        if (err > 0)
 374                return 0;
 375        if (err < 0)
 376                return err;
 377        return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
 378}
 379