linux/mm/mapping_dirty_helpers.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/pagewalk.h>
   3#include <linux/hugetlb.h>
   4#include <linux/bitops.h>
   5#include <linux/mmu_notifier.h>
   6#include <asm/cacheflush.h>
   7#include <asm/tlbflush.h>
   8
   9/**
  10 * struct wp_walk - Private struct for pagetable walk callbacks
  11 * @range: Range for mmu notifiers
  12 * @tlbflush_start: Address of first modified pte
  13 * @tlbflush_end: Address of last modified pte + 1
  14 * @total: Total number of modified ptes
  15 */
  16struct wp_walk {
  17        struct mmu_notifier_range range;
  18        unsigned long tlbflush_start;
  19        unsigned long tlbflush_end;
  20        unsigned long total;
  21};
  22
  23/**
  24 * wp_pte - Write-protect a pte
  25 * @pte: Pointer to the pte
  26 * @addr: The virtual page address
  27 * @walk: pagetable walk callback argument
  28 *
  29 * The function write-protects a pte and records the range in
  30 * virtual address space of touched ptes for efficient range TLB flushes.
  31 */
  32static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
  33                  struct mm_walk *walk)
  34{
  35        struct wp_walk *wpwalk = walk->private;
  36        pte_t ptent = *pte;
  37
  38        if (pte_write(ptent)) {
  39                pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
  40
  41                ptent = pte_wrprotect(old_pte);
  42                ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
  43                wpwalk->total++;
  44                wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
  45                wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
  46                                           addr + PAGE_SIZE);
  47        }
  48
  49        return 0;
  50}
  51
  52/**
  53 * struct clean_walk - Private struct for the clean_record_pte function.
  54 * @base: struct wp_walk we derive from
  55 * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
  56 * @bitmap: Bitmap with one bit for each page offset in the address_space range
  57 * covered.
  58 * @start: Address_space page offset of first modified pte relative
  59 * to @bitmap_pgoff
  60 * @end: Address_space page offset of last modified pte relative
  61 * to @bitmap_pgoff
  62 */
  63struct clean_walk {
  64        struct wp_walk base;
  65        pgoff_t bitmap_pgoff;
  66        unsigned long *bitmap;
  67        pgoff_t start;
  68        pgoff_t end;
  69};
  70
  71#define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base)
  72
  73/**
  74 * clean_record_pte - Clean a pte and record its address space offset in a
  75 * bitmap
  76 * @pte: Pointer to the pte
  77 * @addr: The virtual page address
  78 * @walk: pagetable walk callback argument
  79 *
  80 * The function cleans a pte and records the range in
  81 * virtual address space of touched ptes for efficient TLB flushes.
  82 * It also records dirty ptes in a bitmap representing page offsets
  83 * in the address_space, as well as the first and last of the bits
  84 * touched.
  85 */
  86static int clean_record_pte(pte_t *pte, unsigned long addr,
  87                            unsigned long end, struct mm_walk *walk)
  88{
  89        struct wp_walk *wpwalk = walk->private;
  90        struct clean_walk *cwalk = to_clean_walk(wpwalk);
  91        pte_t ptent = *pte;
  92
  93        if (pte_dirty(ptent)) {
  94                pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
  95                        walk->vma->vm_pgoff - cwalk->bitmap_pgoff;
  96                pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
  97
  98                ptent = pte_mkclean(old_pte);
  99                ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
 100
 101                wpwalk->total++;
 102                wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
 103                wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
 104                                           addr + PAGE_SIZE);
 105
 106                __set_bit(pgoff, cwalk->bitmap);
 107                cwalk->start = min(cwalk->start, pgoff);
 108                cwalk->end = max(cwalk->end, pgoff + 1);
 109        }
 110
 111        return 0;
 112}
 113
 114/*
 115 * wp_clean_pmd_entry - The pagewalk pmd callback.
 116 *
 117 * Dirty-tracking should take place on the PTE level, so
 118 * WARN() if encountering a dirty huge pmd.
 119 * Furthermore, never split huge pmds, since that currently
 120 * causes dirty info loss. The pagefault handler should do
 121 * that if needed.
 122 */
 123static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
 124                              struct mm_walk *walk)
 125{
 126        pmd_t pmdval = pmd_read_atomic(pmd);
 127
 128        if (!pmd_trans_unstable(&pmdval))
 129                return 0;
 130
 131        if (pmd_none(pmdval)) {
 132                walk->action = ACTION_AGAIN;
 133                return 0;
 134        }
 135
 136        /* Huge pmd, present or migrated */
 137        walk->action = ACTION_CONTINUE;
 138        if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
 139                WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
 140
 141        return 0;
 142}
 143
 144/*
 145 * wp_clean_pud_entry - The pagewalk pud callback.
 146 *
 147 * Dirty-tracking should take place on the PTE level, so
 148 * WARN() if encountering a dirty huge puds.
 149 * Furthermore, never split huge puds, since that currently
 150 * causes dirty info loss. The pagefault handler should do
 151 * that if needed.
 152 */
 153static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
 154                              struct mm_walk *walk)
 155{
 156        pud_t pudval = READ_ONCE(*pud);
 157
 158        if (!pud_trans_unstable(&pudval))
 159                return 0;
 160
 161        if (pud_none(pudval)) {
 162                walk->action = ACTION_AGAIN;
 163                return 0;
 164        }
 165
 166        /* Huge pud */
 167        walk->action = ACTION_CONTINUE;
 168        if (pud_trans_huge(pudval) || pud_devmap(pudval))
 169                WARN_ON(pud_write(pudval) || pud_dirty(pudval));
 170
 171        return 0;
 172}
 173
 174/*
 175 * wp_clean_pre_vma - The pagewalk pre_vma callback.
 176 *
 177 * The pre_vma callback performs the cache flush, stages the tlb flush
 178 * and calls the necessary mmu notifiers.
 179 */
 180static int wp_clean_pre_vma(unsigned long start, unsigned long end,
 181                            struct mm_walk *walk)
 182{
 183        struct wp_walk *wpwalk = walk->private;
 184
 185        wpwalk->tlbflush_start = end;
 186        wpwalk->tlbflush_end = start;
 187
 188        mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0,
 189                                walk->vma, walk->mm, start, end);
 190        mmu_notifier_invalidate_range_start(&wpwalk->range);
 191        flush_cache_range(walk->vma, start, end);
 192
 193        /*
 194         * We're not using tlb_gather_mmu() since typically
 195         * only a small subrange of PTEs are affected, whereas
 196         * tlb_gather_mmu() records the full range.
 197         */
 198        inc_tlb_flush_pending(walk->mm);
 199
 200        return 0;
 201}
 202
 203/*
 204 * wp_clean_post_vma - The pagewalk post_vma callback.
 205 *
 206 * The post_vma callback performs the tlb flush and calls necessary mmu
 207 * notifiers.
 208 */
 209static void wp_clean_post_vma(struct mm_walk *walk)
 210{
 211        struct wp_walk *wpwalk = walk->private;
 212
 213        if (mm_tlb_flush_nested(walk->mm))
 214                flush_tlb_range(walk->vma, wpwalk->range.start,
 215                                wpwalk->range.end);
 216        else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start)
 217                flush_tlb_range(walk->vma, wpwalk->tlbflush_start,
 218                                wpwalk->tlbflush_end);
 219
 220        mmu_notifier_invalidate_range_end(&wpwalk->range);
 221        dec_tlb_flush_pending(walk->mm);
 222}
 223
 224/*
 225 * wp_clean_test_walk - The pagewalk test_walk callback.
 226 *
 227 * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas.
 228 */
 229static int wp_clean_test_walk(unsigned long start, unsigned long end,
 230                              struct mm_walk *walk)
 231{
 232        unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags);
 233
 234        /* Skip non-applicable VMAs */
 235        if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) !=
 236            (VM_SHARED | VM_MAYWRITE))
 237                return 1;
 238
 239        return 0;
 240}
 241
 242static const struct mm_walk_ops clean_walk_ops = {
 243        .pte_entry = clean_record_pte,
 244        .pmd_entry = wp_clean_pmd_entry,
 245        .pud_entry = wp_clean_pud_entry,
 246        .test_walk = wp_clean_test_walk,
 247        .pre_vma = wp_clean_pre_vma,
 248        .post_vma = wp_clean_post_vma
 249};
 250
 251static const struct mm_walk_ops wp_walk_ops = {
 252        .pte_entry = wp_pte,
 253        .pmd_entry = wp_clean_pmd_entry,
 254        .pud_entry = wp_clean_pud_entry,
 255        .test_walk = wp_clean_test_walk,
 256        .pre_vma = wp_clean_pre_vma,
 257        .post_vma = wp_clean_post_vma
 258};
 259
 260/**
 261 * wp_shared_mapping_range - Write-protect all ptes in an address space range
 262 * @mapping: The address_space we want to write protect
 263 * @first_index: The first page offset in the range
 264 * @nr: Number of incremental page offsets to cover
 265 *
 266 * Note: This function currently skips transhuge page-table entries, since
 267 * it's intended for dirty-tracking on the PTE level. It will warn on
 268 * encountering transhuge write-enabled entries, though, and can easily be
 269 * extended to handle them as well.
 270 *
 271 * Return: The number of ptes actually write-protected. Note that
 272 * already write-protected ptes are not counted.
 273 */
 274unsigned long wp_shared_mapping_range(struct address_space *mapping,
 275                                      pgoff_t first_index, pgoff_t nr)
 276{
 277        struct wp_walk wpwalk = { .total = 0 };
 278
 279        i_mmap_lock_read(mapping);
 280        WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops,
 281                                  &wpwalk));
 282        i_mmap_unlock_read(mapping);
 283
 284        return wpwalk.total;
 285}
 286EXPORT_SYMBOL_GPL(wp_shared_mapping_range);
 287
 288/**
 289 * clean_record_shared_mapping_range - Clean and record all ptes in an
 290 * address space range
 291 * @mapping: The address_space we want to clean
 292 * @first_index: The first page offset in the range
 293 * @nr: Number of incremental page offsets to cover
 294 * @bitmap_pgoff: The page offset of the first bit in @bitmap
 295 * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
 296 * cover the whole range @first_index..@first_index + @nr.
 297 * @start: Pointer to number of the first set bit in @bitmap.
 298 * is modified as new bits are set by the function.
 299 * @end: Pointer to the number of the last set bit in @bitmap.
 300 * none set. The value is modified as new bits are set by the function.
 301 *
 302 * Note: When this function returns there is no guarantee that a CPU has
 303 * not already dirtied new ptes. However it will not clean any ptes not
 304 * reported in the bitmap. The guarantees are as follows:
 305 * a) All ptes dirty when the function starts executing will end up recorded
 306 *    in the bitmap.
 307 * b) All ptes dirtied after that will either remain dirty, be recorded in the
 308 *    bitmap or both.
 309 *
 310 * If a caller needs to make sure all dirty ptes are picked up and none
 311 * additional are added, it first needs to write-protect the address-space
 312 * range and make sure new writers are blocked in page_mkwrite() or
 313 * pfn_mkwrite(). And then after a TLB flush following the write-protection
 314 * pick up all dirty bits.
 315 *
 316 * Note: This function currently skips transhuge page-table entries, since
 317 * it's intended for dirty-tracking on the PTE level. It will warn on
 318 * encountering transhuge dirty entries, though, and can easily be extended
 319 * to handle them as well.
 320 *
 321 * Return: The number of dirty ptes actually cleaned.
 322 */
 323unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
 324                                                pgoff_t first_index, pgoff_t nr,
 325                                                pgoff_t bitmap_pgoff,
 326                                                unsigned long *bitmap,
 327                                                pgoff_t *start,
 328                                                pgoff_t *end)
 329{
 330        bool none_set = (*start >= *end);
 331        struct clean_walk cwalk = {
 332                .base = { .total = 0 },
 333                .bitmap_pgoff = bitmap_pgoff,
 334                .bitmap = bitmap,
 335                .start = none_set ? nr : *start,
 336                .end = none_set ? 0 : *end,
 337        };
 338
 339        i_mmap_lock_read(mapping);
 340        WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops,
 341                                  &cwalk.base));
 342        i_mmap_unlock_read(mapping);
 343
 344        *start = cwalk.start;
 345        *end = cwalk.end;
 346
 347        return cwalk.base.total;
 348}
 349EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range);
 350