linux/mm/mapping_dirty_helpers.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2#include <linux/pagewalk.h>
   3#include <linux/hugetlb.h>
   4#include <linux/bitops.h>
   5#include <linux/mmu_notifier.h>
   6#include <asm/cacheflush.h>
   7#include <asm/tlbflush.h>
   8
   9/**
  10 * struct wp_walk - Private struct for pagetable walk callbacks
  11 * @range: Range for mmu notifiers
  12 * @tlbflush_start: Address of first modified pte
  13 * @tlbflush_end: Address of last modified pte + 1
  14 * @total: Total number of modified ptes
  15 */
  16struct wp_walk {
  17        struct mmu_notifier_range range;
  18        unsigned long tlbflush_start;
  19        unsigned long tlbflush_end;
  20        unsigned long total;
  21};
  22
  23/**
  24 * wp_pte - Write-protect a pte
  25 * @pte: Pointer to the pte
  26 * @addr: The start of protecting virtual address
  27 * @end: The end of protecting virtual address
  28 * @walk: pagetable walk callback argument
  29 *
  30 * The function write-protects a pte and records the range in
  31 * virtual address space of touched ptes for efficient range TLB flushes.
  32 */
  33static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end,
  34                  struct mm_walk *walk)
  35{
  36        struct wp_walk *wpwalk = walk->private;
  37        pte_t ptent = *pte;
  38
  39        if (pte_write(ptent)) {
  40                pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
  41
  42                ptent = pte_wrprotect(old_pte);
  43                ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
  44                wpwalk->total++;
  45                wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
  46                wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
  47                                           addr + PAGE_SIZE);
  48        }
  49
  50        return 0;
  51}
  52
  53/**
  54 * struct clean_walk - Private struct for the clean_record_pte function.
  55 * @base: struct wp_walk we derive from
  56 * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
  57 * @bitmap: Bitmap with one bit for each page offset in the address_space range
  58 * covered.
  59 * @start: Address_space page offset of first modified pte relative
  60 * to @bitmap_pgoff
  61 * @end: Address_space page offset of last modified pte relative
  62 * to @bitmap_pgoff
  63 */
  64struct clean_walk {
  65        struct wp_walk base;
  66        pgoff_t bitmap_pgoff;
  67        unsigned long *bitmap;
  68        pgoff_t start;
  69        pgoff_t end;
  70};
  71
  72#define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base)
  73
  74/**
  75 * clean_record_pte - Clean a pte and record its address space offset in a
  76 * bitmap
  77 * @pte: Pointer to the pte
  78 * @addr: The start of virtual address to be clean
  79 * @end: The end of virtual address to be clean
  80 * @walk: pagetable walk callback argument
  81 *
  82 * The function cleans a pte and records the range in
  83 * virtual address space of touched ptes for efficient TLB flushes.
  84 * It also records dirty ptes in a bitmap representing page offsets
  85 * in the address_space, as well as the first and last of the bits
  86 * touched.
  87 */
  88static int clean_record_pte(pte_t *pte, unsigned long addr,
  89                            unsigned long end, struct mm_walk *walk)
  90{
  91        struct wp_walk *wpwalk = walk->private;
  92        struct clean_walk *cwalk = to_clean_walk(wpwalk);
  93        pte_t ptent = *pte;
  94
  95        if (pte_dirty(ptent)) {
  96                pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) +
  97                        walk->vma->vm_pgoff - cwalk->bitmap_pgoff;
  98                pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte);
  99
 100                ptent = pte_mkclean(old_pte);
 101                ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent);
 102
 103                wpwalk->total++;
 104                wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr);
 105                wpwalk->tlbflush_end = max(wpwalk->tlbflush_end,
 106                                           addr + PAGE_SIZE);
 107
 108                __set_bit(pgoff, cwalk->bitmap);
 109                cwalk->start = min(cwalk->start, pgoff);
 110                cwalk->end = max(cwalk->end, pgoff + 1);
 111        }
 112
 113        return 0;
 114}
 115
 116/*
 117 * wp_clean_pmd_entry - The pagewalk pmd callback.
 118 *
 119 * Dirty-tracking should take place on the PTE level, so
 120 * WARN() if encountering a dirty huge pmd.
 121 * Furthermore, never split huge pmds, since that currently
 122 * causes dirty info loss. The pagefault handler should do
 123 * that if needed.
 124 */
 125static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
 126                              struct mm_walk *walk)
 127{
 128        pmd_t pmdval = pmd_read_atomic(pmd);
 129
 130        if (!pmd_trans_unstable(&pmdval))
 131                return 0;
 132
 133        if (pmd_none(pmdval)) {
 134                walk->action = ACTION_AGAIN;
 135                return 0;
 136        }
 137
 138        /* Huge pmd, present or migrated */
 139        walk->action = ACTION_CONTINUE;
 140        if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
 141                WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval));
 142
 143        return 0;
 144}
 145
 146/*
 147 * wp_clean_pud_entry - The pagewalk pud callback.
 148 *
 149 * Dirty-tracking should take place on the PTE level, so
 150 * WARN() if encountering a dirty huge puds.
 151 * Furthermore, never split huge puds, since that currently
 152 * causes dirty info loss. The pagefault handler should do
 153 * that if needed.
 154 */
 155static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
 156                              struct mm_walk *walk)
 157{
 158        pud_t pudval = READ_ONCE(*pud);
 159
 160        if (!pud_trans_unstable(&pudval))
 161                return 0;
 162
 163        if (pud_none(pudval)) {
 164                walk->action = ACTION_AGAIN;
 165                return 0;
 166        }
 167
 168#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 169        /* Huge pud */
 170        walk->action = ACTION_CONTINUE;
 171        if (pud_trans_huge(pudval) || pud_devmap(pudval))
 172                WARN_ON(pud_write(pudval) || pud_dirty(pudval));
 173#endif
 174
 175        return 0;
 176}
 177
 178/*
 179 * wp_clean_pre_vma - The pagewalk pre_vma callback.
 180 *
 181 * The pre_vma callback performs the cache flush, stages the tlb flush
 182 * and calls the necessary mmu notifiers.
 183 */
 184static int wp_clean_pre_vma(unsigned long start, unsigned long end,
 185                            struct mm_walk *walk)
 186{
 187        struct wp_walk *wpwalk = walk->private;
 188
 189        wpwalk->tlbflush_start = end;
 190        wpwalk->tlbflush_end = start;
 191
 192        mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0,
 193                                walk->vma, walk->mm, start, end);
 194        mmu_notifier_invalidate_range_start(&wpwalk->range);
 195        flush_cache_range(walk->vma, start, end);
 196
 197        /*
 198         * We're not using tlb_gather_mmu() since typically
 199         * only a small subrange of PTEs are affected, whereas
 200         * tlb_gather_mmu() records the full range.
 201         */
 202        inc_tlb_flush_pending(walk->mm);
 203
 204        return 0;
 205}
 206
 207/*
 208 * wp_clean_post_vma - The pagewalk post_vma callback.
 209 *
 210 * The post_vma callback performs the tlb flush and calls necessary mmu
 211 * notifiers.
 212 */
 213static void wp_clean_post_vma(struct mm_walk *walk)
 214{
 215        struct wp_walk *wpwalk = walk->private;
 216
 217        if (mm_tlb_flush_nested(walk->mm))
 218                flush_tlb_range(walk->vma, wpwalk->range.start,
 219                                wpwalk->range.end);
 220        else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start)
 221                flush_tlb_range(walk->vma, wpwalk->tlbflush_start,
 222                                wpwalk->tlbflush_end);
 223
 224        mmu_notifier_invalidate_range_end(&wpwalk->range);
 225        dec_tlb_flush_pending(walk->mm);
 226}
 227
 228/*
 229 * wp_clean_test_walk - The pagewalk test_walk callback.
 230 *
 231 * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas.
 232 */
 233static int wp_clean_test_walk(unsigned long start, unsigned long end,
 234                              struct mm_walk *walk)
 235{
 236        unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags);
 237
 238        /* Skip non-applicable VMAs */
 239        if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) !=
 240            (VM_SHARED | VM_MAYWRITE))
 241                return 1;
 242
 243        return 0;
 244}
 245
 246static const struct mm_walk_ops clean_walk_ops = {
 247        .pte_entry = clean_record_pte,
 248        .pmd_entry = wp_clean_pmd_entry,
 249        .pud_entry = wp_clean_pud_entry,
 250        .test_walk = wp_clean_test_walk,
 251        .pre_vma = wp_clean_pre_vma,
 252        .post_vma = wp_clean_post_vma
 253};
 254
 255static const struct mm_walk_ops wp_walk_ops = {
 256        .pte_entry = wp_pte,
 257        .pmd_entry = wp_clean_pmd_entry,
 258        .pud_entry = wp_clean_pud_entry,
 259        .test_walk = wp_clean_test_walk,
 260        .pre_vma = wp_clean_pre_vma,
 261        .post_vma = wp_clean_post_vma
 262};
 263
 264/**
 265 * wp_shared_mapping_range - Write-protect all ptes in an address space range
 266 * @mapping: The address_space we want to write protect
 267 * @first_index: The first page offset in the range
 268 * @nr: Number of incremental page offsets to cover
 269 *
 270 * Note: This function currently skips transhuge page-table entries, since
 271 * it's intended for dirty-tracking on the PTE level. It will warn on
 272 * encountering transhuge write-enabled entries, though, and can easily be
 273 * extended to handle them as well.
 274 *
 275 * Return: The number of ptes actually write-protected. Note that
 276 * already write-protected ptes are not counted.
 277 */
 278unsigned long wp_shared_mapping_range(struct address_space *mapping,
 279                                      pgoff_t first_index, pgoff_t nr)
 280{
 281        struct wp_walk wpwalk = { .total = 0 };
 282
 283        i_mmap_lock_read(mapping);
 284        WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops,
 285                                  &wpwalk));
 286        i_mmap_unlock_read(mapping);
 287
 288        return wpwalk.total;
 289}
 290EXPORT_SYMBOL_GPL(wp_shared_mapping_range);
 291
 292/**
 293 * clean_record_shared_mapping_range - Clean and record all ptes in an
 294 * address space range
 295 * @mapping: The address_space we want to clean
 296 * @first_index: The first page offset in the range
 297 * @nr: Number of incremental page offsets to cover
 298 * @bitmap_pgoff: The page offset of the first bit in @bitmap
 299 * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
 300 * cover the whole range @first_index..@first_index + @nr.
 301 * @start: Pointer to number of the first set bit in @bitmap.
 302 * is modified as new bits are set by the function.
 303 * @end: Pointer to the number of the last set bit in @bitmap.
 304 * none set. The value is modified as new bits are set by the function.
 305 *
 306 * Note: When this function returns there is no guarantee that a CPU has
 307 * not already dirtied new ptes. However it will not clean any ptes not
 308 * reported in the bitmap. The guarantees are as follows:
 309 * a) All ptes dirty when the function starts executing will end up recorded
 310 *    in the bitmap.
 311 * b) All ptes dirtied after that will either remain dirty, be recorded in the
 312 *    bitmap or both.
 313 *
 314 * If a caller needs to make sure all dirty ptes are picked up and none
 315 * additional are added, it first needs to write-protect the address-space
 316 * range and make sure new writers are blocked in page_mkwrite() or
 317 * pfn_mkwrite(). And then after a TLB flush following the write-protection
 318 * pick up all dirty bits.
 319 *
 320 * This function currently skips transhuge page-table entries, since
 321 * it's intended for dirty-tracking on the PTE level. It will warn on
 322 * encountering transhuge dirty entries, though, and can easily be extended
 323 * to handle them as well.
 324 *
 325 * Return: The number of dirty ptes actually cleaned.
 326 */
 327unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
 328                                                pgoff_t first_index, pgoff_t nr,
 329                                                pgoff_t bitmap_pgoff,
 330                                                unsigned long *bitmap,
 331                                                pgoff_t *start,
 332                                                pgoff_t *end)
 333{
 334        bool none_set = (*start >= *end);
 335        struct clean_walk cwalk = {
 336                .base = { .total = 0 },
 337                .bitmap_pgoff = bitmap_pgoff,
 338                .bitmap = bitmap,
 339                .start = none_set ? nr : *start,
 340                .end = none_set ? 0 : *end,
 341        };
 342
 343        i_mmap_lock_read(mapping);
 344        WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops,
 345                                  &cwalk.base));
 346        i_mmap_unlock_read(mapping);
 347
 348        *start = cwalk.start;
 349        *end = cwalk.end;
 350
 351        return cwalk.base.total;
 352}
 353EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range);
 354