linux/arch/x86/mm/dump_pagetables.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Debug helper to dump the current kernel pagetables of the system
   4 * so that we can see what the various memory ranges are set to.
   5 *
   6 * (C) Copyright 2008 Intel Corporation
   7 *
   8 * Author: Arjan van de Ven <arjan@linux.intel.com>
   9 */
  10
  11#include <linux/debugfs.h>
  12#include <linux/kasan.h>
  13#include <linux/mm.h>
  14#include <linux/init.h>
  15#include <linux/sched.h>
  16#include <linux/seq_file.h>
  17#include <linux/highmem.h>
  18#include <linux/pci.h>
  19#include <linux/ptdump.h>
  20
  21#include <asm/e820/types.h>
  22
  23/*
  24 * The dumper groups pagetable entries of the same type into one, and for
  25 * that it needs to keep some state when walking, and flush this state
  26 * when a "break" in the continuity is found.
  27 */
  28struct pg_state {
  29        struct ptdump_state ptdump;
  30        int level;
  31        pgprotval_t current_prot;
  32        pgprotval_t effective_prot;
  33        pgprotval_t prot_levels[5];
  34        unsigned long start_address;
  35        const struct addr_marker *marker;
  36        unsigned long lines;
  37        bool to_dmesg;
  38        bool check_wx;
  39        unsigned long wx_pages;
  40        struct seq_file *seq;
  41};
  42
  43struct addr_marker {
  44        unsigned long start_address;
  45        const char *name;
  46        unsigned long max_lines;
  47};
  48
  49/* Address space markers hints */
  50
  51#ifdef CONFIG_X86_64
  52
  53enum address_markers_idx {
  54        USER_SPACE_NR = 0,
  55        KERNEL_SPACE_NR,
  56#ifdef CONFIG_MODIFY_LDT_SYSCALL
  57        LDT_NR,
  58#endif
  59        LOW_KERNEL_NR,
  60        VMALLOC_START_NR,
  61        VMEMMAP_START_NR,
  62#ifdef CONFIG_KASAN
  63        KASAN_SHADOW_START_NR,
  64        KASAN_SHADOW_END_NR,
  65#endif
  66        CPU_ENTRY_AREA_NR,
  67#ifdef CONFIG_X86_ESPFIX64
  68        ESPFIX_START_NR,
  69#endif
  70#ifdef CONFIG_EFI
  71        EFI_END_NR,
  72#endif
  73        HIGH_KERNEL_NR,
  74        MODULES_VADDR_NR,
  75        MODULES_END_NR,
  76        FIXADDR_START_NR,
  77        END_OF_SPACE_NR,
  78};
  79
  80static struct addr_marker address_markers[] = {
  81        [USER_SPACE_NR]         = { 0,                  "User Space" },
  82        [KERNEL_SPACE_NR]       = { (1UL << 63),        "Kernel Space" },
  83        [LOW_KERNEL_NR]         = { 0UL,                "Low Kernel Mapping" },
  84        [VMALLOC_START_NR]      = { 0UL,                "vmalloc() Area" },
  85        [VMEMMAP_START_NR]      = { 0UL,                "Vmemmap" },
  86#ifdef CONFIG_KASAN
  87        /*
  88         * These fields get initialized with the (dynamic)
  89         * KASAN_SHADOW_{START,END} values in pt_dump_init().
  90         */
  91        [KASAN_SHADOW_START_NR] = { 0UL,                "KASAN shadow" },
  92        [KASAN_SHADOW_END_NR]   = { 0UL,                "KASAN shadow end" },
  93#endif
  94#ifdef CONFIG_MODIFY_LDT_SYSCALL
  95        [LDT_NR]                = { 0UL,                "LDT remap" },
  96#endif
  97        [CPU_ENTRY_AREA_NR]     = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
  98#ifdef CONFIG_X86_ESPFIX64
  99        [ESPFIX_START_NR]       = { ESPFIX_BASE_ADDR,   "ESPfix Area", 16 },
 100#endif
 101#ifdef CONFIG_EFI
 102        [EFI_END_NR]            = { EFI_VA_END,         "EFI Runtime Services" },
 103#endif
 104        [HIGH_KERNEL_NR]        = { __START_KERNEL_map, "High Kernel Mapping" },
 105        [MODULES_VADDR_NR]      = { MODULES_VADDR,      "Modules" },
 106        [MODULES_END_NR]        = { MODULES_END,        "End Modules" },
 107        [FIXADDR_START_NR]      = { FIXADDR_START,      "Fixmap Area" },
 108        [END_OF_SPACE_NR]       = { -1,                 NULL }
 109};
 110
 111#define INIT_PGD        ((pgd_t *) &init_top_pgt)
 112
 113#else /* CONFIG_X86_64 */
 114
 115enum address_markers_idx {
 116        USER_SPACE_NR = 0,
 117        KERNEL_SPACE_NR,
 118        VMALLOC_START_NR,
 119        VMALLOC_END_NR,
 120#ifdef CONFIG_HIGHMEM
 121        PKMAP_BASE_NR,
 122#endif
 123#ifdef CONFIG_MODIFY_LDT_SYSCALL
 124        LDT_NR,
 125#endif
 126        CPU_ENTRY_AREA_NR,
 127        FIXADDR_START_NR,
 128        END_OF_SPACE_NR,
 129};
 130
 131static struct addr_marker address_markers[] = {
 132        [USER_SPACE_NR]         = { 0,                  "User Space" },
 133        [KERNEL_SPACE_NR]       = { PAGE_OFFSET,        "Kernel Mapping" },
 134        [VMALLOC_START_NR]      = { 0UL,                "vmalloc() Area" },
 135        [VMALLOC_END_NR]        = { 0UL,                "vmalloc() End" },
 136#ifdef CONFIG_HIGHMEM
 137        [PKMAP_BASE_NR]         = { 0UL,                "Persistent kmap() Area" },
 138#endif
 139#ifdef CONFIG_MODIFY_LDT_SYSCALL
 140        [LDT_NR]                = { 0UL,                "LDT remap" },
 141#endif
 142        [CPU_ENTRY_AREA_NR]     = { 0UL,                "CPU entry area" },
 143        [FIXADDR_START_NR]      = { 0UL,                "Fixmap area" },
 144        [END_OF_SPACE_NR]       = { -1,                 NULL }
 145};
 146
 147#define INIT_PGD        (swapper_pg_dir)
 148
 149#endif /* !CONFIG_X86_64 */
 150
 151/* Multipliers for offsets within the PTEs */
 152#define PTE_LEVEL_MULT (PAGE_SIZE)
 153#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
 154#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
 155#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
 156#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
 157
 158#define pt_dump_seq_printf(m, to_dmesg, fmt, args...)           \
 159({                                                              \
 160        if (to_dmesg)                                   \
 161                printk(KERN_INFO fmt, ##args);                  \
 162        else                                                    \
 163                if (m)                                          \
 164                        seq_printf(m, fmt, ##args);             \
 165})
 166
 167#define pt_dump_cont_printf(m, to_dmesg, fmt, args...)          \
 168({                                                              \
 169        if (to_dmesg)                                   \
 170                printk(KERN_CONT fmt, ##args);                  \
 171        else                                                    \
 172                if (m)                                          \
 173                        seq_printf(m, fmt, ##args);             \
 174})
 175
 176/*
 177 * Print a readable form of a pgprot_t to the seq_file
 178 */
 179static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg)
 180{
 181        static const char * const level_name[] =
 182                { "pgd", "p4d", "pud", "pmd", "pte" };
 183
 184        if (!(pr & _PAGE_PRESENT)) {
 185                /* Not present */
 186                pt_dump_cont_printf(m, dmsg, "                              ");
 187        } else {
 188                if (pr & _PAGE_USER)
 189                        pt_dump_cont_printf(m, dmsg, "USR ");
 190                else
 191                        pt_dump_cont_printf(m, dmsg, "    ");
 192                if (pr & _PAGE_RW)
 193                        pt_dump_cont_printf(m, dmsg, "RW ");
 194                else
 195                        pt_dump_cont_printf(m, dmsg, "ro ");
 196                if (pr & _PAGE_PWT)
 197                        pt_dump_cont_printf(m, dmsg, "PWT ");
 198                else
 199                        pt_dump_cont_printf(m, dmsg, "    ");
 200                if (pr & _PAGE_PCD)
 201                        pt_dump_cont_printf(m, dmsg, "PCD ");
 202                else
 203                        pt_dump_cont_printf(m, dmsg, "    ");
 204
 205                /* Bit 7 has a different meaning on level 3 vs 4 */
 206                if (level <= 3 && pr & _PAGE_PSE)
 207                        pt_dump_cont_printf(m, dmsg, "PSE ");
 208                else
 209                        pt_dump_cont_printf(m, dmsg, "    ");
 210                if ((level == 4 && pr & _PAGE_PAT) ||
 211                    ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
 212                        pt_dump_cont_printf(m, dmsg, "PAT ");
 213                else
 214                        pt_dump_cont_printf(m, dmsg, "    ");
 215                if (pr & _PAGE_GLOBAL)
 216                        pt_dump_cont_printf(m, dmsg, "GLB ");
 217                else
 218                        pt_dump_cont_printf(m, dmsg, "    ");
 219                if (pr & _PAGE_NX)
 220                        pt_dump_cont_printf(m, dmsg, "NX ");
 221                else
 222                        pt_dump_cont_printf(m, dmsg, "x  ");
 223        }
 224        pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
 225}
 226
 227static void note_wx(struct pg_state *st, unsigned long addr)
 228{
 229        unsigned long npages;
 230
 231        npages = (addr - st->start_address) / PAGE_SIZE;
 232
 233#ifdef CONFIG_PCI_BIOS
 234        /*
 235         * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
 236         * Inform about it, but avoid the warning.
 237         */
 238        if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
 239            addr <= PAGE_OFFSET + BIOS_END) {
 240                pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
 241                return;
 242        }
 243#endif
 244        /* Account the WX pages */
 245        st->wx_pages += npages;
 246        WARN_ONCE(__supported_pte_mask & _PAGE_NX,
 247                  "x86/mm: Found insecure W+X mapping at address %pS\n",
 248                  (void *)st->start_address);
 249}
 250
 251static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
 252{
 253        struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
 254        pgprotval_t prot = val & PTE_FLAGS_MASK;
 255        pgprotval_t effective;
 256
 257        if (level > 0) {
 258                pgprotval_t higher_prot = st->prot_levels[level - 1];
 259
 260                effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) |
 261                            ((higher_prot | prot) & _PAGE_NX);
 262        } else {
 263                effective = prot;
 264        }
 265
 266        st->prot_levels[level] = effective;
 267}
 268
 269/*
 270 * This function gets called on a break in a continuous series
 271 * of PTE entries; the next one is different so we need to
 272 * print what we collected so far.
 273 */
 274static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
 275                      u64 val)
 276{
 277        struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
 278        pgprotval_t new_prot, new_eff;
 279        pgprotval_t cur, eff;
 280        static const char units[] = "BKMGTPE";
 281        struct seq_file *m = st->seq;
 282
 283        new_prot = val & PTE_FLAGS_MASK;
 284        if (!val)
 285                new_eff = 0;
 286        else
 287                new_eff = st->prot_levels[level];
 288
 289        /*
 290         * If we have a "break" in the series, we need to flush the state that
 291         * we have now. "break" is either changing perms, levels or
 292         * address space marker.
 293         */
 294        cur = st->current_prot;
 295        eff = st->effective_prot;
 296
 297        if (st->level == -1) {
 298                /* First entry */
 299                st->current_prot = new_prot;
 300                st->effective_prot = new_eff;
 301                st->level = level;
 302                st->marker = address_markers;
 303                st->lines = 0;
 304                pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
 305                                   st->marker->name);
 306        } else if (new_prot != cur || new_eff != eff || level != st->level ||
 307                   addr >= st->marker[1].start_address) {
 308                const char *unit = units;
 309                unsigned long delta;
 310                int width = sizeof(unsigned long) * 2;
 311
 312                if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
 313                        note_wx(st, addr);
 314
 315                /*
 316                 * Now print the actual finished series
 317                 */
 318                if (!st->marker->max_lines ||
 319                    st->lines < st->marker->max_lines) {
 320                        pt_dump_seq_printf(m, st->to_dmesg,
 321                                           "0x%0*lx-0x%0*lx   ",
 322                                           width, st->start_address,
 323                                           width, addr);
 324
 325                        delta = addr - st->start_address;
 326                        while (!(delta & 1023) && unit[1]) {
 327                                delta >>= 10;
 328                                unit++;
 329                        }
 330                        pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
 331                                            delta, *unit);
 332                        printk_prot(m, st->current_prot, st->level,
 333                                    st->to_dmesg);
 334                }
 335                st->lines++;
 336
 337                /*
 338                 * We print markers for special areas of address space,
 339                 * such as the start of vmalloc space etc.
 340                 * This helps in the interpretation.
 341                 */
 342                if (addr >= st->marker[1].start_address) {
 343                        if (st->marker->max_lines &&
 344                            st->lines > st->marker->max_lines) {
 345                                unsigned long nskip =
 346                                        st->lines - st->marker->max_lines;
 347                                pt_dump_seq_printf(m, st->to_dmesg,
 348                                                   "... %lu entr%s skipped ... \n",
 349                                                   nskip,
 350                                                   nskip == 1 ? "y" : "ies");
 351                        }
 352                        st->marker++;
 353                        st->lines = 0;
 354                        pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
 355                                           st->marker->name);
 356                }
 357
 358                st->start_address = addr;
 359                st->current_prot = new_prot;
 360                st->effective_prot = new_eff;
 361                st->level = level;
 362        }
 363}
 364
 365static void ptdump_walk_pgd_level_core(struct seq_file *m,
 366                                       struct mm_struct *mm, pgd_t *pgd,
 367                                       bool checkwx, bool dmesg)
 368{
 369        const struct ptdump_range ptdump_ranges[] = {
 370#ifdef CONFIG_X86_64
 371        {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2},
 372        {GUARD_HOLE_END_ADDR, ~0UL},
 373#else
 374        {0, ~0UL},
 375#endif
 376        {0, 0}
 377};
 378
 379        struct pg_state st = {
 380                .ptdump = {
 381                        .note_page      = note_page,
 382                        .effective_prot = effective_prot,
 383                        .range          = ptdump_ranges
 384                },
 385                .level = -1,
 386                .to_dmesg       = dmesg,
 387                .check_wx       = checkwx,
 388                .seq            = m
 389        };
 390
 391        ptdump_walk_pgd(&st.ptdump, mm, pgd);
 392
 393        if (!checkwx)
 394                return;
 395        if (st.wx_pages)
 396                pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
 397                        st.wx_pages);
 398        else
 399                pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
 400}
 401
 402void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
 403{
 404        ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true);
 405}
 406
 407void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
 408                                   bool user)
 409{
 410        pgd_t *pgd = mm->pgd;
 411#ifdef CONFIG_PAGE_TABLE_ISOLATION
 412        if (user && boot_cpu_has(X86_FEATURE_PTI))
 413                pgd = kernel_to_user_pgdp(pgd);
 414#endif
 415        ptdump_walk_pgd_level_core(m, mm, pgd, false, false);
 416}
 417EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
 418
 419void ptdump_walk_user_pgd_level_checkwx(void)
 420{
 421#ifdef CONFIG_PAGE_TABLE_ISOLATION
 422        pgd_t *pgd = INIT_PGD;
 423
 424        if (!(__supported_pte_mask & _PAGE_NX) ||
 425            !boot_cpu_has(X86_FEATURE_PTI))
 426                return;
 427
 428        pr_info("x86/mm: Checking user space page tables\n");
 429        pgd = kernel_to_user_pgdp(pgd);
 430        ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false);
 431#endif
 432}
 433
 434void ptdump_walk_pgd_level_checkwx(void)
 435{
 436        ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
 437}
 438
 439static int __init pt_dump_init(void)
 440{
 441        /*
 442         * Various markers are not compile-time constants, so assign them
 443         * here.
 444         */
 445#ifdef CONFIG_X86_64
 446        address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
 447        address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
 448        address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
 449#ifdef CONFIG_MODIFY_LDT_SYSCALL
 450        address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
 451#endif
 452#ifdef CONFIG_KASAN
 453        address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
 454        address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
 455#endif
 456#endif
 457#ifdef CONFIG_X86_32
 458        address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
 459        address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
 460# ifdef CONFIG_HIGHMEM
 461        address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
 462# endif
 463        address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
 464        address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
 465# ifdef CONFIG_MODIFY_LDT_SYSCALL
 466        address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
 467# endif
 468#endif
 469        return 0;
 470}
 471__initcall(pt_dump_init);
 472