linux/arch/x86/mm/dump_pagetables.c
<<
>>
Prefs
   1/*
   2 * Debug helper to dump the current kernel pagetables of the system
   3 * so that we can see what the various memory ranges are set to.
   4 *
   5 * (C) Copyright 2008 Intel Corporation
   6 *
   7 * Author: Arjan van de Ven <arjan@linux.intel.com>
   8 *
   9 * This program is free software; you can redistribute it and/or
  10 * modify it under the terms of the GNU General Public License
  11 * as published by the Free Software Foundation; version 2
  12 * of the License.
  13 */
  14
  15#include <linux/debugfs.h>
  16#include <linux/kasan.h>
  17#include <linux/mm.h>
  18#include <linux/init.h>
  19#include <linux/sched.h>
  20#include <linux/seq_file.h>
  21
  22#include <asm/pgtable.h>
  23
  24/*
  25 * The dumper groups pagetable entries of the same type into one, and for
  26 * that it needs to keep some state when walking, and flush this state
  27 * when a "break" in the continuity is found.
  28 */
  29struct pg_state {
  30        int level;
  31        pgprot_t current_prot;
  32        unsigned long start_address;
  33        unsigned long current_address;
  34        const struct addr_marker *marker;
  35        unsigned long lines;
  36        bool to_dmesg;
  37        bool check_wx;
  38        unsigned long wx_pages;
  39};
  40
  41struct addr_marker {
  42        unsigned long start_address;
  43        const char *name;
  44        unsigned long max_lines;
  45};
  46
  47/* indices for address_markers; keep sync'd w/ address_markers below */
  48enum address_markers_idx {
  49        USER_SPACE_NR = 0,
  50#ifdef CONFIG_X86_64
  51        KERNEL_SPACE_NR,
  52        LOW_KERNEL_NR,
  53        VMALLOC_START_NR,
  54        VMEMMAP_START_NR,
  55#ifdef CONFIG_KASAN
  56        KASAN_SHADOW_START_NR,
  57        KASAN_SHADOW_END_NR,
  58#endif
  59# ifdef CONFIG_X86_ESPFIX64
  60        ESPFIX_START_NR,
  61# endif
  62        HIGH_KERNEL_NR,
  63        MODULES_VADDR_NR,
  64        MODULES_END_NR,
  65#else
  66        KERNEL_SPACE_NR,
  67        VMALLOC_START_NR,
  68        VMALLOC_END_NR,
  69# ifdef CONFIG_HIGHMEM
  70        PKMAP_BASE_NR,
  71# endif
  72        FIXADDR_START_NR,
  73#endif
  74};
  75
  76/* Address space markers hints */
  77static struct addr_marker address_markers[] = {
  78        { 0, "User Space" },
  79#ifdef CONFIG_X86_64
  80        { 0x8000000000000000UL, "Kernel Space" },
  81        { 0/* PAGE_OFFSET */,   "Low Kernel Mapping" },
  82        { 0/* VMALLOC_START */, "vmalloc() Area" },
  83        { 0/* VMEMMAP_START */, "Vmemmap" },
  84#ifdef CONFIG_KASAN
  85        { KASAN_SHADOW_START,   "KASAN shadow" },
  86        { KASAN_SHADOW_END,     "KASAN shadow end" },
  87#endif
  88# ifdef CONFIG_X86_ESPFIX64
  89        { ESPFIX_BASE_ADDR,     "ESPfix Area", 16 },
  90# endif
  91# ifdef CONFIG_EFI
  92        { EFI_VA_END,           "EFI Runtime Services" },
  93# endif
  94        { __START_KERNEL_map,   "High Kernel Mapping" },
  95        { MODULES_VADDR,        "Modules" },
  96        { MODULES_END,          "End Modules" },
  97#else
  98        { PAGE_OFFSET,          "Kernel Mapping" },
  99        { 0/* VMALLOC_START */, "vmalloc() Area" },
 100        { 0/*VMALLOC_END*/,     "vmalloc() End" },
 101# ifdef CONFIG_HIGHMEM
 102        { 0/*PKMAP_BASE*/,      "Persistent kmap() Area" },
 103# endif
 104        { 0/*FIXADDR_START*/,   "Fixmap Area" },
 105#endif
 106        { -1, NULL }            /* End of list */
 107};
 108
 109/* Multipliers for offsets within the PTEs */
 110#define PTE_LEVEL_MULT (PAGE_SIZE)
 111#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
 112#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
 113#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
 114#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
 115
 116#define pt_dump_seq_printf(m, to_dmesg, fmt, args...)           \
 117({                                                              \
 118        if (to_dmesg)                                   \
 119                printk(KERN_INFO fmt, ##args);                  \
 120        else                                                    \
 121                if (m)                                          \
 122                        seq_printf(m, fmt, ##args);             \
 123})
 124
 125#define pt_dump_cont_printf(m, to_dmesg, fmt, args...)          \
 126({                                                              \
 127        if (to_dmesg)                                   \
 128                printk(KERN_CONT fmt, ##args);                  \
 129        else                                                    \
 130                if (m)                                          \
 131                        seq_printf(m, fmt, ##args);             \
 132})
 133
 134/*
 135 * Print a readable form of a pgprot_t to the seq_file
 136 */
 137static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
 138{
 139        pgprotval_t pr = pgprot_val(prot);
 140        static const char * const level_name[] =
 141                { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
 142
 143        if (!pgprot_val(prot)) {
 144                /* Not present */
 145                pt_dump_cont_printf(m, dmsg, "                              ");
 146        } else {
 147                if (pr & _PAGE_USER)
 148                        pt_dump_cont_printf(m, dmsg, "USR ");
 149                else
 150                        pt_dump_cont_printf(m, dmsg, "    ");
 151                if (pr & _PAGE_RW)
 152                        pt_dump_cont_printf(m, dmsg, "RW ");
 153                else
 154                        pt_dump_cont_printf(m, dmsg, "ro ");
 155                if (pr & _PAGE_PWT)
 156                        pt_dump_cont_printf(m, dmsg, "PWT ");
 157                else
 158                        pt_dump_cont_printf(m, dmsg, "    ");
 159                if (pr & _PAGE_PCD)
 160                        pt_dump_cont_printf(m, dmsg, "PCD ");
 161                else
 162                        pt_dump_cont_printf(m, dmsg, "    ");
 163
 164                /* Bit 7 has a different meaning on level 3 vs 4 */
 165                if (level <= 4 && pr & _PAGE_PSE)
 166                        pt_dump_cont_printf(m, dmsg, "PSE ");
 167                else
 168                        pt_dump_cont_printf(m, dmsg, "    ");
 169                if ((level == 5 && pr & _PAGE_PAT) ||
 170                    ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
 171                        pt_dump_cont_printf(m, dmsg, "PAT ");
 172                else
 173                        pt_dump_cont_printf(m, dmsg, "    ");
 174                if (pr & _PAGE_GLOBAL)
 175                        pt_dump_cont_printf(m, dmsg, "GLB ");
 176                else
 177                        pt_dump_cont_printf(m, dmsg, "    ");
 178                if (pr & _PAGE_NX)
 179                        pt_dump_cont_printf(m, dmsg, "NX ");
 180                else
 181                        pt_dump_cont_printf(m, dmsg, "x  ");
 182        }
 183        pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
 184}
 185
 186/*
 187 * On 64 bits, sign-extend the 48 bit address to 64 bit
 188 */
 189static unsigned long normalize_addr(unsigned long u)
 190{
 191        int shift;
 192        if (!IS_ENABLED(CONFIG_X86_64))
 193                return u;
 194
 195        shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
 196        return (signed long)(u << shift) >> shift;
 197}
 198
 199/*
 200 * This function gets called on a break in a continuous series
 201 * of PTE entries; the next one is different so we need to
 202 * print what we collected so far.
 203 */
 204static void note_page(struct seq_file *m, struct pg_state *st,
 205                      pgprot_t new_prot, int level)
 206{
 207        pgprotval_t prot, cur;
 208        static const char units[] = "BKMGTPE";
 209
 210        /*
 211         * If we have a "break" in the series, we need to flush the state that
 212         * we have now. "break" is either changing perms, levels or
 213         * address space marker.
 214         */
 215        prot = pgprot_val(new_prot);
 216        cur = pgprot_val(st->current_prot);
 217
 218        if (!st->level) {
 219                /* First entry */
 220                st->current_prot = new_prot;
 221                st->level = level;
 222                st->marker = address_markers;
 223                st->lines = 0;
 224                pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
 225                                   st->marker->name);
 226        } else if (prot != cur || level != st->level ||
 227                   st->current_address >= st->marker[1].start_address) {
 228                const char *unit = units;
 229                unsigned long delta;
 230                int width = sizeof(unsigned long) * 2;
 231                pgprotval_t pr = pgprot_val(st->current_prot);
 232
 233                if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
 234                        WARN_ONCE(1,
 235                                  "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
 236                                  (void *)st->start_address,
 237                                  (void *)st->start_address);
 238                        st->wx_pages += (st->current_address -
 239                                         st->start_address) / PAGE_SIZE;
 240                }
 241
 242                /*
 243                 * Now print the actual finished series
 244                 */
 245                if (!st->marker->max_lines ||
 246                    st->lines < st->marker->max_lines) {
 247                        pt_dump_seq_printf(m, st->to_dmesg,
 248                                           "0x%0*lx-0x%0*lx   ",
 249                                           width, st->start_address,
 250                                           width, st->current_address);
 251
 252                        delta = st->current_address - st->start_address;
 253                        while (!(delta & 1023) && unit[1]) {
 254                                delta >>= 10;
 255                                unit++;
 256                        }
 257                        pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
 258                                            delta, *unit);
 259                        printk_prot(m, st->current_prot, st->level,
 260                                    st->to_dmesg);
 261                }
 262                st->lines++;
 263
 264                /*
 265                 * We print markers for special areas of address space,
 266                 * such as the start of vmalloc space etc.
 267                 * This helps in the interpretation.
 268                 */
 269                if (st->current_address >= st->marker[1].start_address) {
 270                        if (st->marker->max_lines &&
 271                            st->lines > st->marker->max_lines) {
 272                                unsigned long nskip =
 273                                        st->lines - st->marker->max_lines;
 274                                pt_dump_seq_printf(m, st->to_dmesg,
 275                                                   "... %lu entr%s skipped ... \n",
 276                                                   nskip,
 277                                                   nskip == 1 ? "y" : "ies");
 278                        }
 279                        st->marker++;
 280                        st->lines = 0;
 281                        pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
 282                                           st->marker->name);
 283                }
 284
 285                st->start_address = st->current_address;
 286                st->current_prot = new_prot;
 287                st->level = level;
 288        }
 289}
 290
 291static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
 292{
 293        int i;
 294        pte_t *start;
 295        pgprotval_t prot;
 296
 297        start = (pte_t *)pmd_page_vaddr(addr);
 298        for (i = 0; i < PTRS_PER_PTE; i++) {
 299                prot = pte_flags(*start);
 300                st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
 301                note_page(m, st, __pgprot(prot), 5);
 302                start++;
 303        }
 304}
 305#ifdef CONFIG_KASAN
 306
 307/*
 308 * This is an optimization for KASAN=y case. Since all kasan page tables
 309 * eventually point to the kasan_zero_page we could call note_page()
 310 * right away without walking through lower level page tables. This saves
 311 * us dozens of seconds (minutes for 5-level config) while checking for
 312 * W+X mapping or reading kernel_page_tables debugfs file.
 313 */
 314static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
 315                                void *pt)
 316{
 317        if (__pa(pt) == __pa(kasan_zero_pmd) ||
 318#ifdef CONFIG_X86_5LEVEL
 319            __pa(pt) == __pa(kasan_zero_p4d) ||
 320#endif
 321            __pa(pt) == __pa(kasan_zero_pud)) {
 322                pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
 323                note_page(m, st, __pgprot(prot), 5);
 324                return true;
 325        }
 326        return false;
 327}
 328#else
 329static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
 330                                void *pt)
 331{
 332        return false;
 333}
 334#endif
 335
 336#if PTRS_PER_PMD > 1
 337
 338static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
 339{
 340        int i;
 341        pmd_t *start, *pmd_start;
 342        pgprotval_t prot;
 343
 344        pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
 345        for (i = 0; i < PTRS_PER_PMD; i++) {
 346                st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
 347                if (!pmd_none(*start)) {
 348                        if (pmd_large(*start) || !pmd_present(*start)) {
 349                                prot = pmd_flags(*start);
 350                                note_page(m, st, __pgprot(prot), 4);
 351                        } else if (!kasan_page_table(m, st, pmd_start)) {
 352                                walk_pte_level(m, st, *start,
 353                                               P + i * PMD_LEVEL_MULT);
 354                        }
 355                } else
 356                        note_page(m, st, __pgprot(0), 4);
 357                start++;
 358        }
 359}
 360
 361#else
 362#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
 363#define pud_large(a) pmd_large(__pmd(pud_val(a)))
 364#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
 365#endif
 366
 367#if PTRS_PER_PUD > 1
 368
 369static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
 370{
 371        int i;
 372        pud_t *start, *pud_start;
 373        pgprotval_t prot;
 374        pud_t *prev_pud = NULL;
 375
 376        pud_start = start = (pud_t *)p4d_page_vaddr(addr);
 377
 378        for (i = 0; i < PTRS_PER_PUD; i++) {
 379                st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
 380                if (!pud_none(*start)) {
 381                        if (pud_large(*start) || !pud_present(*start)) {
 382                                prot = pud_flags(*start);
 383                                note_page(m, st, __pgprot(prot), 3);
 384                        } else if (!kasan_page_table(m, st, pud_start)) {
 385                                walk_pmd_level(m, st, *start,
 386                                               P + i * PUD_LEVEL_MULT);
 387                        }
 388                } else
 389                        note_page(m, st, __pgprot(0), 3);
 390
 391                prev_pud = start;
 392                start++;
 393        }
 394}
 395
 396#else
 397#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
 398#define p4d_large(a) pud_large(__pud(p4d_val(a)))
 399#define p4d_none(a)  pud_none(__pud(p4d_val(a)))
 400#endif
 401
 402#if PTRS_PER_P4D > 1
 403
 404static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
 405{
 406        int i;
 407        p4d_t *start, *p4d_start;
 408        pgprotval_t prot;
 409
 410        p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
 411
 412        for (i = 0; i < PTRS_PER_P4D; i++) {
 413                st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
 414                if (!p4d_none(*start)) {
 415                        if (p4d_large(*start) || !p4d_present(*start)) {
 416                                prot = p4d_flags(*start);
 417                                note_page(m, st, __pgprot(prot), 2);
 418                        } else if (!kasan_page_table(m, st, p4d_start)) {
 419                                walk_pud_level(m, st, *start,
 420                                               P + i * P4D_LEVEL_MULT);
 421                        }
 422                } else
 423                        note_page(m, st, __pgprot(0), 2);
 424
 425                start++;
 426        }
 427}
 428
 429#else
 430#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
 431#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
 432#define pgd_none(a)  p4d_none(__p4d(pgd_val(a)))
 433#endif
 434
 435static inline bool is_hypervisor_range(int idx)
 436{
 437#ifdef CONFIG_X86_64
 438        /*
 439         * ffff800000000000 - ffff87ffffffffff is reserved for
 440         * the hypervisor.
 441         */
 442        return  (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
 443                (idx <  pgd_index(__PAGE_OFFSET));
 444#else
 445        return false;
 446#endif
 447}
 448
 449static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 450                                       bool checkwx)
 451{
 452#ifdef CONFIG_X86_64
 453        pgd_t *start = (pgd_t *) &init_top_pgt;
 454#else
 455        pgd_t *start = swapper_pg_dir;
 456#endif
 457        pgprotval_t prot;
 458        int i;
 459        struct pg_state st = {};
 460
 461        if (pgd) {
 462                start = pgd;
 463                st.to_dmesg = true;
 464        }
 465
 466        st.check_wx = checkwx;
 467        if (checkwx)
 468                st.wx_pages = 0;
 469
 470        for (i = 0; i < PTRS_PER_PGD; i++) {
 471                st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
 472                if (!pgd_none(*start) && !is_hypervisor_range(i)) {
 473                        if (pgd_large(*start) || !pgd_present(*start)) {
 474                                prot = pgd_flags(*start);
 475                                note_page(m, &st, __pgprot(prot), 1);
 476                        } else {
 477                                walk_p4d_level(m, &st, *start,
 478                                               i * PGD_LEVEL_MULT);
 479                        }
 480                } else
 481                        note_page(m, &st, __pgprot(0), 1);
 482
 483                cond_resched();
 484                start++;
 485        }
 486
 487        /* Flush out the last page */
 488        st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
 489        note_page(m, &st, __pgprot(0), 0);
 490        if (!checkwx)
 491                return;
 492        if (st.wx_pages)
 493                pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
 494                        st.wx_pages);
 495        else
 496                pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
 497}
 498
 499void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 500{
 501        ptdump_walk_pgd_level_core(m, pgd, false);
 502}
 503EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
 504
 505void ptdump_walk_pgd_level_checkwx(void)
 506{
 507        ptdump_walk_pgd_level_core(NULL, NULL, true);
 508}
 509
 510static int __init pt_dump_init(void)
 511{
 512        /*
 513         * Various markers are not compile-time constants, so assign them
 514         * here.
 515         */
 516#ifdef CONFIG_X86_64
 517        address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
 518        address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
 519        address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
 520#endif
 521#ifdef CONFIG_X86_32
 522        address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
 523        address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
 524# ifdef CONFIG_HIGHMEM
 525        address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
 526# endif
 527        address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
 528#endif
 529
 530        return 0;
 531}
 532__initcall(pt_dump_init);
 533