linux/arch/x86/mm/pti.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2017 Intel Corporation. All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of version 2 of the GNU General Public License as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * This code is based in part on work published here:
  14 *
  15 *      https://github.com/IAIK/KAISER
  16 *
  17 * The original work was written by and and signed off by for the Linux
  18 * kernel by:
  19 *
  20 *   Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
  21 *   Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
  22 *   Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
  23 *   Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
  24 *
  25 * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
  26 * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
  27 *                     Andy Lutomirsky <luto@amacapital.net>
  28 */
  29#include <linux/kernel.h>
  30#include <linux/errno.h>
  31#include <linux/string.h>
  32#include <linux/types.h>
  33#include <linux/bug.h>
  34#include <linux/init.h>
  35#include <linux/spinlock.h>
  36#include <linux/mm.h>
  37#include <linux/uaccess.h>
  38
  39#include <asm/cpufeature.h>
  40#include <asm/hypervisor.h>
  41#include <asm/vsyscall.h>
  42#include <asm/cmdline.h>
  43#include <asm/pti.h>
  44#include <asm/pgtable.h>
  45#include <asm/pgalloc.h>
  46#include <asm/tlbflush.h>
  47#include <asm/desc.h>
  48
  49#undef pr_fmt
  50#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
  51
  52/* Backporting helper */
  53#ifndef __GFP_NOTRACK
  54#define __GFP_NOTRACK   0
  55#endif
  56
  57static void __init pti_print_if_insecure(const char *reason)
  58{
  59        if (boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
  60                pr_info("%s\n", reason);
  61}
  62
  63static void __init pti_print_if_secure(const char *reason)
  64{
  65        if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
  66                pr_info("%s\n", reason);
  67}
  68
  69void __init pti_check_boottime_disable(void)
  70{
  71        char arg[5];
  72        int ret;
  73
  74        if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
  75                pti_print_if_insecure("disabled on XEN PV.");
  76                return;
  77        }
  78
  79        ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
  80        if (ret > 0)  {
  81                if (ret == 3 && !strncmp(arg, "off", 3)) {
  82                        pti_print_if_insecure("disabled on command line.");
  83                        return;
  84                }
  85                if (ret == 2 && !strncmp(arg, "on", 2)) {
  86                        pti_print_if_secure("force enabled on command line.");
  87                        goto enable;
  88                }
  89                if (ret == 4 && !strncmp(arg, "auto", 4))
  90                        goto autosel;
  91        }
  92
  93        if (cmdline_find_option_bool(boot_command_line, "nopti")) {
  94                pti_print_if_insecure("disabled on command line.");
  95                return;
  96        }
  97
  98autosel:
  99        if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
 100                return;
 101enable:
 102        setup_force_cpu_cap(X86_FEATURE_PTI);
 103}
 104
 105pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
 106{
 107        /*
 108         * Changes to the high (kernel) portion of the kernelmode page
 109         * tables are not automatically propagated to the usermode tables.
 110         *
 111         * Users should keep in mind that, unlike the kernelmode tables,
 112         * there is no vmalloc_fault equivalent for the usermode tables.
 113         * Top-level entries added to init_mm's usermode pgd after boot
 114         * will not be automatically propagated to other mms.
 115         */
 116        if (!pgdp_maps_userspace(pgdp))
 117                return pgd;
 118
 119        /*
 120         * The user page tables get the full PGD, accessible from
 121         * userspace:
 122         */
 123        kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
 124
 125        /*
 126         * If this is normal user memory, make it NX in the kernel
 127         * pagetables so that, if we somehow screw up and return to
 128         * usermode with the kernel CR3 loaded, we'll get a page fault
 129         * instead of allowing user code to execute with the wrong CR3.
 130         *
 131         * As exceptions, we don't set NX if:
 132         *  - _PAGE_USER is not set.  This could be an executable
 133         *     EFI runtime mapping or something similar, and the kernel
 134         *     may execute from it
 135         *  - we don't have NX support
 136         *  - we're clearing the PGD (i.e. the new pgd is not present).
 137         */
 138        if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
 139            (__supported_pte_mask & _PAGE_NX))
 140                pgd.pgd |= _PAGE_NX;
 141
 142        /* return the copy of the PGD we want the kernel to use: */
 143        return pgd;
 144}
 145
 146/*
 147 * Walk the user copy of the page tables (optionally) trying to allocate
 148 * page table pages on the way down.
 149 *
 150 * Returns a pointer to a P4D on success, or NULL on failure.
 151 */
 152static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
 153{
 154        pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
 155        gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 156
 157        if (address < PAGE_OFFSET) {
 158                WARN_ONCE(1, "attempt to walk user address\n");
 159                return NULL;
 160        }
 161
 162        if (pgd_none(*pgd)) {
 163                unsigned long new_p4d_page = __get_free_page(gfp);
 164                if (!new_p4d_page)
 165                        return NULL;
 166
 167                set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
 168        }
 169        BUILD_BUG_ON(pgd_large(*pgd) != 0);
 170
 171        return p4d_offset(pgd, address);
 172}
 173
 174/*
 175 * Walk the user copy of the page tables (optionally) trying to allocate
 176 * page table pages on the way down.
 177 *
 178 * Returns a pointer to a PMD on success, or NULL on failure.
 179 */
 180static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 181{
 182        gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 183        p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
 184        pud_t *pud;
 185
 186        BUILD_BUG_ON(p4d_large(*p4d) != 0);
 187        if (p4d_none(*p4d)) {
 188                unsigned long new_pud_page = __get_free_page(gfp);
 189                if (!new_pud_page)
 190                        return NULL;
 191
 192                set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
 193        }
 194
 195        pud = pud_offset(p4d, address);
 196        /* The user page tables do not use large mappings: */
 197        if (pud_large(*pud)) {
 198                WARN_ON(1);
 199                return NULL;
 200        }
 201        if (pud_none(*pud)) {
 202                unsigned long new_pmd_page = __get_free_page(gfp);
 203                if (!new_pmd_page)
 204                        return NULL;
 205
 206                set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
 207        }
 208
 209        return pmd_offset(pud, address);
 210}
 211
 212#ifdef CONFIG_X86_VSYSCALL_EMULATION
 213/*
 214 * Walk the shadow copy of the page tables (optionally) trying to allocate
 215 * page table pages on the way down.  Does not support large pages.
 216 *
 217 * Note: this is only used when mapping *new* kernel data into the
 218 * user/shadow page tables.  It is never used for userspace data.
 219 *
 220 * Returns a pointer to a PTE on success, or NULL on failure.
 221 */
 222static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 223{
 224        gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 225        pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
 226        pte_t *pte;
 227
 228        /* We can't do anything sensible if we hit a large mapping. */
 229        if (pmd_large(*pmd)) {
 230                WARN_ON(1);
 231                return NULL;
 232        }
 233
 234        if (pmd_none(*pmd)) {
 235                unsigned long new_pte_page = __get_free_page(gfp);
 236                if (!new_pte_page)
 237                        return NULL;
 238
 239                set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
 240        }
 241
 242        pte = pte_offset_kernel(pmd, address);
 243        if (pte_flags(*pte) & _PAGE_USER) {
 244                WARN_ONCE(1, "attempt to walk to user pte\n");
 245                return NULL;
 246        }
 247        return pte;
 248}
 249
 250static void __init pti_setup_vsyscall(void)
 251{
 252        pte_t *pte, *target_pte;
 253        unsigned int level;
 254
 255        pte = lookup_address(VSYSCALL_ADDR, &level);
 256        if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
 257                return;
 258
 259        target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
 260        if (WARN_ON(!target_pte))
 261                return;
 262
 263        *target_pte = *pte;
 264        set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
 265}
 266#else
 267static void __init pti_setup_vsyscall(void) { }
 268#endif
 269
 270static void __init
 271pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
 272{
 273        unsigned long addr;
 274
 275        /*
 276         * Clone the populated PMDs which cover start to end. These PMD areas
 277         * can have holes.
 278         */
 279        for (addr = start; addr < end; addr += PMD_SIZE) {
 280                pmd_t *pmd, *target_pmd;
 281                pgd_t *pgd;
 282                p4d_t *p4d;
 283                pud_t *pud;
 284
 285                pgd = pgd_offset_k(addr);
 286                if (WARN_ON(pgd_none(*pgd)))
 287                        return;
 288                p4d = p4d_offset(pgd, addr);
 289                if (WARN_ON(p4d_none(*p4d)))
 290                        return;
 291                pud = pud_offset(p4d, addr);
 292                if (pud_none(*pud))
 293                        continue;
 294                pmd = pmd_offset(pud, addr);
 295                if (pmd_none(*pmd))
 296                        continue;
 297
 298                target_pmd = pti_user_pagetable_walk_pmd(addr);
 299                if (WARN_ON(!target_pmd))
 300                        return;
 301
 302                /*
 303                 * Copy the PMD.  That is, the kernelmode and usermode
 304                 * tables will share the last-level page tables of this
 305                 * address range
 306                 */
 307                *target_pmd = pmd_clear_flags(*pmd, clear);
 308        }
 309}
 310
 311/*
 312 * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
 313 * next-level entry on 5-level systems.
 314 */
 315static void __init pti_clone_p4d(unsigned long addr)
 316{
 317        p4d_t *kernel_p4d, *user_p4d;
 318        pgd_t *kernel_pgd;
 319
 320        user_p4d = pti_user_pagetable_walk_p4d(addr);
 321        kernel_pgd = pgd_offset_k(addr);
 322        kernel_p4d = p4d_offset(kernel_pgd, addr);
 323        *user_p4d = *kernel_p4d;
 324}
 325
 326/*
 327 * Clone the CPU_ENTRY_AREA into the user space visible page table.
 328 */
 329static void __init pti_clone_user_shared(void)
 330{
 331        pti_clone_p4d(CPU_ENTRY_AREA_BASE);
 332}
 333
 334/*
 335 * Clone the ESPFIX P4D into the user space visible page table
 336 */
 337static void __init pti_setup_espfix64(void)
 338{
 339#ifdef CONFIG_X86_ESPFIX64
 340        pti_clone_p4d(ESPFIX_BASE_ADDR);
 341#endif
 342}
 343
 344/*
 345 * Clone the populated PMDs of the entry and irqentry text and force it RO.
 346 */
 347static void __init pti_clone_entry_text(void)
 348{
 349        pti_clone_pmds((unsigned long) __entry_text_start,
 350                        (unsigned long) __irqentry_text_end,
 351                       _PAGE_RW | _PAGE_GLOBAL);
 352}
 353
 354/*
 355 * Initialize kernel page table isolation
 356 */
 357void __init pti_init(void)
 358{
 359        if (!static_cpu_has(X86_FEATURE_PTI))
 360                return;
 361
 362        pr_info("enabled\n");
 363
 364        pti_clone_user_shared();
 365        pti_clone_entry_text();
 366        pti_setup_espfix64();
 367        pti_setup_vsyscall();
 368}
 369