linux/arch/x86/kernel/head_64.S
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/*
   3 *  linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
   4 *
   5 *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
   6 *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
   7 *  Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
   8 *  Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
   9 *  Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
  10 */
  11
  12
  13#include <linux/linkage.h>
  14#include <linux/threads.h>
  15#include <linux/init.h>
  16#include <asm/segment.h>
  17#include <asm/pgtable.h>
  18#include <asm/page.h>
  19#include <asm/msr.h>
  20#include <asm/cache.h>
  21#include <asm/processor-flags.h>
  22#include <asm/percpu.h>
  23#include <asm/nops.h>
  24#include "../entry/calling.h"
  25#include <asm/export.h>
  26#include <asm/nospec-branch.h>
  27
  28#ifdef CONFIG_PARAVIRT
  29#include <asm/asm-offsets.h>
  30#include <asm/paravirt.h>
  31#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
  32#else
  33#define GET_CR2_INTO(reg) movq %cr2, reg
  34#define INTERRUPT_RETURN iretq
  35#endif
  36
  37/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
  38 * because we need identity-mapped pages.
  39 *
  40 */
  41
  42#define pud_index(x)    (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
  43
  44#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
  45PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
  46PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
  47#endif
  48L3_START_KERNEL = pud_index(__START_KERNEL_map)
  49
  50        .text
  51        __HEAD
  52        .code64
  53        .globl startup_64
  54startup_64:
  55        UNWIND_HINT_EMPTY
  56        /*
  57         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
  58         * and someone has loaded an identity mapped page table
  59         * for us.  These identity mapped page tables map all of the
  60         * kernel pages and possibly all of memory.
  61         *
  62         * %rsi holds a physical pointer to real_mode_data.
  63         *
  64         * We come here either directly from a 64bit bootloader, or from
  65         * arch/x86/boot/compressed/head_64.S.
  66         *
  67         * We only come here initially at boot nothing else comes here.
  68         *
  69         * Since we may be loaded at an address different from what we were
  70         * compiled to run at we first fixup the physical addresses in our page
  71         * tables and then reload them.
  72         */
  73
  74        /* Set up the stack for verify_cpu(), similar to initial_stack below */
  75        leaq    (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
  76
  77        /* Sanitize CPU configuration */
  78        call verify_cpu
  79
  80        /*
  81         * Perform pagetable fixups. Additionally, if SME is active, encrypt
  82         * the kernel and retrieve the modifier (SME encryption mask if SME
  83         * is active) to be added to the initial pgdir entry that will be
  84         * programmed into CR3.
  85         */
  86        leaq    _text(%rip), %rdi
  87        pushq   %rsi
  88        call    __startup_64
  89        popq    %rsi
  90
  91        /* Form the CR3 value being sure to include the CR3 modifier */
  92        addq    $(early_top_pgt - __START_KERNEL_map), %rax
  93        jmp 1f
  94ENTRY(secondary_startup_64)
  95        UNWIND_HINT_EMPTY
  96        /*
  97         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
  98         * and someone has loaded a mapped page table.
  99         *
 100         * %rsi holds a physical pointer to real_mode_data.
 101         *
 102         * We come here either from startup_64 (using physical addresses)
 103         * or from trampoline.S (using virtual addresses).
 104         *
 105         * Using virtual addresses from trampoline.S removes the need
 106         * to have any identity mapped pages in the kernel page table
 107         * after the boot processor executes this code.
 108         */
 109
 110        /* Sanitize CPU configuration */
 111        call verify_cpu
 112
 113        /*
 114         * Retrieve the modifier (SME encryption mask if SME is active) to be
 115         * added to the initial pgdir entry that will be programmed into CR3.
 116         */
 117        pushq   %rsi
 118        call    __startup_secondary_64
 119        popq    %rsi
 120
 121        /* Form the CR3 value being sure to include the CR3 modifier */
 122        addq    $(init_top_pgt - __START_KERNEL_map), %rax
 1231:
 124
 125        /* Enable PAE mode, PGE and LA57 */
 126        movl    $(X86_CR4_PAE | X86_CR4_PGE), %ecx
 127#ifdef CONFIG_X86_5LEVEL
 128        orl     $X86_CR4_LA57, %ecx
 129#endif
 130        movq    %rcx, %cr4
 131
 132        /* Setup early boot stage 4-/5-level pagetables. */
 133        addq    phys_base(%rip), %rax
 134        movq    %rax, %cr3
 135
 136        /* Ensure I am executing from virtual addresses */
 137        movq    $1f, %rax
 138        ANNOTATE_RETPOLINE_SAFE
 139        jmp     *%rax
 1401:
 141        UNWIND_HINT_EMPTY
 142
 143        /* Check if nx is implemented */
 144        movl    $0x80000001, %eax
 145        cpuid
 146        movl    %edx,%edi
 147
 148        /* Setup EFER (Extended Feature Enable Register) */
 149        movl    $MSR_EFER, %ecx
 150        rdmsr
 151        btsl    $_EFER_SCE, %eax        /* Enable System Call */
 152        btl     $20,%edi                /* No Execute supported? */
 153        jnc     1f
 154        btsl    $_EFER_NX, %eax
 155        btsq    $_PAGE_BIT_NX,early_pmd_flags(%rip)
 1561:      wrmsr                           /* Make changes effective */
 157
 158        /* Setup cr0 */
 159        movl    $CR0_STATE, %eax
 160        /* Make changes effective */
 161        movq    %rax, %cr0
 162
 163        /* Setup a boot time stack */
 164        movq initial_stack(%rip), %rsp
 165
 166        /* zero EFLAGS after setting rsp */
 167        pushq $0
 168        popfq
 169
 170        /*
 171         * We must switch to a new descriptor in kernel space for the GDT
 172         * because soon the kernel won't have access anymore to the userspace
 173         * addresses where we're currently running on. We have to do that here
 174         * because in 32bit we couldn't load a 64bit linear address.
 175         */
 176        lgdt    early_gdt_descr(%rip)
 177
 178        /* set up data segments */
 179        xorl %eax,%eax
 180        movl %eax,%ds
 181        movl %eax,%ss
 182        movl %eax,%es
 183
 184        /*
 185         * We don't really need to load %fs or %gs, but load them anyway
 186         * to kill any stale realmode selectors.  This allows execution
 187         * under VT hardware.
 188         */
 189        movl %eax,%fs
 190        movl %eax,%gs
 191
 192        /* Set up %gs.
 193         *
 194         * The base of %gs always points to the bottom of the irqstack
 195         * union.  If the stack protector canary is enabled, it is
 196         * located at %gs:40.  Note that, on SMP, the boot cpu uses
 197         * init data section till per cpu areas are set up.
 198         */
 199        movl    $MSR_GS_BASE,%ecx
 200        movl    initial_gs(%rip),%eax
 201        movl    initial_gs+4(%rip),%edx
 202        wrmsr
 203
 204        /* rsi is pointer to real mode structure with interesting info.
 205           pass it to C */
 206        movq    %rsi, %rdi
 207
 208.Ljump_to_C_code:
 209        /*
 210         * Jump to run C code and to be on a real kernel address.
 211         * Since we are running on identity-mapped space we have to jump
 212         * to the full 64bit address, this is only possible as indirect
 213         * jump.  In addition we need to ensure %cs is set so we make this
 214         * a far return.
 215         *
 216         * Note: do not change to far jump indirect with 64bit offset.
 217         *
 218         * AMD does not support far jump indirect with 64bit offset.
 219         * AMD64 Architecture Programmer's Manual, Volume 3: states only
 220         *      JMP FAR mem16:16 FF /5 Far jump indirect,
 221         *              with the target specified by a far pointer in memory.
 222         *      JMP FAR mem16:32 FF /5 Far jump indirect,
 223         *              with the target specified by a far pointer in memory.
 224         *
 225         * Intel64 does support 64bit offset.
 226         * Software Developer Manual Vol 2: states:
 227         *      FF /5 JMP m16:16 Jump far, absolute indirect,
 228         *              address given in m16:16
 229         *      FF /5 JMP m16:32 Jump far, absolute indirect,
 230         *              address given in m16:32.
 231         *      REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
 232         *              address given in m16:64.
 233         */
 234        pushq   $.Lafter_lret   # put return address on stack for unwinder
 235        xorq    %rbp, %rbp      # clear frame pointer
 236        movq    initial_code(%rip), %rax
 237        pushq   $__KERNEL_CS    # set correct cs
 238        pushq   %rax            # target address in negative space
 239        lretq
 240.Lafter_lret:
 241END(secondary_startup_64)
 242
 243#include "verify_cpu.S"
 244
 245#ifdef CONFIG_HOTPLUG_CPU
 246/*
 247 * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
 248 * up already except stack. We just set up stack here. Then call
 249 * start_secondary() via .Ljump_to_C_code.
 250 */
 251ENTRY(start_cpu0)
 252        movq    initial_stack(%rip), %rsp
 253        UNWIND_HINT_EMPTY
 254        jmp     .Ljump_to_C_code
 255ENDPROC(start_cpu0)
 256#endif
 257
 258        /* Both SMP bootup and ACPI suspend change these variables */
 259        __REFDATA
 260        .balign 8
 261        GLOBAL(initial_code)
 262        .quad   x86_64_start_kernel
 263        GLOBAL(initial_gs)
 264        .quad   INIT_PER_CPU_VAR(irq_stack_union)
 265        GLOBAL(initial_stack)
 266        /*
 267         * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
 268         * unwinder reliably detect the end of the stack.
 269         */
 270        .quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
 271        __FINITDATA
 272
 273        __INIT
 274ENTRY(early_idt_handler_array)
 275        i = 0
 276        .rept NUM_EXCEPTION_VECTORS
 277        .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
 278                UNWIND_HINT_IRET_REGS
 279                pushq $0        # Dummy error code, to make stack frame uniform
 280        .else
 281                UNWIND_HINT_IRET_REGS offset=8
 282        .endif
 283        pushq $i                # 72(%rsp) Vector number
 284        jmp early_idt_handler_common
 285        UNWIND_HINT_IRET_REGS
 286        i = i + 1
 287        .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
 288        .endr
 289        UNWIND_HINT_IRET_REGS offset=16
 290END(early_idt_handler_array)
 291
 292early_idt_handler_common:
 293        /*
 294         * The stack is the hardware frame, an error code or zero, and the
 295         * vector number.
 296         */
 297        cld
 298
 299        incl early_recursion_flag(%rip)
 300
 301        /* The vector number is currently in the pt_regs->di slot. */
 302        pushq %rsi                              /* pt_regs->si */
 303        movq 8(%rsp), %rsi                      /* RSI = vector number */
 304        movq %rdi, 8(%rsp)                      /* pt_regs->di = RDI */
 305        pushq %rdx                              /* pt_regs->dx */
 306        pushq %rcx                              /* pt_regs->cx */
 307        pushq %rax                              /* pt_regs->ax */
 308        pushq %r8                               /* pt_regs->r8 */
 309        pushq %r9                               /* pt_regs->r9 */
 310        pushq %r10                              /* pt_regs->r10 */
 311        pushq %r11                              /* pt_regs->r11 */
 312        pushq %rbx                              /* pt_regs->bx */
 313        pushq %rbp                              /* pt_regs->bp */
 314        pushq %r12                              /* pt_regs->r12 */
 315        pushq %r13                              /* pt_regs->r13 */
 316        pushq %r14                              /* pt_regs->r14 */
 317        pushq %r15                              /* pt_regs->r15 */
 318        UNWIND_HINT_REGS
 319
 320        cmpq $14,%rsi           /* Page fault? */
 321        jnz 10f
 322        GET_CR2_INTO(%rdi)      /* Can clobber any volatile register if pv */
 323        call early_make_pgtable
 324        andl %eax,%eax
 325        jz 20f                  /* All good */
 326
 32710:
 328        movq %rsp,%rdi          /* RDI = pt_regs; RSI is already trapnr */
 329        call early_fixup_exception
 330
 33120:
 332        decl early_recursion_flag(%rip)
 333        jmp restore_regs_and_return_to_kernel
 334END(early_idt_handler_common)
 335
 336        __INITDATA
 337
 338        .balign 4
 339GLOBAL(early_recursion_flag)
 340        .long 0
 341
 342#define NEXT_PAGE(name) \
 343        .balign PAGE_SIZE; \
 344GLOBAL(name)
 345
 346#ifdef CONFIG_PAGE_TABLE_ISOLATION
 347/*
 348 * Each PGD needs to be 8k long and 8k aligned.  We do not
 349 * ever go out to userspace with these, so we do not
 350 * strictly *need* the second page, but this allows us to
 351 * have a single set_pgd() implementation that does not
 352 * need to worry about whether it has 4k or 8k to work
 353 * with.
 354 *
 355 * This ensures PGDs are 8k long:
 356 */
 357#define PTI_USER_PGD_FILL       512
 358/* This ensures they are 8k-aligned: */
 359#define NEXT_PGD_PAGE(name) \
 360        .balign 2 * PAGE_SIZE; \
 361GLOBAL(name)
 362#else
 363#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
 364#define PTI_USER_PGD_FILL       0
 365#endif
 366
 367/* Automate the creation of 1 to 1 mapping pmd entries */
 368#define PMDS(START, PERM, COUNT)                        \
 369        i = 0 ;                                         \
 370        .rept (COUNT) ;                                 \
 371        .quad   (START) + (i << PMD_SHIFT) + (PERM) ;   \
 372        i = i + 1 ;                                     \
 373        .endr
 374
 375        __INITDATA
 376NEXT_PGD_PAGE(early_top_pgt)
 377        .fill   511,8,0
 378#ifdef CONFIG_X86_5LEVEL
 379        .quad   level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 380#else
 381        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 382#endif
 383        .fill   PTI_USER_PGD_FILL,8,0
 384
 385NEXT_PAGE(early_dynamic_pgts)
 386        .fill   512*EARLY_DYNAMIC_PAGE_TABLES,8,0
 387
 388        .data
 389
 390#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
 391NEXT_PGD_PAGE(init_top_pgt)
 392        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 393        .org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
 394        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 395        .org    init_top_pgt + PGD_START_KERNEL*8, 0
 396        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 397        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 398        .fill   PTI_USER_PGD_FILL,8,0
 399
 400NEXT_PAGE(level3_ident_pgt)
 401        .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 402        .fill   511, 8, 0
 403NEXT_PAGE(level2_ident_pgt)
 404        /* Since I easily can, map the first 1G.
 405         * Don't set NX because code runs from these pages.
 406         */
 407        PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 408#else
 409NEXT_PGD_PAGE(init_top_pgt)
 410        .fill   512,8,0
 411        .fill   PTI_USER_PGD_FILL,8,0
 412#endif
 413
 414#ifdef CONFIG_X86_5LEVEL
 415NEXT_PAGE(level4_kernel_pgt)
 416        .fill   511,8,0
 417        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 418#endif
 419
 420NEXT_PAGE(level3_kernel_pgt)
 421        .fill   L3_START_KERNEL,8,0
 422        /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
 423        .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 424        .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 425
 426NEXT_PAGE(level2_kernel_pgt)
 427        /*
 428         * 512 MB kernel mapping. We spend a full page on this pagetable
 429         * anyway.
 430         *
 431         * The kernel code+data+bss must not be bigger than that.
 432         *
 433         * (NOTE: at +512MB starts the module area, see MODULES_VADDR.
 434         *  If you want to increase this then increase MODULES_VADDR
 435         *  too.)
 436         */
 437        PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
 438                KERNEL_IMAGE_SIZE/PMD_SIZE)
 439
 440NEXT_PAGE(level2_fixmap_pgt)
 441        .fill   506,8,0
 442        .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 443        /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
 444        .fill   5,8,0
 445
 446NEXT_PAGE(level1_fixmap_pgt)
 447        .fill   512,8,0
 448
 449#undef PMDS
 450
 451        .data
 452        .align 16
 453        .globl early_gdt_descr
 454early_gdt_descr:
 455        .word   GDT_ENTRIES*8-1
 456early_gdt_descr_base:
 457        .quad   INIT_PER_CPU_VAR(gdt_page)
 458
 459ENTRY(phys_base)
 460        /* This must match the first entry in level2_kernel_pgt */
 461        .quad   0x0000000000000000
 462EXPORT_SYMBOL(phys_base)
 463
 464#include "../../x86/xen/xen-head.S"
 465
 466        __PAGE_ALIGNED_BSS
 467NEXT_PAGE(empty_zero_page)
 468        .skip PAGE_SIZE
 469EXPORT_SYMBOL(empty_zero_page)
 470
 471