linux/arch/x86/kernel/machine_kexec_64.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * handle transition of Linux booting another kernel
   4 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
   5 */
   6
   7#define pr_fmt(fmt)     "kexec: " fmt
   8
   9#include <linux/mm.h>
  10#include <linux/kexec.h>
  11#include <linux/string.h>
  12#include <linux/gfp.h>
  13#include <linux/reboot.h>
  14#include <linux/numa.h>
  15#include <linux/ftrace.h>
  16#include <linux/io.h>
  17#include <linux/suspend.h>
  18#include <linux/vmalloc.h>
  19#include <linux/efi.h>
  20
  21#include <asm/init.h>
  22#include <asm/tlbflush.h>
  23#include <asm/mmu_context.h>
  24#include <asm/io_apic.h>
  25#include <asm/debugreg.h>
  26#include <asm/kexec-bzimage64.h>
  27#include <asm/setup.h>
  28#include <asm/set_memory.h>
  29
  30#ifdef CONFIG_ACPI
  31/*
  32 * Used while adding mapping for ACPI tables.
  33 * Can be reused when other iomem regions need be mapped
  34 */
  35struct init_pgtable_data {
  36        struct x86_mapping_info *info;
  37        pgd_t *level4p;
  38};
  39
  40static int mem_region_callback(struct resource *res, void *arg)
  41{
  42        struct init_pgtable_data *data = arg;
  43        unsigned long mstart, mend;
  44
  45        mstart = res->start;
  46        mend = mstart + resource_size(res) - 1;
  47
  48        return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend);
  49}
  50
  51static int
  52map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
  53{
  54        struct init_pgtable_data data;
  55        unsigned long flags;
  56        int ret;
  57
  58        data.info = info;
  59        data.level4p = level4p;
  60        flags = IORESOURCE_MEM | IORESOURCE_BUSY;
  61
  62        ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
  63                                  &data, mem_region_callback);
  64        if (ret && ret != -EINVAL)
  65                return ret;
  66
  67        /* ACPI tables could be located in ACPI Non-volatile Storage region */
  68        ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1,
  69                                  &data, mem_region_callback);
  70        if (ret && ret != -EINVAL)
  71                return ret;
  72
  73        return 0;
  74}
  75#else
  76static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; }
  77#endif
  78
  79#ifdef CONFIG_KEXEC_FILE
  80const struct kexec_file_ops * const kexec_file_loaders[] = {
  81                &kexec_bzImage64_ops,
  82                NULL
  83};
  84#endif
  85
  86static int
  87map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
  88{
  89#ifdef CONFIG_EFI
  90        unsigned long mstart, mend;
  91
  92        if (!efi_enabled(EFI_BOOT))
  93                return 0;
  94
  95        mstart = (boot_params.efi_info.efi_systab |
  96                        ((u64)boot_params.efi_info.efi_systab_hi<<32));
  97
  98        if (efi_enabled(EFI_64BIT))
  99                mend = mstart + sizeof(efi_system_table_64_t);
 100        else
 101                mend = mstart + sizeof(efi_system_table_32_t);
 102
 103        if (!mstart)
 104                return 0;
 105
 106        return kernel_ident_mapping_init(info, level4p, mstart, mend);
 107#endif
 108        return 0;
 109}
 110
 111static void free_transition_pgtable(struct kimage *image)
 112{
 113        free_page((unsigned long)image->arch.p4d);
 114        image->arch.p4d = NULL;
 115        free_page((unsigned long)image->arch.pud);
 116        image->arch.pud = NULL;
 117        free_page((unsigned long)image->arch.pmd);
 118        image->arch.pmd = NULL;
 119        free_page((unsigned long)image->arch.pte);
 120        image->arch.pte = NULL;
 121}
 122
 123static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 124{
 125        pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
 126        unsigned long vaddr, paddr;
 127        int result = -ENOMEM;
 128        p4d_t *p4d;
 129        pud_t *pud;
 130        pmd_t *pmd;
 131        pte_t *pte;
 132
 133        vaddr = (unsigned long)relocate_kernel;
 134        paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
 135        pgd += pgd_index(vaddr);
 136        if (!pgd_present(*pgd)) {
 137                p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
 138                if (!p4d)
 139                        goto err;
 140                image->arch.p4d = p4d;
 141                set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
 142        }
 143        p4d = p4d_offset(pgd, vaddr);
 144        if (!p4d_present(*p4d)) {
 145                pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
 146                if (!pud)
 147                        goto err;
 148                image->arch.pud = pud;
 149                set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
 150        }
 151        pud = pud_offset(p4d, vaddr);
 152        if (!pud_present(*pud)) {
 153                pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 154                if (!pmd)
 155                        goto err;
 156                image->arch.pmd = pmd;
 157                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 158        }
 159        pmd = pmd_offset(pud, vaddr);
 160        if (!pmd_present(*pmd)) {
 161                pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 162                if (!pte)
 163                        goto err;
 164                image->arch.pte = pte;
 165                set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 166        }
 167        pte = pte_offset_kernel(pmd, vaddr);
 168
 169        if (sev_active())
 170                prot = PAGE_KERNEL_EXEC;
 171
 172        set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
 173        return 0;
 174err:
 175        return result;
 176}
 177
 178static void *alloc_pgt_page(void *data)
 179{
 180        struct kimage *image = (struct kimage *)data;
 181        struct page *page;
 182        void *p = NULL;
 183
 184        page = kimage_alloc_control_pages(image, 0);
 185        if (page) {
 186                p = page_address(page);
 187                clear_page(p);
 188        }
 189
 190        return p;
 191}
 192
 193static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 194{
 195        struct x86_mapping_info info = {
 196                .alloc_pgt_page = alloc_pgt_page,
 197                .context        = image,
 198                .page_flag      = __PAGE_KERNEL_LARGE_EXEC,
 199                .kernpg_flag    = _KERNPG_TABLE_NOENC,
 200        };
 201        unsigned long mstart, mend;
 202        pgd_t *level4p;
 203        int result;
 204        int i;
 205
 206        level4p = (pgd_t *)__va(start_pgtable);
 207        clear_page(level4p);
 208
 209        if (sev_active()) {
 210                info.page_flag   |= _PAGE_ENC;
 211                info.kernpg_flag |= _PAGE_ENC;
 212        }
 213
 214        if (direct_gbpages)
 215                info.direct_gbpages = true;
 216
 217        for (i = 0; i < nr_pfn_mapped; i++) {
 218                mstart = pfn_mapped[i].start << PAGE_SHIFT;
 219                mend   = pfn_mapped[i].end << PAGE_SHIFT;
 220
 221                result = kernel_ident_mapping_init(&info,
 222                                                 level4p, mstart, mend);
 223                if (result)
 224                        return result;
 225        }
 226
 227        /*
 228         * segments's mem ranges could be outside 0 ~ max_pfn,
 229         * for example when jump back to original kernel from kexeced kernel.
 230         * or first kernel is booted with user mem map, and second kernel
 231         * could be loaded out of that range.
 232         */
 233        for (i = 0; i < image->nr_segments; i++) {
 234                mstart = image->segment[i].mem;
 235                mend   = mstart + image->segment[i].memsz;
 236
 237                result = kernel_ident_mapping_init(&info,
 238                                                 level4p, mstart, mend);
 239
 240                if (result)
 241                        return result;
 242        }
 243
 244        /*
 245         * Prepare EFI systab and ACPI tables for kexec kernel since they are
 246         * not covered by pfn_mapped.
 247         */
 248        result = map_efi_systab(&info, level4p);
 249        if (result)
 250                return result;
 251
 252        result = map_acpi_tables(&info, level4p);
 253        if (result)
 254                return result;
 255
 256        return init_transition_pgtable(image, level4p);
 257}
 258
 259static void set_idt(void *newidt, u16 limit)
 260{
 261        struct desc_ptr curidt;
 262
 263        /* x86-64 supports unaliged loads & stores */
 264        curidt.size    = limit;
 265        curidt.address = (unsigned long)newidt;
 266
 267        __asm__ __volatile__ (
 268                "lidtq %0\n"
 269                : : "m" (curidt)
 270                );
 271};
 272
 273
 274static void set_gdt(void *newgdt, u16 limit)
 275{
 276        struct desc_ptr curgdt;
 277
 278        /* x86-64 supports unaligned loads & stores */
 279        curgdt.size    = limit;
 280        curgdt.address = (unsigned long)newgdt;
 281
 282        __asm__ __volatile__ (
 283                "lgdtq %0\n"
 284                : : "m" (curgdt)
 285                );
 286};
 287
 288static void load_segments(void)
 289{
 290        __asm__ __volatile__ (
 291                "\tmovl %0,%%ds\n"
 292                "\tmovl %0,%%es\n"
 293                "\tmovl %0,%%ss\n"
 294                "\tmovl %0,%%fs\n"
 295                "\tmovl %0,%%gs\n"
 296                : : "a" (__KERNEL_DS) : "memory"
 297                );
 298}
 299
 300int machine_kexec_prepare(struct kimage *image)
 301{
 302        unsigned long start_pgtable;
 303        int result;
 304
 305        /* Calculate the offsets */
 306        start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 307
 308        /* Setup the identity mapped 64bit page table */
 309        result = init_pgtable(image, start_pgtable);
 310        if (result)
 311                return result;
 312
 313        return 0;
 314}
 315
 316void machine_kexec_cleanup(struct kimage *image)
 317{
 318        free_transition_pgtable(image);
 319}
 320
 321/*
 322 * Do not allocate memory (or fail in any way) in machine_kexec().
 323 * We are past the point of no return, committed to rebooting now.
 324 */
 325void machine_kexec(struct kimage *image)
 326{
 327        unsigned long page_list[PAGES_NR];
 328        void *control_page;
 329        int save_ftrace_enabled;
 330
 331#ifdef CONFIG_KEXEC_JUMP
 332        if (image->preserve_context)
 333                save_processor_state();
 334#endif
 335
 336        save_ftrace_enabled = __ftrace_enabled_save();
 337
 338        /* Interrupts aren't acceptable while we reboot */
 339        local_irq_disable();
 340        hw_breakpoint_disable();
 341
 342        if (image->preserve_context) {
 343#ifdef CONFIG_X86_IO_APIC
 344                /*
 345                 * We need to put APICs in legacy mode so that we can
 346                 * get timer interrupts in second kernel. kexec/kdump
 347                 * paths already have calls to restore_boot_irq_mode()
 348                 * in one form or other. kexec jump path also need one.
 349                 */
 350                clear_IO_APIC();
 351                restore_boot_irq_mode();
 352#endif
 353        }
 354
 355        control_page = page_address(image->control_code_page) + PAGE_SIZE;
 356        memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 357
 358        page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
 359        page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 360        page_list[PA_TABLE_PAGE] =
 361          (unsigned long)__pa(page_address(image->control_code_page));
 362
 363        if (image->type == KEXEC_TYPE_DEFAULT)
 364                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 365                                                << PAGE_SHIFT);
 366
 367        /*
 368         * The segment registers are funny things, they have both a
 369         * visible and an invisible part.  Whenever the visible part is
 370         * set to a specific selector, the invisible part is loaded
 371         * with from a table in memory.  At no other time is the
 372         * descriptor table in memory accessed.
 373         *
 374         * I take advantage of this here by force loading the
 375         * segments, before I zap the gdt with an invalid value.
 376         */
 377        load_segments();
 378        /*
 379         * The gdt & idt are now invalid.
 380         * If you want to load them you must set up your own idt & gdt.
 381         */
 382        set_gdt(phys_to_virt(0), 0);
 383        set_idt(phys_to_virt(0), 0);
 384
 385        /* now call it */
 386        image->start = relocate_kernel((unsigned long)image->head,
 387                                       (unsigned long)page_list,
 388                                       image->start,
 389                                       image->preserve_context,
 390                                       sme_active());
 391
 392#ifdef CONFIG_KEXEC_JUMP
 393        if (image->preserve_context)
 394                restore_processor_state();
 395#endif
 396
 397        __ftrace_enabled_restore(save_ftrace_enabled);
 398}
 399
 400/* arch-dependent functionality related to kexec file-based syscall */
 401
 402#ifdef CONFIG_KEXEC_FILE
 403void *arch_kexec_kernel_image_load(struct kimage *image)
 404{
 405        vfree(image->arch.elf_headers);
 406        image->arch.elf_headers = NULL;
 407
 408        if (!image->fops || !image->fops->load)
 409                return ERR_PTR(-ENOEXEC);
 410
 411        return image->fops->load(image, image->kernel_buf,
 412                                 image->kernel_buf_len, image->initrd_buf,
 413                                 image->initrd_buf_len, image->cmdline_buf,
 414                                 image->cmdline_buf_len);
 415}
 416
 417/*
 418 * Apply purgatory relocations.
 419 *
 420 * @pi:         Purgatory to be relocated.
 421 * @section:    Section relocations applying to.
 422 * @relsec:     Section containing RELAs.
 423 * @symtabsec:  Corresponding symtab.
 424 *
 425 * TODO: Some of the code belongs to generic code. Move that in kexec.c.
 426 */
 427int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 428                                     Elf_Shdr *section, const Elf_Shdr *relsec,
 429                                     const Elf_Shdr *symtabsec)
 430{
 431        unsigned int i;
 432        Elf64_Rela *rel;
 433        Elf64_Sym *sym;
 434        void *location;
 435        unsigned long address, sec_base, value;
 436        const char *strtab, *name, *shstrtab;
 437        const Elf_Shdr *sechdrs;
 438
 439        /* String & section header string table */
 440        sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
 441        strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset;
 442        shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
 443
 444        rel = (void *)pi->ehdr + relsec->sh_offset;
 445
 446        pr_debug("Applying relocate section %s to %u\n",
 447                 shstrtab + relsec->sh_name, relsec->sh_info);
 448
 449        for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) {
 450
 451                /*
 452                 * rel[i].r_offset contains byte offset from beginning
 453                 * of section to the storage unit affected.
 454                 *
 455                 * This is location to update. This is temporary buffer
 456                 * where section is currently loaded. This will finally be
 457                 * loaded to a different address later, pointed to by
 458                 * ->sh_addr. kexec takes care of moving it
 459                 *  (kexec_load_segment()).
 460                 */
 461                location = pi->purgatory_buf;
 462                location += section->sh_offset;
 463                location += rel[i].r_offset;
 464
 465                /* Final address of the location */
 466                address = section->sh_addr + rel[i].r_offset;
 467
 468                /*
 469                 * rel[i].r_info contains information about symbol table index
 470                 * w.r.t which relocation must be made and type of relocation
 471                 * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
 472                 * these respectively.
 473                 */
 474                sym = (void *)pi->ehdr + symtabsec->sh_offset;
 475                sym += ELF64_R_SYM(rel[i].r_info);
 476
 477                if (sym->st_name)
 478                        name = strtab + sym->st_name;
 479                else
 480                        name = shstrtab + sechdrs[sym->st_shndx].sh_name;
 481
 482                pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
 483                         name, sym->st_info, sym->st_shndx, sym->st_value,
 484                         sym->st_size);
 485
 486                if (sym->st_shndx == SHN_UNDEF) {
 487                        pr_err("Undefined symbol: %s\n", name);
 488                        return -ENOEXEC;
 489                }
 490
 491                if (sym->st_shndx == SHN_COMMON) {
 492                        pr_err("symbol '%s' in common section\n", name);
 493                        return -ENOEXEC;
 494                }
 495
 496                if (sym->st_shndx == SHN_ABS)
 497                        sec_base = 0;
 498                else if (sym->st_shndx >= pi->ehdr->e_shnum) {
 499                        pr_err("Invalid section %d for symbol %s\n",
 500                               sym->st_shndx, name);
 501                        return -ENOEXEC;
 502                } else
 503                        sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
 504
 505                value = sym->st_value;
 506                value += sec_base;
 507                value += rel[i].r_addend;
 508
 509                switch (ELF64_R_TYPE(rel[i].r_info)) {
 510                case R_X86_64_NONE:
 511                        break;
 512                case R_X86_64_64:
 513                        *(u64 *)location = value;
 514                        break;
 515                case R_X86_64_32:
 516                        *(u32 *)location = value;
 517                        if (value != *(u32 *)location)
 518                                goto overflow;
 519                        break;
 520                case R_X86_64_32S:
 521                        *(s32 *)location = value;
 522                        if ((s64)value != *(s32 *)location)
 523                                goto overflow;
 524                        break;
 525                case R_X86_64_PC32:
 526                case R_X86_64_PLT32:
 527                        value -= (u64)address;
 528                        *(u32 *)location = value;
 529                        break;
 530                default:
 531                        pr_err("Unknown rela relocation: %llu\n",
 532                               ELF64_R_TYPE(rel[i].r_info));
 533                        return -ENOEXEC;
 534                }
 535        }
 536        return 0;
 537
 538overflow:
 539        pr_err("Overflow in relocation type %d value 0x%lx\n",
 540               (int)ELF64_R_TYPE(rel[i].r_info), value);
 541        return -ENOEXEC;
 542}
 543#endif /* CONFIG_KEXEC_FILE */
 544
 545static int
 546kexec_mark_range(unsigned long start, unsigned long end, bool protect)
 547{
 548        struct page *page;
 549        unsigned int nr_pages;
 550
 551        /*
 552         * For physical range: [start, end]. We must skip the unassigned
 553         * crashk resource with zero-valued "end" member.
 554         */
 555        if (!end || start > end)
 556                return 0;
 557
 558        page = pfn_to_page(start >> PAGE_SHIFT);
 559        nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
 560        if (protect)
 561                return set_pages_ro(page, nr_pages);
 562        else
 563                return set_pages_rw(page, nr_pages);
 564}
 565
 566static void kexec_mark_crashkres(bool protect)
 567{
 568        unsigned long control;
 569
 570        kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
 571
 572        /* Don't touch the control code page used in crash_kexec().*/
 573        control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
 574        /* Control code page is located in the 2nd page. */
 575        kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
 576        control += KEXEC_CONTROL_PAGE_SIZE;
 577        kexec_mark_range(control, crashk_res.end, protect);
 578}
 579
 580void arch_kexec_protect_crashkres(void)
 581{
 582        kexec_mark_crashkres(true);
 583}
 584
 585void arch_kexec_unprotect_crashkres(void)
 586{
 587        kexec_mark_crashkres(false);
 588}
 589
 590/*
 591 * During a traditional boot under SME, SME will encrypt the kernel,
 592 * so the SME kexec kernel also needs to be un-encrypted in order to
 593 * replicate a normal SME boot.
 594 *
 595 * During a traditional boot under SEV, the kernel has already been
 596 * loaded encrypted, so the SEV kexec kernel needs to be encrypted in
 597 * order to replicate a normal SEV boot.
 598 */
 599int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
 600{
 601        if (sev_active())
 602                return 0;
 603
 604        /*
 605         * If SME is active we need to be sure that kexec pages are
 606         * not encrypted because when we boot to the new kernel the
 607         * pages won't be accessed encrypted (initially).
 608         */
 609        return set_memory_decrypted((unsigned long)vaddr, pages);
 610}
 611
 612void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
 613{
 614        if (sev_active())
 615                return;
 616
 617        /*
 618         * If SME is active we need to reset the pages back to being
 619         * an encrypted mapping before freeing them.
 620         */
 621        set_memory_encrypted((unsigned long)vaddr, pages);
 622}
 623