linux/arch/x86/kernel/machine_kexec_64.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * handle transition of Linux booting another kernel
   4 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
   5 */
   6
   7#define pr_fmt(fmt)     "kexec: " fmt
   8
   9#include <linux/mm.h>
  10#include <linux/kexec.h>
  11#include <linux/string.h>
  12#include <linux/gfp.h>
  13#include <linux/reboot.h>
  14#include <linux/numa.h>
  15#include <linux/ftrace.h>
  16#include <linux/io.h>
  17#include <linux/suspend.h>
  18#include <linux/vmalloc.h>
  19#include <linux/efi.h>
  20
  21#include <asm/init.h>
  22#include <asm/tlbflush.h>
  23#include <asm/mmu_context.h>
  24#include <asm/io_apic.h>
  25#include <asm/debugreg.h>
  26#include <asm/kexec-bzimage64.h>
  27#include <asm/setup.h>
  28#include <asm/set_memory.h>
  29
  30#ifdef CONFIG_ACPI
  31/*
  32 * Used while adding mapping for ACPI tables.
  33 * Can be reused when other iomem regions need be mapped
  34 */
  35struct init_pgtable_data {
  36        struct x86_mapping_info *info;
  37        pgd_t *level4p;
  38};
  39
  40static int mem_region_callback(struct resource *res, void *arg)
  41{
  42        struct init_pgtable_data *data = arg;
  43        unsigned long mstart, mend;
  44
  45        mstart = res->start;
  46        mend = mstart + resource_size(res) - 1;
  47
  48        return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend);
  49}
  50
  51static int
  52map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p)
  53{
  54        struct init_pgtable_data data;
  55        unsigned long flags;
  56        int ret;
  57
  58        data.info = info;
  59        data.level4p = level4p;
  60        flags = IORESOURCE_MEM | IORESOURCE_BUSY;
  61
  62        ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1,
  63                                  &data, mem_region_callback);
  64        if (ret && ret != -EINVAL)
  65                return ret;
  66
  67        /* ACPI tables could be located in ACPI Non-volatile Storage region */
  68        ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1,
  69                                  &data, mem_region_callback);
  70        if (ret && ret != -EINVAL)
  71                return ret;
  72
  73        return 0;
  74}
  75#else
  76static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; }
  77#endif
  78
  79#ifdef CONFIG_KEXEC_FILE
  80const struct kexec_file_ops * const kexec_file_loaders[] = {
  81                &kexec_bzImage64_ops,
  82                NULL
  83};
  84#endif
  85
  86static int
  87map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
  88{
  89#ifdef CONFIG_EFI
  90        unsigned long mstart, mend;
  91
  92        if (!efi_enabled(EFI_BOOT))
  93                return 0;
  94
  95        mstart = (boot_params.efi_info.efi_systab |
  96                        ((u64)boot_params.efi_info.efi_systab_hi<<32));
  97
  98        if (efi_enabled(EFI_64BIT))
  99                mend = mstart + sizeof(efi_system_table_64_t);
 100        else
 101                mend = mstart + sizeof(efi_system_table_32_t);
 102
 103        if (!mstart)
 104                return 0;
 105
 106        return kernel_ident_mapping_init(info, level4p, mstart, mend);
 107#endif
 108        return 0;
 109}
 110
 111static void free_transition_pgtable(struct kimage *image)
 112{
 113        free_page((unsigned long)image->arch.p4d);
 114        image->arch.p4d = NULL;
 115        free_page((unsigned long)image->arch.pud);
 116        image->arch.pud = NULL;
 117        free_page((unsigned long)image->arch.pmd);
 118        image->arch.pmd = NULL;
 119        free_page((unsigned long)image->arch.pte);
 120        image->arch.pte = NULL;
 121}
 122
 123static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 124{
 125        pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
 126        unsigned long vaddr, paddr;
 127        int result = -ENOMEM;
 128        p4d_t *p4d;
 129        pud_t *pud;
 130        pmd_t *pmd;
 131        pte_t *pte;
 132
 133        vaddr = (unsigned long)relocate_kernel;
 134        paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
 135        pgd += pgd_index(vaddr);
 136        if (!pgd_present(*pgd)) {
 137                p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
 138                if (!p4d)
 139                        goto err;
 140                image->arch.p4d = p4d;
 141                set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
 142        }
 143        p4d = p4d_offset(pgd, vaddr);
 144        if (!p4d_present(*p4d)) {
 145                pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
 146                if (!pud)
 147                        goto err;
 148                image->arch.pud = pud;
 149                set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
 150        }
 151        pud = pud_offset(p4d, vaddr);
 152        if (!pud_present(*pud)) {
 153                pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 154                if (!pmd)
 155                        goto err;
 156                image->arch.pmd = pmd;
 157                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 158        }
 159        pmd = pmd_offset(pud, vaddr);
 160        if (!pmd_present(*pmd)) {
 161                pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 162                if (!pte)
 163                        goto err;
 164                image->arch.pte = pte;
 165                set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 166        }
 167        pte = pte_offset_kernel(pmd, vaddr);
 168
 169        if (sev_active())
 170                prot = PAGE_KERNEL_EXEC;
 171
 172        set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
 173        return 0;
 174err:
 175        return result;
 176}
 177
 178static void *alloc_pgt_page(void *data)
 179{
 180        struct kimage *image = (struct kimage *)data;
 181        struct page *page;
 182        void *p = NULL;
 183
 184        page = kimage_alloc_control_pages(image, 0);
 185        if (page) {
 186                p = page_address(page);
 187                clear_page(p);
 188        }
 189
 190        return p;
 191}
 192
 193static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 194{
 195        struct x86_mapping_info info = {
 196                .alloc_pgt_page = alloc_pgt_page,
 197                .context        = image,
 198                .page_flag      = __PAGE_KERNEL_LARGE_EXEC,
 199                .kernpg_flag    = _KERNPG_TABLE_NOENC,
 200        };
 201        unsigned long mstart, mend;
 202        pgd_t *level4p;
 203        int result;
 204        int i;
 205
 206        level4p = (pgd_t *)__va(start_pgtable);
 207        clear_page(level4p);
 208
 209        if (sev_active()) {
 210                info.page_flag   |= _PAGE_ENC;
 211                info.kernpg_flag |= _PAGE_ENC;
 212        }
 213
 214        if (direct_gbpages)
 215                info.direct_gbpages = true;
 216
 217        for (i = 0; i < nr_pfn_mapped; i++) {
 218                mstart = pfn_mapped[i].start << PAGE_SHIFT;
 219                mend   = pfn_mapped[i].end << PAGE_SHIFT;
 220
 221                result = kernel_ident_mapping_init(&info,
 222                                                 level4p, mstart, mend);
 223                if (result)
 224                        return result;
 225        }
 226
 227        /*
 228         * segments's mem ranges could be outside 0 ~ max_pfn,
 229         * for example when jump back to original kernel from kexeced kernel.
 230         * or first kernel is booted with user mem map, and second kernel
 231         * could be loaded out of that range.
 232         */
 233        for (i = 0; i < image->nr_segments; i++) {
 234                mstart = image->segment[i].mem;
 235                mend   = mstart + image->segment[i].memsz;
 236
 237                result = kernel_ident_mapping_init(&info,
 238                                                 level4p, mstart, mend);
 239
 240                if (result)
 241                        return result;
 242        }
 243
 244        /*
 245         * Prepare EFI systab and ACPI tables for kexec kernel since they are
 246         * not covered by pfn_mapped.
 247         */
 248        result = map_efi_systab(&info, level4p);
 249        if (result)
 250                return result;
 251
 252        result = map_acpi_tables(&info, level4p);
 253        if (result)
 254                return result;
 255
 256        return init_transition_pgtable(image, level4p);
 257}
 258
 259static void load_segments(void)
 260{
 261        __asm__ __volatile__ (
 262                "\tmovl %0,%%ds\n"
 263                "\tmovl %0,%%es\n"
 264                "\tmovl %0,%%ss\n"
 265                "\tmovl %0,%%fs\n"
 266                "\tmovl %0,%%gs\n"
 267                : : "a" (__KERNEL_DS) : "memory"
 268                );
 269}
 270
 271int machine_kexec_prepare(struct kimage *image)
 272{
 273        unsigned long start_pgtable;
 274        int result;
 275
 276        /* Calculate the offsets */
 277        start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 278
 279        /* Setup the identity mapped 64bit page table */
 280        result = init_pgtable(image, start_pgtable);
 281        if (result)
 282                return result;
 283
 284        return 0;
 285}
 286
 287void machine_kexec_cleanup(struct kimage *image)
 288{
 289        free_transition_pgtable(image);
 290}
 291
 292/*
 293 * Do not allocate memory (or fail in any way) in machine_kexec().
 294 * We are past the point of no return, committed to rebooting now.
 295 */
 296void machine_kexec(struct kimage *image)
 297{
 298        unsigned long page_list[PAGES_NR];
 299        void *control_page;
 300        int save_ftrace_enabled;
 301
 302#ifdef CONFIG_KEXEC_JUMP
 303        if (image->preserve_context)
 304                save_processor_state();
 305#endif
 306
 307        save_ftrace_enabled = __ftrace_enabled_save();
 308
 309        /* Interrupts aren't acceptable while we reboot */
 310        local_irq_disable();
 311        hw_breakpoint_disable();
 312
 313        if (image->preserve_context) {
 314#ifdef CONFIG_X86_IO_APIC
 315                /*
 316                 * We need to put APICs in legacy mode so that we can
 317                 * get timer interrupts in second kernel. kexec/kdump
 318                 * paths already have calls to restore_boot_irq_mode()
 319                 * in one form or other. kexec jump path also need one.
 320                 */
 321                clear_IO_APIC();
 322                restore_boot_irq_mode();
 323#endif
 324        }
 325
 326        control_page = page_address(image->control_code_page) + PAGE_SIZE;
 327        memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 328
 329        page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
 330        page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 331        page_list[PA_TABLE_PAGE] =
 332          (unsigned long)__pa(page_address(image->control_code_page));
 333
 334        if (image->type == KEXEC_TYPE_DEFAULT)
 335                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 336                                                << PAGE_SHIFT);
 337
 338        /*
 339         * The segment registers are funny things, they have both a
 340         * visible and an invisible part.  Whenever the visible part is
 341         * set to a specific selector, the invisible part is loaded
 342         * with from a table in memory.  At no other time is the
 343         * descriptor table in memory accessed.
 344         *
 345         * I take advantage of this here by force loading the
 346         * segments, before I zap the gdt with an invalid value.
 347         */
 348        load_segments();
 349        /*
 350         * The gdt & idt are now invalid.
 351         * If you want to load them you must set up your own idt & gdt.
 352         */
 353        native_idt_invalidate();
 354        native_gdt_invalidate();
 355
 356        /* now call it */
 357        image->start = relocate_kernel((unsigned long)image->head,
 358                                       (unsigned long)page_list,
 359                                       image->start,
 360                                       image->preserve_context,
 361                                       sme_active());
 362
 363#ifdef CONFIG_KEXEC_JUMP
 364        if (image->preserve_context)
 365                restore_processor_state();
 366#endif
 367
 368        __ftrace_enabled_restore(save_ftrace_enabled);
 369}
 370
 371/* arch-dependent functionality related to kexec file-based syscall */
 372
 373#ifdef CONFIG_KEXEC_FILE
 374void *arch_kexec_kernel_image_load(struct kimage *image)
 375{
 376        vfree(image->elf_headers);
 377        image->elf_headers = NULL;
 378
 379        if (!image->fops || !image->fops->load)
 380                return ERR_PTR(-ENOEXEC);
 381
 382        return image->fops->load(image, image->kernel_buf,
 383                                 image->kernel_buf_len, image->initrd_buf,
 384                                 image->initrd_buf_len, image->cmdline_buf,
 385                                 image->cmdline_buf_len);
 386}
 387
 388/*
 389 * Apply purgatory relocations.
 390 *
 391 * @pi:         Purgatory to be relocated.
 392 * @section:    Section relocations applying to.
 393 * @relsec:     Section containing RELAs.
 394 * @symtabsec:  Corresponding symtab.
 395 *
 396 * TODO: Some of the code belongs to generic code. Move that in kexec.c.
 397 */
 398int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 399                                     Elf_Shdr *section, const Elf_Shdr *relsec,
 400                                     const Elf_Shdr *symtabsec)
 401{
 402        unsigned int i;
 403        Elf64_Rela *rel;
 404        Elf64_Sym *sym;
 405        void *location;
 406        unsigned long address, sec_base, value;
 407        const char *strtab, *name, *shstrtab;
 408        const Elf_Shdr *sechdrs;
 409
 410        /* String & section header string table */
 411        sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
 412        strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset;
 413        shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
 414
 415        rel = (void *)pi->ehdr + relsec->sh_offset;
 416
 417        pr_debug("Applying relocate section %s to %u\n",
 418                 shstrtab + relsec->sh_name, relsec->sh_info);
 419
 420        for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) {
 421
 422                /*
 423                 * rel[i].r_offset contains byte offset from beginning
 424                 * of section to the storage unit affected.
 425                 *
 426                 * This is location to update. This is temporary buffer
 427                 * where section is currently loaded. This will finally be
 428                 * loaded to a different address later, pointed to by
 429                 * ->sh_addr. kexec takes care of moving it
 430                 *  (kexec_load_segment()).
 431                 */
 432                location = pi->purgatory_buf;
 433                location += section->sh_offset;
 434                location += rel[i].r_offset;
 435
 436                /* Final address of the location */
 437                address = section->sh_addr + rel[i].r_offset;
 438
 439                /*
 440                 * rel[i].r_info contains information about symbol table index
 441                 * w.r.t which relocation must be made and type of relocation
 442                 * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
 443                 * these respectively.
 444                 */
 445                sym = (void *)pi->ehdr + symtabsec->sh_offset;
 446                sym += ELF64_R_SYM(rel[i].r_info);
 447
 448                if (sym->st_name)
 449                        name = strtab + sym->st_name;
 450                else
 451                        name = shstrtab + sechdrs[sym->st_shndx].sh_name;
 452
 453                pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
 454                         name, sym->st_info, sym->st_shndx, sym->st_value,
 455                         sym->st_size);
 456
 457                if (sym->st_shndx == SHN_UNDEF) {
 458                        pr_err("Undefined symbol: %s\n", name);
 459                        return -ENOEXEC;
 460                }
 461
 462                if (sym->st_shndx == SHN_COMMON) {
 463                        pr_err("symbol '%s' in common section\n", name);
 464                        return -ENOEXEC;
 465                }
 466
 467                if (sym->st_shndx == SHN_ABS)
 468                        sec_base = 0;
 469                else if (sym->st_shndx >= pi->ehdr->e_shnum) {
 470                        pr_err("Invalid section %d for symbol %s\n",
 471                               sym->st_shndx, name);
 472                        return -ENOEXEC;
 473                } else
 474                        sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
 475
 476                value = sym->st_value;
 477                value += sec_base;
 478                value += rel[i].r_addend;
 479
 480                switch (ELF64_R_TYPE(rel[i].r_info)) {
 481                case R_X86_64_NONE:
 482                        break;
 483                case R_X86_64_64:
 484                        *(u64 *)location = value;
 485                        break;
 486                case R_X86_64_32:
 487                        *(u32 *)location = value;
 488                        if (value != *(u32 *)location)
 489                                goto overflow;
 490                        break;
 491                case R_X86_64_32S:
 492                        *(s32 *)location = value;
 493                        if ((s64)value != *(s32 *)location)
 494                                goto overflow;
 495                        break;
 496                case R_X86_64_PC32:
 497                case R_X86_64_PLT32:
 498                        value -= (u64)address;
 499                        *(u32 *)location = value;
 500                        break;
 501                default:
 502                        pr_err("Unknown rela relocation: %llu\n",
 503                               ELF64_R_TYPE(rel[i].r_info));
 504                        return -ENOEXEC;
 505                }
 506        }
 507        return 0;
 508
 509overflow:
 510        pr_err("Overflow in relocation type %d value 0x%lx\n",
 511               (int)ELF64_R_TYPE(rel[i].r_info), value);
 512        return -ENOEXEC;
 513}
 514#endif /* CONFIG_KEXEC_FILE */
 515
 516static int
 517kexec_mark_range(unsigned long start, unsigned long end, bool protect)
 518{
 519        struct page *page;
 520        unsigned int nr_pages;
 521
 522        /*
 523         * For physical range: [start, end]. We must skip the unassigned
 524         * crashk resource with zero-valued "end" member.
 525         */
 526        if (!end || start > end)
 527                return 0;
 528
 529        page = pfn_to_page(start >> PAGE_SHIFT);
 530        nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
 531        if (protect)
 532                return set_pages_ro(page, nr_pages);
 533        else
 534                return set_pages_rw(page, nr_pages);
 535}
 536
 537static void kexec_mark_crashkres(bool protect)
 538{
 539        unsigned long control;
 540
 541        kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect);
 542
 543        /* Don't touch the control code page used in crash_kexec().*/
 544        control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
 545        /* Control code page is located in the 2nd page. */
 546        kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
 547        control += KEXEC_CONTROL_PAGE_SIZE;
 548        kexec_mark_range(control, crashk_res.end, protect);
 549}
 550
 551void arch_kexec_protect_crashkres(void)
 552{
 553        kexec_mark_crashkres(true);
 554}
 555
 556void arch_kexec_unprotect_crashkres(void)
 557{
 558        kexec_mark_crashkres(false);
 559}
 560
 561/*
 562 * During a traditional boot under SME, SME will encrypt the kernel,
 563 * so the SME kexec kernel also needs to be un-encrypted in order to
 564 * replicate a normal SME boot.
 565 *
 566 * During a traditional boot under SEV, the kernel has already been
 567 * loaded encrypted, so the SEV kexec kernel needs to be encrypted in
 568 * order to replicate a normal SEV boot.
 569 */
 570int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
 571{
 572        if (sev_active())
 573                return 0;
 574
 575        /*
 576         * If SME is active we need to be sure that kexec pages are
 577         * not encrypted because when we boot to the new kernel the
 578         * pages won't be accessed encrypted (initially).
 579         */
 580        return set_memory_decrypted((unsigned long)vaddr, pages);
 581}
 582
 583void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
 584{
 585        if (sev_active())
 586                return;
 587
 588        /*
 589         * If SME is active we need to reset the pages back to being
 590         * an encrypted mapping before freeing them.
 591         */
 592        set_memory_encrypted((unsigned long)vaddr, pages);
 593}
 594