linux/arch/x86/kernel/machine_kexec_64.c
<<
>>
Prefs
   1/*
   2 * handle transition of Linux booting another kernel
   3 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
   4 *
   5 * This source code is licensed under the GNU General Public License,
   6 * Version 2.  See the file COPYING for more details.
   7 */
   8
   9#include <linux/mm.h>
  10#include <linux/kexec.h>
  11#include <linux/string.h>
  12#include <linux/gfp.h>
  13#include <linux/reboot.h>
  14#include <linux/numa.h>
  15#include <linux/ftrace.h>
  16#include <linux/io.h>
  17#include <linux/suspend.h>
  18
  19#include <asm/init.h>
  20#include <asm/pgtable.h>
  21#include <asm/tlbflush.h>
  22#include <asm/mmu_context.h>
  23#include <asm/debugreg.h>
  24
  25static void free_transition_pgtable(struct kimage *image)
  26{
  27        free_page((unsigned long)image->arch.pud);
  28        free_page((unsigned long)image->arch.pmd);
  29        free_page((unsigned long)image->arch.pte);
  30}
  31
  32static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
  33{
  34        pud_t *pud;
  35        pmd_t *pmd;
  36        pte_t *pte;
  37        unsigned long vaddr, paddr;
  38        int result = -ENOMEM;
  39
  40        vaddr = (unsigned long)relocate_kernel;
  41        paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
  42        pgd += pgd_index(vaddr);
  43        if (!pgd_present(*pgd)) {
  44                pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
  45                if (!pud)
  46                        goto err;
  47                image->arch.pud = pud;
  48                set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
  49        }
  50        pud = pud_offset(pgd, vaddr);
  51        if (!pud_present(*pud)) {
  52                pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
  53                if (!pmd)
  54                        goto err;
  55                image->arch.pmd = pmd;
  56                set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
  57        }
  58        pmd = pmd_offset(pud, vaddr);
  59        if (!pmd_present(*pmd)) {
  60                pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
  61                if (!pte)
  62                        goto err;
  63                image->arch.pte = pte;
  64                set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
  65        }
  66        pte = pte_offset_kernel(pmd, vaddr);
  67        set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
  68        return 0;
  69err:
  70        free_transition_pgtable(image);
  71        return result;
  72}
  73
  74static void *alloc_pgt_page(void *data)
  75{
  76        struct kimage *image = (struct kimage *)data;
  77        struct page *page;
  78        void *p = NULL;
  79
  80        page = kimage_alloc_control_pages(image, 0);
  81        if (page) {
  82                p = page_address(page);
  83                clear_page(p);
  84        }
  85
  86        return p;
  87}
  88
  89static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
  90{
  91        struct x86_mapping_info info = {
  92                .alloc_pgt_page = alloc_pgt_page,
  93                .context        = image,
  94                .pmd_flag       = __PAGE_KERNEL_LARGE_EXEC,
  95        };
  96        unsigned long mstart, mend;
  97        pgd_t *level4p;
  98        int result;
  99        int i;
 100
 101        level4p = (pgd_t *)__va(start_pgtable);
 102        clear_page(level4p);
 103        for (i = 0; i < nr_pfn_mapped; i++) {
 104                mstart = pfn_mapped[i].start << PAGE_SHIFT;
 105                mend   = pfn_mapped[i].end << PAGE_SHIFT;
 106
 107                result = kernel_ident_mapping_init(&info,
 108                                                 level4p, mstart, mend);
 109                if (result)
 110                        return result;
 111        }
 112
 113        /*
 114         * segments's mem ranges could be outside 0 ~ max_pfn,
 115         * for example when jump back to original kernel from kexeced kernel.
 116         * or first kernel is booted with user mem map, and second kernel
 117         * could be loaded out of that range.
 118         */
 119        for (i = 0; i < image->nr_segments; i++) {
 120                mstart = image->segment[i].mem;
 121                mend   = mstart + image->segment[i].memsz;
 122
 123                result = kernel_ident_mapping_init(&info,
 124                                                 level4p, mstart, mend);
 125
 126                if (result)
 127                        return result;
 128        }
 129
 130        return init_transition_pgtable(image, level4p);
 131}
 132
 133static void set_idt(void *newidt, u16 limit)
 134{
 135        struct desc_ptr curidt;
 136
 137        /* x86-64 supports unaliged loads & stores */
 138        curidt.size    = limit;
 139        curidt.address = (unsigned long)newidt;
 140
 141        __asm__ __volatile__ (
 142                "lidtq %0\n"
 143                : : "m" (curidt)
 144                );
 145};
 146
 147
 148static void set_gdt(void *newgdt, u16 limit)
 149{
 150        struct desc_ptr curgdt;
 151
 152        /* x86-64 supports unaligned loads & stores */
 153        curgdt.size    = limit;
 154        curgdt.address = (unsigned long)newgdt;
 155
 156        __asm__ __volatile__ (
 157                "lgdtq %0\n"
 158                : : "m" (curgdt)
 159                );
 160};
 161
 162static void load_segments(void)
 163{
 164        __asm__ __volatile__ (
 165                "\tmovl %0,%%ds\n"
 166                "\tmovl %0,%%es\n"
 167                "\tmovl %0,%%ss\n"
 168                "\tmovl %0,%%fs\n"
 169                "\tmovl %0,%%gs\n"
 170                : : "a" (__KERNEL_DS) : "memory"
 171                );
 172}
 173
 174int machine_kexec_prepare(struct kimage *image)
 175{
 176        unsigned long start_pgtable;
 177        int result;
 178
 179        /* Calculate the offsets */
 180        start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 181
 182        /* Setup the identity mapped 64bit page table */
 183        result = init_pgtable(image, start_pgtable);
 184        if (result)
 185                return result;
 186
 187        return 0;
 188}
 189
 190void machine_kexec_cleanup(struct kimage *image)
 191{
 192        free_transition_pgtable(image);
 193}
 194
 195/*
 196 * Do not allocate memory (or fail in any way) in machine_kexec().
 197 * We are past the point of no return, committed to rebooting now.
 198 */
 199void machine_kexec(struct kimage *image)
 200{
 201        unsigned long page_list[PAGES_NR];
 202        void *control_page;
 203        int save_ftrace_enabled;
 204
 205#ifdef CONFIG_KEXEC_JUMP
 206        if (image->preserve_context)
 207                save_processor_state();
 208#endif
 209
 210        save_ftrace_enabled = __ftrace_enabled_save();
 211
 212        /* Interrupts aren't acceptable while we reboot */
 213        local_irq_disable();
 214        hw_breakpoint_disable();
 215
 216        if (image->preserve_context) {
 217#ifdef CONFIG_X86_IO_APIC
 218                /*
 219                 * We need to put APICs in legacy mode so that we can
 220                 * get timer interrupts in second kernel. kexec/kdump
 221                 * paths already have calls to disable_IO_APIC() in
 222                 * one form or other. kexec jump path also need
 223                 * one.
 224                 */
 225                disable_IO_APIC();
 226#endif
 227        }
 228
 229        control_page = page_address(image->control_code_page) + PAGE_SIZE;
 230        memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 231
 232        page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
 233        page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 234        page_list[PA_TABLE_PAGE] =
 235          (unsigned long)__pa(page_address(image->control_code_page));
 236
 237        if (image->type == KEXEC_TYPE_DEFAULT)
 238                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 239                                                << PAGE_SHIFT);
 240
 241        /*
 242         * The segment registers are funny things, they have both a
 243         * visible and an invisible part.  Whenever the visible part is
 244         * set to a specific selector, the invisible part is loaded
 245         * with from a table in memory.  At no other time is the
 246         * descriptor table in memory accessed.
 247         *
 248         * I take advantage of this here by force loading the
 249         * segments, before I zap the gdt with an invalid value.
 250         */
 251        load_segments();
 252        /*
 253         * The gdt & idt are now invalid.
 254         * If you want to load them you must set up your own idt & gdt.
 255         */
 256        set_gdt(phys_to_virt(0), 0);
 257        set_idt(phys_to_virt(0), 0);
 258
 259        /* now call it */
 260        image->start = relocate_kernel((unsigned long)image->head,
 261                                       (unsigned long)page_list,
 262                                       image->start,
 263                                       image->preserve_context);
 264
 265#ifdef CONFIG_KEXEC_JUMP
 266        if (image->preserve_context)
 267                restore_processor_state();
 268#endif
 269
 270        __ftrace_enabled_restore(save_ftrace_enabled);
 271}
 272
 273void arch_crash_save_vmcoreinfo(void)
 274{
 275        VMCOREINFO_SYMBOL(phys_base);
 276        VMCOREINFO_SYMBOL(init_level4_pgt);
 277
 278#ifdef CONFIG_NUMA
 279        VMCOREINFO_SYMBOL(node_data);
 280        VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
 281#endif
 282}
 283
 284