linux/arch/x86/kernel/machine_kexec_32.c
<<
>>
Prefs
   1/*
   2 * handle transition of Linux booting another kernel
   3 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
   4 *
   5 * This source code is licensed under the GNU General Public License,
   6 * Version 2.  See the file COPYING for more details.
   7 */
   8
   9#include <linux/mm.h>
  10#include <linux/kexec.h>
  11#include <linux/delay.h>
  12#include <linux/init.h>
  13#include <linux/numa.h>
  14#include <linux/ftrace.h>
  15#include <linux/suspend.h>
  16#include <linux/gfp.h>
  17#include <linux/io.h>
  18
  19#include <asm/pgtable.h>
  20#include <asm/pgalloc.h>
  21#include <asm/tlbflush.h>
  22#include <asm/mmu_context.h>
  23#include <asm/apic.h>
  24#include <asm/cpufeature.h>
  25#include <asm/desc.h>
  26#include <asm/system.h>
  27#include <asm/cacheflush.h>
  28
  29static void set_idt(void *newidt, __u16 limit)
  30{
  31        struct desc_ptr curidt;
  32
  33        /* ia32 supports unaliged loads & stores */
  34        curidt.size    = limit;
  35        curidt.address = (unsigned long)newidt;
  36
  37        load_idt(&curidt);
  38}
  39
  40
  41static void set_gdt(void *newgdt, __u16 limit)
  42{
  43        struct desc_ptr curgdt;
  44
  45        /* ia32 supports unaligned loads & stores */
  46        curgdt.size    = limit;
  47        curgdt.address = (unsigned long)newgdt;
  48
  49        load_gdt(&curgdt);
  50}
  51
  52static void load_segments(void)
  53{
  54#define __STR(X) #X
  55#define STR(X) __STR(X)
  56
  57        __asm__ __volatile__ (
  58                "\tljmp $"STR(__KERNEL_CS)",$1f\n"
  59                "\t1:\n"
  60                "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
  61                "\tmovl %%eax,%%ds\n"
  62                "\tmovl %%eax,%%es\n"
  63                "\tmovl %%eax,%%fs\n"
  64                "\tmovl %%eax,%%gs\n"
  65                "\tmovl %%eax,%%ss\n"
  66                : : : "eax", "memory");
  67#undef STR
  68#undef __STR
  69}
  70
  71static void machine_kexec_free_page_tables(struct kimage *image)
  72{
  73        free_page((unsigned long)image->arch.pgd);
  74#ifdef CONFIG_X86_PAE
  75        free_page((unsigned long)image->arch.pmd0);
  76        free_page((unsigned long)image->arch.pmd1);
  77#endif
  78        free_page((unsigned long)image->arch.pte0);
  79        free_page((unsigned long)image->arch.pte1);
  80}
  81
  82static int machine_kexec_alloc_page_tables(struct kimage *image)
  83{
  84        image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
  85#ifdef CONFIG_X86_PAE
  86        image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
  87        image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
  88#endif
  89        image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
  90        image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
  91        if (!image->arch.pgd ||
  92#ifdef CONFIG_X86_PAE
  93            !image->arch.pmd0 || !image->arch.pmd1 ||
  94#endif
  95            !image->arch.pte0 || !image->arch.pte1) {
  96                machine_kexec_free_page_tables(image);
  97                return -ENOMEM;
  98        }
  99        return 0;
 100}
 101
 102static void machine_kexec_page_table_set_one(
 103        pgd_t *pgd, pmd_t *pmd, pte_t *pte,
 104        unsigned long vaddr, unsigned long paddr)
 105{
 106        pud_t *pud;
 107
 108        pgd += pgd_index(vaddr);
 109#ifdef CONFIG_X86_PAE
 110        if (!(pgd_val(*pgd) & _PAGE_PRESENT))
 111                set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
 112#endif
 113        pud = pud_offset(pgd, vaddr);
 114        pmd = pmd_offset(pud, vaddr);
 115        if (!(pmd_val(*pmd) & _PAGE_PRESENT))
 116                set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
 117        pte = pte_offset_kernel(pmd, vaddr);
 118        set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
 119}
 120
 121static void machine_kexec_prepare_page_tables(struct kimage *image)
 122{
 123        void *control_page;
 124        pmd_t *pmd = NULL;
 125
 126        control_page = page_address(image->control_code_page);
 127#ifdef CONFIG_X86_PAE
 128        pmd = image->arch.pmd0;
 129#endif
 130        machine_kexec_page_table_set_one(
 131                image->arch.pgd, pmd, image->arch.pte0,
 132                (unsigned long)control_page, __pa(control_page));
 133#ifdef CONFIG_X86_PAE
 134        pmd = image->arch.pmd1;
 135#endif
 136        machine_kexec_page_table_set_one(
 137                image->arch.pgd, pmd, image->arch.pte1,
 138                __pa(control_page), __pa(control_page));
 139}
 140
 141/*
 142 * A architecture hook called to validate the
 143 * proposed image and prepare the control pages
 144 * as needed.  The pages for KEXEC_CONTROL_PAGE_SIZE
 145 * have been allocated, but the segments have yet
 146 * been copied into the kernel.
 147 *
 148 * Do what every setup is needed on image and the
 149 * reboot code buffer to allow us to avoid allocations
 150 * later.
 151 *
 152 * - Make control page executable.
 153 * - Allocate page tables
 154 * - Setup page tables
 155 */
 156int machine_kexec_prepare(struct kimage *image)
 157{
 158        int error;
 159
 160        if (nx_enabled)
 161                set_pages_x(image->control_code_page, 1);
 162        error = machine_kexec_alloc_page_tables(image);
 163        if (error)
 164                return error;
 165        machine_kexec_prepare_page_tables(image);
 166        return 0;
 167}
 168
 169/*
 170 * Undo anything leftover by machine_kexec_prepare
 171 * when an image is freed.
 172 */
 173void machine_kexec_cleanup(struct kimage *image)
 174{
 175        if (nx_enabled)
 176                set_pages_nx(image->control_code_page, 1);
 177        machine_kexec_free_page_tables(image);
 178}
 179
 180/*
 181 * Do not allocate memory (or fail in any way) in machine_kexec().
 182 * We are past the point of no return, committed to rebooting now.
 183 */
 184void machine_kexec(struct kimage *image)
 185{
 186        unsigned long page_list[PAGES_NR];
 187        void *control_page;
 188        int save_ftrace_enabled;
 189        asmlinkage unsigned long
 190                (*relocate_kernel_ptr)(unsigned long indirection_page,
 191                                       unsigned long control_page,
 192                                       unsigned long start_address,
 193                                       unsigned int has_pae,
 194                                       unsigned int preserve_context);
 195
 196#ifdef CONFIG_KEXEC_JUMP
 197        if (image->preserve_context)
 198                save_processor_state();
 199#endif
 200
 201        save_ftrace_enabled = __ftrace_enabled_save();
 202
 203        /* Interrupts aren't acceptable while we reboot */
 204        local_irq_disable();
 205
 206        if (image->preserve_context) {
 207#ifdef CONFIG_X86_IO_APIC
 208                /*
 209                 * We need to put APICs in legacy mode so that we can
 210                 * get timer interrupts in second kernel. kexec/kdump
 211                 * paths already have calls to disable_IO_APIC() in
 212                 * one form or other. kexec jump path also need
 213                 * one.
 214                 */
 215                disable_IO_APIC();
 216#endif
 217        }
 218
 219        control_page = page_address(image->control_code_page);
 220        memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 221
 222        relocate_kernel_ptr = control_page;
 223        page_list[PA_CONTROL_PAGE] = __pa(control_page);
 224        page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 225        page_list[PA_PGD] = __pa(image->arch.pgd);
 226
 227        if (image->type == KEXEC_TYPE_DEFAULT)
 228                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 229                                                << PAGE_SHIFT);
 230
 231        /*
 232         * The segment registers are funny things, they have both a
 233         * visible and an invisible part.  Whenever the visible part is
 234         * set to a specific selector, the invisible part is loaded
 235         * with from a table in memory.  At no other time is the
 236         * descriptor table in memory accessed.
 237         *
 238         * I take advantage of this here by force loading the
 239         * segments, before I zap the gdt with an invalid value.
 240         */
 241        load_segments();
 242        /*
 243         * The gdt & idt are now invalid.
 244         * If you want to load them you must set up your own idt & gdt.
 245         */
 246        set_gdt(phys_to_virt(0), 0);
 247        set_idt(phys_to_virt(0), 0);
 248
 249        /* now call it */
 250        image->start = relocate_kernel_ptr((unsigned long)image->head,
 251                                           (unsigned long)page_list,
 252                                           image->start, cpu_has_pae,
 253                                           image->preserve_context);
 254
 255#ifdef CONFIG_KEXEC_JUMP
 256        if (image->preserve_context)
 257                restore_processor_state();
 258#endif
 259
 260        __ftrace_enabled_restore(save_ftrace_enabled);
 261}
 262
 263void arch_crash_save_vmcoreinfo(void)
 264{
 265#ifdef CONFIG_NUMA
 266        VMCOREINFO_SYMBOL(node_data);
 267        VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
 268#endif
 269#ifdef CONFIG_X86_PAE
 270        VMCOREINFO_CONFIG(X86_PAE);
 271#endif
 272}
 273
 274