linux/arch/x86/kernel/alternative.c
<<
>>
Prefs
   1#include <linux/module.h>
   2#include <linux/sched.h>
   3#include <linux/mutex.h>
   4#include <linux/list.h>
   5#include <linux/stringify.h>
   6#include <linux/kprobes.h>
   7#include <linux/mm.h>
   8#include <linux/vmalloc.h>
   9#include <linux/memory.h>
  10#include <asm/alternative.h>
  11#include <asm/sections.h>
  12#include <asm/pgtable.h>
  13#include <asm/mce.h>
  14#include <asm/nmi.h>
  15#include <asm/vsyscall.h>
  16#include <asm/cacheflush.h>
  17#include <asm/tlbflush.h>
  18#include <asm/io.h>
  19#include <asm/fixmap.h>
  20
  21#define MAX_PATCH_LEN (255-1)
  22
  23#ifdef CONFIG_HOTPLUG_CPU
  24static int smp_alt_once;
  25
  26static int __init bootonly(char *str)
  27{
  28        smp_alt_once = 1;
  29        return 1;
  30}
  31__setup("smp-alt-boot", bootonly);
  32#else
  33#define smp_alt_once 1
  34#endif
  35
  36static int __initdata_or_module debug_alternative;
  37
  38static int __init debug_alt(char *str)
  39{
  40        debug_alternative = 1;
  41        return 1;
  42}
  43__setup("debug-alternative", debug_alt);
  44
  45static int noreplace_smp;
  46
  47static int __init setup_noreplace_smp(char *str)
  48{
  49        noreplace_smp = 1;
  50        return 1;
  51}
  52__setup("noreplace-smp", setup_noreplace_smp);
  53
  54#ifdef CONFIG_PARAVIRT
  55static int __initdata_or_module noreplace_paravirt = 0;
  56
  57static int __init setup_noreplace_paravirt(char *str)
  58{
  59        noreplace_paravirt = 1;
  60        return 1;
  61}
  62__setup("noreplace-paravirt", setup_noreplace_paravirt);
  63#endif
  64
  65#define DPRINTK(fmt, args...) if (debug_alternative) \
  66        printk(KERN_DEBUG fmt, args)
  67
  68#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
  69/* Use inline assembly to define this because the nops are defined
  70   as inline assembly strings in the include files and we cannot
  71   get them easily into strings. */
  72asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
  73        GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
  74        GENERIC_NOP7 GENERIC_NOP8
  75    "\t.previous");
  76extern const unsigned char intelnops[];
  77static const unsigned char *const __initconst_or_module
  78intel_nops[ASM_NOP_MAX+1] = {
  79        NULL,
  80        intelnops,
  81        intelnops + 1,
  82        intelnops + 1 + 2,
  83        intelnops + 1 + 2 + 3,
  84        intelnops + 1 + 2 + 3 + 4,
  85        intelnops + 1 + 2 + 3 + 4 + 5,
  86        intelnops + 1 + 2 + 3 + 4 + 5 + 6,
  87        intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
  88};
  89#endif
  90
  91#ifdef K8_NOP1
  92asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
  93        K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
  94        K8_NOP7 K8_NOP8
  95    "\t.previous");
  96extern const unsigned char k8nops[];
  97static const unsigned char *const __initconst_or_module
  98k8_nops[ASM_NOP_MAX+1] = {
  99        NULL,
 100        k8nops,
 101        k8nops + 1,
 102        k8nops + 1 + 2,
 103        k8nops + 1 + 2 + 3,
 104        k8nops + 1 + 2 + 3 + 4,
 105        k8nops + 1 + 2 + 3 + 4 + 5,
 106        k8nops + 1 + 2 + 3 + 4 + 5 + 6,
 107        k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 108};
 109#endif
 110
 111#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
 112asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
 113        K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
 114        K7_NOP7 K7_NOP8
 115    "\t.previous");
 116extern const unsigned char k7nops[];
 117static const unsigned char *const __initconst_or_module
 118k7_nops[ASM_NOP_MAX+1] = {
 119        NULL,
 120        k7nops,
 121        k7nops + 1,
 122        k7nops + 1 + 2,
 123        k7nops + 1 + 2 + 3,
 124        k7nops + 1 + 2 + 3 + 4,
 125        k7nops + 1 + 2 + 3 + 4 + 5,
 126        k7nops + 1 + 2 + 3 + 4 + 5 + 6,
 127        k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 128};
 129#endif
 130
 131#ifdef P6_NOP1
 132asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
 133        P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
 134        P6_NOP7 P6_NOP8
 135    "\t.previous");
 136extern const unsigned char p6nops[];
 137static const unsigned char *const __initconst_or_module
 138p6_nops[ASM_NOP_MAX+1] = {
 139        NULL,
 140        p6nops,
 141        p6nops + 1,
 142        p6nops + 1 + 2,
 143        p6nops + 1 + 2 + 3,
 144        p6nops + 1 + 2 + 3 + 4,
 145        p6nops + 1 + 2 + 3 + 4 + 5,
 146        p6nops + 1 + 2 + 3 + 4 + 5 + 6,
 147        p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 148};
 149#endif
 150
 151#ifdef CONFIG_X86_64
 152
 153extern char __vsyscall_0;
 154static const unsigned char *const *__init_or_module find_nop_table(void)
 155{
 156        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
 157            boot_cpu_has(X86_FEATURE_NOPL))
 158                return p6_nops;
 159        else
 160                return k8_nops;
 161}
 162
 163#else /* CONFIG_X86_64 */
 164
 165static const unsigned char *const *__init_or_module find_nop_table(void)
 166{
 167        if (boot_cpu_has(X86_FEATURE_K8))
 168                return k8_nops;
 169        else if (boot_cpu_has(X86_FEATURE_K7))
 170                return k7_nops;
 171        else if (boot_cpu_has(X86_FEATURE_NOPL))
 172                return p6_nops;
 173        else
 174                return intel_nops;
 175}
 176
 177#endif /* CONFIG_X86_64 */
 178
 179/* Use this to add nops to a buffer, then text_poke the whole buffer. */
 180static void __init_or_module add_nops(void *insns, unsigned int len)
 181{
 182        const unsigned char *const *noptable = find_nop_table();
 183
 184        while (len > 0) {
 185                unsigned int noplen = len;
 186                if (noplen > ASM_NOP_MAX)
 187                        noplen = ASM_NOP_MAX;
 188                memcpy(insns, noptable[noplen], noplen);
 189                insns += noplen;
 190                len -= noplen;
 191        }
 192}
 193
 194extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 195extern u8 *__smp_locks[], *__smp_locks_end[];
 196static void *text_poke_early(void *addr, const void *opcode, size_t len);
 197
 198/* Replace instructions with better alternatives for this CPU type.
 199   This runs before SMP is initialized to avoid SMP problems with
 200   self modifying code. This implies that assymetric systems where
 201   APs have less capabilities than the boot processor are not handled.
 202   Tough. Make sure you disable such features by hand. */
 203
 204void __init_or_module apply_alternatives(struct alt_instr *start,
 205                                         struct alt_instr *end)
 206{
 207        struct alt_instr *a;
 208        char insnbuf[MAX_PATCH_LEN];
 209
 210        DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
 211        for (a = start; a < end; a++) {
 212                u8 *instr = a->instr;
 213                BUG_ON(a->replacementlen > a->instrlen);
 214                BUG_ON(a->instrlen > sizeof(insnbuf));
 215                if (!boot_cpu_has(a->cpuid))
 216                        continue;
 217#ifdef CONFIG_X86_64
 218                /* vsyscall code is not mapped yet. resolve it manually. */
 219                if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
 220                        instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
 221                        DPRINTK("%s: vsyscall fixup: %p => %p\n",
 222                                __func__, a->instr, instr);
 223                }
 224#endif
 225                memcpy(insnbuf, a->replacement, a->replacementlen);
 226                add_nops(insnbuf + a->replacementlen,
 227                         a->instrlen - a->replacementlen);
 228                text_poke_early(instr, insnbuf, a->instrlen);
 229        }
 230}
 231
 232#ifdef CONFIG_SMP
 233
 234static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 235{
 236        u8 **ptr;
 237
 238        mutex_lock(&text_mutex);
 239        for (ptr = start; ptr < end; ptr++) {
 240                if (*ptr < text)
 241                        continue;
 242                if (*ptr > text_end)
 243                        continue;
 244                /* turn DS segment override prefix into lock prefix */
 245                text_poke(*ptr, ((unsigned char []){0xf0}), 1);
 246        };
 247        mutex_unlock(&text_mutex);
 248}
 249
 250static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 251{
 252        u8 **ptr;
 253
 254        if (noreplace_smp)
 255                return;
 256
 257        mutex_lock(&text_mutex);
 258        for (ptr = start; ptr < end; ptr++) {
 259                if (*ptr < text)
 260                        continue;
 261                if (*ptr > text_end)
 262                        continue;
 263                /* turn lock prefix into DS segment override prefix */
 264                text_poke(*ptr, ((unsigned char []){0x3E}), 1);
 265        };
 266        mutex_unlock(&text_mutex);
 267}
 268
 269struct smp_alt_module {
 270        /* what is this ??? */
 271        struct module   *mod;
 272        char            *name;
 273
 274        /* ptrs to lock prefixes */
 275        u8              **locks;
 276        u8              **locks_end;
 277
 278        /* .text segment, needed to avoid patching init code ;) */
 279        u8              *text;
 280        u8              *text_end;
 281
 282        struct list_head next;
 283};
 284static LIST_HEAD(smp_alt_modules);
 285static DEFINE_MUTEX(smp_alt);
 286static int smp_mode = 1;        /* protected by smp_alt */
 287
 288void __init_or_module alternatives_smp_module_add(struct module *mod,
 289                                                  char *name,
 290                                                  void *locks, void *locks_end,
 291                                                  void *text,  void *text_end)
 292{
 293        struct smp_alt_module *smp;
 294
 295        if (noreplace_smp)
 296                return;
 297
 298        if (smp_alt_once) {
 299                if (boot_cpu_has(X86_FEATURE_UP))
 300                        alternatives_smp_unlock(locks, locks_end,
 301                                                text, text_end);
 302                return;
 303        }
 304
 305        smp = kzalloc(sizeof(*smp), GFP_KERNEL);
 306        if (NULL == smp)
 307                return; /* we'll run the (safe but slow) SMP code then ... */
 308
 309        smp->mod        = mod;
 310        smp->name       = name;
 311        smp->locks      = locks;
 312        smp->locks_end  = locks_end;
 313        smp->text       = text;
 314        smp->text_end   = text_end;
 315        DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
 316                __func__, smp->locks, smp->locks_end,
 317                smp->text, smp->text_end, smp->name);
 318
 319        mutex_lock(&smp_alt);
 320        list_add_tail(&smp->next, &smp_alt_modules);
 321        if (boot_cpu_has(X86_FEATURE_UP))
 322                alternatives_smp_unlock(smp->locks, smp->locks_end,
 323                                        smp->text, smp->text_end);
 324        mutex_unlock(&smp_alt);
 325}
 326
 327void __init_or_module alternatives_smp_module_del(struct module *mod)
 328{
 329        struct smp_alt_module *item;
 330
 331        if (smp_alt_once || noreplace_smp)
 332                return;
 333
 334        mutex_lock(&smp_alt);
 335        list_for_each_entry(item, &smp_alt_modules, next) {
 336                if (mod != item->mod)
 337                        continue;
 338                list_del(&item->next);
 339                mutex_unlock(&smp_alt);
 340                DPRINTK("%s: %s\n", __func__, item->name);
 341                kfree(item);
 342                return;
 343        }
 344        mutex_unlock(&smp_alt);
 345}
 346
 347void alternatives_smp_switch(int smp)
 348{
 349        struct smp_alt_module *mod;
 350
 351#ifdef CONFIG_LOCKDEP
 352        /*
 353         * Older binutils section handling bug prevented
 354         * alternatives-replacement from working reliably.
 355         *
 356         * If this still occurs then you should see a hang
 357         * or crash shortly after this line:
 358         */
 359        printk("lockdep: fixing up alternatives.\n");
 360#endif
 361
 362        if (noreplace_smp || smp_alt_once)
 363                return;
 364        BUG_ON(!smp && (num_online_cpus() > 1));
 365
 366        mutex_lock(&smp_alt);
 367
 368        /*
 369         * Avoid unnecessary switches because it forces JIT based VMs to
 370         * throw away all cached translations, which can be quite costly.
 371         */
 372        if (smp == smp_mode) {
 373                /* nothing */
 374        } else if (smp) {
 375                printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
 376                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
 377                clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 378                list_for_each_entry(mod, &smp_alt_modules, next)
 379                        alternatives_smp_lock(mod->locks, mod->locks_end,
 380                                              mod->text, mod->text_end);
 381        } else {
 382                printk(KERN_INFO "SMP alternatives: switching to UP code\n");
 383                set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
 384                set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 385                list_for_each_entry(mod, &smp_alt_modules, next)
 386                        alternatives_smp_unlock(mod->locks, mod->locks_end,
 387                                                mod->text, mod->text_end);
 388        }
 389        smp_mode = smp;
 390        mutex_unlock(&smp_alt);
 391}
 392
 393#endif
 394
 395#ifdef CONFIG_PARAVIRT
 396void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
 397                                     struct paravirt_patch_site *end)
 398{
 399        struct paravirt_patch_site *p;
 400        char insnbuf[MAX_PATCH_LEN];
 401
 402        if (noreplace_paravirt)
 403                return;
 404
 405        for (p = start; p < end; p++) {
 406                unsigned int used;
 407
 408                BUG_ON(p->len > MAX_PATCH_LEN);
 409                /* prep the buffer with the original instructions */
 410                memcpy(insnbuf, p->instr, p->len);
 411                used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
 412                                         (unsigned long)p->instr, p->len);
 413
 414                BUG_ON(used > p->len);
 415
 416                /* Pad the rest with nops */
 417                add_nops(insnbuf + used, p->len - used);
 418                text_poke_early(p->instr, insnbuf, p->len);
 419        }
 420}
 421extern struct paravirt_patch_site __start_parainstructions[],
 422        __stop_parainstructions[];
 423#endif  /* CONFIG_PARAVIRT */
 424
 425void __init alternative_instructions(void)
 426{
 427        /* The patching is not fully atomic, so try to avoid local interruptions
 428           that might execute the to be patched code.
 429           Other CPUs are not running. */
 430        stop_nmi();
 431
 432        /*
 433         * Don't stop machine check exceptions while patching.
 434         * MCEs only happen when something got corrupted and in this
 435         * case we must do something about the corruption.
 436         * Ignoring it is worse than a unlikely patching race.
 437         * Also machine checks tend to be broadcast and if one CPU
 438         * goes into machine check the others follow quickly, so we don't
 439         * expect a machine check to cause undue problems during to code
 440         * patching.
 441         */
 442
 443        apply_alternatives(__alt_instructions, __alt_instructions_end);
 444
 445        /* switch to patch-once-at-boottime-only mode and free the
 446         * tables in case we know the number of CPUs will never ever
 447         * change */
 448#ifdef CONFIG_HOTPLUG_CPU
 449        if (num_possible_cpus() < 2)
 450                smp_alt_once = 1;
 451#endif
 452
 453#ifdef CONFIG_SMP
 454        if (smp_alt_once) {
 455                if (1 == num_possible_cpus()) {
 456                        printk(KERN_INFO "SMP alternatives: switching to UP code\n");
 457                        set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
 458                        set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 459
 460                        alternatives_smp_unlock(__smp_locks, __smp_locks_end,
 461                                                _text, _etext);
 462                }
 463        } else {
 464                alternatives_smp_module_add(NULL, "core kernel",
 465                                            __smp_locks, __smp_locks_end,
 466                                            _text, _etext);
 467
 468                /* Only switch to UP mode if we don't immediately boot others */
 469                if (num_present_cpus() == 1 || setup_max_cpus <= 1)
 470                        alternatives_smp_switch(0);
 471        }
 472#endif
 473        apply_paravirt(__parainstructions, __parainstructions_end);
 474
 475        if (smp_alt_once)
 476                free_init_pages("SMP alternatives",
 477                                (unsigned long)__smp_locks,
 478                                (unsigned long)__smp_locks_end);
 479
 480        restart_nmi();
 481}
 482
 483/**
 484 * text_poke_early - Update instructions on a live kernel at boot time
 485 * @addr: address to modify
 486 * @opcode: source of the copy
 487 * @len: length to copy
 488 *
 489 * When you use this code to patch more than one byte of an instruction
 490 * you need to make sure that other CPUs cannot execute this code in parallel.
 491 * Also no thread must be currently preempted in the middle of these
 492 * instructions. And on the local CPU you need to be protected again NMI or MCE
 493 * handlers seeing an inconsistent instruction while you patch.
 494 */
 495static void *__init_or_module text_poke_early(void *addr, const void *opcode,
 496                                              size_t len)
 497{
 498        unsigned long flags;
 499        local_irq_save(flags);
 500        memcpy(addr, opcode, len);
 501        sync_core();
 502        local_irq_restore(flags);
 503        /* Could also do a CLFLUSH here to speed up CPU recovery; but
 504           that causes hangs on some VIA CPUs. */
 505        return addr;
 506}
 507
 508/**
 509 * text_poke - Update instructions on a live kernel
 510 * @addr: address to modify
 511 * @opcode: source of the copy
 512 * @len: length to copy
 513 *
 514 * Only atomic text poke/set should be allowed when not doing early patching.
 515 * It means the size must be writable atomically and the address must be aligned
 516 * in a way that permits an atomic write. It also makes sure we fit on a single
 517 * page.
 518 *
 519 * Note: Must be called under text_mutex.
 520 */
 521void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
 522{
 523        unsigned long flags;
 524        char *vaddr;
 525        struct page *pages[2];
 526        int i;
 527
 528        if (!core_kernel_text((unsigned long)addr)) {
 529                pages[0] = vmalloc_to_page(addr);
 530                pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
 531        } else {
 532                pages[0] = virt_to_page(addr);
 533                WARN_ON(!PageReserved(pages[0]));
 534                pages[1] = virt_to_page(addr + PAGE_SIZE);
 535        }
 536        BUG_ON(!pages[0]);
 537        local_irq_save(flags);
 538        set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
 539        if (pages[1])
 540                set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
 541        vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
 542        memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
 543        clear_fixmap(FIX_TEXT_POKE0);
 544        if (pages[1])
 545                clear_fixmap(FIX_TEXT_POKE1);
 546        local_flush_tlb();
 547        sync_core();
 548        /* Could also do a CLFLUSH here to speed up CPU recovery; but
 549           that causes hangs on some VIA CPUs. */
 550        for (i = 0; i < len; i++)
 551                BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
 552        local_irq_restore(flags);
 553        return addr;
 554}
 555