linux/arch/x86/kernel/alternative.c
<<
>>
Prefs
   1#define pr_fmt(fmt) "SMP alternatives: " fmt
   2
   3#include <linux/module.h>
   4#include <linux/sched.h>
   5#include <linux/mutex.h>
   6#include <linux/list.h>
   7#include <linux/stringify.h>
   8#include <linux/mm.h>
   9#include <linux/vmalloc.h>
  10#include <linux/memory.h>
  11#include <linux/stop_machine.h>
  12#include <linux/slab.h>
  13#include <linux/kdebug.h>
  14#include <linux/kprobes.h>
  15#include <linux/bsearch.h>
  16#include <linux/sync_core.h>
  17#include <asm/text-patching.h>
  18#include <asm/alternative.h>
  19#include <asm/sections.h>
  20#include <asm/pgtable.h>
  21#include <asm/mce.h>
  22#include <asm/nmi.h>
  23#include <asm/cacheflush.h>
  24#include <asm/tlbflush.h>
  25#include <asm/insn.h>
  26#include <asm/io.h>
  27#include <asm/fixmap.h>
  28
  29int __read_mostly alternatives_patched;
  30
  31EXPORT_SYMBOL_GPL(alternatives_patched);
  32
  33#define MAX_PATCH_LEN (255-1)
  34
  35static int __initdata_or_module debug_alternative;
  36
  37static int __init debug_alt(char *str)
  38{
  39        debug_alternative = 1;
  40        return 1;
  41}
  42__setup("debug-alternative", debug_alt);
  43
  44static int noreplace_smp;
  45
  46static int __init setup_noreplace_smp(char *str)
  47{
  48        noreplace_smp = 1;
  49        return 1;
  50}
  51__setup("noreplace-smp", setup_noreplace_smp);
  52
  53#define DPRINTK(fmt, args...)                                           \
  54do {                                                                    \
  55        if (debug_alternative)                                          \
  56                printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);   \
  57} while (0)
  58
  59#define DUMP_BYTES(buf, len, fmt, args...)                              \
  60do {                                                                    \
  61        if (unlikely(debug_alternative)) {                              \
  62                int j;                                                  \
  63                                                                        \
  64                if (!(len))                                             \
  65                        break;                                          \
  66                                                                        \
  67                printk(KERN_DEBUG fmt, ##args);                         \
  68                for (j = 0; j < (len) - 1; j++)                         \
  69                        printk(KERN_CONT "%02hhx ", buf[j]);            \
  70                printk(KERN_CONT "%02hhx\n", buf[j]);                   \
  71        }                                                               \
  72} while (0)
  73
  74/*
  75 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
  76 * that correspond to that nop. Getting from one nop to the next, we
  77 * add to the array the offset that is equal to the sum of all sizes of
  78 * nops preceding the one we are after.
  79 *
  80 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
  81 * nice symmetry of sizes of the previous nops.
  82 */
  83#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
  84static const unsigned char intelnops[] =
  85{
  86        GENERIC_NOP1,
  87        GENERIC_NOP2,
  88        GENERIC_NOP3,
  89        GENERIC_NOP4,
  90        GENERIC_NOP5,
  91        GENERIC_NOP6,
  92        GENERIC_NOP7,
  93        GENERIC_NOP8,
  94        GENERIC_NOP5_ATOMIC
  95};
  96static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
  97{
  98        NULL,
  99        intelnops,
 100        intelnops + 1,
 101        intelnops + 1 + 2,
 102        intelnops + 1 + 2 + 3,
 103        intelnops + 1 + 2 + 3 + 4,
 104        intelnops + 1 + 2 + 3 + 4 + 5,
 105        intelnops + 1 + 2 + 3 + 4 + 5 + 6,
 106        intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 107        intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 108};
 109#endif
 110
 111#ifdef K8_NOP1
 112static const unsigned char k8nops[] =
 113{
 114        K8_NOP1,
 115        K8_NOP2,
 116        K8_NOP3,
 117        K8_NOP4,
 118        K8_NOP5,
 119        K8_NOP6,
 120        K8_NOP7,
 121        K8_NOP8,
 122        K8_NOP5_ATOMIC
 123};
 124static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
 125{
 126        NULL,
 127        k8nops,
 128        k8nops + 1,
 129        k8nops + 1 + 2,
 130        k8nops + 1 + 2 + 3,
 131        k8nops + 1 + 2 + 3 + 4,
 132        k8nops + 1 + 2 + 3 + 4 + 5,
 133        k8nops + 1 + 2 + 3 + 4 + 5 + 6,
 134        k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 135        k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 136};
 137#endif
 138
 139#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
 140static const unsigned char k7nops[] =
 141{
 142        K7_NOP1,
 143        K7_NOP2,
 144        K7_NOP3,
 145        K7_NOP4,
 146        K7_NOP5,
 147        K7_NOP6,
 148        K7_NOP7,
 149        K7_NOP8,
 150        K7_NOP5_ATOMIC
 151};
 152static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
 153{
 154        NULL,
 155        k7nops,
 156        k7nops + 1,
 157        k7nops + 1 + 2,
 158        k7nops + 1 + 2 + 3,
 159        k7nops + 1 + 2 + 3 + 4,
 160        k7nops + 1 + 2 + 3 + 4 + 5,
 161        k7nops + 1 + 2 + 3 + 4 + 5 + 6,
 162        k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 163        k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 164};
 165#endif
 166
 167#ifdef P6_NOP1
 168static const unsigned char p6nops[] =
 169{
 170        P6_NOP1,
 171        P6_NOP2,
 172        P6_NOP3,
 173        P6_NOP4,
 174        P6_NOP5,
 175        P6_NOP6,
 176        P6_NOP7,
 177        P6_NOP8,
 178        P6_NOP5_ATOMIC
 179};
 180static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
 181{
 182        NULL,
 183        p6nops,
 184        p6nops + 1,
 185        p6nops + 1 + 2,
 186        p6nops + 1 + 2 + 3,
 187        p6nops + 1 + 2 + 3 + 4,
 188        p6nops + 1 + 2 + 3 + 4 + 5,
 189        p6nops + 1 + 2 + 3 + 4 + 5 + 6,
 190        p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 191        p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 192};
 193#endif
 194
 195/* Initialize these to a safe default */
 196#ifdef CONFIG_X86_64
 197const unsigned char * const *ideal_nops = p6_nops;
 198#else
 199const unsigned char * const *ideal_nops = intel_nops;
 200#endif
 201
 202void __init arch_init_ideal_nops(void)
 203{
 204        switch (boot_cpu_data.x86_vendor) {
 205        case X86_VENDOR_INTEL:
 206                /*
 207                 * Due to a decoder implementation quirk, some
 208                 * specific Intel CPUs actually perform better with
 209                 * the "k8_nops" than with the SDM-recommended NOPs.
 210                 */
 211                if (boot_cpu_data.x86 == 6 &&
 212                    boot_cpu_data.x86_model >= 0x0f &&
 213                    boot_cpu_data.x86_model != 0x1c &&
 214                    boot_cpu_data.x86_model != 0x26 &&
 215                    boot_cpu_data.x86_model != 0x27 &&
 216                    boot_cpu_data.x86_model < 0x30) {
 217                        ideal_nops = k8_nops;
 218                } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
 219                           ideal_nops = p6_nops;
 220                } else {
 221#ifdef CONFIG_X86_64
 222                        ideal_nops = k8_nops;
 223#else
 224                        ideal_nops = intel_nops;
 225#endif
 226                }
 227                break;
 228
 229        case X86_VENDOR_AMD:
 230                if (boot_cpu_data.x86 > 0xf) {
 231                        ideal_nops = p6_nops;
 232                        return;
 233                }
 234
 235                /* fall through */
 236
 237        default:
 238#ifdef CONFIG_X86_64
 239                ideal_nops = k8_nops;
 240#else
 241                if (boot_cpu_has(X86_FEATURE_K8))
 242                        ideal_nops = k8_nops;
 243                else if (boot_cpu_has(X86_FEATURE_K7))
 244                        ideal_nops = k7_nops;
 245                else
 246                        ideal_nops = intel_nops;
 247#endif
 248        }
 249}
 250
 251/* Use this to add nops to a buffer, then text_poke the whole buffer. */
 252static void __init_or_module add_nops(void *insns, unsigned int len)
 253{
 254        while (len > 0) {
 255                unsigned int noplen = len;
 256                if (noplen > ASM_NOP_MAX)
 257                        noplen = ASM_NOP_MAX;
 258                memcpy(insns, ideal_nops[noplen], noplen);
 259                insns += noplen;
 260                len -= noplen;
 261        }
 262}
 263
 264extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 265extern s32 __smp_locks[], __smp_locks_end[];
 266void text_poke_early(void *addr, const void *opcode, size_t len);
 267
 268/*
 269 * Are we looking at a near JMP with a 1 or 4-byte displacement.
 270 */
 271static inline bool is_jmp(const u8 opcode)
 272{
 273        return opcode == 0xeb || opcode == 0xe9;
 274}
 275
 276static void __init_or_module
 277recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
 278{
 279        u8 *next_rip, *tgt_rip;
 280        s32 n_dspl, o_dspl;
 281        int repl_len;
 282
 283        if (a->replacementlen != 5)
 284                return;
 285
 286        o_dspl = *(s32 *)(insnbuf + 1);
 287
 288        /* next_rip of the replacement JMP */
 289        next_rip = repl_insn + a->replacementlen;
 290        /* target rip of the replacement JMP */
 291        tgt_rip  = next_rip + o_dspl;
 292        n_dspl = tgt_rip - orig_insn;
 293
 294        DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
 295
 296        if (tgt_rip - orig_insn >= 0) {
 297                if (n_dspl - 2 <= 127)
 298                        goto two_byte_jmp;
 299                else
 300                        goto five_byte_jmp;
 301        /* negative offset */
 302        } else {
 303                if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
 304                        goto two_byte_jmp;
 305                else
 306                        goto five_byte_jmp;
 307        }
 308
 309two_byte_jmp:
 310        n_dspl -= 2;
 311
 312        insnbuf[0] = 0xeb;
 313        insnbuf[1] = (s8)n_dspl;
 314        add_nops(insnbuf + 2, 3);
 315
 316        repl_len = 2;
 317        goto done;
 318
 319five_byte_jmp:
 320        n_dspl -= 5;
 321
 322        insnbuf[0] = 0xe9;
 323        *(s32 *)&insnbuf[1] = n_dspl;
 324
 325        repl_len = 5;
 326
 327done:
 328
 329        DPRINTK("final displ: 0x%08x, JMP 0x%lx",
 330                n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
 331}
 332
 333/*
 334 * "noinline" to cause control flow change and thus invalidate I$ and
 335 * cause refetch after modification.
 336 */
 337static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
 338{
 339        unsigned long flags;
 340        int i;
 341
 342        for (i = 0; i < a->padlen; i++) {
 343                if (instr[i] != 0x90)
 344                        return;
 345        }
 346
 347        local_irq_save(flags);
 348        add_nops(instr + (a->instrlen - a->padlen), a->padlen);
 349        local_irq_restore(flags);
 350
 351        DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
 352                   instr, a->instrlen - a->padlen, a->padlen);
 353}
 354
 355/*
 356 * Replace instructions with better alternatives for this CPU type. This runs
 357 * before SMP is initialized to avoid SMP problems with self modifying code.
 358 * This implies that asymmetric systems where APs have less capabilities than
 359 * the boot processor are not handled. Tough. Make sure you disable such
 360 * features by hand.
 361 *
 362 * Marked "noinline" to cause control flow change and thus insn cache
 363 * to refetch changed I$ lines.
 364 */
 365void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 366                                                  struct alt_instr *end)
 367{
 368        struct alt_instr *a;
 369        u8 *instr, *replacement;
 370        u8 insnbuf[MAX_PATCH_LEN];
 371
 372        DPRINTK("alt table %px, -> %px", start, end);
 373        /*
 374         * The scan order should be from start to end. A later scanned
 375         * alternative code can overwrite previously scanned alternative code.
 376         * Some kernel functions (e.g. memcpy, memset, etc) use this order to
 377         * patch code.
 378         *
 379         * So be careful if you want to change the scan order to any other
 380         * order.
 381         */
 382        for (a = start; a < end; a++) {
 383                int insnbuf_sz = 0;
 384
 385                instr = (u8 *)&a->instr_offset + a->instr_offset;
 386                replacement = (u8 *)&a->repl_offset + a->repl_offset;
 387                BUG_ON(a->instrlen > sizeof(insnbuf));
 388                BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
 389                if (!boot_cpu_has(a->cpuid)) {
 390                        if (a->padlen > 1)
 391                                optimize_nops(a, instr);
 392
 393                        continue;
 394                }
 395
 396                DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
 397                        a->cpuid >> 5,
 398                        a->cpuid & 0x1f,
 399                        instr, a->instrlen,
 400                        replacement, a->replacementlen, a->padlen);
 401
 402                DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
 403                DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
 404
 405                memcpy(insnbuf, replacement, a->replacementlen);
 406                insnbuf_sz = a->replacementlen;
 407
 408                /*
 409                 * 0xe8 is a relative jump; fix the offset.
 410                 *
 411                 * Instruction length is checked before the opcode to avoid
 412                 * accessing uninitialized bytes for zero-length replacements.
 413                 */
 414                if (a->replacementlen == 5 && *insnbuf == 0xe8) {
 415                        *(s32 *)(insnbuf + 1) += replacement - instr;
 416                        DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
 417                                *(s32 *)(insnbuf + 1),
 418                                (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
 419                }
 420
 421                if (a->replacementlen && is_jmp(replacement[0]))
 422                        recompute_jump(a, instr, replacement, insnbuf);
 423
 424                if (a->instrlen > a->replacementlen) {
 425                        add_nops(insnbuf + a->replacementlen,
 426                                 a->instrlen - a->replacementlen);
 427                        insnbuf_sz += a->instrlen - a->replacementlen;
 428                }
 429                DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
 430
 431                text_poke_early(instr, insnbuf, insnbuf_sz);
 432        }
 433}
 434
 435#ifdef CONFIG_SMP
 436static void alternatives_smp_lock(const s32 *start, const s32 *end,
 437                                  u8 *text, u8 *text_end)
 438{
 439        const s32 *poff;
 440
 441        for (poff = start; poff < end; poff++) {
 442                u8 *ptr = (u8 *)poff + *poff;
 443
 444                if (!*poff || ptr < text || ptr >= text_end)
 445                        continue;
 446                /* turn DS segment override prefix into lock prefix */
 447                if (*ptr == 0x3e)
 448                        text_poke(ptr, ((unsigned char []){0xf0}), 1);
 449        }
 450}
 451
 452static void alternatives_smp_unlock(const s32 *start, const s32 *end,
 453                                    u8 *text, u8 *text_end)
 454{
 455        const s32 *poff;
 456
 457        for (poff = start; poff < end; poff++) {
 458                u8 *ptr = (u8 *)poff + *poff;
 459
 460                if (!*poff || ptr < text || ptr >= text_end)
 461                        continue;
 462                /* turn lock prefix into DS segment override prefix */
 463                if (*ptr == 0xf0)
 464                        text_poke(ptr, ((unsigned char []){0x3E}), 1);
 465        }
 466}
 467
 468struct smp_alt_module {
 469        /* what is this ??? */
 470        struct module   *mod;
 471        char            *name;
 472
 473        /* ptrs to lock prefixes */
 474        const s32       *locks;
 475        const s32       *locks_end;
 476
 477        /* .text segment, needed to avoid patching init code ;) */
 478        u8              *text;
 479        u8              *text_end;
 480
 481        struct list_head next;
 482};
 483static LIST_HEAD(smp_alt_modules);
 484static bool uniproc_patched = false;    /* protected by text_mutex */
 485
 486void __init_or_module alternatives_smp_module_add(struct module *mod,
 487                                                  char *name,
 488                                                  void *locks, void *locks_end,
 489                                                  void *text,  void *text_end)
 490{
 491        struct smp_alt_module *smp;
 492
 493        mutex_lock(&text_mutex);
 494        if (!uniproc_patched)
 495                goto unlock;
 496
 497        if (num_possible_cpus() == 1)
 498                /* Don't bother remembering, we'll never have to undo it. */
 499                goto smp_unlock;
 500
 501        smp = kzalloc(sizeof(*smp), GFP_KERNEL);
 502        if (NULL == smp)
 503                /* we'll run the (safe but slow) SMP code then ... */
 504                goto unlock;
 505
 506        smp->mod        = mod;
 507        smp->name       = name;
 508        smp->locks      = locks;
 509        smp->locks_end  = locks_end;
 510        smp->text       = text;
 511        smp->text_end   = text_end;
 512        DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
 513                smp->locks, smp->locks_end,
 514                smp->text, smp->text_end, smp->name);
 515
 516        list_add_tail(&smp->next, &smp_alt_modules);
 517smp_unlock:
 518        alternatives_smp_unlock(locks, locks_end, text, text_end);
 519unlock:
 520        mutex_unlock(&text_mutex);
 521}
 522
 523void __init_or_module alternatives_smp_module_del(struct module *mod)
 524{
 525        struct smp_alt_module *item;
 526
 527        mutex_lock(&text_mutex);
 528        list_for_each_entry(item, &smp_alt_modules, next) {
 529                if (mod != item->mod)
 530                        continue;
 531                list_del(&item->next);
 532                kfree(item);
 533                break;
 534        }
 535        mutex_unlock(&text_mutex);
 536}
 537
 538void alternatives_enable_smp(void)
 539{
 540        struct smp_alt_module *mod;
 541
 542        /* Why bother if there are no other CPUs? */
 543        BUG_ON(num_possible_cpus() == 1);
 544
 545        mutex_lock(&text_mutex);
 546
 547        if (uniproc_patched) {
 548                pr_info("switching to SMP code\n");
 549                BUG_ON(num_online_cpus() != 1);
 550                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
 551                clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 552                list_for_each_entry(mod, &smp_alt_modules, next)
 553                        alternatives_smp_lock(mod->locks, mod->locks_end,
 554                                              mod->text, mod->text_end);
 555                uniproc_patched = false;
 556        }
 557        mutex_unlock(&text_mutex);
 558}
 559
 560/*
 561 * Return 1 if the address range is reserved for SMP-alternatives.
 562 * Must hold text_mutex.
 563 */
 564int alternatives_text_reserved(void *start, void *end)
 565{
 566        struct smp_alt_module *mod;
 567        const s32 *poff;
 568        u8 *text_start = start;
 569        u8 *text_end = end;
 570
 571        lockdep_assert_held(&text_mutex);
 572
 573        list_for_each_entry(mod, &smp_alt_modules, next) {
 574                if (mod->text > text_end || mod->text_end < text_start)
 575                        continue;
 576                for (poff = mod->locks; poff < mod->locks_end; poff++) {
 577                        const u8 *ptr = (const u8 *)poff + *poff;
 578
 579                        if (text_start <= ptr && text_end > ptr)
 580                                return 1;
 581                }
 582        }
 583
 584        return 0;
 585}
 586#endif /* CONFIG_SMP */
 587
 588#ifdef CONFIG_PARAVIRT
 589void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
 590                                     struct paravirt_patch_site *end)
 591{
 592        struct paravirt_patch_site *p;
 593        char insnbuf[MAX_PATCH_LEN];
 594
 595        for (p = start; p < end; p++) {
 596                unsigned int used;
 597
 598                BUG_ON(p->len > MAX_PATCH_LEN);
 599                /* prep the buffer with the original instructions */
 600                memcpy(insnbuf, p->instr, p->len);
 601                used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
 602                                         (unsigned long)p->instr, p->len);
 603
 604                BUG_ON(used > p->len);
 605
 606                /* Pad the rest with nops */
 607                add_nops(insnbuf + used, p->len - used);
 608                text_poke_early(p->instr, insnbuf, p->len);
 609        }
 610}
 611extern struct paravirt_patch_site __start_parainstructions[],
 612        __stop_parainstructions[];
 613#endif  /* CONFIG_PARAVIRT */
 614
 615void __init alternative_instructions(void)
 616{
 617        /* The patching is not fully atomic, so try to avoid local interruptions
 618           that might execute the to be patched code.
 619           Other CPUs are not running. */
 620        stop_nmi();
 621
 622        /*
 623         * Don't stop machine check exceptions while patching.
 624         * MCEs only happen when something got corrupted and in this
 625         * case we must do something about the corruption.
 626         * Ignoring it is worse than a unlikely patching race.
 627         * Also machine checks tend to be broadcast and if one CPU
 628         * goes into machine check the others follow quickly, so we don't
 629         * expect a machine check to cause undue problems during to code
 630         * patching.
 631         */
 632
 633        apply_alternatives(__alt_instructions, __alt_instructions_end);
 634
 635#ifdef CONFIG_SMP
 636        /* Patch to UP if other cpus not imminent. */
 637        if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
 638                uniproc_patched = true;
 639                alternatives_smp_module_add(NULL, "core kernel",
 640                                            __smp_locks, __smp_locks_end,
 641                                            _text, _etext);
 642        }
 643
 644        if (!uniproc_patched || num_possible_cpus() == 1)
 645                free_init_pages("SMP alternatives",
 646                                (unsigned long)__smp_locks,
 647                                (unsigned long)__smp_locks_end);
 648#endif
 649
 650        apply_paravirt(__parainstructions, __parainstructions_end);
 651
 652        restart_nmi();
 653        alternatives_patched = 1;
 654}
 655
 656/**
 657 * text_poke_early - Update instructions on a live kernel at boot time
 658 * @addr: address to modify
 659 * @opcode: source of the copy
 660 * @len: length to copy
 661 *
 662 * When you use this code to patch more than one byte of an instruction
 663 * you need to make sure that other CPUs cannot execute this code in parallel.
 664 * Also no thread must be currently preempted in the middle of these
 665 * instructions. And on the local CPU you need to be protected again NMI or MCE
 666 * handlers seeing an inconsistent instruction while you patch.
 667 */
 668void __init_or_module text_poke_early(void *addr, const void *opcode,
 669                                      size_t len)
 670{
 671        unsigned long flags;
 672
 673        if (boot_cpu_has(X86_FEATURE_NX) &&
 674            is_module_text_address((unsigned long)addr)) {
 675                /*
 676                 * Modules text is marked initially as non-executable, so the
 677                 * code cannot be running and speculative code-fetches are
 678                 * prevented. Just change the code.
 679                 */
 680                memcpy(addr, opcode, len);
 681        } else {
 682                local_irq_save(flags);
 683                memcpy(addr, opcode, len);
 684                local_irq_restore(flags);
 685                sync_core();
 686
 687                /*
 688                 * Could also do a CLFLUSH here to speed up CPU recovery; but
 689                 * that causes hangs on some VIA CPUs.
 690                 */
 691        }
 692}
 693
 694__ro_after_init struct mm_struct *poking_mm;
 695__ro_after_init unsigned long poking_addr;
 696
 697static void *__text_poke(void *addr, const void *opcode, size_t len)
 698{
 699        unsigned long flags;
 700        char *vaddr;
 701        struct page *pages[2];
 702        int i;
 703
 704        /*
 705         * While boot memory allocator is runnig we cannot use struct
 706         * pages as they are not yet initialized.
 707         */
 708        BUG_ON(!after_bootmem);
 709
 710        if (!core_kernel_text((unsigned long)addr)) {
 711                pages[0] = vmalloc_to_page(addr);
 712                pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
 713        } else {
 714                pages[0] = virt_to_page(addr);
 715                WARN_ON(!PageReserved(pages[0]));
 716                pages[1] = virt_to_page(addr + PAGE_SIZE);
 717        }
 718        BUG_ON(!pages[0]);
 719        local_irq_save(flags);
 720        set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
 721        if (pages[1])
 722                set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
 723        vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
 724        memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
 725        clear_fixmap(FIX_TEXT_POKE0);
 726        if (pages[1])
 727                clear_fixmap(FIX_TEXT_POKE1);
 728        local_flush_tlb();
 729        sync_core();
 730        /* Could also do a CLFLUSH here to speed up CPU recovery; but
 731           that causes hangs on some VIA CPUs. */
 732        for (i = 0; i < len; i++)
 733                BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
 734        local_irq_restore(flags);
 735        return addr;
 736}
 737
 738/**
 739 * text_poke - Update instructions on a live kernel
 740 * @addr: address to modify
 741 * @opcode: source of the copy
 742 * @len: length to copy
 743 *
 744 * Only atomic text poke/set should be allowed when not doing early patching.
 745 * It means the size must be writable atomically and the address must be aligned
 746 * in a way that permits an atomic write. It also makes sure we fit on a single
 747 * page.
 748 */
 749void *text_poke(void *addr, const void *opcode, size_t len)
 750{
 751        lockdep_assert_held(&text_mutex);
 752
 753        return __text_poke(addr, opcode, len);
 754}
 755
 756/**
 757 * text_poke_kgdb - Update instructions on a live kernel by kgdb
 758 * @addr: address to modify
 759 * @opcode: source of the copy
 760 * @len: length to copy
 761 *
 762 * Only atomic text poke/set should be allowed when not doing early patching.
 763 * It means the size must be writable atomically and the address must be aligned
 764 * in a way that permits an atomic write. It also makes sure we fit on a single
 765 * page.
 766 *
 767 * Context: should only be used by kgdb, which ensures no other core is running,
 768 *          despite the fact it does not hold the text_mutex.
 769 */
 770void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
 771{
 772        return __text_poke(addr, opcode, len);
 773}
 774
 775static void do_sync_core(void *info)
 776{
 777        sync_core();
 778}
 779
 780static struct bp_patching_desc {
 781        struct text_poke_loc *vec;
 782        int nr_entries;
 783} bp_patching;
 784
 785static int patch_cmp(const void *key, const void *elt)
 786{
 787        struct text_poke_loc *tp = (struct text_poke_loc *) elt;
 788
 789        if (key < tp->addr)
 790                return -1;
 791        if (key > tp->addr)
 792                return 1;
 793        return 0;
 794}
 795NOKPROBE_SYMBOL(patch_cmp);
 796
 797int poke_int3_handler(struct pt_regs *regs)
 798{
 799        struct text_poke_loc *tp;
 800        void *ip;
 801
 802        /*
 803         * Having observed our INT3 instruction, we now must observe
 804         * bp_patching.nr_entries.
 805         *
 806         *      nr_entries != 0                 INT3
 807         *      WMB                             RMB
 808         *      write INT3                      if (nr_entries)
 809         *
 810         * Idem for other elements in bp_patching.
 811         */
 812        smp_rmb();
 813
 814        if (likely(!bp_patching.nr_entries))
 815                return 0;
 816
 817        if (user_mode(regs))
 818                return 0;
 819
 820        /*
 821         * Discount the INT3. See text_poke_bp_batch().
 822         */
 823        ip = (void *) regs->ip - INT3_INSN_SIZE;
 824
 825        /*
 826         * Skip the binary search if there is a single member in the vector.
 827         */
 828        if (unlikely(bp_patching.nr_entries > 1)) {
 829                tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
 830                             sizeof(struct text_poke_loc),
 831                             patch_cmp);
 832                if (!tp)
 833                        return 0;
 834        } else {
 835                tp = bp_patching.vec;
 836                if (tp->addr != ip)
 837                        return 0;
 838        }
 839
 840        ip += tp->len;
 841
 842        switch (tp->opcode) {
 843        case INT3_INSN_OPCODE:
 844                /*
 845                 * Someone poked an explicit INT3, they'll want to handle it,
 846                 * do not consume.
 847                 */
 848                return 0;
 849
 850        case CALL_INSN_OPCODE:
 851                int3_emulate_call(regs, (long)ip + tp->rel32);
 852                break;
 853
 854        case JMP32_INSN_OPCODE:
 855        case JMP8_INSN_OPCODE:
 856                int3_emulate_jmp(regs, (long)ip + tp->rel32);
 857                break;
 858
 859        default:
 860                BUG();
 861        }
 862
 863        return 1;
 864}
 865NOKPROBE_SYMBOL(poke_int3_handler);
 866
 867/**
 868 * text_poke_bp_batch() -- update instructions on live kernel on SMP
 869 * @tp:                 vector of instructions to patch
 870 * @nr_entries:         number of entries in the vector
 871 *
 872 * Modify multi-byte instruction by using int3 breakpoint on SMP.
 873 * We completely avoid stop_machine() here, and achieve the
 874 * synchronization using int3 breakpoint.
 875 *
 876 * The way it is done:
 877 *      - For each entry in the vector:
 878 *              - add a int3 trap to the address that will be patched
 879 *      - sync cores
 880 *      - For each entry in the vector:
 881 *              - update all but the first byte of the patched range
 882 *      - sync cores
 883 *      - For each entry in the vector:
 884 *              - replace the first byte (int3) by the first byte of
 885 *                replacing opcode
 886 *      - sync cores
 887 */
 888void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
 889{
 890        unsigned char int3 = INT3_INSN_OPCODE;
 891        unsigned int i;
 892        int do_sync;
 893
 894        lockdep_assert_held(&text_mutex);
 895
 896        bp_patching.vec = tp;
 897        bp_patching.nr_entries = nr_entries;
 898
 899        /*
 900         * Corresponding read barrier in int3 notifier for making sure the
 901         * nr_entries and handler are correctly ordered wrt. patching.
 902         */
 903        smp_wmb();
 904
 905        /*
 906         * First step: add a int3 trap to the address that will be patched.
 907         */
 908        for (i = 0; i < nr_entries; i++)
 909                text_poke(tp[i].addr, &int3, sizeof(int3));
 910
 911        on_each_cpu(do_sync_core, NULL, 1);
 912
 913        /*
 914         * Second step: update all but the first byte of the patched range.
 915         */
 916        for (do_sync = 0, i = 0; i < nr_entries; i++) {
 917                if (tp[i].len - sizeof(int3) > 0) {
 918                        text_poke((char *)tp[i].addr + sizeof(int3),
 919                                  (const char *)tp[i].text + sizeof(int3),
 920                                  tp[i].len - sizeof(int3));
 921                        do_sync++;
 922                }
 923        }
 924
 925        if (do_sync) {
 926                /*
 927                 * According to Intel, this core syncing is very likely
 928                 * not necessary and we'd be safe even without it. But
 929                 * better safe than sorry (plus there's not only Intel).
 930                 */
 931                on_each_cpu(do_sync_core, NULL, 1);
 932        }
 933
 934        /*
 935         * Third step: replace the first byte (int3) by the first byte of
 936         * replacing opcode.
 937         */
 938        for (do_sync = 0, i = 0; i < nr_entries; i++) {
 939                if (tp[i].text[0] == INT3_INSN_OPCODE)
 940                        continue;
 941
 942                text_poke(tp[i].addr, tp[i].text, sizeof(int3));
 943                do_sync++;
 944        }
 945
 946        if (do_sync)
 947                on_each_cpu(do_sync_core, NULL, 1);
 948
 949        /*
 950         * sync_core() implies an smp_mb() and orders this store against
 951         * the writing of the new instruction.
 952         */
 953        bp_patching.nr_entries = 0;
 954        /*
 955         * This sync_core () call ensures that all INT3 handlers in progress
 956         * have finished. This allows poke_int3_handler() after this to
 957         * avoid touching bp_paching.vec by checking nr_entries == 0.
 958         */
 959        on_each_cpu(do_sync_core, NULL, 1);
 960        bp_patching.vec = NULL;
 961}
 962
 963void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
 964                        const void *opcode, size_t len, const void *emulate)
 965{
 966        struct insn insn;
 967
 968        if (!opcode)
 969                opcode = (void *)tp->text;
 970        else
 971                memcpy((void *)tp->text, opcode, len);
 972
 973        if (!emulate)
 974                emulate = opcode;
 975
 976        kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
 977        insn_get_length(&insn);
 978
 979        BUG_ON(!insn_complete(&insn));
 980        BUG_ON(len != insn.length);
 981
 982        tp->addr = addr;
 983        tp->len = len;
 984        tp->opcode = insn.opcode.bytes[0];
 985
 986        switch (tp->opcode) {
 987        case INT3_INSN_OPCODE:
 988                break;
 989
 990        case CALL_INSN_OPCODE:
 991        case JMP32_INSN_OPCODE:
 992        case JMP8_INSN_OPCODE:
 993                tp->rel32 = insn.immediate.value;
 994                break;
 995
 996        default: /* assume NOP */
 997                switch (len) {
 998                case 2: /* NOP2 -- emulate as JMP8+0 */
 999                        BUG_ON(memcmp(emulate, ideal_nops[len], len));
1000                        tp->opcode = JMP8_INSN_OPCODE;
1001                        tp->rel32 = 0;
1002                        break;
1003
1004                case 5: /* NOP5 -- emulate as JMP32+0 */
1005                        BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
1006                        tp->opcode = JMP32_INSN_OPCODE;
1007                        tp->rel32 = 0;
1008                        break;
1009
1010                default: /* unknown instruction */
1011                        BUG();
1012                }
1013                break;
1014        }
1015}
1016
1017/**
1018 * text_poke_bp() -- update instructions on live kernel on SMP
1019 * @addr:       address to patch
1020 * @opcode:     opcode of new instruction
1021 * @len:        length to copy
1022 * @handler:    address to jump to when the temporary breakpoint is hit
1023 *
1024 * Update a single instruction with the vector in the stack, avoiding
1025 * dynamically allocated memory. This function should be used when it is
1026 * not possible to allocate memory.
1027 */
1028void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
1029{
1030        struct text_poke_loc tp;
1031
1032        text_poke_loc_init(&tp, addr, opcode, len, emulate);
1033        text_poke_bp_batch(&tp, 1);
1034}
1035