linux/arch/x86/kernel/alternative.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2#define pr_fmt(fmt) "SMP alternatives: " fmt
   3
   4#include <linux/module.h>
   5#include <linux/sched.h>
   6#include <linux/mutex.h>
   7#include <linux/list.h>
   8#include <linux/stringify.h>
   9#include <linux/mm.h>
  10#include <linux/vmalloc.h>
  11#include <linux/memory.h>
  12#include <linux/stop_machine.h>
  13#include <linux/slab.h>
  14#include <linux/kdebug.h>
  15#include <linux/kprobes.h>
  16#include <linux/mmu_context.h>
  17#include <linux/bsearch.h>
  18#include <asm/text-patching.h>
  19#include <asm/alternative.h>
  20#include <asm/sections.h>
  21#include <asm/pgtable.h>
  22#include <asm/mce.h>
  23#include <asm/nmi.h>
  24#include <asm/cacheflush.h>
  25#include <asm/tlbflush.h>
  26#include <asm/insn.h>
  27#include <asm/io.h>
  28#include <asm/fixmap.h>
  29
  30int __read_mostly alternatives_patched;
  31
  32EXPORT_SYMBOL_GPL(alternatives_patched);
  33
  34#define MAX_PATCH_LEN (255-1)
  35
  36static int __initdata_or_module debug_alternative;
  37
  38static int __init debug_alt(char *str)
  39{
  40        debug_alternative = 1;
  41        return 1;
  42}
  43__setup("debug-alternative", debug_alt);
  44
  45static int noreplace_smp;
  46
  47static int __init setup_noreplace_smp(char *str)
  48{
  49        noreplace_smp = 1;
  50        return 1;
  51}
  52__setup("noreplace-smp", setup_noreplace_smp);
  53
  54#define DPRINTK(fmt, args...)                                           \
  55do {                                                                    \
  56        if (debug_alternative)                                          \
  57                printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);   \
  58} while (0)
  59
  60#define DUMP_BYTES(buf, len, fmt, args...)                              \
  61do {                                                                    \
  62        if (unlikely(debug_alternative)) {                              \
  63                int j;                                                  \
  64                                                                        \
  65                if (!(len))                                             \
  66                        break;                                          \
  67                                                                        \
  68                printk(KERN_DEBUG fmt, ##args);                         \
  69                for (j = 0; j < (len) - 1; j++)                         \
  70                        printk(KERN_CONT "%02hhx ", buf[j]);            \
  71                printk(KERN_CONT "%02hhx\n", buf[j]);                   \
  72        }                                                               \
  73} while (0)
  74
  75/*
  76 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
  77 * that correspond to that nop. Getting from one nop to the next, we
  78 * add to the array the offset that is equal to the sum of all sizes of
  79 * nops preceding the one we are after.
  80 *
  81 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
  82 * nice symmetry of sizes of the previous nops.
  83 */
  84#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
  85static const unsigned char intelnops[] =
  86{
  87        GENERIC_NOP1,
  88        GENERIC_NOP2,
  89        GENERIC_NOP3,
  90        GENERIC_NOP4,
  91        GENERIC_NOP5,
  92        GENERIC_NOP6,
  93        GENERIC_NOP7,
  94        GENERIC_NOP8,
  95        GENERIC_NOP5_ATOMIC
  96};
  97static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
  98{
  99        NULL,
 100        intelnops,
 101        intelnops + 1,
 102        intelnops + 1 + 2,
 103        intelnops + 1 + 2 + 3,
 104        intelnops + 1 + 2 + 3 + 4,
 105        intelnops + 1 + 2 + 3 + 4 + 5,
 106        intelnops + 1 + 2 + 3 + 4 + 5 + 6,
 107        intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 108        intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 109};
 110#endif
 111
 112#ifdef K8_NOP1
 113static const unsigned char k8nops[] =
 114{
 115        K8_NOP1,
 116        K8_NOP2,
 117        K8_NOP3,
 118        K8_NOP4,
 119        K8_NOP5,
 120        K8_NOP6,
 121        K8_NOP7,
 122        K8_NOP8,
 123        K8_NOP5_ATOMIC
 124};
 125static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
 126{
 127        NULL,
 128        k8nops,
 129        k8nops + 1,
 130        k8nops + 1 + 2,
 131        k8nops + 1 + 2 + 3,
 132        k8nops + 1 + 2 + 3 + 4,
 133        k8nops + 1 + 2 + 3 + 4 + 5,
 134        k8nops + 1 + 2 + 3 + 4 + 5 + 6,
 135        k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 136        k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 137};
 138#endif
 139
 140#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
 141static const unsigned char k7nops[] =
 142{
 143        K7_NOP1,
 144        K7_NOP2,
 145        K7_NOP3,
 146        K7_NOP4,
 147        K7_NOP5,
 148        K7_NOP6,
 149        K7_NOP7,
 150        K7_NOP8,
 151        K7_NOP5_ATOMIC
 152};
 153static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
 154{
 155        NULL,
 156        k7nops,
 157        k7nops + 1,
 158        k7nops + 1 + 2,
 159        k7nops + 1 + 2 + 3,
 160        k7nops + 1 + 2 + 3 + 4,
 161        k7nops + 1 + 2 + 3 + 4 + 5,
 162        k7nops + 1 + 2 + 3 + 4 + 5 + 6,
 163        k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 164        k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 165};
 166#endif
 167
 168#ifdef P6_NOP1
 169static const unsigned char p6nops[] =
 170{
 171        P6_NOP1,
 172        P6_NOP2,
 173        P6_NOP3,
 174        P6_NOP4,
 175        P6_NOP5,
 176        P6_NOP6,
 177        P6_NOP7,
 178        P6_NOP8,
 179        P6_NOP5_ATOMIC
 180};
 181static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
 182{
 183        NULL,
 184        p6nops,
 185        p6nops + 1,
 186        p6nops + 1 + 2,
 187        p6nops + 1 + 2 + 3,
 188        p6nops + 1 + 2 + 3 + 4,
 189        p6nops + 1 + 2 + 3 + 4 + 5,
 190        p6nops + 1 + 2 + 3 + 4 + 5 + 6,
 191        p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
 192        p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 193};
 194#endif
 195
 196/* Initialize these to a safe default */
 197#ifdef CONFIG_X86_64
 198const unsigned char * const *ideal_nops = p6_nops;
 199#else
 200const unsigned char * const *ideal_nops = intel_nops;
 201#endif
 202
 203void __init arch_init_ideal_nops(void)
 204{
 205        switch (boot_cpu_data.x86_vendor) {
 206        case X86_VENDOR_INTEL:
 207                /*
 208                 * Due to a decoder implementation quirk, some
 209                 * specific Intel CPUs actually perform better with
 210                 * the "k8_nops" than with the SDM-recommended NOPs.
 211                 */
 212                if (boot_cpu_data.x86 == 6 &&
 213                    boot_cpu_data.x86_model >= 0x0f &&
 214                    boot_cpu_data.x86_model != 0x1c &&
 215                    boot_cpu_data.x86_model != 0x26 &&
 216                    boot_cpu_data.x86_model != 0x27 &&
 217                    boot_cpu_data.x86_model < 0x30) {
 218                        ideal_nops = k8_nops;
 219                } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
 220                           ideal_nops = p6_nops;
 221                } else {
 222#ifdef CONFIG_X86_64
 223                        ideal_nops = k8_nops;
 224#else
 225                        ideal_nops = intel_nops;
 226#endif
 227                }
 228                break;
 229
 230        case X86_VENDOR_HYGON:
 231                ideal_nops = p6_nops;
 232                return;
 233
 234        case X86_VENDOR_AMD:
 235                if (boot_cpu_data.x86 > 0xf) {
 236                        ideal_nops = p6_nops;
 237                        return;
 238                }
 239
 240                /* fall through */
 241
 242        default:
 243#ifdef CONFIG_X86_64
 244                ideal_nops = k8_nops;
 245#else
 246                if (boot_cpu_has(X86_FEATURE_K8))
 247                        ideal_nops = k8_nops;
 248                else if (boot_cpu_has(X86_FEATURE_K7))
 249                        ideal_nops = k7_nops;
 250                else
 251                        ideal_nops = intel_nops;
 252#endif
 253        }
 254}
 255
 256/* Use this to add nops to a buffer, then text_poke the whole buffer. */
 257static void __init_or_module add_nops(void *insns, unsigned int len)
 258{
 259        while (len > 0) {
 260                unsigned int noplen = len;
 261                if (noplen > ASM_NOP_MAX)
 262                        noplen = ASM_NOP_MAX;
 263                memcpy(insns, ideal_nops[noplen], noplen);
 264                insns += noplen;
 265                len -= noplen;
 266        }
 267}
 268
 269extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 270extern s32 __smp_locks[], __smp_locks_end[];
 271void text_poke_early(void *addr, const void *opcode, size_t len);
 272
 273/*
 274 * Are we looking at a near JMP with a 1 or 4-byte displacement.
 275 */
 276static inline bool is_jmp(const u8 opcode)
 277{
 278        return opcode == 0xeb || opcode == 0xe9;
 279}
 280
 281static void __init_or_module
 282recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
 283{
 284        u8 *next_rip, *tgt_rip;
 285        s32 n_dspl, o_dspl;
 286        int repl_len;
 287
 288        if (a->replacementlen != 5)
 289                return;
 290
 291        o_dspl = *(s32 *)(insn_buff + 1);
 292
 293        /* next_rip of the replacement JMP */
 294        next_rip = repl_insn + a->replacementlen;
 295        /* target rip of the replacement JMP */
 296        tgt_rip  = next_rip + o_dspl;
 297        n_dspl = tgt_rip - orig_insn;
 298
 299        DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
 300
 301        if (tgt_rip - orig_insn >= 0) {
 302                if (n_dspl - 2 <= 127)
 303                        goto two_byte_jmp;
 304                else
 305                        goto five_byte_jmp;
 306        /* negative offset */
 307        } else {
 308                if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
 309                        goto two_byte_jmp;
 310                else
 311                        goto five_byte_jmp;
 312        }
 313
 314two_byte_jmp:
 315        n_dspl -= 2;
 316
 317        insn_buff[0] = 0xeb;
 318        insn_buff[1] = (s8)n_dspl;
 319        add_nops(insn_buff + 2, 3);
 320
 321        repl_len = 2;
 322        goto done;
 323
 324five_byte_jmp:
 325        n_dspl -= 5;
 326
 327        insn_buff[0] = 0xe9;
 328        *(s32 *)&insn_buff[1] = n_dspl;
 329
 330        repl_len = 5;
 331
 332done:
 333
 334        DPRINTK("final displ: 0x%08x, JMP 0x%lx",
 335                n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
 336}
 337
 338/*
 339 * "noinline" to cause control flow change and thus invalidate I$ and
 340 * cause refetch after modification.
 341 */
 342static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
 343{
 344        unsigned long flags;
 345        int i;
 346
 347        for (i = 0; i < a->padlen; i++) {
 348                if (instr[i] != 0x90)
 349                        return;
 350        }
 351
 352        local_irq_save(flags);
 353        add_nops(instr + (a->instrlen - a->padlen), a->padlen);
 354        local_irq_restore(flags);
 355
 356        DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
 357                   instr, a->instrlen - a->padlen, a->padlen);
 358}
 359
 360/*
 361 * Replace instructions with better alternatives for this CPU type. This runs
 362 * before SMP is initialized to avoid SMP problems with self modifying code.
 363 * This implies that asymmetric systems where APs have less capabilities than
 364 * the boot processor are not handled. Tough. Make sure you disable such
 365 * features by hand.
 366 *
 367 * Marked "noinline" to cause control flow change and thus insn cache
 368 * to refetch changed I$ lines.
 369 */
 370void __init_or_module noinline apply_alternatives(struct alt_instr *start,
 371                                                  struct alt_instr *end)
 372{
 373        struct alt_instr *a;
 374        u8 *instr, *replacement;
 375        u8 insn_buff[MAX_PATCH_LEN];
 376
 377        DPRINTK("alt table %px, -> %px", start, end);
 378        /*
 379         * The scan order should be from start to end. A later scanned
 380         * alternative code can overwrite previously scanned alternative code.
 381         * Some kernel functions (e.g. memcpy, memset, etc) use this order to
 382         * patch code.
 383         *
 384         * So be careful if you want to change the scan order to any other
 385         * order.
 386         */
 387        for (a = start; a < end; a++) {
 388                int insn_buff_sz = 0;
 389
 390                instr = (u8 *)&a->instr_offset + a->instr_offset;
 391                replacement = (u8 *)&a->repl_offset + a->repl_offset;
 392                BUG_ON(a->instrlen > sizeof(insn_buff));
 393                BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
 394                if (!boot_cpu_has(a->cpuid)) {
 395                        if (a->padlen > 1)
 396                                optimize_nops(a, instr);
 397
 398                        continue;
 399                }
 400
 401                DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
 402                        a->cpuid >> 5,
 403                        a->cpuid & 0x1f,
 404                        instr, instr, a->instrlen,
 405                        replacement, a->replacementlen, a->padlen);
 406
 407                DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
 408                DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
 409
 410                memcpy(insn_buff, replacement, a->replacementlen);
 411                insn_buff_sz = a->replacementlen;
 412
 413                /*
 414                 * 0xe8 is a relative jump; fix the offset.
 415                 *
 416                 * Instruction length is checked before the opcode to avoid
 417                 * accessing uninitialized bytes for zero-length replacements.
 418                 */
 419                if (a->replacementlen == 5 && *insn_buff == 0xe8) {
 420                        *(s32 *)(insn_buff + 1) += replacement - instr;
 421                        DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
 422                                *(s32 *)(insn_buff + 1),
 423                                (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
 424                }
 425
 426                if (a->replacementlen && is_jmp(replacement[0]))
 427                        recompute_jump(a, instr, replacement, insn_buff);
 428
 429                if (a->instrlen > a->replacementlen) {
 430                        add_nops(insn_buff + a->replacementlen,
 431                                 a->instrlen - a->replacementlen);
 432                        insn_buff_sz += a->instrlen - a->replacementlen;
 433                }
 434                DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
 435
 436                text_poke_early(instr, insn_buff, insn_buff_sz);
 437        }
 438}
 439
 440#ifdef CONFIG_SMP
 441static void alternatives_smp_lock(const s32 *start, const s32 *end,
 442                                  u8 *text, u8 *text_end)
 443{
 444        const s32 *poff;
 445
 446        for (poff = start; poff < end; poff++) {
 447                u8 *ptr = (u8 *)poff + *poff;
 448
 449                if (!*poff || ptr < text || ptr >= text_end)
 450                        continue;
 451                /* turn DS segment override prefix into lock prefix */
 452                if (*ptr == 0x3e)
 453                        text_poke(ptr, ((unsigned char []){0xf0}), 1);
 454        }
 455}
 456
 457static void alternatives_smp_unlock(const s32 *start, const s32 *end,
 458                                    u8 *text, u8 *text_end)
 459{
 460        const s32 *poff;
 461
 462        for (poff = start; poff < end; poff++) {
 463                u8 *ptr = (u8 *)poff + *poff;
 464
 465                if (!*poff || ptr < text || ptr >= text_end)
 466                        continue;
 467                /* turn lock prefix into DS segment override prefix */
 468                if (*ptr == 0xf0)
 469                        text_poke(ptr, ((unsigned char []){0x3E}), 1);
 470        }
 471}
 472
 473struct smp_alt_module {
 474        /* what is this ??? */
 475        struct module   *mod;
 476        char            *name;
 477
 478        /* ptrs to lock prefixes */
 479        const s32       *locks;
 480        const s32       *locks_end;
 481
 482        /* .text segment, needed to avoid patching init code ;) */
 483        u8              *text;
 484        u8              *text_end;
 485
 486        struct list_head next;
 487};
 488static LIST_HEAD(smp_alt_modules);
 489static bool uniproc_patched = false;    /* protected by text_mutex */
 490
 491void __init_or_module alternatives_smp_module_add(struct module *mod,
 492                                                  char *name,
 493                                                  void *locks, void *locks_end,
 494                                                  void *text,  void *text_end)
 495{
 496        struct smp_alt_module *smp;
 497
 498        mutex_lock(&text_mutex);
 499        if (!uniproc_patched)
 500                goto unlock;
 501
 502        if (num_possible_cpus() == 1)
 503                /* Don't bother remembering, we'll never have to undo it. */
 504                goto smp_unlock;
 505
 506        smp = kzalloc(sizeof(*smp), GFP_KERNEL);
 507        if (NULL == smp)
 508                /* we'll run the (safe but slow) SMP code then ... */
 509                goto unlock;
 510
 511        smp->mod        = mod;
 512        smp->name       = name;
 513        smp->locks      = locks;
 514        smp->locks_end  = locks_end;
 515        smp->text       = text;
 516        smp->text_end   = text_end;
 517        DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
 518                smp->locks, smp->locks_end,
 519                smp->text, smp->text_end, smp->name);
 520
 521        list_add_tail(&smp->next, &smp_alt_modules);
 522smp_unlock:
 523        alternatives_smp_unlock(locks, locks_end, text, text_end);
 524unlock:
 525        mutex_unlock(&text_mutex);
 526}
 527
 528void __init_or_module alternatives_smp_module_del(struct module *mod)
 529{
 530        struct smp_alt_module *item;
 531
 532        mutex_lock(&text_mutex);
 533        list_for_each_entry(item, &smp_alt_modules, next) {
 534                if (mod != item->mod)
 535                        continue;
 536                list_del(&item->next);
 537                kfree(item);
 538                break;
 539        }
 540        mutex_unlock(&text_mutex);
 541}
 542
 543void alternatives_enable_smp(void)
 544{
 545        struct smp_alt_module *mod;
 546
 547        /* Why bother if there are no other CPUs? */
 548        BUG_ON(num_possible_cpus() == 1);
 549
 550        mutex_lock(&text_mutex);
 551
 552        if (uniproc_patched) {
 553                pr_info("switching to SMP code\n");
 554                BUG_ON(num_online_cpus() != 1);
 555                clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
 556                clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
 557                list_for_each_entry(mod, &smp_alt_modules, next)
 558                        alternatives_smp_lock(mod->locks, mod->locks_end,
 559                                              mod->text, mod->text_end);
 560                uniproc_patched = false;
 561        }
 562        mutex_unlock(&text_mutex);
 563}
 564
 565/*
 566 * Return 1 if the address range is reserved for SMP-alternatives.
 567 * Must hold text_mutex.
 568 */
 569int alternatives_text_reserved(void *start, void *end)
 570{
 571        struct smp_alt_module *mod;
 572        const s32 *poff;
 573        u8 *text_start = start;
 574        u8 *text_end = end;
 575
 576        lockdep_assert_held(&text_mutex);
 577
 578        list_for_each_entry(mod, &smp_alt_modules, next) {
 579                if (mod->text > text_end || mod->text_end < text_start)
 580                        continue;
 581                for (poff = mod->locks; poff < mod->locks_end; poff++) {
 582                        const u8 *ptr = (const u8 *)poff + *poff;
 583
 584                        if (text_start <= ptr && text_end > ptr)
 585                                return 1;
 586                }
 587        }
 588
 589        return 0;
 590}
 591#endif /* CONFIG_SMP */
 592
 593#ifdef CONFIG_PARAVIRT
 594void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
 595                                     struct paravirt_patch_site *end)
 596{
 597        struct paravirt_patch_site *p;
 598        char insn_buff[MAX_PATCH_LEN];
 599
 600        for (p = start; p < end; p++) {
 601                unsigned int used;
 602
 603                BUG_ON(p->len > MAX_PATCH_LEN);
 604                /* prep the buffer with the original instructions */
 605                memcpy(insn_buff, p->instr, p->len);
 606                used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
 607
 608                BUG_ON(used > p->len);
 609
 610                /* Pad the rest with nops */
 611                add_nops(insn_buff + used, p->len - used);
 612                text_poke_early(p->instr, insn_buff, p->len);
 613        }
 614}
 615extern struct paravirt_patch_site __start_parainstructions[],
 616        __stop_parainstructions[];
 617#endif  /* CONFIG_PARAVIRT */
 618
 619/*
 620 * Self-test for the INT3 based CALL emulation code.
 621 *
 622 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
 623 * properly and that there is a stack gap between the INT3 frame and the
 624 * previous context. Without this gap doing a virtual PUSH on the interrupted
 625 * stack would corrupt the INT3 IRET frame.
 626 *
 627 * See entry_{32,64}.S for more details.
 628 */
 629
 630/*
 631 * We define the int3_magic() function in assembly to control the calling
 632 * convention such that we can 'call' it from assembly.
 633 */
 634
 635extern void int3_magic(unsigned int *ptr); /* defined in asm */
 636
 637asm (
 638"       .pushsection    .init.text, \"ax\", @progbits\n"
 639"       .type           int3_magic, @function\n"
 640"int3_magic:\n"
 641"       movl    $1, (%" _ASM_ARG1 ")\n"
 642"       ret\n"
 643"       .size           int3_magic, .-int3_magic\n"
 644"       .popsection\n"
 645);
 646
 647extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
 648
 649static int __init
 650int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
 651{
 652        struct die_args *args = data;
 653        struct pt_regs *regs = args->regs;
 654
 655        if (!regs || user_mode(regs))
 656                return NOTIFY_DONE;
 657
 658        if (val != DIE_INT3)
 659                return NOTIFY_DONE;
 660
 661        if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
 662                return NOTIFY_DONE;
 663
 664        int3_emulate_call(regs, (unsigned long)&int3_magic);
 665        return NOTIFY_STOP;
 666}
 667
 668static void __init int3_selftest(void)
 669{
 670        static __initdata struct notifier_block int3_exception_nb = {
 671                .notifier_call  = int3_exception_notify,
 672                .priority       = INT_MAX-1, /* last */
 673        };
 674        unsigned int val = 0;
 675
 676        BUG_ON(register_die_notifier(&int3_exception_nb));
 677
 678        /*
 679         * Basically: int3_magic(&val); but really complicated :-)
 680         *
 681         * Stick the address of the INT3 instruction into int3_selftest_ip,
 682         * then trigger the INT3, padded with NOPs to match a CALL instruction
 683         * length.
 684         */
 685        asm volatile ("1: int3; nop; nop; nop; nop\n\t"
 686                      ".pushsection .init.data,\"aw\"\n\t"
 687                      ".align " __ASM_SEL(4, 8) "\n\t"
 688                      ".type int3_selftest_ip, @object\n\t"
 689                      ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
 690                      "int3_selftest_ip:\n\t"
 691                      __ASM_SEL(.long, .quad) " 1b\n\t"
 692                      ".popsection\n\t"
 693                      : ASM_CALL_CONSTRAINT
 694                      : __ASM_SEL_RAW(a, D) (&val)
 695                      : "memory");
 696
 697        BUG_ON(val != 1);
 698
 699        unregister_die_notifier(&int3_exception_nb);
 700}
 701
 702void __init alternative_instructions(void)
 703{
 704        int3_selftest();
 705
 706        /*
 707         * The patching is not fully atomic, so try to avoid local
 708         * interruptions that might execute the to be patched code.
 709         * Other CPUs are not running.
 710         */
 711        stop_nmi();
 712
 713        /*
 714         * Don't stop machine check exceptions while patching.
 715         * MCEs only happen when something got corrupted and in this
 716         * case we must do something about the corruption.
 717         * Ignoring it is worse than an unlikely patching race.
 718         * Also machine checks tend to be broadcast and if one CPU
 719         * goes into machine check the others follow quickly, so we don't
 720         * expect a machine check to cause undue problems during to code
 721         * patching.
 722         */
 723
 724        apply_alternatives(__alt_instructions, __alt_instructions_end);
 725
 726#ifdef CONFIG_SMP
 727        /* Patch to UP if other cpus not imminent. */
 728        if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
 729                uniproc_patched = true;
 730                alternatives_smp_module_add(NULL, "core kernel",
 731                                            __smp_locks, __smp_locks_end,
 732                                            _text, _etext);
 733        }
 734
 735        if (!uniproc_patched || num_possible_cpus() == 1) {
 736                free_init_pages("SMP alternatives",
 737                                (unsigned long)__smp_locks,
 738                                (unsigned long)__smp_locks_end);
 739        }
 740#endif
 741
 742        apply_paravirt(__parainstructions, __parainstructions_end);
 743
 744        restart_nmi();
 745        alternatives_patched = 1;
 746}
 747
 748/**
 749 * text_poke_early - Update instructions on a live kernel at boot time
 750 * @addr: address to modify
 751 * @opcode: source of the copy
 752 * @len: length to copy
 753 *
 754 * When you use this code to patch more than one byte of an instruction
 755 * you need to make sure that other CPUs cannot execute this code in parallel.
 756 * Also no thread must be currently preempted in the middle of these
 757 * instructions. And on the local CPU you need to be protected against NMI or
 758 * MCE handlers seeing an inconsistent instruction while you patch.
 759 */
 760void __init_or_module text_poke_early(void *addr, const void *opcode,
 761                                      size_t len)
 762{
 763        unsigned long flags;
 764
 765        if (boot_cpu_has(X86_FEATURE_NX) &&
 766            is_module_text_address((unsigned long)addr)) {
 767                /*
 768                 * Modules text is marked initially as non-executable, so the
 769                 * code cannot be running and speculative code-fetches are
 770                 * prevented. Just change the code.
 771                 */
 772                memcpy(addr, opcode, len);
 773        } else {
 774                local_irq_save(flags);
 775                memcpy(addr, opcode, len);
 776                local_irq_restore(flags);
 777                sync_core();
 778
 779                /*
 780                 * Could also do a CLFLUSH here to speed up CPU recovery; but
 781                 * that causes hangs on some VIA CPUs.
 782                 */
 783        }
 784}
 785
 786__ro_after_init struct mm_struct *poking_mm;
 787__ro_after_init unsigned long poking_addr;
 788
 789static void *__text_poke(void *addr, const void *opcode, size_t len)
 790{
 791        bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
 792        struct page *pages[2] = {NULL};
 793        temp_mm_state_t prev;
 794        unsigned long flags;
 795        pte_t pte, *ptep;
 796        spinlock_t *ptl;
 797        pgprot_t pgprot;
 798
 799        /*
 800         * While boot memory allocator is running we cannot use struct pages as
 801         * they are not yet initialized. There is no way to recover.
 802         */
 803        BUG_ON(!after_bootmem);
 804
 805        if (!core_kernel_text((unsigned long)addr)) {
 806                pages[0] = vmalloc_to_page(addr);
 807                if (cross_page_boundary)
 808                        pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
 809        } else {
 810                pages[0] = virt_to_page(addr);
 811                WARN_ON(!PageReserved(pages[0]));
 812                if (cross_page_boundary)
 813                        pages[1] = virt_to_page(addr + PAGE_SIZE);
 814        }
 815        /*
 816         * If something went wrong, crash and burn since recovery paths are not
 817         * implemented.
 818         */
 819        BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
 820
 821        local_irq_save(flags);
 822
 823        /*
 824         * Map the page without the global bit, as TLB flushing is done with
 825         * flush_tlb_mm_range(), which is intended for non-global PTEs.
 826         */
 827        pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
 828
 829        /*
 830         * The lock is not really needed, but this allows to avoid open-coding.
 831         */
 832        ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
 833
 834        /*
 835         * This must not fail; preallocated in poking_init().
 836         */
 837        VM_BUG_ON(!ptep);
 838
 839        pte = mk_pte(pages[0], pgprot);
 840        set_pte_at(poking_mm, poking_addr, ptep, pte);
 841
 842        if (cross_page_boundary) {
 843                pte = mk_pte(pages[1], pgprot);
 844                set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
 845        }
 846
 847        /*
 848         * Loading the temporary mm behaves as a compiler barrier, which
 849         * guarantees that the PTE will be set at the time memcpy() is done.
 850         */
 851        prev = use_temporary_mm(poking_mm);
 852
 853        kasan_disable_current();
 854        memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
 855        kasan_enable_current();
 856
 857        /*
 858         * Ensure that the PTE is only cleared after the instructions of memcpy
 859         * were issued by using a compiler barrier.
 860         */
 861        barrier();
 862
 863        pte_clear(poking_mm, poking_addr, ptep);
 864        if (cross_page_boundary)
 865                pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
 866
 867        /*
 868         * Loading the previous page-table hierarchy requires a serializing
 869         * instruction that already allows the core to see the updated version.
 870         * Xen-PV is assumed to serialize execution in a similar manner.
 871         */
 872        unuse_temporary_mm(prev);
 873
 874        /*
 875         * Flushing the TLB might involve IPIs, which would require enabled
 876         * IRQs, but not if the mm is not used, as it is in this point.
 877         */
 878        flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
 879                           (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
 880                           PAGE_SHIFT, false);
 881
 882        /*
 883         * If the text does not match what we just wrote then something is
 884         * fundamentally screwy; there's nothing we can really do about that.
 885         */
 886        BUG_ON(memcmp(addr, opcode, len));
 887
 888        pte_unmap_unlock(ptep, ptl);
 889        local_irq_restore(flags);
 890        return addr;
 891}
 892
 893/**
 894 * text_poke - Update instructions on a live kernel
 895 * @addr: address to modify
 896 * @opcode: source of the copy
 897 * @len: length to copy
 898 *
 899 * Only atomic text poke/set should be allowed when not doing early patching.
 900 * It means the size must be writable atomically and the address must be aligned
 901 * in a way that permits an atomic write. It also makes sure we fit on a single
 902 * page.
 903 *
 904 * Note that the caller must ensure that if the modified code is part of a
 905 * module, the module would not be removed during poking. This can be achieved
 906 * by registering a module notifier, and ordering module removal and patching
 907 * trough a mutex.
 908 */
 909void *text_poke(void *addr, const void *opcode, size_t len)
 910{
 911        lockdep_assert_held(&text_mutex);
 912
 913        return __text_poke(addr, opcode, len);
 914}
 915
 916/**
 917 * text_poke_kgdb - Update instructions on a live kernel by kgdb
 918 * @addr: address to modify
 919 * @opcode: source of the copy
 920 * @len: length to copy
 921 *
 922 * Only atomic text poke/set should be allowed when not doing early patching.
 923 * It means the size must be writable atomically and the address must be aligned
 924 * in a way that permits an atomic write. It also makes sure we fit on a single
 925 * page.
 926 *
 927 * Context: should only be used by kgdb, which ensures no other core is running,
 928 *          despite the fact it does not hold the text_mutex.
 929 */
 930void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
 931{
 932        return __text_poke(addr, opcode, len);
 933}
 934
 935static void do_sync_core(void *info)
 936{
 937        sync_core();
 938}
 939
 940void text_poke_sync(void)
 941{
 942        on_each_cpu(do_sync_core, NULL, 1);
 943}
 944
 945struct text_poke_loc {
 946        s32 rel_addr; /* addr := _stext + rel_addr */
 947        s32 rel32;
 948        u8 opcode;
 949        const u8 text[POKE_MAX_OPCODE_SIZE];
 950};
 951
 952struct bp_patching_desc {
 953        struct text_poke_loc *vec;
 954        int nr_entries;
 955        atomic_t refs;
 956};
 957
 958static struct bp_patching_desc *bp_desc;
 959
 960static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
 961{
 962        struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */
 963
 964        if (!desc || !atomic_inc_not_zero(&desc->refs))
 965                return NULL;
 966
 967        return desc;
 968}
 969
 970static inline void put_desc(struct bp_patching_desc *desc)
 971{
 972        smp_mb__before_atomic();
 973        atomic_dec(&desc->refs);
 974}
 975
 976static inline void *text_poke_addr(struct text_poke_loc *tp)
 977{
 978        return _stext + tp->rel_addr;
 979}
 980
 981static int notrace patch_cmp(const void *key, const void *elt)
 982{
 983        struct text_poke_loc *tp = (struct text_poke_loc *) elt;
 984
 985        if (key < text_poke_addr(tp))
 986                return -1;
 987        if (key > text_poke_addr(tp))
 988                return 1;
 989        return 0;
 990}
 991NOKPROBE_SYMBOL(patch_cmp);
 992
 993int notrace poke_int3_handler(struct pt_regs *regs)
 994{
 995        struct bp_patching_desc *desc;
 996        struct text_poke_loc *tp;
 997        int len, ret = 0;
 998        void *ip;
 999
1000        if (user_mode(regs))
1001                return 0;
1002
1003        /*
1004         * Having observed our INT3 instruction, we now must observe
1005         * bp_desc:
1006         *
1007         *      bp_desc = desc                  INT3
1008         *      WMB                             RMB
1009         *      write INT3                      if (desc)
1010         */
1011        smp_rmb();
1012
1013        desc = try_get_desc(&bp_desc);
1014        if (!desc)
1015                return 0;
1016
1017        /*
1018         * Discount the INT3. See text_poke_bp_batch().
1019         */
1020        ip = (void *) regs->ip - INT3_INSN_SIZE;
1021
1022        /*
1023         * Skip the binary search if there is a single member in the vector.
1024         */
1025        if (unlikely(desc->nr_entries > 1)) {
1026                tp = bsearch(ip, desc->vec, desc->nr_entries,
1027                             sizeof(struct text_poke_loc),
1028                             patch_cmp);
1029                if (!tp)
1030                        goto out_put;
1031        } else {
1032                tp = desc->vec;
1033                if (text_poke_addr(tp) != ip)
1034                        goto out_put;
1035        }
1036
1037        len = text_opcode_size(tp->opcode);
1038        ip += len;
1039
1040        switch (tp->opcode) {
1041        case INT3_INSN_OPCODE:
1042                /*
1043                 * Someone poked an explicit INT3, they'll want to handle it,
1044                 * do not consume.
1045                 */
1046                goto out_put;
1047
1048        case CALL_INSN_OPCODE:
1049                int3_emulate_call(regs, (long)ip + tp->rel32);
1050                break;
1051
1052        case JMP32_INSN_OPCODE:
1053        case JMP8_INSN_OPCODE:
1054                int3_emulate_jmp(regs, (long)ip + tp->rel32);
1055                break;
1056
1057        default:
1058                BUG();
1059        }
1060
1061        ret = 1;
1062
1063out_put:
1064        put_desc(desc);
1065        return ret;
1066}
1067NOKPROBE_SYMBOL(poke_int3_handler);
1068
1069#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
1070static struct text_poke_loc tp_vec[TP_VEC_MAX];
1071static int tp_vec_nr;
1072
1073/**
1074 * text_poke_bp_batch() -- update instructions on live kernel on SMP
1075 * @tp:                 vector of instructions to patch
1076 * @nr_entries:         number of entries in the vector
1077 *
1078 * Modify multi-byte instruction by using int3 breakpoint on SMP.
1079 * We completely avoid stop_machine() here, and achieve the
1080 * synchronization using int3 breakpoint.
1081 *
1082 * The way it is done:
1083 *      - For each entry in the vector:
1084 *              - add a int3 trap to the address that will be patched
1085 *      - sync cores
1086 *      - For each entry in the vector:
1087 *              - update all but the first byte of the patched range
1088 *      - sync cores
1089 *      - For each entry in the vector:
1090 *              - replace the first byte (int3) by the first byte of
1091 *                replacing opcode
1092 *      - sync cores
1093 */
1094static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
1095{
1096        struct bp_patching_desc desc = {
1097                .vec = tp,
1098                .nr_entries = nr_entries,
1099                .refs = ATOMIC_INIT(1),
1100        };
1101        unsigned char int3 = INT3_INSN_OPCODE;
1102        unsigned int i;
1103        int do_sync;
1104
1105        lockdep_assert_held(&text_mutex);
1106
1107        smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */
1108
1109        /*
1110         * Corresponding read barrier in int3 notifier for making sure the
1111         * nr_entries and handler are correctly ordered wrt. patching.
1112         */
1113        smp_wmb();
1114
1115        /*
1116         * First step: add a int3 trap to the address that will be patched.
1117         */
1118        for (i = 0; i < nr_entries; i++)
1119                text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
1120
1121        text_poke_sync();
1122
1123        /*
1124         * Second step: update all but the first byte of the patched range.
1125         */
1126        for (do_sync = 0, i = 0; i < nr_entries; i++) {
1127                int len = text_opcode_size(tp[i].opcode);
1128
1129                if (len - INT3_INSN_SIZE > 0) {
1130                        text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1131                                  (const char *)tp[i].text + INT3_INSN_SIZE,
1132                                  len - INT3_INSN_SIZE);
1133                        do_sync++;
1134                }
1135        }
1136
1137        if (do_sync) {
1138                /*
1139                 * According to Intel, this core syncing is very likely
1140                 * not necessary and we'd be safe even without it. But
1141                 * better safe than sorry (plus there's not only Intel).
1142                 */
1143                text_poke_sync();
1144        }
1145
1146        /*
1147         * Third step: replace the first byte (int3) by the first byte of
1148         * replacing opcode.
1149         */
1150        for (do_sync = 0, i = 0; i < nr_entries; i++) {
1151                if (tp[i].text[0] == INT3_INSN_OPCODE)
1152                        continue;
1153
1154                text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
1155                do_sync++;
1156        }
1157
1158        if (do_sync)
1159                text_poke_sync();
1160
1161        /*
1162         * Remove and synchronize_rcu(), except we have a very primitive
1163         * refcount based completion.
1164         */
1165        WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */
1166        if (!atomic_dec_and_test(&desc.refs))
1167                atomic_cond_read_acquire(&desc.refs, !VAL);
1168}
1169
1170void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
1171                        const void *opcode, size_t len, const void *emulate)
1172{
1173        struct insn insn;
1174
1175        memcpy((void *)tp->text, opcode, len);
1176        if (!emulate)
1177                emulate = opcode;
1178
1179        kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
1180        insn_get_length(&insn);
1181
1182        BUG_ON(!insn_complete(&insn));
1183        BUG_ON(len != insn.length);
1184
1185        tp->rel_addr = addr - (void *)_stext;
1186        tp->opcode = insn.opcode.bytes[0];
1187
1188        switch (tp->opcode) {
1189        case INT3_INSN_OPCODE:
1190                break;
1191
1192        case CALL_INSN_OPCODE:
1193        case JMP32_INSN_OPCODE:
1194        case JMP8_INSN_OPCODE:
1195                tp->rel32 = insn.immediate.value;
1196                break;
1197
1198        default: /* assume NOP */
1199                switch (len) {
1200                case 2: /* NOP2 -- emulate as JMP8+0 */
1201                        BUG_ON(memcmp(emulate, ideal_nops[len], len));
1202                        tp->opcode = JMP8_INSN_OPCODE;
1203                        tp->rel32 = 0;
1204                        break;
1205
1206                case 5: /* NOP5 -- emulate as JMP32+0 */
1207                        BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
1208                        tp->opcode = JMP32_INSN_OPCODE;
1209                        tp->rel32 = 0;
1210                        break;
1211
1212                default: /* unknown instruction */
1213                        BUG();
1214                }
1215                break;
1216        }
1217}
1218
1219/*
1220 * We hard rely on the tp_vec being ordered; ensure this is so by flushing
1221 * early if needed.
1222 */
1223static bool tp_order_fail(void *addr)
1224{
1225        struct text_poke_loc *tp;
1226
1227        if (!tp_vec_nr)
1228                return false;
1229
1230        if (!addr) /* force */
1231                return true;
1232
1233        tp = &tp_vec[tp_vec_nr - 1];
1234        if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
1235                return true;
1236
1237        return false;
1238}
1239
1240static void text_poke_flush(void *addr)
1241{
1242        if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
1243                text_poke_bp_batch(tp_vec, tp_vec_nr);
1244                tp_vec_nr = 0;
1245        }
1246}
1247
1248void text_poke_finish(void)
1249{
1250        text_poke_flush(NULL);
1251}
1252
1253void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
1254{
1255        struct text_poke_loc *tp;
1256
1257        if (unlikely(system_state == SYSTEM_BOOTING)) {
1258                text_poke_early(addr, opcode, len);
1259                return;
1260        }
1261
1262        text_poke_flush(addr);
1263
1264        tp = &tp_vec[tp_vec_nr++];
1265        text_poke_loc_init(tp, addr, opcode, len, emulate);
1266}
1267
1268/**
1269 * text_poke_bp() -- update instructions on live kernel on SMP
1270 * @addr:       address to patch
1271 * @opcode:     opcode of new instruction
1272 * @len:        length to copy
1273 * @handler:    address to jump to when the temporary breakpoint is hit
1274 *
1275 * Update a single instruction with the vector in the stack, avoiding
1276 * dynamically allocated memory. This function should be used when it is
1277 * not possible to allocate memory.
1278 */
1279void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
1280{
1281        struct text_poke_loc tp;
1282
1283        if (unlikely(system_state == SYSTEM_BOOTING)) {
1284                text_poke_early(addr, opcode, len);
1285                return;
1286        }
1287
1288        text_poke_loc_init(&tp, addr, opcode, len, emulate);
1289        text_poke_bp_batch(&tp, 1);
1290}
1291