linux/arch/x86/kernel/paravirt.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*  Paravirtualization interfaces
   3    Copyright (C) 2006 Rusty Russell IBM Corporation
   4
   5
   6    2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
   7*/
   8
   9#include <linux/errno.h>
  10#include <linux/init.h>
  11#include <linux/export.h>
  12#include <linux/efi.h>
  13#include <linux/bcd.h>
  14#include <linux/highmem.h>
  15#include <linux/kprobes.h>
  16
  17#include <asm/bug.h>
  18#include <asm/paravirt.h>
  19#include <asm/debugreg.h>
  20#include <asm/desc.h>
  21#include <asm/setup.h>
  22#include <asm/pgtable.h>
  23#include <asm/time.h>
  24#include <asm/pgalloc.h>
  25#include <asm/irq.h>
  26#include <asm/delay.h>
  27#include <asm/fixmap.h>
  28#include <asm/apic.h>
  29#include <asm/tlbflush.h>
  30#include <asm/timer.h>
  31#include <asm/special_insns.h>
  32#include <asm/tlb.h>
  33
  34/*
  35 * nop stub, which must not clobber anything *including the stack* to
  36 * avoid confusing the entry prologues.
  37 */
  38extern void _paravirt_nop(void);
  39asm (".pushsection .entry.text, \"ax\"\n"
  40     ".global _paravirt_nop\n"
  41     "_paravirt_nop:\n\t"
  42     "ret\n\t"
  43     ".size _paravirt_nop, . - _paravirt_nop\n\t"
  44     ".type _paravirt_nop, @function\n\t"
  45     ".popsection");
  46
  47void __init default_banner(void)
  48{
  49        printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
  50               pv_info.name);
  51}
  52
  53/* Undefined instruction for dealing with missing ops pointers. */
  54static const unsigned char ud2a[] = { 0x0f, 0x0b };
  55
  56struct branch {
  57        unsigned char opcode;
  58        u32 delta;
  59} __attribute__((packed));
  60
  61static unsigned paravirt_patch_call(void *insn_buff, const void *target,
  62                                    unsigned long addr, unsigned len)
  63{
  64        const int call_len = 5;
  65        struct branch *b = insn_buff;
  66        unsigned long delta = (unsigned long)target - (addr+call_len);
  67
  68        if (len < call_len) {
  69                pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr);
  70                /* Kernel might not be viable if patching fails, bail out: */
  71                BUG_ON(1);
  72        }
  73
  74        b->opcode = 0xe8; /* call */
  75        b->delta = delta;
  76        BUILD_BUG_ON(sizeof(*b) != call_len);
  77
  78        return call_len;
  79}
  80
  81#ifdef CONFIG_PARAVIRT_XXL
  82/* identity function, which can be inlined */
  83u64 notrace _paravirt_ident_64(u64 x)
  84{
  85        return x;
  86}
  87
  88static unsigned paravirt_patch_jmp(void *insn_buff, const void *target,
  89                                   unsigned long addr, unsigned len)
  90{
  91        struct branch *b = insn_buff;
  92        unsigned long delta = (unsigned long)target - (addr+5);
  93
  94        if (len < 5) {
  95#ifdef CONFIG_RETPOLINE
  96                WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
  97#endif
  98                return len;     /* call too long for patch site */
  99        }
 100
 101        b->opcode = 0xe9;       /* jmp */
 102        b->delta = delta;
 103
 104        return 5;
 105}
 106#endif
 107
 108DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
 109
 110void __init native_pv_lock_init(void)
 111{
 112        if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
 113                static_branch_disable(&virt_spin_lock_key);
 114}
 115
 116unsigned paravirt_patch_default(u8 type, void *insn_buff,
 117                                unsigned long addr, unsigned len)
 118{
 119        /*
 120         * Neat trick to map patch type back to the call within the
 121         * corresponding structure.
 122         */
 123        void *opfunc = *((void **)&pv_ops + type);
 124        unsigned ret;
 125
 126        if (opfunc == NULL)
 127                /* If there's no function, patch it with a ud2a (BUG) */
 128                ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a));
 129        else if (opfunc == _paravirt_nop)
 130                ret = 0;
 131
 132#ifdef CONFIG_PARAVIRT_XXL
 133        /* identity functions just return their single argument */
 134        else if (opfunc == _paravirt_ident_64)
 135                ret = paravirt_patch_ident_64(insn_buff, len);
 136
 137        else if (type == PARAVIRT_PATCH(cpu.iret) ||
 138                 type == PARAVIRT_PATCH(cpu.usergs_sysret64))
 139                /* If operation requires a jmp, then jmp */
 140                ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
 141#endif
 142        else
 143                /* Otherwise call the function. */
 144                ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
 145
 146        return ret;
 147}
 148
 149unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
 150                              const char *start, const char *end)
 151{
 152        unsigned insn_len = end - start;
 153
 154        /* Alternative instruction is too large for the patch site and we cannot continue: */
 155        BUG_ON(insn_len > len || start == NULL);
 156
 157        memcpy(insn_buff, start, insn_len);
 158
 159        return insn_len;
 160}
 161
 162static void native_flush_tlb(void)
 163{
 164        __native_flush_tlb();
 165}
 166
 167/*
 168 * Global pages have to be flushed a bit differently. Not a real
 169 * performance problem because this does not happen often.
 170 */
 171static void native_flush_tlb_global(void)
 172{
 173        __native_flush_tlb_global();
 174}
 175
 176static void native_flush_tlb_one_user(unsigned long addr)
 177{
 178        __native_flush_tlb_one_user(addr);
 179}
 180
 181struct static_key paravirt_steal_enabled;
 182struct static_key paravirt_steal_rq_enabled;
 183
 184static u64 native_steal_clock(int cpu)
 185{
 186        return 0;
 187}
 188
 189/* These are in entry.S */
 190extern void native_iret(void);
 191extern void native_usergs_sysret64(void);
 192
 193static struct resource reserve_ioports = {
 194        .start = 0,
 195        .end = IO_SPACE_LIMIT,
 196        .name = "paravirt-ioport",
 197        .flags = IORESOURCE_IO | IORESOURCE_BUSY,
 198};
 199
 200/*
 201 * Reserve the whole legacy IO space to prevent any legacy drivers
 202 * from wasting time probing for their hardware.  This is a fairly
 203 * brute-force approach to disabling all non-virtual drivers.
 204 *
 205 * Note that this must be called very early to have any effect.
 206 */
 207int paravirt_disable_iospace(void)
 208{
 209        return request_resource(&ioport_resource, &reserve_ioports);
 210}
 211
 212static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
 213
 214static inline void enter_lazy(enum paravirt_lazy_mode mode)
 215{
 216        BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
 217
 218        this_cpu_write(paravirt_lazy_mode, mode);
 219}
 220
 221static void leave_lazy(enum paravirt_lazy_mode mode)
 222{
 223        BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode);
 224
 225        this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
 226}
 227
 228void paravirt_enter_lazy_mmu(void)
 229{
 230        enter_lazy(PARAVIRT_LAZY_MMU);
 231}
 232
 233void paravirt_leave_lazy_mmu(void)
 234{
 235        leave_lazy(PARAVIRT_LAZY_MMU);
 236}
 237
 238void paravirt_flush_lazy_mmu(void)
 239{
 240        preempt_disable();
 241
 242        if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 243                arch_leave_lazy_mmu_mode();
 244                arch_enter_lazy_mmu_mode();
 245        }
 246
 247        preempt_enable();
 248}
 249
 250#ifdef CONFIG_PARAVIRT_XXL
 251void paravirt_start_context_switch(struct task_struct *prev)
 252{
 253        BUG_ON(preemptible());
 254
 255        if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
 256                arch_leave_lazy_mmu_mode();
 257                set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
 258        }
 259        enter_lazy(PARAVIRT_LAZY_CPU);
 260}
 261
 262void paravirt_end_context_switch(struct task_struct *next)
 263{
 264        BUG_ON(preemptible());
 265
 266        leave_lazy(PARAVIRT_LAZY_CPU);
 267
 268        if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
 269                arch_enter_lazy_mmu_mode();
 270}
 271#endif
 272
 273enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 274{
 275        if (in_interrupt())
 276                return PARAVIRT_LAZY_NONE;
 277
 278        return this_cpu_read(paravirt_lazy_mode);
 279}
 280
 281struct pv_info pv_info = {
 282        .name = "bare hardware",
 283#ifdef CONFIG_PARAVIRT_XXL
 284        .kernel_rpl = 0,
 285        .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
 286
 287#ifdef CONFIG_X86_64
 288        .extra_user_64bit_cs = __USER_CS,
 289#endif
 290#endif
 291};
 292
 293/* 64-bit pagetable entries */
 294#define PTE_IDENT       __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
 295
 296struct paravirt_patch_template pv_ops = {
 297        /* Init ops. */
 298        .init.patch             = native_patch,
 299
 300        /* Time ops. */
 301        .time.sched_clock       = native_sched_clock,
 302        .time.steal_clock       = native_steal_clock,
 303
 304        /* Cpu ops. */
 305        .cpu.io_delay           = native_io_delay,
 306
 307#ifdef CONFIG_PARAVIRT_XXL
 308        .cpu.cpuid              = native_cpuid,
 309        .cpu.get_debugreg       = native_get_debugreg,
 310        .cpu.set_debugreg       = native_set_debugreg,
 311        .cpu.read_cr0           = native_read_cr0,
 312        .cpu.write_cr0          = native_write_cr0,
 313        .cpu.write_cr4          = native_write_cr4,
 314        .cpu.wbinvd             = native_wbinvd,
 315        .cpu.read_msr           = native_read_msr,
 316        .cpu.write_msr          = native_write_msr,
 317        .cpu.read_msr_safe      = native_read_msr_safe,
 318        .cpu.write_msr_safe     = native_write_msr_safe,
 319        .cpu.read_pmc           = native_read_pmc,
 320        .cpu.load_tr_desc       = native_load_tr_desc,
 321        .cpu.set_ldt            = native_set_ldt,
 322        .cpu.load_gdt           = native_load_gdt,
 323        .cpu.load_idt           = native_load_idt,
 324        .cpu.store_tr           = native_store_tr,
 325        .cpu.load_tls           = native_load_tls,
 326#ifdef CONFIG_X86_64
 327        .cpu.load_gs_index      = native_load_gs_index,
 328#endif
 329        .cpu.write_ldt_entry    = native_write_ldt_entry,
 330        .cpu.write_gdt_entry    = native_write_gdt_entry,
 331        .cpu.write_idt_entry    = native_write_idt_entry,
 332
 333        .cpu.alloc_ldt          = paravirt_nop,
 334        .cpu.free_ldt           = paravirt_nop,
 335
 336        .cpu.load_sp0           = native_load_sp0,
 337
 338#ifdef CONFIG_X86_64
 339        .cpu.usergs_sysret64    = native_usergs_sysret64,
 340#endif
 341        .cpu.iret               = native_iret,
 342        .cpu.swapgs             = native_swapgs,
 343
 344        .cpu.set_iopl_mask      = native_set_iopl_mask,
 345
 346        .cpu.start_context_switch       = paravirt_nop,
 347        .cpu.end_context_switch         = paravirt_nop,
 348
 349        /* Irq ops. */
 350        .irq.save_fl            = __PV_IS_CALLEE_SAVE(native_save_fl),
 351        .irq.restore_fl         = __PV_IS_CALLEE_SAVE(native_restore_fl),
 352        .irq.irq_disable        = __PV_IS_CALLEE_SAVE(native_irq_disable),
 353        .irq.irq_enable         = __PV_IS_CALLEE_SAVE(native_irq_enable),
 354        .irq.safe_halt          = native_safe_halt,
 355        .irq.halt               = native_halt,
 356#endif /* CONFIG_PARAVIRT_XXL */
 357
 358        /* Mmu ops. */
 359        .mmu.flush_tlb_user     = native_flush_tlb,
 360        .mmu.flush_tlb_kernel   = native_flush_tlb_global,
 361        .mmu.flush_tlb_one_user = native_flush_tlb_one_user,
 362        .mmu.flush_tlb_others   = native_flush_tlb_others,
 363        .mmu.tlb_remove_table   =
 364                        (void (*)(struct mmu_gather *, void *))tlb_remove_page,
 365
 366        .mmu.exit_mmap          = paravirt_nop,
 367
 368#ifdef CONFIG_PARAVIRT_XXL
 369        .mmu.read_cr2           = __PV_IS_CALLEE_SAVE(native_read_cr2),
 370        .mmu.write_cr2          = native_write_cr2,
 371        .mmu.read_cr3           = __native_read_cr3,
 372        .mmu.write_cr3          = native_write_cr3,
 373
 374        .mmu.pgd_alloc          = __paravirt_pgd_alloc,
 375        .mmu.pgd_free           = paravirt_nop,
 376
 377        .mmu.alloc_pte          = paravirt_nop,
 378        .mmu.alloc_pmd          = paravirt_nop,
 379        .mmu.alloc_pud          = paravirt_nop,
 380        .mmu.alloc_p4d          = paravirt_nop,
 381        .mmu.release_pte        = paravirt_nop,
 382        .mmu.release_pmd        = paravirt_nop,
 383        .mmu.release_pud        = paravirt_nop,
 384        .mmu.release_p4d        = paravirt_nop,
 385
 386        .mmu.set_pte            = native_set_pte,
 387        .mmu.set_pte_at         = native_set_pte_at,
 388        .mmu.set_pmd            = native_set_pmd,
 389
 390        .mmu.ptep_modify_prot_start     = __ptep_modify_prot_start,
 391        .mmu.ptep_modify_prot_commit    = __ptep_modify_prot_commit,
 392
 393#if CONFIG_PGTABLE_LEVELS >= 3
 394#ifdef CONFIG_X86_PAE
 395        .mmu.set_pte_atomic     = native_set_pte_atomic,
 396        .mmu.pte_clear          = native_pte_clear,
 397        .mmu.pmd_clear          = native_pmd_clear,
 398#endif
 399        .mmu.set_pud            = native_set_pud,
 400
 401        .mmu.pmd_val            = PTE_IDENT,
 402        .mmu.make_pmd           = PTE_IDENT,
 403
 404#if CONFIG_PGTABLE_LEVELS >= 4
 405        .mmu.pud_val            = PTE_IDENT,
 406        .mmu.make_pud           = PTE_IDENT,
 407
 408        .mmu.set_p4d            = native_set_p4d,
 409
 410#if CONFIG_PGTABLE_LEVELS >= 5
 411        .mmu.p4d_val            = PTE_IDENT,
 412        .mmu.make_p4d           = PTE_IDENT,
 413
 414        .mmu.set_pgd            = native_set_pgd,
 415#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
 416#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
 417#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
 418
 419        .mmu.pte_val            = PTE_IDENT,
 420        .mmu.pgd_val            = PTE_IDENT,
 421
 422        .mmu.make_pte           = PTE_IDENT,
 423        .mmu.make_pgd           = PTE_IDENT,
 424
 425        .mmu.dup_mmap           = paravirt_nop,
 426        .mmu.activate_mm        = paravirt_nop,
 427
 428        .mmu.lazy_mode = {
 429                .enter          = paravirt_nop,
 430                .leave          = paravirt_nop,
 431                .flush          = paravirt_nop,
 432        },
 433
 434        .mmu.set_fixmap         = native_set_fixmap,
 435#endif /* CONFIG_PARAVIRT_XXL */
 436
 437#if defined(CONFIG_PARAVIRT_SPINLOCKS)
 438        /* Lock ops. */
 439#ifdef CONFIG_SMP
 440        .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
 441        .lock.queued_spin_unlock        =
 442                                PV_CALLEE_SAVE(__native_queued_spin_unlock),
 443        .lock.wait                      = paravirt_nop,
 444        .lock.kick                      = paravirt_nop,
 445        .lock.vcpu_is_preempted         =
 446                                PV_CALLEE_SAVE(__native_vcpu_is_preempted),
 447#endif /* SMP */
 448#endif
 449};
 450
 451#ifdef CONFIG_PARAVIRT_XXL
 452/* At this point, native_get/set_debugreg has real function entries */
 453NOKPROBE_SYMBOL(native_get_debugreg);
 454NOKPROBE_SYMBOL(native_set_debugreg);
 455NOKPROBE_SYMBOL(native_load_idt);
 456#endif
 457
 458EXPORT_SYMBOL(pv_ops);
 459EXPORT_SYMBOL_GPL(pv_info);
 460