linux/arch/x86/xen/enlighten_pv.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Core of Xen paravirt_ops implementation.
   4 *
   5 * This file contains the xen_paravirt_ops structure itself, and the
   6 * implementations for:
   7 * - privileged instructions
   8 * - interrupt flags
   9 * - segment operations
  10 * - booting and setup
  11 *
  12 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  13 */
  14
  15#include <linux/cpu.h>
  16#include <linux/kernel.h>
  17#include <linux/init.h>
  18#include <linux/smp.h>
  19#include <linux/preempt.h>
  20#include <linux/hardirq.h>
  21#include <linux/percpu.h>
  22#include <linux/delay.h>
  23#include <linux/start_kernel.h>
  24#include <linux/sched.h>
  25#include <linux/kprobes.h>
  26#include <linux/memblock.h>
  27#include <linux/export.h>
  28#include <linux/mm.h>
  29#include <linux/page-flags.h>
  30#include <linux/highmem.h>
  31#include <linux/console.h>
  32#include <linux/pci.h>
  33#include <linux/gfp.h>
  34#include <linux/edd.h>
  35#include <linux/frame.h>
  36
  37#include <xen/xen.h>
  38#include <xen/events.h>
  39#include <xen/interface/xen.h>
  40#include <xen/interface/version.h>
  41#include <xen/interface/physdev.h>
  42#include <xen/interface/vcpu.h>
  43#include <xen/interface/memory.h>
  44#include <xen/interface/nmi.h>
  45#include <xen/interface/xen-mca.h>
  46#include <xen/features.h>
  47#include <xen/page.h>
  48#include <xen/hvc-console.h>
  49#include <xen/acpi.h>
  50
  51#include <asm/paravirt.h>
  52#include <asm/apic.h>
  53#include <asm/page.h>
  54#include <asm/xen/pci.h>
  55#include <asm/xen/hypercall.h>
  56#include <asm/xen/hypervisor.h>
  57#include <asm/xen/cpuid.h>
  58#include <asm/fixmap.h>
  59#include <asm/processor.h>
  60#include <asm/proto.h>
  61#include <asm/msr-index.h>
  62#include <asm/traps.h>
  63#include <asm/setup.h>
  64#include <asm/desc.h>
  65#include <asm/pgalloc.h>
  66#include <asm/pgtable.h>
  67#include <asm/tlbflush.h>
  68#include <asm/reboot.h>
  69#include <asm/stackprotector.h>
  70#include <asm/hypervisor.h>
  71#include <asm/mach_traps.h>
  72#include <asm/mwait.h>
  73#include <asm/pci_x86.h>
  74#include <asm/cpu.h>
  75#ifdef CONFIG_X86_IOPL_IOPERM
  76#include <asm/io_bitmap.h>
  77#endif
  78
  79#ifdef CONFIG_ACPI
  80#include <linux/acpi.h>
  81#include <asm/acpi.h>
  82#include <acpi/pdc_intel.h>
  83#include <acpi/processor.h>
  84#include <xen/interface/platform.h>
  85#endif
  86
  87#include "xen-ops.h"
  88#include "mmu.h"
  89#include "smp.h"
  90#include "multicalls.h"
  91#include "pmu.h"
  92
  93#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */
  94
  95void *xen_initial_gdt;
  96
  97static int xen_cpu_up_prepare_pv(unsigned int cpu);
  98static int xen_cpu_dead_pv(unsigned int cpu);
  99
 100struct tls_descs {
 101        struct desc_struct desc[3];
 102};
 103
 104/*
 105 * Updating the 3 TLS descriptors in the GDT on every task switch is
 106 * surprisingly expensive so we avoid updating them if they haven't
 107 * changed.  Since Xen writes different descriptors than the one
 108 * passed in the update_descriptor hypercall we keep shadow copies to
 109 * compare against.
 110 */
 111static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
 112
 113static void __init xen_banner(void)
 114{
 115        unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
 116        struct xen_extraversion extra;
 117        HYPERVISOR_xen_version(XENVER_extraversion, &extra);
 118
 119        pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
 120        printk(KERN_INFO "Xen version: %d.%d%s%s\n",
 121               version >> 16, version & 0xffff, extra.extraversion,
 122               xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
 123
 124#ifdef CONFIG_X86_32
 125        pr_warn("WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n"
 126                "Support for running as 32-bit PV-guest under Xen will soon be removed\n"
 127                "from the Linux kernel!\n"
 128                "Please use either a 64-bit kernel or switch to HVM or PVH mode!\n"
 129                "WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n");
 130#endif
 131}
 132
 133static void __init xen_pv_init_platform(void)
 134{
 135        populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
 136
 137        set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
 138        HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
 139
 140        /* xen clock uses per-cpu vcpu_info, need to init it for boot cpu */
 141        xen_vcpu_info_reset(0);
 142
 143        /* pvclock is in shared info area */
 144        xen_init_time_ops();
 145}
 146
 147static void __init xen_pv_guest_late_init(void)
 148{
 149#ifndef CONFIG_SMP
 150        /* Setup shared vcpu info for non-smp configurations */
 151        xen_setup_vcpu_info_placement();
 152#endif
 153}
 154
 155/* Check if running on Xen version (major, minor) or later */
 156bool
 157xen_running_on_version_or_later(unsigned int major, unsigned int minor)
 158{
 159        unsigned int version;
 160
 161        if (!xen_domain())
 162                return false;
 163
 164        version = HYPERVISOR_xen_version(XENVER_version, NULL);
 165        if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
 166                ((version >> 16) > major))
 167                return true;
 168        return false;
 169}
 170
 171static __read_mostly unsigned int cpuid_leaf5_ecx_val;
 172static __read_mostly unsigned int cpuid_leaf5_edx_val;
 173
 174static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 175                      unsigned int *cx, unsigned int *dx)
 176{
 177        unsigned maskebx = ~0;
 178
 179        /*
 180         * Mask out inconvenient features, to try and disable as many
 181         * unsupported kernel subsystems as possible.
 182         */
 183        switch (*ax) {
 184        case CPUID_MWAIT_LEAF:
 185                /* Synthesize the values.. */
 186                *ax = 0;
 187                *bx = 0;
 188                *cx = cpuid_leaf5_ecx_val;
 189                *dx = cpuid_leaf5_edx_val;
 190                return;
 191
 192        case 0xb:
 193                /* Suppress extended topology stuff */
 194                maskebx = 0;
 195                break;
 196        }
 197
 198        asm(XEN_EMULATE_PREFIX "cpuid"
 199                : "=a" (*ax),
 200                  "=b" (*bx),
 201                  "=c" (*cx),
 202                  "=d" (*dx)
 203                : "0" (*ax), "2" (*cx));
 204
 205        *bx &= maskebx;
 206}
 207STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
 208
 209static bool __init xen_check_mwait(void)
 210{
 211#ifdef CONFIG_ACPI
 212        struct xen_platform_op op = {
 213                .cmd                    = XENPF_set_processor_pminfo,
 214                .u.set_pminfo.id        = -1,
 215                .u.set_pminfo.type      = XEN_PM_PDC,
 216        };
 217        uint32_t buf[3];
 218        unsigned int ax, bx, cx, dx;
 219        unsigned int mwait_mask;
 220
 221        /* We need to determine whether it is OK to expose the MWAIT
 222         * capability to the kernel to harvest deeper than C3 states from ACPI
 223         * _CST using the processor_harvest_xen.c module. For this to work, we
 224         * need to gather the MWAIT_LEAF values (which the cstate.c code
 225         * checks against). The hypervisor won't expose the MWAIT flag because
 226         * it would break backwards compatibility; so we will find out directly
 227         * from the hardware and hypercall.
 228         */
 229        if (!xen_initial_domain())
 230                return false;
 231
 232        /*
 233         * When running under platform earlier than Xen4.2, do not expose
 234         * mwait, to avoid the risk of loading native acpi pad driver
 235         */
 236        if (!xen_running_on_version_or_later(4, 2))
 237                return false;
 238
 239        ax = 1;
 240        cx = 0;
 241
 242        native_cpuid(&ax, &bx, &cx, &dx);
 243
 244        mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
 245                     (1 << (X86_FEATURE_MWAIT % 32));
 246
 247        if ((cx & mwait_mask) != mwait_mask)
 248                return false;
 249
 250        /* We need to emulate the MWAIT_LEAF and for that we need both
 251         * ecx and edx. The hypercall provides only partial information.
 252         */
 253
 254        ax = CPUID_MWAIT_LEAF;
 255        bx = 0;
 256        cx = 0;
 257        dx = 0;
 258
 259        native_cpuid(&ax, &bx, &cx, &dx);
 260
 261        /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
 262         * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
 263         */
 264        buf[0] = ACPI_PDC_REVISION_ID;
 265        buf[1] = 1;
 266        buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
 267
 268        set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
 269
 270        if ((HYPERVISOR_platform_op(&op) == 0) &&
 271            (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
 272                cpuid_leaf5_ecx_val = cx;
 273                cpuid_leaf5_edx_val = dx;
 274        }
 275        return true;
 276#else
 277        return false;
 278#endif
 279}
 280
 281static bool __init xen_check_xsave(void)
 282{
 283        unsigned int cx, xsave_mask;
 284
 285        cx = cpuid_ecx(1);
 286
 287        xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) |
 288                     (1 << (X86_FEATURE_OSXSAVE % 32));
 289
 290        /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
 291        return (cx & xsave_mask) == xsave_mask;
 292}
 293
 294static void __init xen_init_capabilities(void)
 295{
 296        setup_force_cpu_cap(X86_FEATURE_XENPV);
 297        setup_clear_cpu_cap(X86_FEATURE_DCA);
 298        setup_clear_cpu_cap(X86_FEATURE_APERFMPERF);
 299        setup_clear_cpu_cap(X86_FEATURE_MTRR);
 300        setup_clear_cpu_cap(X86_FEATURE_ACC);
 301        setup_clear_cpu_cap(X86_FEATURE_X2APIC);
 302        setup_clear_cpu_cap(X86_FEATURE_SME);
 303
 304        /*
 305         * Xen PV would need some work to support PCID: CR3 handling as well
 306         * as xen_flush_tlb_others() would need updating.
 307         */
 308        setup_clear_cpu_cap(X86_FEATURE_PCID);
 309
 310        if (!xen_initial_domain())
 311                setup_clear_cpu_cap(X86_FEATURE_ACPI);
 312
 313        if (xen_check_mwait())
 314                setup_force_cpu_cap(X86_FEATURE_MWAIT);
 315        else
 316                setup_clear_cpu_cap(X86_FEATURE_MWAIT);
 317
 318        if (!xen_check_xsave()) {
 319                setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 320                setup_clear_cpu_cap(X86_FEATURE_OSXSAVE);
 321        }
 322}
 323
 324static void xen_set_debugreg(int reg, unsigned long val)
 325{
 326        HYPERVISOR_set_debugreg(reg, val);
 327}
 328
 329static unsigned long xen_get_debugreg(int reg)
 330{
 331        return HYPERVISOR_get_debugreg(reg);
 332}
 333
 334static void xen_end_context_switch(struct task_struct *next)
 335{
 336        xen_mc_flush();
 337        paravirt_end_context_switch(next);
 338}
 339
 340static unsigned long xen_store_tr(void)
 341{
 342        return 0;
 343}
 344
 345/*
 346 * Set the page permissions for a particular virtual address.  If the
 347 * address is a vmalloc mapping (or other non-linear mapping), then
 348 * find the linear mapping of the page and also set its protections to
 349 * match.
 350 */
 351static void set_aliased_prot(void *v, pgprot_t prot)
 352{
 353        int level;
 354        pte_t *ptep;
 355        pte_t pte;
 356        unsigned long pfn;
 357        struct page *page;
 358        unsigned char dummy;
 359
 360        ptep = lookup_address((unsigned long)v, &level);
 361        BUG_ON(ptep == NULL);
 362
 363        pfn = pte_pfn(*ptep);
 364        page = pfn_to_page(pfn);
 365
 366        pte = pfn_pte(pfn, prot);
 367
 368        /*
 369         * Careful: update_va_mapping() will fail if the virtual address
 370         * we're poking isn't populated in the page tables.  We don't
 371         * need to worry about the direct map (that's always in the page
 372         * tables), but we need to be careful about vmap space.  In
 373         * particular, the top level page table can lazily propagate
 374         * entries between processes, so if we've switched mms since we
 375         * vmapped the target in the first place, we might not have the
 376         * top-level page table entry populated.
 377         *
 378         * We disable preemption because we want the same mm active when
 379         * we probe the target and when we issue the hypercall.  We'll
 380         * have the same nominal mm, but if we're a kernel thread, lazy
 381         * mm dropping could change our pgd.
 382         *
 383         * Out of an abundance of caution, this uses __get_user() to fault
 384         * in the target address just in case there's some obscure case
 385         * in which the target address isn't readable.
 386         */
 387
 388        preempt_disable();
 389
 390        probe_kernel_read(&dummy, v, 1);
 391
 392        if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
 393                BUG();
 394
 395        if (!PageHighMem(page)) {
 396                void *av = __va(PFN_PHYS(pfn));
 397
 398                if (av != v)
 399                        if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
 400                                BUG();
 401        } else
 402                kmap_flush_unused();
 403
 404        preempt_enable();
 405}
 406
 407static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 408{
 409        const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
 410        int i;
 411
 412        /*
 413         * We need to mark the all aliases of the LDT pages RO.  We
 414         * don't need to call vm_flush_aliases(), though, since that's
 415         * only responsible for flushing aliases out the TLBs, not the
 416         * page tables, and Xen will flush the TLB for us if needed.
 417         *
 418         * To avoid confusing future readers: none of this is necessary
 419         * to load the LDT.  The hypervisor only checks this when the
 420         * LDT is faulted in due to subsequent descriptor access.
 421         */
 422
 423        for (i = 0; i < entries; i += entries_per_page)
 424                set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
 425}
 426
 427static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
 428{
 429        const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
 430        int i;
 431
 432        for (i = 0; i < entries; i += entries_per_page)
 433                set_aliased_prot(ldt + i, PAGE_KERNEL);
 434}
 435
 436static void xen_set_ldt(const void *addr, unsigned entries)
 437{
 438        struct mmuext_op *op;
 439        struct multicall_space mcs = xen_mc_entry(sizeof(*op));
 440
 441        trace_xen_cpu_set_ldt(addr, entries);
 442
 443        op = mcs.args;
 444        op->cmd = MMUEXT_SET_LDT;
 445        op->arg1.linear_addr = (unsigned long)addr;
 446        op->arg2.nr_ents = entries;
 447
 448        MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 449
 450        xen_mc_issue(PARAVIRT_LAZY_CPU);
 451}
 452
 453static void xen_load_gdt(const struct desc_ptr *dtr)
 454{
 455        unsigned long va = dtr->address;
 456        unsigned int size = dtr->size + 1;
 457        unsigned long pfn, mfn;
 458        int level;
 459        pte_t *ptep;
 460        void *virt;
 461
 462        /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
 463        BUG_ON(size > PAGE_SIZE);
 464        BUG_ON(va & ~PAGE_MASK);
 465
 466        /*
 467         * The GDT is per-cpu and is in the percpu data area.
 468         * That can be virtually mapped, so we need to do a
 469         * page-walk to get the underlying MFN for the
 470         * hypercall.  The page can also be in the kernel's
 471         * linear range, so we need to RO that mapping too.
 472         */
 473        ptep = lookup_address(va, &level);
 474        BUG_ON(ptep == NULL);
 475
 476        pfn = pte_pfn(*ptep);
 477        mfn = pfn_to_mfn(pfn);
 478        virt = __va(PFN_PHYS(pfn));
 479
 480        make_lowmem_page_readonly((void *)va);
 481        make_lowmem_page_readonly(virt);
 482
 483        if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
 484                BUG();
 485}
 486
 487/*
 488 * load_gdt for early boot, when the gdt is only mapped once
 489 */
 490static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
 491{
 492        unsigned long va = dtr->address;
 493        unsigned int size = dtr->size + 1;
 494        unsigned long pfn, mfn;
 495        pte_t pte;
 496
 497        /* @size should be at most GDT_SIZE which is smaller than PAGE_SIZE. */
 498        BUG_ON(size > PAGE_SIZE);
 499        BUG_ON(va & ~PAGE_MASK);
 500
 501        pfn = virt_to_pfn(va);
 502        mfn = pfn_to_mfn(pfn);
 503
 504        pte = pfn_pte(pfn, PAGE_KERNEL_RO);
 505
 506        if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
 507                BUG();
 508
 509        if (HYPERVISOR_set_gdt(&mfn, size / sizeof(struct desc_struct)))
 510                BUG();
 511}
 512
 513static inline bool desc_equal(const struct desc_struct *d1,
 514                              const struct desc_struct *d2)
 515{
 516        return !memcmp(d1, d2, sizeof(*d1));
 517}
 518
 519static void load_TLS_descriptor(struct thread_struct *t,
 520                                unsigned int cpu, unsigned int i)
 521{
 522        struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
 523        struct desc_struct *gdt;
 524        xmaddr_t maddr;
 525        struct multicall_space mc;
 526
 527        if (desc_equal(shadow, &t->tls_array[i]))
 528                return;
 529
 530        *shadow = t->tls_array[i];
 531
 532        gdt = get_cpu_gdt_rw(cpu);
 533        maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
 534        mc = __xen_mc_entry(0);
 535
 536        MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
 537}
 538
 539static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 540{
 541        /*
 542         * XXX sleazy hack: If we're being called in a lazy-cpu zone
 543         * and lazy gs handling is enabled, it means we're in a
 544         * context switch, and %gs has just been saved.  This means we
 545         * can zero it out to prevent faults on exit from the
 546         * hypervisor if the next process has no %gs.  Either way, it
 547         * has been saved, and the new value will get loaded properly.
 548         * This will go away as soon as Xen has been modified to not
 549         * save/restore %gs for normal hypercalls.
 550         *
 551         * On x86_64, this hack is not used for %gs, because gs points
 552         * to KERNEL_GS_BASE (and uses it for PDA references), so we
 553         * must not zero %gs on x86_64
 554         *
 555         * For x86_64, we need to zero %fs, otherwise we may get an
 556         * exception between the new %fs descriptor being loaded and
 557         * %fs being effectively cleared at __switch_to().
 558         */
 559        if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
 560#ifdef CONFIG_X86_32
 561                lazy_load_gs(0);
 562#else
 563                loadsegment(fs, 0);
 564#endif
 565        }
 566
 567        xen_mc_batch();
 568
 569        load_TLS_descriptor(t, cpu, 0);
 570        load_TLS_descriptor(t, cpu, 1);
 571        load_TLS_descriptor(t, cpu, 2);
 572
 573        xen_mc_issue(PARAVIRT_LAZY_CPU);
 574}
 575
 576#ifdef CONFIG_X86_64
 577static void xen_load_gs_index(unsigned int idx)
 578{
 579        if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
 580                BUG();
 581}
 582#endif
 583
 584static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 585                                const void *ptr)
 586{
 587        xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
 588        u64 entry = *(u64 *)ptr;
 589
 590        trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
 591
 592        preempt_disable();
 593
 594        xen_mc_flush();
 595        if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
 596                BUG();
 597
 598        preempt_enable();
 599}
 600
 601#ifdef CONFIG_X86_64
 602struct trap_array_entry {
 603        void (*orig)(void);
 604        void (*xen)(void);
 605        bool ist_okay;
 606};
 607
 608static struct trap_array_entry trap_array[] = {
 609        { debug,                       xen_xendebug,                    true },
 610        { double_fault,                xen_double_fault,                true },
 611#ifdef CONFIG_X86_MCE
 612        { machine_check,               xen_machine_check,               true },
 613#endif
 614        { nmi,                         xen_xennmi,                      true },
 615        { int3,                        xen_int3,                        false },
 616        { overflow,                    xen_overflow,                    false },
 617#ifdef CONFIG_IA32_EMULATION
 618        { entry_INT80_compat,          xen_entry_INT80_compat,          false },
 619#endif
 620        { page_fault,                  xen_page_fault,                  false },
 621        { divide_error,                xen_divide_error,                false },
 622        { bounds,                      xen_bounds,                      false },
 623        { invalid_op,                  xen_invalid_op,                  false },
 624        { device_not_available,        xen_device_not_available,        false },
 625        { coprocessor_segment_overrun, xen_coprocessor_segment_overrun, false },
 626        { invalid_TSS,                 xen_invalid_TSS,                 false },
 627        { segment_not_present,         xen_segment_not_present,         false },
 628        { stack_segment,               xen_stack_segment,               false },
 629        { general_protection,          xen_general_protection,          false },
 630        { spurious_interrupt_bug,      xen_spurious_interrupt_bug,      false },
 631        { coprocessor_error,           xen_coprocessor_error,           false },
 632        { alignment_check,             xen_alignment_check,             false },
 633        { simd_coprocessor_error,      xen_simd_coprocessor_error,      false },
 634};
 635
 636static bool __ref get_trap_addr(void **addr, unsigned int ist)
 637{
 638        unsigned int nr;
 639        bool ist_okay = false;
 640
 641        /*
 642         * Replace trap handler addresses by Xen specific ones.
 643         * Check for known traps using IST and whitelist them.
 644         * The debugger ones are the only ones we care about.
 645         * Xen will handle faults like double_fault, * so we should never see
 646         * them.  Warn if there's an unexpected IST-using fault handler.
 647         */
 648        for (nr = 0; nr < ARRAY_SIZE(trap_array); nr++) {
 649                struct trap_array_entry *entry = trap_array + nr;
 650
 651                if (*addr == entry->orig) {
 652                        *addr = entry->xen;
 653                        ist_okay = entry->ist_okay;
 654                        break;
 655                }
 656        }
 657
 658        if (nr == ARRAY_SIZE(trap_array) &&
 659            *addr >= (void *)early_idt_handler_array[0] &&
 660            *addr < (void *)early_idt_handler_array[NUM_EXCEPTION_VECTORS]) {
 661                nr = (*addr - (void *)early_idt_handler_array[0]) /
 662                     EARLY_IDT_HANDLER_SIZE;
 663                *addr = (void *)xen_early_idt_handler_array[nr];
 664        }
 665
 666        if (WARN_ON(ist != 0 && !ist_okay))
 667                return false;
 668
 669        return true;
 670}
 671#endif
 672
 673static int cvt_gate_to_trap(int vector, const gate_desc *val,
 674                            struct trap_info *info)
 675{
 676        unsigned long addr;
 677
 678        if (val->bits.type != GATE_TRAP && val->bits.type != GATE_INTERRUPT)
 679                return 0;
 680
 681        info->vector = vector;
 682
 683        addr = gate_offset(val);
 684#ifdef CONFIG_X86_64
 685        if (!get_trap_addr((void **)&addr, val->bits.ist))
 686                return 0;
 687#endif  /* CONFIG_X86_64 */
 688        info->address = addr;
 689
 690        info->cs = gate_segment(val);
 691        info->flags = val->bits.dpl;
 692        /* interrupt gates clear IF */
 693        if (val->bits.type == GATE_INTERRUPT)
 694                info->flags |= 1 << 2;
 695
 696        return 1;
 697}
 698
 699/* Locations of each CPU's IDT */
 700static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
 701
 702/* Set an IDT entry.  If the entry is part of the current IDT, then
 703   also update Xen. */
 704static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
 705{
 706        unsigned long p = (unsigned long)&dt[entrynum];
 707        unsigned long start, end;
 708
 709        trace_xen_cpu_write_idt_entry(dt, entrynum, g);
 710
 711        preempt_disable();
 712
 713        start = __this_cpu_read(idt_desc.address);
 714        end = start + __this_cpu_read(idt_desc.size) + 1;
 715
 716        xen_mc_flush();
 717
 718        native_write_idt_entry(dt, entrynum, g);
 719
 720        if (p >= start && (p + 8) <= end) {
 721                struct trap_info info[2];
 722
 723                info[1].address = 0;
 724
 725                if (cvt_gate_to_trap(entrynum, g, &info[0]))
 726                        if (HYPERVISOR_set_trap_table(info))
 727                                BUG();
 728        }
 729
 730        preempt_enable();
 731}
 732
 733static void xen_convert_trap_info(const struct desc_ptr *desc,
 734                                  struct trap_info *traps)
 735{
 736        unsigned in, out, count;
 737
 738        count = (desc->size+1) / sizeof(gate_desc);
 739        BUG_ON(count > 256);
 740
 741        for (in = out = 0; in < count; in++) {
 742                gate_desc *entry = (gate_desc *)(desc->address) + in;
 743
 744                if (cvt_gate_to_trap(in, entry, &traps[out]))
 745                        out++;
 746        }
 747        traps[out].address = 0;
 748}
 749
 750void xen_copy_trap_info(struct trap_info *traps)
 751{
 752        const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
 753
 754        xen_convert_trap_info(desc, traps);
 755}
 756
 757/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
 758   hold a spinlock to protect the static traps[] array (static because
 759   it avoids allocation, and saves stack space). */
 760static void xen_load_idt(const struct desc_ptr *desc)
 761{
 762        static DEFINE_SPINLOCK(lock);
 763        static struct trap_info traps[257];
 764
 765        trace_xen_cpu_load_idt(desc);
 766
 767        spin_lock(&lock);
 768
 769        memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
 770
 771        xen_convert_trap_info(desc, traps);
 772
 773        xen_mc_flush();
 774        if (HYPERVISOR_set_trap_table(traps))
 775                BUG();
 776
 777        spin_unlock(&lock);
 778}
 779
 780/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
 781   they're handled differently. */
 782static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 783                                const void *desc, int type)
 784{
 785        trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
 786
 787        preempt_disable();
 788
 789        switch (type) {
 790        case DESC_LDT:
 791        case DESC_TSS:
 792                /* ignore */
 793                break;
 794
 795        default: {
 796                xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
 797
 798                xen_mc_flush();
 799                if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
 800                        BUG();
 801        }
 802
 803        }
 804
 805        preempt_enable();
 806}
 807
 808/*
 809 * Version of write_gdt_entry for use at early boot-time needed to
 810 * update an entry as simply as possible.
 811 */
 812static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
 813                                            const void *desc, int type)
 814{
 815        trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
 816
 817        switch (type) {
 818        case DESC_LDT:
 819        case DESC_TSS:
 820                /* ignore */
 821                break;
 822
 823        default: {
 824                xmaddr_t maddr = virt_to_machine(&dt[entry]);
 825
 826                if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
 827                        dt[entry] = *(struct desc_struct *)desc;
 828        }
 829
 830        }
 831}
 832
 833static void xen_load_sp0(unsigned long sp0)
 834{
 835        struct multicall_space mcs;
 836
 837        mcs = xen_mc_entry(0);
 838        MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
 839        xen_mc_issue(PARAVIRT_LAZY_CPU);
 840        this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 841}
 842
 843#ifdef CONFIG_X86_IOPL_IOPERM
 844static void xen_update_io_bitmap(void)
 845{
 846        struct physdev_set_iobitmap iobitmap;
 847        struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
 848
 849        native_tss_update_io_bitmap();
 850
 851        iobitmap.bitmap = (uint8_t *)(&tss->x86_tss) +
 852                          tss->x86_tss.io_bitmap_base;
 853        if (tss->x86_tss.io_bitmap_base == IO_BITMAP_OFFSET_INVALID)
 854                iobitmap.nr_ports = 0;
 855        else
 856                iobitmap.nr_ports = IO_BITMAP_BITS;
 857
 858        HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobitmap);
 859}
 860#endif
 861
 862static void xen_io_delay(void)
 863{
 864}
 865
 866static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
 867
 868static unsigned long xen_read_cr0(void)
 869{
 870        unsigned long cr0 = this_cpu_read(xen_cr0_value);
 871
 872        if (unlikely(cr0 == 0)) {
 873                cr0 = native_read_cr0();
 874                this_cpu_write(xen_cr0_value, cr0);
 875        }
 876
 877        return cr0;
 878}
 879
 880static void xen_write_cr0(unsigned long cr0)
 881{
 882        struct multicall_space mcs;
 883
 884        this_cpu_write(xen_cr0_value, cr0);
 885
 886        /* Only pay attention to cr0.TS; everything else is
 887           ignored. */
 888        mcs = xen_mc_entry(0);
 889
 890        MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
 891
 892        xen_mc_issue(PARAVIRT_LAZY_CPU);
 893}
 894
 895static void xen_write_cr4(unsigned long cr4)
 896{
 897        cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
 898
 899        native_write_cr4(cr4);
 900}
 901
 902static u64 xen_read_msr_safe(unsigned int msr, int *err)
 903{
 904        u64 val;
 905
 906        if (pmu_msr_read(msr, &val, err))
 907                return val;
 908
 909        val = native_read_msr_safe(msr, err);
 910        switch (msr) {
 911        case MSR_IA32_APICBASE:
 912                val &= ~X2APIC_ENABLE;
 913                break;
 914        }
 915        return val;
 916}
 917
 918static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 919{
 920        int ret;
 921#ifdef CONFIG_X86_64
 922        unsigned int which;
 923        u64 base;
 924#endif
 925
 926        ret = 0;
 927
 928        switch (msr) {
 929#ifdef CONFIG_X86_64
 930        case MSR_FS_BASE:               which = SEGBASE_FS; goto set;
 931        case MSR_KERNEL_GS_BASE:        which = SEGBASE_GS_USER; goto set;
 932        case MSR_GS_BASE:               which = SEGBASE_GS_KERNEL; goto set;
 933
 934        set:
 935                base = ((u64)high << 32) | low;
 936                if (HYPERVISOR_set_segment_base(which, base) != 0)
 937                        ret = -EIO;
 938                break;
 939#endif
 940
 941        case MSR_STAR:
 942        case MSR_CSTAR:
 943        case MSR_LSTAR:
 944        case MSR_SYSCALL_MASK:
 945        case MSR_IA32_SYSENTER_CS:
 946        case MSR_IA32_SYSENTER_ESP:
 947        case MSR_IA32_SYSENTER_EIP:
 948                /* Fast syscall setup is all done in hypercalls, so
 949                   these are all ignored.  Stub them out here to stop
 950                   Xen console noise. */
 951                break;
 952
 953        default:
 954                if (!pmu_msr_write(msr, low, high, &ret))
 955                        ret = native_write_msr_safe(msr, low, high);
 956        }
 957
 958        return ret;
 959}
 960
 961static u64 xen_read_msr(unsigned int msr)
 962{
 963        /*
 964         * This will silently swallow a #GP from RDMSR.  It may be worth
 965         * changing that.
 966         */
 967        int err;
 968
 969        return xen_read_msr_safe(msr, &err);
 970}
 971
 972static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
 973{
 974        /*
 975         * This will silently swallow a #GP from WRMSR.  It may be worth
 976         * changing that.
 977         */
 978        xen_write_msr_safe(msr, low, high);
 979}
 980
 981/* This is called once we have the cpu_possible_mask */
 982void __init xen_setup_vcpu_info_placement(void)
 983{
 984        int cpu;
 985
 986        for_each_possible_cpu(cpu) {
 987                /* Set up direct vCPU id mapping for PV guests. */
 988                per_cpu(xen_vcpu_id, cpu) = cpu;
 989
 990                /*
 991                 * xen_vcpu_setup(cpu) can fail  -- in which case it
 992                 * falls back to the shared_info version for cpus
 993                 * where xen_vcpu_nr(cpu) < MAX_VIRT_CPUS.
 994                 *
 995                 * xen_cpu_up_prepare_pv() handles the rest by failing
 996                 * them in hotplug.
 997                 */
 998                (void) xen_vcpu_setup(cpu);
 999        }
1000
1001        /*
1002         * xen_vcpu_setup managed to place the vcpu_info within the
1003         * percpu area for all cpus, so make use of it.
1004         */
1005        if (xen_have_vcpu_info_placement) {
1006                pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1007                pv_ops.irq.restore_fl =
1008                        __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1009                pv_ops.irq.irq_disable =
1010                        __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
1011                pv_ops.irq.irq_enable =
1012                        __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1013                pv_ops.mmu.read_cr2 =
1014                        __PV_IS_CALLEE_SAVE(xen_read_cr2_direct);
1015        }
1016}
1017
1018static const struct pv_info xen_info __initconst = {
1019        .shared_kernel_pmd = 0,
1020
1021#ifdef CONFIG_X86_64
1022        .extra_user_64bit_cs = FLAT_USER_CS64,
1023#endif
1024        .name = "Xen",
1025};
1026
1027static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1028        .cpuid = xen_cpuid,
1029
1030        .set_debugreg = xen_set_debugreg,
1031        .get_debugreg = xen_get_debugreg,
1032
1033        .read_cr0 = xen_read_cr0,
1034        .write_cr0 = xen_write_cr0,
1035
1036        .write_cr4 = xen_write_cr4,
1037
1038        .wbinvd = native_wbinvd,
1039
1040        .read_msr = xen_read_msr,
1041        .write_msr = xen_write_msr,
1042
1043        .read_msr_safe = xen_read_msr_safe,
1044        .write_msr_safe = xen_write_msr_safe,
1045
1046        .read_pmc = xen_read_pmc,
1047
1048        .iret = xen_iret,
1049#ifdef CONFIG_X86_64
1050        .usergs_sysret64 = xen_sysret64,
1051#endif
1052
1053        .load_tr_desc = paravirt_nop,
1054        .set_ldt = xen_set_ldt,
1055        .load_gdt = xen_load_gdt,
1056        .load_idt = xen_load_idt,
1057        .load_tls = xen_load_tls,
1058#ifdef CONFIG_X86_64
1059        .load_gs_index = xen_load_gs_index,
1060#endif
1061
1062        .alloc_ldt = xen_alloc_ldt,
1063        .free_ldt = xen_free_ldt,
1064
1065        .store_tr = xen_store_tr,
1066
1067        .write_ldt_entry = xen_write_ldt_entry,
1068        .write_gdt_entry = xen_write_gdt_entry,
1069        .write_idt_entry = xen_write_idt_entry,
1070        .load_sp0 = xen_load_sp0,
1071
1072#ifdef CONFIG_X86_IOPL_IOPERM
1073        .update_io_bitmap = xen_update_io_bitmap,
1074#endif
1075        .io_delay = xen_io_delay,
1076
1077        /* Xen takes care of %gs when switching to usermode for us */
1078        .swapgs = paravirt_nop,
1079
1080        .start_context_switch = paravirt_start_context_switch,
1081        .end_context_switch = xen_end_context_switch,
1082};
1083
1084static void xen_restart(char *msg)
1085{
1086        xen_reboot(SHUTDOWN_reboot);
1087}
1088
1089static void xen_machine_halt(void)
1090{
1091        xen_reboot(SHUTDOWN_poweroff);
1092}
1093
1094static void xen_machine_power_off(void)
1095{
1096        if (pm_power_off)
1097                pm_power_off();
1098        xen_reboot(SHUTDOWN_poweroff);
1099}
1100
1101static void xen_crash_shutdown(struct pt_regs *regs)
1102{
1103        xen_reboot(SHUTDOWN_crash);
1104}
1105
1106static const struct machine_ops xen_machine_ops __initconst = {
1107        .restart = xen_restart,
1108        .halt = xen_machine_halt,
1109        .power_off = xen_machine_power_off,
1110        .shutdown = xen_machine_halt,
1111        .crash_shutdown = xen_crash_shutdown,
1112        .emergency_restart = xen_emergency_restart,
1113};
1114
1115static unsigned char xen_get_nmi_reason(void)
1116{
1117        unsigned char reason = 0;
1118
1119        /* Construct a value which looks like it came from port 0x61. */
1120        if (test_bit(_XEN_NMIREASON_io_error,
1121                     &HYPERVISOR_shared_info->arch.nmi_reason))
1122                reason |= NMI_REASON_IOCHK;
1123        if (test_bit(_XEN_NMIREASON_pci_serr,
1124                     &HYPERVISOR_shared_info->arch.nmi_reason))
1125                reason |= NMI_REASON_SERR;
1126
1127        return reason;
1128}
1129
1130static void __init xen_boot_params_init_edd(void)
1131{
1132#if IS_ENABLED(CONFIG_EDD)
1133        struct xen_platform_op op;
1134        struct edd_info *edd_info;
1135        u32 *mbr_signature;
1136        unsigned nr;
1137        int ret;
1138
1139        edd_info = boot_params.eddbuf;
1140        mbr_signature = boot_params.edd_mbr_sig_buffer;
1141
1142        op.cmd = XENPF_firmware_info;
1143
1144        op.u.firmware_info.type = XEN_FW_DISK_INFO;
1145        for (nr = 0; nr < EDDMAXNR; nr++) {
1146                struct edd_info *info = edd_info + nr;
1147
1148                op.u.firmware_info.index = nr;
1149                info->params.length = sizeof(info->params);
1150                set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
1151                                     &info->params);
1152                ret = HYPERVISOR_platform_op(&op);
1153                if (ret)
1154                        break;
1155
1156#define C(x) info->x = op.u.firmware_info.u.disk_info.x
1157                C(device);
1158                C(version);
1159                C(interface_support);
1160                C(legacy_max_cylinder);
1161                C(legacy_max_head);
1162                C(legacy_sectors_per_track);
1163#undef C
1164        }
1165        boot_params.eddbuf_entries = nr;
1166
1167        op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
1168        for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
1169                op.u.firmware_info.index = nr;
1170                ret = HYPERVISOR_platform_op(&op);
1171                if (ret)
1172                        break;
1173                mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
1174        }
1175        boot_params.edd_mbr_sig_buf_entries = nr;
1176#endif
1177}
1178
1179/*
1180 * Set up the GDT and segment registers for -fstack-protector.  Until
1181 * we do this, we have to be careful not to call any stack-protected
1182 * function, which is most of the kernel.
1183 */
1184static void __init xen_setup_gdt(int cpu)
1185{
1186        pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
1187        pv_ops.cpu.load_gdt = xen_load_gdt_boot;
1188
1189        setup_stack_canary_segment(cpu);
1190        switch_to_new_gdt(cpu);
1191
1192        pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
1193        pv_ops.cpu.load_gdt = xen_load_gdt;
1194}
1195
1196static void __init xen_dom0_set_legacy_features(void)
1197{
1198        x86_platform.legacy.rtc = 1;
1199}
1200
1201/* First C function to be called on Xen boot */
1202asmlinkage __visible void __init xen_start_kernel(void)
1203{
1204        struct physdev_set_iopl set_iopl;
1205        unsigned long initrd_start = 0;
1206        int rc;
1207
1208        if (!xen_start_info)
1209                return;
1210
1211        xen_domain_type = XEN_PV_DOMAIN;
1212        xen_start_flags = xen_start_info->flags;
1213
1214        xen_setup_features();
1215
1216        /* Install Xen paravirt ops */
1217        pv_info = xen_info;
1218        pv_ops.init.patch = paravirt_patch_default;
1219        pv_ops.cpu = xen_cpu_ops;
1220        xen_init_irq_ops();
1221
1222        /*
1223         * Setup xen_vcpu early because it is needed for
1224         * local_irq_disable(), irqs_disabled(), e.g. in printk().
1225         *
1226         * Don't do the full vcpu_info placement stuff until we have
1227         * the cpu_possible_mask and a non-dummy shared_info.
1228         */
1229        xen_vcpu_info_reset(0);
1230
1231        x86_platform.get_nmi_reason = xen_get_nmi_reason;
1232
1233        x86_init.resources.memory_setup = xen_memory_setup;
1234        x86_init.irqs.intr_mode_select  = x86_init_noop;
1235        x86_init.irqs.intr_mode_init    = x86_init_noop;
1236        x86_init.oem.arch_setup = xen_arch_setup;
1237        x86_init.oem.banner = xen_banner;
1238        x86_init.hyper.init_platform = xen_pv_init_platform;
1239        x86_init.hyper.guest_late_init = xen_pv_guest_late_init;
1240
1241        /*
1242         * Set up some pagetable state before starting to set any ptes.
1243         */
1244
1245        xen_setup_machphys_mapping();
1246        xen_init_mmu_ops();
1247
1248        /* Prevent unwanted bits from being set in PTEs. */
1249        __supported_pte_mask &= ~_PAGE_GLOBAL;
1250        __default_kernel_pte_mask &= ~_PAGE_GLOBAL;
1251
1252        /*
1253         * Prevent page tables from being allocated in highmem, even
1254         * if CONFIG_HIGHPTE is enabled.
1255         */
1256        __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
1257
1258        /* Get mfn list */
1259        xen_build_dynamic_phys_to_machine();
1260
1261        /*
1262         * Set up kernel GDT and segment registers, mainly so that
1263         * -fstack-protector code can be executed.
1264         */
1265        xen_setup_gdt(0);
1266
1267        /* Work out if we support NX */
1268        get_cpu_cap(&boot_cpu_data);
1269        x86_configure_nx();
1270
1271        /* Determine virtual and physical address sizes */
1272        get_cpu_address_sizes(&boot_cpu_data);
1273
1274        /* Let's presume PV guests always boot on vCPU with id 0. */
1275        per_cpu(xen_vcpu_id, 0) = 0;
1276
1277        idt_setup_early_handler();
1278
1279        xen_init_capabilities();
1280
1281#ifdef CONFIG_X86_LOCAL_APIC
1282        /*
1283         * set up the basic apic ops.
1284         */
1285        xen_init_apic();
1286#endif
1287
1288        if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1289                pv_ops.mmu.ptep_modify_prot_start =
1290                        xen_ptep_modify_prot_start;
1291                pv_ops.mmu.ptep_modify_prot_commit =
1292                        xen_ptep_modify_prot_commit;
1293        }
1294
1295        machine_ops = xen_machine_ops;
1296
1297        /*
1298         * The only reliable way to retain the initial address of the
1299         * percpu gdt_page is to remember it here, so we can go and
1300         * mark it RW later, when the initial percpu area is freed.
1301         */
1302        xen_initial_gdt = &per_cpu(gdt_page, 0);
1303
1304        xen_smp_init();
1305
1306#ifdef CONFIG_ACPI_NUMA
1307        /*
1308         * The pages we from Xen are not related to machine pages, so
1309         * any NUMA information the kernel tries to get from ACPI will
1310         * be meaningless.  Prevent it from trying.
1311         */
1312        acpi_numa = -1;
1313#endif
1314        WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
1315
1316        local_irq_disable();
1317        early_boot_irqs_disabled = true;
1318
1319        xen_raw_console_write("mapping kernel into physical memory\n");
1320        xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
1321                                   xen_start_info->nr_pages);
1322        xen_reserve_special_pages();
1323
1324        /* keep using Xen gdt for now; no urgent need to change it */
1325
1326#ifdef CONFIG_X86_32
1327        pv_info.kernel_rpl = 1;
1328        if (xen_feature(XENFEAT_supervisor_mode_kernel))
1329                pv_info.kernel_rpl = 0;
1330#else
1331        pv_info.kernel_rpl = 0;
1332#endif
1333        /* set the limit of our address space */
1334        xen_reserve_top();
1335
1336        /*
1337         * We used to do this in xen_arch_setup, but that is too late
1338         * on AMD were early_cpu_init (run before ->arch_setup()) calls
1339         * early_amd_init which pokes 0xcf8 port.
1340         */
1341        set_iopl.iopl = 1;
1342        rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1343        if (rc != 0)
1344                xen_raw_printk("physdev_op failed %d\n", rc);
1345
1346#ifdef CONFIG_X86_32
1347        /* set up basic CPUID stuff */
1348        cpu_detect(&new_cpu_data);
1349        set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
1350        new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
1351#endif
1352
1353        if (xen_start_info->mod_start) {
1354            if (xen_start_info->flags & SIF_MOD_START_PFN)
1355                initrd_start = PFN_PHYS(xen_start_info->mod_start);
1356            else
1357                initrd_start = __pa(xen_start_info->mod_start);
1358        }
1359
1360        /* Poke various useful things into boot_params */
1361        boot_params.hdr.type_of_loader = (9 << 4) | 0;
1362        boot_params.hdr.ramdisk_image = initrd_start;
1363        boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1364        boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1365        boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN;
1366
1367        if (!xen_initial_domain()) {
1368                add_preferred_console("xenboot", 0, NULL);
1369                if (pci_xen)
1370                        x86_init.pci.arch_init = pci_xen_init;
1371        } else {
1372                const struct dom0_vga_console_info *info =
1373                        (void *)((char *)xen_start_info +
1374                                 xen_start_info->console.dom0.info_off);
1375                struct xen_platform_op op = {
1376                        .cmd = XENPF_firmware_info,
1377                        .interface_version = XENPF_INTERFACE_VERSION,
1378                        .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
1379                };
1380
1381                x86_platform.set_legacy_features =
1382                                xen_dom0_set_legacy_features;
1383                xen_init_vga(info, xen_start_info->console.dom0.info_size);
1384                xen_start_info->console.domU.mfn = 0;
1385                xen_start_info->console.domU.evtchn = 0;
1386
1387                if (HYPERVISOR_platform_op(&op) == 0)
1388                        boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
1389
1390                /* Make sure ACS will be enabled */
1391                pci_request_acs();
1392
1393                xen_acpi_sleep_register();
1394
1395                /* Avoid searching for BIOS MP tables */
1396                x86_init.mpparse.find_smp_config = x86_init_noop;
1397                x86_init.mpparse.get_smp_config = x86_init_uint_noop;
1398
1399                xen_boot_params_init_edd();
1400        }
1401
1402        if (!boot_params.screen_info.orig_video_isVGA)
1403                add_preferred_console("tty", 0, NULL);
1404        add_preferred_console("hvc", 0, NULL);
1405        if (boot_params.screen_info.orig_video_isVGA)
1406                add_preferred_console("tty", 0, NULL);
1407
1408#ifdef CONFIG_PCI
1409        /* PCI BIOS service won't work from a PV guest. */
1410        pci_probe &= ~PCI_PROBE_BIOS;
1411#endif
1412        xen_raw_console_write("about to get started...\n");
1413
1414        /* We need this for printk timestamps */
1415        xen_setup_runstate_info(0);
1416
1417        xen_efi_init(&boot_params);
1418
1419        /* Start the world */
1420#ifdef CONFIG_X86_32
1421        i386_start_kernel();
1422#else
1423        cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
1424        x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1425#endif
1426}
1427
1428static int xen_cpu_up_prepare_pv(unsigned int cpu)
1429{
1430        int rc;
1431
1432        if (per_cpu(xen_vcpu, cpu) == NULL)
1433                return -ENODEV;
1434
1435        xen_setup_timer(cpu);
1436
1437        rc = xen_smp_intr_init(cpu);
1438        if (rc) {
1439                WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
1440                     cpu, rc);
1441                return rc;
1442        }
1443
1444        rc = xen_smp_intr_init_pv(cpu);
1445        if (rc) {
1446                WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
1447                     cpu, rc);
1448                return rc;
1449        }
1450
1451        return 0;
1452}
1453
1454static int xen_cpu_dead_pv(unsigned int cpu)
1455{
1456        xen_smp_intr_free(cpu);
1457        xen_smp_intr_free_pv(cpu);
1458
1459        xen_teardown_timer(cpu);
1460
1461        return 0;
1462}
1463
1464static uint32_t __init xen_platform_pv(void)
1465{
1466        if (xen_pv_domain())
1467                return xen_cpuid_base();
1468
1469        return 0;
1470}
1471
1472const __initconst struct hypervisor_x86 x86_hyper_xen_pv = {
1473        .name                   = "Xen PV",
1474        .detect                 = xen_platform_pv,
1475        .type                   = X86_HYPER_XEN_PV,
1476        .runtime.pin_vcpu       = xen_pin_vcpu,
1477        .ignore_nopv            = true,
1478};
1479