linux/arch/x86/kernel/smpboot.c
<<
>>
Prefs
   1 /*
   2 *      x86 SMP booting functions
   3 *
   4 *      (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
   5 *      (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
   6 *      Copyright 2001 Andi Kleen, SuSE Labs.
   7 *
   8 *      Much of the core SMP work is based on previous work by Thomas Radke, to
   9 *      whom a great many thanks are extended.
  10 *
  11 *      Thanks to Intel for making available several different Pentium,
  12 *      Pentium Pro and Pentium-II/Xeon MP machines.
  13 *      Original development of Linux SMP code supported by Caldera.
  14 *
  15 *      This code is released under the GNU General Public License version 2 or
  16 *      later.
  17 *
  18 *      Fixes
  19 *              Felix Koop      :       NR_CPUS used properly
  20 *              Jose Renau      :       Handle single CPU case.
  21 *              Alan Cox        :       By repeated request 8) - Total BogoMIPS report.
  22 *              Greg Wright     :       Fix for kernel stacks panic.
  23 *              Erich Boleyn    :       MP v1.4 and additional changes.
  24 *      Matthias Sattler        :       Changes for 2.1 kernel map.
  25 *      Michel Lespinasse       :       Changes for 2.1 kernel map.
  26 *      Michael Chastain        :       Change trampoline.S to gnu as.
  27 *              Alan Cox        :       Dumb bug: 'B' step PPro's are fine
  28 *              Ingo Molnar     :       Added APIC timers, based on code
  29 *                                      from Jose Renau
  30 *              Ingo Molnar     :       various cleanups and rewrites
  31 *              Tigran Aivazian :       fixed "0.00 in /proc/uptime on SMP" bug.
  32 *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs
  33 *      Andi Kleen              :       Changed for SMP boot into long mode.
  34 *              Martin J. Bligh :       Added support for multi-quad systems
  35 *              Dave Jones      :       Report invalid combinations of Athlon CPUs.
  36 *              Rusty Russell   :       Hacked into shape for new "hotplug" boot process.
  37 *      Andi Kleen              :       Converted to new state machine.
  38 *      Ashok Raj               :       CPU hotplug support
  39 *      Glauber Costa           :       i386 and x86_64 integration
  40 */
  41
  42#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  43
  44#include <linux/init.h>
  45#include <linux/smp.h>
  46#include <linux/module.h>
  47#include <linux/sched.h>
  48#include <linux/percpu.h>
  49#include <linux/bootmem.h>
  50#include <linux/err.h>
  51#include <linux/nmi.h>
  52#include <linux/tboot.h>
  53#include <linux/stackprotector.h>
  54#include <linux/gfp.h>
  55#include <linux/cpuidle.h>
  56
  57#include <asm/acpi.h>
  58#include <asm/desc.h>
  59#include <asm/nmi.h>
  60#include <asm/irq.h>
  61#include <asm/idle.h>
  62#include <asm/realmode.h>
  63#include <asm/cpu.h>
  64#include <asm/numa.h>
  65#include <asm/pgtable.h>
  66#include <asm/tlbflush.h>
  67#include <asm/mtrr.h>
  68#include <asm/mwait.h>
  69#include <asm/apic.h>
  70#include <asm/io_apic.h>
  71#include <asm/i387.h>
  72#include <asm/fpu-internal.h>
  73#include <asm/setup.h>
  74#include <asm/uv/uv.h>
  75#include <linux/mc146818rtc.h>
  76
  77#include <asm/smpboot_hooks.h>
  78#include <asm/i8259.h>
  79
  80#include <asm/realmode.h>
  81
  82#include <asm/hypervisor.h>
  83
  84/* State of each CPU */
  85DEFINE_PER_CPU(int, cpu_state) = { 0 };
  86
  87/* Number of siblings per CPU package */
  88int smp_num_siblings = 1;
  89EXPORT_SYMBOL(smp_num_siblings);
  90
  91/* Last level cache ID of each logical CPU */
  92DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
  93
  94/* representing HT siblings of each logical CPU */
  95DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
  96EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
  97
  98/* representing HT and core siblings of each logical CPU */
  99DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
 100EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 101
 102DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 103
 104/* Per CPU bogomips and other parameters */
 105DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
 106EXPORT_PER_CPU_SYMBOL(cpu_info);
 107DEFINE_PER_CPU_SHARED_ALIGNED(struct rh_cpuinfo_x86, rh_cpu_info);
 108EXPORT_PER_CPU_SYMBOL(rh_cpu_info);
 109
 110atomic_t init_deasserted;
 111/* Logical package management. We might want to allocate that dynamically */
 112unsigned int __max_logical_packages __read_mostly;
 113EXPORT_SYMBOL(__max_logical_packages);
 114static unsigned int logical_packages __read_mostly;
 115
 116/* Maximum number of SMT threads on any online core */
 117int __read_mostly __max_smt_threads = 1;
 118
 119/* Flag to indicate if a complete sched domain rebuild is required */
 120bool x86_topology_update;
 121
 122int arch_update_cpu_topology(void)
 123{
 124        int retval = x86_topology_update;
 125
 126        x86_topology_update = false;
 127        return retval;
 128}
 129
 130/*
 131 * Report back to the Boot Processor during boot time or to the caller processor
 132 * during CPU online.
 133 */
 134static void smp_callin(void)
 135{
 136        int cpuid, phys_id;
 137
 138        /*
 139         * If waken up by an INIT in an 82489DX configuration
 140         * we may get here before an INIT-deassert IPI reaches
 141         * our local APIC.  We have to wait for the IPI or we'll
 142         * lock up on an APIC access.
 143         *
 144         * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
 145         */
 146        cpuid = smp_processor_id();
 147        if (apic->wait_for_init_deassert && cpuid != 0)
 148                apic->wait_for_init_deassert(&init_deasserted);
 149
 150        /*
 151         * (This works even if the APIC is not enabled.)
 152         */
 153        phys_id = read_apic_id();
 154
 155        /*
 156         * the boot CPU has finished the init stage and is spinning
 157         * on callin_map until we finish. We are free to set up this
 158         * CPU, first the APIC. (this is probably redundant on most
 159         * boards)
 160         */
 161
 162        pr_debug("CALLIN, before setup_local_APIC()\n");
 163        if (apic->smp_callin_clear_local_apic)
 164                apic->smp_callin_clear_local_apic();
 165        setup_local_APIC();
 166        end_local_APIC_setup();
 167
 168        /*
 169         * Need to setup vector mappings before we enable interrupts.
 170         */
 171        setup_vector_irq(smp_processor_id());
 172
 173        /*
 174         * Save our processor parameters. Note: this information
 175         * is needed for clock calibration.
 176         */
 177        smp_store_cpu_info(cpuid);
 178
 179        /*
 180         * Get our bogomips.
 181         * Update loops_per_jiffy in cpu_data. Previous call to
 182         * smp_store_cpu_info() stored a value that is close but not as
 183         * accurate as the value just calculated.
 184         */
 185        calibrate_delay();
 186        cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
 187        pr_debug("Stack at about %p\n", &cpuid);
 188
 189        /*
 190         * This must be done before setting cpu_online_mask
 191         * or calling notify_cpu_starting.
 192         */
 193        set_cpu_sibling_map(raw_smp_processor_id());
 194        wmb();
 195
 196        notify_cpu_starting(cpuid);
 197
 198        /*
 199         * Allow the master to continue.
 200         */
 201        cpumask_set_cpu(cpuid, cpu_callin_mask);
 202}
 203
 204static int cpu0_logical_apicid;
 205static int enable_start_cpu0;
 206/*
 207 * Activate a secondary processor.
 208 */
 209static void notrace start_secondary(void *unused)
 210{
 211        /*
 212         * Don't put *anything* except direct CPU state initialization
 213         * before cpu_init(), SMP booting is too fragile that we want to
 214         * limit the things done here to the most necessary things.
 215         */
 216        if (boot_cpu_has(X86_FEATURE_PCID))
 217                write_cr4(read_cr4() | X86_CR4_PCIDE);
 218        cpu_init();
 219        x86_cpuinit.early_percpu_clock_init();
 220        preempt_disable();
 221        smp_callin();
 222
 223        enable_start_cpu0 = 0;
 224
 225#ifdef CONFIG_X86_32
 226        /* switch away from the initial page table */
 227        load_cr3(swapper_pg_dir);
 228        __flush_tlb_all();
 229#endif
 230
 231        /* otherwise gcc will move up smp_processor_id before the cpu_init */
 232        barrier();
 233        /*
 234         * Check TSC synchronization with the BP:
 235         */
 236        check_tsc_sync_target();
 237
 238        /*
 239         * We need to hold vector_lock so there the set of online cpus
 240         * does not change while we are assigning vectors to cpus.  Holding
 241         * this lock ensures we don't half assign or remove an irq from a cpu.
 242         */
 243        lock_vector_lock();
 244        set_cpu_online(smp_processor_id(), true);
 245        unlock_vector_lock();
 246        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 247        x86_platform.nmi_init();
 248
 249        /* enable local interrupts */
 250        local_irq_enable();
 251
 252        /* to prevent fake stack check failure in clock setup */
 253        boot_init_stack_canary();
 254
 255        x86_cpuinit.setup_percpu_clockev();
 256
 257        wmb();
 258        cpu_startup_entry(CPUHP_ONLINE);
 259}
 260
 261/**
 262 * topology_phys_to_logical_pkg - Map a physical package id to a logical
 263 *
 264 * Returns logical package id or -1 if not found
 265 */
 266int topology_phys_to_logical_pkg(unsigned int phys_pkg)
 267{
 268        int cpu;
 269
 270        for_each_possible_cpu(cpu) {
 271                struct cpuinfo_x86 *c = &cpu_data(cpu);
 272                struct rh_cpuinfo_x86 *rhc = &rh_cpu_data(cpu);
 273
 274                if (rhc->initialized && c->phys_proc_id == phys_pkg)
 275                        return rh_cpu_data(cpu).logical_proc_id;
 276        }
 277        return -1;
 278}
 279EXPORT_SYMBOL(topology_phys_to_logical_pkg);
 280
 281/**
 282 * topology_update_package_map - Update the physical to logical package map
 283 * @pkg:        The physical package id as retrieved via CPUID
 284 * @cpu:        The cpu for which this is updated
 285 */
 286int topology_update_package_map(unsigned int pkg, unsigned int cpu)
 287{
 288        int new;
 289
 290        /* Already available somewhere? */
 291        new = topology_phys_to_logical_pkg(pkg);
 292        if (new >= 0)
 293                goto found;
 294
 295        new = logical_packages++;
 296        if (new != pkg) {
 297                pr_info("CPU %u Converting physical %u to logical package %u\n",
 298                        cpu, pkg, new);
 299        }
 300found:
 301        rh_cpu_data(cpu).logical_proc_id = new;
 302        return 0;
 303}
 304
 305void __init smp_store_boot_cpu_info(void)
 306{
 307        int id = 0; /* CPU 0 */
 308        struct cpuinfo_x86 *c = &cpu_data(id);
 309        struct rh_cpuinfo_x86 *rhc = &rh_cpu_data(id);
 310
 311        *c = boot_cpu_data;
 312        c->cpu_index = id;
 313        *rhc = rh_boot_cpu_data;
 314        topology_update_package_map(c->phys_proc_id, id);
 315        rhc->initialized = true;
 316}
 317
 318/*
 319 * The bootstrap kernel entry code has set these up. Save them for
 320 * a given CPU
 321 */
 322void smp_store_cpu_info(int id)
 323{
 324        struct cpuinfo_x86 *c = &cpu_data(id);
 325        struct rh_cpuinfo_x86 *rhc = &rh_cpu_data(id);
 326
 327        /* Copy boot_cpu_data only on the first bringup */
 328        if (!rhc->initialized)
 329                *c = boot_cpu_data;
 330        c->cpu_index = id;
 331        /*
 332         * During boot time, CPU0 has this setup already. Save the info when
 333         * bringing up AP or offlined CPU0.
 334         */
 335        identify_secondary_cpu(c);
 336        rhc->initialized = true;
 337}
 338
 339static bool
 340topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 341{
 342        int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 343
 344        return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
 345}
 346
 347static bool
 348topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
 349{
 350        int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 351
 352        return !WARN_ONCE(!topology_same_node(c, o),
 353                "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
 354                "[node: %d != %d]. Ignoring dependency.\n",
 355                cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
 356}
 357
 358#define link_mask(_m, c1, c2)                                           \
 359do {                                                                    \
 360        cpumask_set_cpu((c1), cpu_##_m##_mask(c2));                     \
 361        cpumask_set_cpu((c2), cpu_##_m##_mask(c1));                     \
 362} while (0)
 363
 364static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 365{
 366        if (cpu_has_topoext) {
 367                int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 368
 369                if (c->phys_proc_id == o->phys_proc_id &&
 370                    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
 371                        if (c->cpu_core_id == o->cpu_core_id)
 372                                return topology_sane(c, o, "smt");
 373
 374                        if ((c->cu_id != 0xff) &&
 375                            (o->cu_id != 0xff) &&
 376                            (c->cu_id == o->cu_id))
 377                                return topology_sane(c, o, "smt");
 378                }
 379
 380        } else if (c->phys_proc_id == o->phys_proc_id &&
 381                   c->cpu_core_id == o->cpu_core_id) {
 382                return topology_sane(c, o, "smt");
 383        }
 384
 385        return false;
 386}
 387
 388static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 389{
 390        int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 391
 392        if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
 393            per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
 394                return topology_sane(c, o, "llc");
 395
 396        return false;
 397}
 398
 399/*
 400 * Unlike the other levels, we do not enforce keeping a
 401 * multicore group inside a NUMA node.  If this happens, we will
 402 * discard the MC level of the topology later.
 403 */
 404static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 405{
 406        if (c->phys_proc_id == o->phys_proc_id)
 407                return true;
 408        return false;
 409}
 410
 411#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
 412static inline int x86_sched_itmt_flags(void)
 413{
 414        return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
 415}
 416
 417#ifdef CONFIG_SCHED_MC
 418static int x86_core_flags(void)
 419{
 420        return cpu_core_flags() | x86_sched_itmt_flags();
 421}
 422#endif
 423#ifdef CONFIG_SCHED_SMT
 424static int x86_smt_flags(void)
 425{
 426        return cpu_smt_flags() | x86_sched_itmt_flags();
 427}
 428#endif
 429#endif
 430
 431static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
 432#ifdef CONFIG_SCHED_SMT
 433        { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
 434#endif
 435#ifdef CONFIG_SCHED_MC
 436        { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
 437#endif
 438        { NULL, },
 439};
 440
 441static struct sched_domain_topology_level x86_topology[] = {
 442#ifdef CONFIG_SCHED_SMT
 443        { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
 444#endif
 445#ifdef CONFIG_SCHED_MC
 446        { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
 447#endif
 448        { cpu_cpu_mask, SD_INIT_NAME(DIE) },
 449        { NULL, },
 450};
 451
 452/*
 453 * Set if a package/die has multiple NUMA nodes inside.
 454 * AMD Magny-Cours and Intel Cluster-on-Die have this.
 455 */
 456static bool x86_has_numa_in_package;
 457
 458void set_cpu_sibling_map(int cpu)
 459{
 460        bool has_smt = smp_num_siblings > 1;
 461        bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
 462        struct cpuinfo_x86 *c = &cpu_data(cpu);
 463        struct cpuinfo_x86 *o;
 464        int i, threads;
 465
 466        cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 467
 468        if (!has_mp) {
 469                cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
 470                cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
 471                cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 472                c->booted_cores = 1;
 473                return;
 474        }
 475
 476        for_each_cpu(i, cpu_sibling_setup_mask) {
 477                o = &cpu_data(i);
 478
 479                if ((i == cpu) || (has_smt && match_smt(c, o)))
 480                        link_mask(sibling, cpu, i);
 481
 482                if ((i == cpu) || (has_mp && match_llc(c, o)))
 483                        link_mask(llc_shared, cpu, i);
 484
 485        }
 486
 487        /*
 488         * This needs a separate iteration over the cpus because we rely on all
 489         * cpu_sibling_mask links to be set-up.
 490         */
 491        for_each_cpu(i, cpu_sibling_setup_mask) {
 492                o = &cpu_data(i);
 493
 494                if ((i == cpu) || (has_mp && match_die(c, o))) {
 495                        link_mask(core, cpu, i);
 496
 497                        /*
 498                         *  Does this new cpu bringup a new core?
 499                         */
 500                        if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
 501                                /*
 502                                 * for each core in package, increment
 503                                 * the booted_cores for this new cpu
 504                                 */
 505                                if (cpumask_first(cpu_sibling_mask(i)) == i)
 506                                        c->booted_cores++;
 507                                /*
 508                                 * increment the core count for all
 509                                 * the other cpus in this package
 510                                 */
 511                                if (i != cpu)
 512                                        cpu_data(i).booted_cores++;
 513                        } else if (i != cpu && !c->booted_cores)
 514                                c->booted_cores = cpu_data(i).booted_cores;
 515                }
 516                if (match_die(c, o) && !topology_same_node(c, o))
 517                        x86_has_numa_in_package = true;
 518        }
 519
 520        threads = cpumask_weight(topology_sibling_cpumask(cpu));
 521        if (threads > __max_smt_threads)
 522                __max_smt_threads = threads;
 523}
 524
 525/* maps the cpu to the sched domain representing multi-core */
 526const struct cpumask *cpu_coregroup_mask(int cpu)
 527{
 528        return cpu_llc_shared_mask(cpu);
 529}
 530
 531static void impress_friends(void)
 532{
 533        int cpu;
 534        unsigned long bogosum = 0;
 535        /*
 536         * Allow the user to impress friends.
 537         */
 538        pr_debug("Before bogomips\n");
 539        for_each_possible_cpu(cpu)
 540                if (cpumask_test_cpu(cpu, cpu_callout_mask))
 541                        bogosum += cpu_data(cpu).loops_per_jiffy;
 542        pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
 543                num_online_cpus(),
 544                bogosum/(500000/HZ),
 545                (bogosum/(5000/HZ))%100);
 546
 547        pr_debug("Before bogocount - setting activated=1\n");
 548}
 549
 550void __inquire_remote_apic(int apicid)
 551{
 552        unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
 553        const char * const names[] = { "ID", "VERSION", "SPIV" };
 554        int timeout;
 555        u32 status;
 556
 557        pr_info("Inquiring remote APIC 0x%x...\n", apicid);
 558
 559        for (i = 0; i < ARRAY_SIZE(regs); i++) {
 560                pr_info("... APIC 0x%x %s: ", apicid, names[i]);
 561
 562                /*
 563                 * Wait for idle.
 564                 */
 565                status = safe_apic_wait_icr_idle();
 566                if (status)
 567                        pr_cont("a previous APIC delivery may have failed\n");
 568
 569                apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
 570
 571                timeout = 0;
 572                do {
 573                        udelay(100);
 574                        status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
 575                } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
 576
 577                switch (status) {
 578                case APIC_ICR_RR_VALID:
 579                        status = apic_read(APIC_RRR);
 580                        pr_cont("%08x\n", status);
 581                        break;
 582                default:
 583                        pr_cont("failed\n");
 584                }
 585        }
 586}
 587
 588/*
 589 * The Multiprocessor Specification 1.4 (1997) example code suggests
 590 * that there should be a 10ms delay between the BSP asserting INIT
 591 * and de-asserting INIT, when starting a remote processor.
 592 * But that slows boot and resume on modern processors, which include
 593 * many cores and don't require that delay.
 594 *
 595 * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
 596 * Modern processor families are quirked to remove the delay entirely.
 597 */
 598#define UDELAY_10MS_DEFAULT 10000
 599
 600static unsigned int init_udelay = UINT_MAX;
 601
 602static int __init cpu_init_udelay(char *str)
 603{
 604        get_option(&str, &init_udelay);
 605
 606        return 0;
 607}
 608early_param("cpu_init_udelay", cpu_init_udelay);
 609
 610static void __init smp_quirk_init_udelay(void)
 611{
 612        /* if cmdline changed it from default, leave it alone */
 613        if (init_udelay != UINT_MAX)
 614                return;
 615
 616        /* if modern processor, use no delay */
 617        if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
 618            ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
 619                init_udelay = 0;
 620                return;
 621        }
 622        /* else, use legacy delay */
 623        init_udelay = UDELAY_10MS_DEFAULT;
 624}
 625
 626/*
 627 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
 628 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
 629 * won't ... remember to clear down the APIC, etc later.
 630 */
 631int
 632wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 633{
 634        unsigned long send_status, accept_status = 0;
 635        int maxlvt;
 636
 637        /* Target chip */
 638        /* Boot on the stack */
 639        /* Kick the second */
 640        apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
 641
 642        pr_debug("Waiting for send to finish...\n");
 643        send_status = safe_apic_wait_icr_idle();
 644
 645        /*
 646         * Give the other CPU some time to accept the IPI.
 647         */
 648        udelay(200);
 649        if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
 650                maxlvt = lapic_get_maxlvt();
 651                if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
 652                        apic_write(APIC_ESR, 0);
 653                accept_status = (apic_read(APIC_ESR) & 0xEF);
 654        }
 655        pr_debug("NMI sent\n");
 656
 657        if (send_status)
 658                pr_err("APIC never delivered???\n");
 659        if (accept_status)
 660                pr_err("APIC delivery error (%lx)\n", accept_status);
 661
 662        return (send_status | accept_status);
 663}
 664
 665static int
 666wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 667{
 668        unsigned long send_status, accept_status = 0;
 669        int maxlvt, num_starts, j;
 670
 671        maxlvt = lapic_get_maxlvt();
 672
 673        /*
 674         * Be paranoid about clearing APIC errors.
 675         */
 676        if (APIC_INTEGRATED(apic_version[phys_apicid])) {
 677                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
 678                        apic_write(APIC_ESR, 0);
 679                apic_read(APIC_ESR);
 680        }
 681
 682        pr_debug("Asserting INIT\n");
 683
 684        /*
 685         * Turn INIT on target chip
 686         */
 687        /*
 688         * Send IPI
 689         */
 690        apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
 691                       phys_apicid);
 692
 693        pr_debug("Waiting for send to finish...\n");
 694        send_status = safe_apic_wait_icr_idle();
 695
 696        udelay(init_udelay);
 697
 698        pr_debug("Deasserting INIT\n");
 699
 700        /* Target chip */
 701        /* Send IPI */
 702        apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
 703
 704        pr_debug("Waiting for send to finish...\n");
 705        send_status = safe_apic_wait_icr_idle();
 706
 707        mb();
 708        atomic_set(&init_deasserted, 1);
 709
 710        /*
 711         * Should we send STARTUP IPIs ?
 712         *
 713         * Determine this based on the APIC version.
 714         * If we don't have an integrated APIC, don't send the STARTUP IPIs.
 715         */
 716        if (APIC_INTEGRATED(apic_version[phys_apicid]))
 717                num_starts = 2;
 718        else
 719                num_starts = 0;
 720
 721        /*
 722         * Paravirt / VMI wants a startup IPI hook here to set up the
 723         * target processor state.
 724         */
 725        startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
 726                         initial_stack);
 727
 728        /*
 729         * Run STARTUP IPI loop.
 730         */
 731        pr_debug("#startup loops: %d\n", num_starts);
 732
 733        for (j = 1; j <= num_starts; j++) {
 734                pr_debug("Sending STARTUP #%d\n", j);
 735                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
 736                        apic_write(APIC_ESR, 0);
 737                apic_read(APIC_ESR);
 738                pr_debug("After apic_write\n");
 739
 740                /*
 741                 * STARTUP IPI
 742                 */
 743
 744                /* Target chip */
 745                /* Boot on the stack */
 746                /* Kick the second */
 747                apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
 748                               phys_apicid);
 749
 750                /*
 751                 * Give the other CPU some time to accept the IPI.
 752                 */
 753                if (init_udelay == 0)
 754                        udelay(10);
 755                else
 756                        udelay(300);
 757
 758                pr_debug("Startup point 1\n");
 759
 760                pr_debug("Waiting for send to finish...\n");
 761                send_status = safe_apic_wait_icr_idle();
 762
 763                /*
 764                 * Give the other CPU some time to accept the IPI.
 765                 */
 766                if (init_udelay == 0)
 767                        udelay(10);
 768                else
 769                        udelay(200);
 770                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
 771                        apic_write(APIC_ESR, 0);
 772                accept_status = (apic_read(APIC_ESR) & 0xEF);
 773                if (send_status || accept_status)
 774                        break;
 775        }
 776        pr_debug("After Startup\n");
 777
 778        if (send_status)
 779                pr_err("APIC never delivered???\n");
 780        if (accept_status)
 781                pr_err("APIC delivery error (%lx)\n", accept_status);
 782
 783        return (send_status | accept_status);
 784}
 785
 786/* reduce the number of lines printed when booting a large cpu count system */
 787static void announce_cpu(int cpu, int apicid)
 788{
 789        static int current_node = -1;
 790        int node = early_cpu_to_node(cpu);
 791
 792        if (system_state == SYSTEM_BOOTING) {
 793                if (node != current_node) {
 794                        if (current_node > (-1))
 795                                pr_cont(" OK\n");
 796                        current_node = node;
 797                        pr_info("Booting Node %3d, Processors ", node);
 798                }
 799                pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : "");
 800                return;
 801        } else
 802                pr_info("Booting Node %d Processor %d APIC 0x%x\n",
 803                        node, cpu, apicid);
 804}
 805
 806static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
 807{
 808        int cpu;
 809
 810        cpu = smp_processor_id();
 811        if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
 812                return NMI_HANDLED;
 813
 814        return NMI_DONE;
 815}
 816
 817/*
 818 * Wake up AP by INIT, INIT, STARTUP sequence.
 819 *
 820 * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
 821 * boot-strap code which is not a desired behavior for waking up BSP. To
 822 * void the boot-strap code, wake up CPU0 by NMI instead.
 823 *
 824 * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
 825 * (i.e. physically hot removed and then hot added), NMI won't wake it up.
 826 * We'll change this code in the future to wake up hard offlined CPU0 if
 827 * real platform and request are available.
 828 */
 829static int
 830wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
 831               int *cpu0_nmi_registered)
 832{
 833        int id;
 834        int boot_error;
 835
 836        preempt_disable();
 837
 838        /*
 839         * Wake up AP by INIT, INIT, STARTUP sequence.
 840         */
 841        if (cpu) {
 842                boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
 843                goto out;
 844        }
 845
 846        /*
 847         * Wake up BSP by nmi.
 848         *
 849         * Register a NMI handler to help wake up CPU0.
 850         */
 851        boot_error = register_nmi_handler(NMI_LOCAL,
 852                                          wakeup_cpu0_nmi, 0, "wake_cpu0");
 853
 854        if (!boot_error) {
 855                enable_start_cpu0 = 1;
 856                *cpu0_nmi_registered = 1;
 857                if (apic->dest_logical == APIC_DEST_LOGICAL)
 858                        id = cpu0_logical_apicid;
 859                else
 860                        id = apicid;
 861                boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
 862        }
 863
 864out:
 865        preempt_enable();
 866
 867        return boot_error;
 868}
 869
 870/*
 871 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
 872 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
 873 * Returns zero if CPU booted OK, else error code from
 874 * ->wakeup_secondary_cpu.
 875 */
 876static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 877{
 878        volatile u32 *trampoline_status =
 879                (volatile u32 *) __va(real_mode_header->trampoline_status);
 880        /* start_ip had better be page-aligned! */
 881        unsigned long start_ip = real_mode_header->trampoline_start;
 882
 883        unsigned long boot_error = 0;
 884        int cpu0_nmi_registered = 0;
 885        unsigned long timeout;
 886
 887        /* Just in case we booted with a single CPU. */
 888        alternatives_enable_smp();
 889
 890        idle->thread.sp = (unsigned long) (((struct pt_regs *)
 891                          (THREAD_SIZE +  task_stack_page(idle))) - 1);
 892        per_cpu(current_task, cpu) = idle;
 893
 894#ifdef CONFIG_X86_32
 895        /* Stack for startup_32 can be just as for start_secondary onwards */
 896        irq_ctx_init(cpu);
 897#else
 898        clear_tsk_thread_flag(idle, TIF_FORK);
 899        initial_gs = per_cpu_offset(cpu);
 900        per_cpu(kernel_stack, cpu) =
 901                (unsigned long)task_stack_page(idle) -
 902                KERNEL_STACK_OFFSET + THREAD_SIZE;
 903        per_cpu(__kernel_stack_70__, cpu) =
 904                (unsigned long)task_stack_page(idle) -
 905                KERNEL_STACK_OFFSET + THREAD_SIZE - 8192;
 906#endif
 907        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 908        initial_code = (unsigned long)start_secondary;
 909        initial_stack  = idle->thread.sp;
 910
 911        /* So we see what's up */
 912        announce_cpu(cpu, apicid);
 913
 914        /*
 915         * This grunge runs the startup process for
 916         * the targeted processor.
 917         */
 918
 919        atomic_set(&init_deasserted, 0);
 920
 921        if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 922
 923                pr_debug("Setting warm reset code and vector.\n");
 924
 925                smpboot_setup_warm_reset_vector(start_ip);
 926                /*
 927                 * Be paranoid about clearing APIC errors.
 928                */
 929                if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
 930                        apic_write(APIC_ESR, 0);
 931                        apic_read(APIC_ESR);
 932                }
 933        }
 934
 935        /*
 936         * AP might wait on cpu_callout_mask in cpu_init() with
 937         * cpu_initialized_mask set if previous attempt to online
 938         * it timed-out. Clear cpu_initialized_mask so that after
 939         * INIT/SIPI it could start with a clean state.
 940         */
 941        cpumask_clear_cpu(cpu, cpu_initialized_mask);
 942        smp_mb();
 943
 944        /*
 945         * Wake up a CPU in difference cases:
 946         * - Use the method in the APIC driver if it's defined
 947         * Otherwise,
 948         * - Use an INIT boot APIC message for APs or NMI for BSP.
 949         */
 950        if (apic->wakeup_secondary_cpu)
 951                boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
 952        else
 953                boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
 954                                                     &cpu0_nmi_registered);
 955
 956
 957        if (!boot_error) {
 958                /*
 959                 * Wait 10s total for a response from AP
 960                 */
 961                boot_error = -1;
 962                timeout = jiffies + 10*HZ;
 963                while (time_before(jiffies, timeout)) {
 964                        if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
 965                                /*
 966                                 * Tell AP to proceed with initialization
 967                                 */
 968                                cpumask_set_cpu(cpu, cpu_callout_mask);
 969                                boot_error = 0;
 970                                break;
 971                        }
 972                        udelay(100);
 973                        schedule();
 974                }
 975        }
 976
 977        if (!boot_error) {
 978                /*
 979                 * Wait till AP completes initial initialization
 980                 */
 981                while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
 982                        /*
 983                         * Allow other tasks to run while we wait for the
 984                         * AP to come online. This also gives a chance
 985                         * for the MTRR work(triggered by the AP coming online)
 986                         * to be completed in the stop machine context.
 987                         */
 988                        udelay(100);
 989                        schedule();
 990                }
 991        }
 992
 993        /* mark "stuck" area as not stuck */
 994        *trampoline_status = 0;
 995
 996        if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 997                /*
 998                 * Cleanup possible dangling ends...
 999                 */
1000                smpboot_restore_warm_reset_vector();
1001        }
1002        /*
1003         * Clean up the nmi handler. Do this after the callin and callout sync
1004         * to avoid impact of possible long unregister time.
1005         */
1006        if (cpu0_nmi_registered)
1007                unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
1008
1009        return boot_error;
1010}
1011
1012int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
1013{
1014        int apicid = apic->cpu_present_to_apicid(cpu);
1015        unsigned long flags;
1016        int err;
1017
1018        WARN_ON(irqs_disabled());
1019
1020        pr_debug("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
1021
1022        if (apicid == BAD_APICID ||
1023            !physid_isset(apicid, phys_cpu_present_map) ||
1024            !apic->apic_id_valid(apicid)) {
1025                pr_err("%s: bad cpu %d\n", __func__, cpu);
1026                return -EINVAL;
1027        }
1028
1029        /*
1030         * Already booted CPU?
1031         */
1032        if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
1033                pr_debug("do_boot_cpu %d Already started\n", cpu);
1034                return -ENOSYS;
1035        }
1036
1037        /*
1038         * Save current MTRR state in case it was changed since early boot
1039         * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
1040         */
1041        mtrr_save_state();
1042
1043        per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
1044
1045        /* the FPU context is blank, nobody can own it */
1046        __cpu_disable_lazy_restore(cpu);
1047
1048        err = do_boot_cpu(apicid, cpu, tidle);
1049        if (err) {
1050                pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
1051                return -EIO;
1052        }
1053
1054        /*
1055         * Check TSC synchronization with the AP (keep irqs disabled
1056         * while doing so):
1057         */
1058        local_irq_save(flags);
1059        check_tsc_sync_source(cpu);
1060        local_irq_restore(flags);
1061
1062        while (!cpu_online(cpu)) {
1063                cpu_relax();
1064                touch_nmi_watchdog();
1065        }
1066
1067        return 0;
1068}
1069
1070/**
1071 * arch_disable_smp_support() - disables SMP support for x86 at runtime
1072 */
1073void arch_disable_smp_support(void)
1074{
1075        disable_ioapic_support();
1076}
1077
1078/*
1079 * Fall back to non SMP mode after errors.
1080 *
1081 * RED-PEN audit/test this more. I bet there is more state messed up here.
1082 */
1083static __init void disable_smp(void)
1084{
1085        init_cpu_present(cpumask_of(0));
1086        init_cpu_possible(cpumask_of(0));
1087        smpboot_clear_io_apic_irqs();
1088
1089        if (smp_found_config)
1090                physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1091        else
1092                physid_set_mask_of_physid(0, &phys_cpu_present_map);
1093        cpumask_set_cpu(0, cpu_sibling_mask(0));
1094        cpumask_set_cpu(0, cpu_core_mask(0));
1095}
1096
1097/*
1098 * Various sanity checks.
1099 */
1100static int __init smp_sanity_check(unsigned max_cpus)
1101{
1102        preempt_disable();
1103
1104#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
1105        if (def_to_bigsmp && nr_cpu_ids > 8) {
1106                unsigned int cpu;
1107                unsigned nr;
1108
1109                pr_warn("More than 8 CPUs detected - skipping them\n"
1110                        "Use CONFIG_X86_BIGSMP\n");
1111
1112                nr = 0;
1113                for_each_present_cpu(cpu) {
1114                        if (nr >= 8)
1115                                set_cpu_present(cpu, false);
1116                        nr++;
1117                }
1118
1119                nr = 0;
1120                for_each_possible_cpu(cpu) {
1121                        if (nr >= 8)
1122                                set_cpu_possible(cpu, false);
1123                        nr++;
1124                }
1125
1126                nr_cpu_ids = 8;
1127        }
1128#endif
1129
1130        if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1131                pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
1132                        hard_smp_processor_id());
1133
1134                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1135        }
1136
1137        /*
1138         * If we couldn't find an SMP configuration at boot time,
1139         * get out of here now!
1140         */
1141        if (!smp_found_config && !acpi_lapic) {
1142                preempt_enable();
1143                pr_notice("SMP motherboard not detected\n");
1144                disable_smp();
1145                if (APIC_init_uniprocessor())
1146                        pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
1147                return -1;
1148        }
1149
1150        /*
1151         * Should not be necessary because the MP table should list the boot
1152         * CPU too, but we do it for the sake of robustness anyway.
1153         */
1154        if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
1155                pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
1156                          boot_cpu_physical_apicid);
1157                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1158        }
1159        preempt_enable();
1160
1161        /*
1162         * If we couldn't find a local APIC, then get out of here now!
1163         */
1164        if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
1165            !cpu_has_apic) {
1166                if (!disable_apic) {
1167                        pr_err("BIOS bug, local APIC #%d not detected!...\n",
1168                                boot_cpu_physical_apicid);
1169                        pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
1170                }
1171                smpboot_clear_io_apic();
1172                disable_ioapic_support();
1173                return -1;
1174        }
1175
1176        verify_local_APIC();
1177
1178        /*
1179         * If SMP should be disabled, then really disable it!
1180         */
1181        if (!max_cpus) {
1182                pr_info("SMP mode deactivated\n");
1183                smpboot_clear_io_apic();
1184
1185                connect_bsp_APIC();
1186                setup_local_APIC();
1187                bsp_end_local_APIC_setup();
1188                return -1;
1189        }
1190
1191        return 0;
1192}
1193
1194static void __init smp_cpu_index_default(void)
1195{
1196        int i;
1197        struct cpuinfo_x86 *c;
1198
1199        for_each_possible_cpu(i) {
1200                c = &cpu_data(i);
1201                /* mark all to hotplug */
1202                c->cpu_index = nr_cpu_ids;
1203        }
1204}
1205
1206/*
1207 * Prepare for SMP bootup.  The MP table or ACPI has been read
1208 * earlier.  Just do some sanity checking here and enable APIC mode.
1209 */
1210void __init native_smp_prepare_cpus(unsigned int max_cpus)
1211{
1212        unsigned int i;
1213
1214        preempt_disable();
1215        smp_cpu_index_default();
1216
1217        /*
1218         * Setup boot CPU information
1219         */
1220        smp_store_boot_cpu_info(); /* Final full version of the data */
1221        cpumask_copy(cpu_callin_mask, cpumask_of(0));
1222        mb();
1223
1224        current_thread_info()->cpu = 0;  /* needed? */
1225        for_each_possible_cpu(i) {
1226                zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1227                zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1228                zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1229        }
1230
1231        /*
1232         * Set 'default' x86 topology, this matches default_topology() in that
1233         * it has NUMA nodes as a topology level. See also
1234         * native_smp_cpus_done().
1235         *
1236         * Must be done before set_cpus_sibling_map() is ran.
1237         */
1238        set_sched_topology(x86_topology);
1239
1240        set_cpu_sibling_map(0);
1241
1242
1243        if (smp_sanity_check(max_cpus) < 0) {
1244                pr_info("SMP disabled\n");
1245                disable_smp();
1246                goto out;
1247        }
1248
1249        default_setup_apic_routing();
1250
1251        preempt_disable();
1252        if (read_apic_id() != boot_cpu_physical_apicid) {
1253                panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1254                     read_apic_id(), boot_cpu_physical_apicid);
1255                /* Or can we switch back to PIC here? */
1256        }
1257        preempt_enable();
1258
1259        connect_bsp_APIC();
1260
1261        /*
1262         * Switch from PIC to APIC mode.
1263         */
1264        setup_local_APIC();
1265
1266        if (x2apic_mode)
1267                cpu0_logical_apicid = apic_read(APIC_LDR);
1268        else
1269                cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
1270
1271        /*
1272         * Enable IO APIC before setting up error vector
1273         */
1274        if (!skip_ioapic_setup && nr_ioapics)
1275                enable_IO_APIC();
1276
1277        bsp_end_local_APIC_setup();
1278
1279        if (apic->setup_portio_remap)
1280                apic->setup_portio_remap();
1281
1282        smpboot_setup_io_apic();
1283        /*
1284         * Set up local APIC timer on boot CPU.
1285         */
1286
1287        pr_info("CPU%d: ", 0);
1288        print_cpu_info(&cpu_data(0));
1289        x86_init.timers.setup_percpu_clockev();
1290
1291        uv_system_init();
1292
1293        set_mtrr_aps_delayed_init();
1294
1295        smp_quirk_init_udelay();
1296out:
1297        preempt_enable();
1298}
1299
1300void arch_enable_nonboot_cpus_begin(void)
1301{
1302        set_mtrr_aps_delayed_init();
1303}
1304
1305void arch_enable_nonboot_cpus_end(void)
1306{
1307        mtrr_aps_init();
1308}
1309
1310/*
1311 * Early setup to make printk work.
1312 */
1313void __init native_smp_prepare_boot_cpu(void)
1314{
1315        int me = smp_processor_id();
1316        switch_to_new_gdt(me);
1317        /* already set me in cpu_online_mask in boot_cpu_init() */
1318        cpumask_set_cpu(me, cpu_callout_mask);
1319        per_cpu(cpu_state, me) = CPU_ONLINE;
1320}
1321
1322void __init native_smp_cpus_done(unsigned int max_cpus)
1323{
1324        int ncpus;
1325
1326        pr_debug("Boot done\n");
1327        /*
1328         * Today neither Intel nor AMD support heterogenous systems so
1329         * extrapolate the boot cpu's data to all packages.
1330         */
1331        ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
1332        __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
1333        pr_info("Max logical packages: %u\n", __max_logical_packages);
1334
1335        if (x86_has_numa_in_package)
1336                set_sched_topology(x86_numa_in_package_topology);
1337
1338        nmi_selftest();
1339        impress_friends();
1340#ifdef CONFIG_X86_IO_APIC
1341        setup_ioapic_dest();
1342#endif
1343        mtrr_aps_init();
1344}
1345
1346static int __initdata setup_possible_cpus = -1;
1347static int __init _setup_possible_cpus(char *str)
1348{
1349        get_option(&str, &setup_possible_cpus);
1350        return 0;
1351}
1352early_param("possible_cpus", _setup_possible_cpus);
1353
1354
1355/*
1356 * cpu_possible_mask should be static, it cannot change as cpu's
1357 * are onlined, or offlined. The reason is per-cpu data-structures
1358 * are allocated by some modules at init time, and dont expect to
1359 * do this dynamically on cpu arrival/departure.
1360 * cpu_present_mask on the other hand can change dynamically.
1361 * In case when cpu_hotplug is not compiled, then we resort to current
1362 * behaviour, which is cpu_possible == cpu_present.
1363 * - Ashok Raj
1364 *
1365 * Three ways to find out the number of additional hotplug CPUs:
1366 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
1367 * - The user can overwrite it with possible_cpus=NUM
1368 * - Otherwise don't reserve additional CPUs.
1369 * We do this because additional CPUs waste a lot of memory.
1370 * -AK
1371 */
1372__init void prefill_possible_map(void)
1373{
1374        int i, possible;
1375
1376        /* No boot processor was found in mptable or ACPI MADT */
1377        if (!num_processors) {
1378                if (boot_cpu_has(X86_FEATURE_APIC)) {
1379                        int apicid = boot_cpu_physical_apicid;
1380                        int cpu = hard_smp_processor_id();
1381
1382                        pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
1383
1384                        /* Make sure boot cpu is enumerated */
1385                        if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
1386                            apic->apic_id_valid(apicid))
1387                                generic_processor_info(apicid,
1388                                        apic_version[boot_cpu_physical_apicid]);
1389                }
1390
1391                if (!num_processors)
1392                        num_processors = 1;
1393        }
1394
1395        i = setup_max_cpus ?: 1;
1396        if (setup_possible_cpus == -1) {
1397                possible = num_processors;
1398#ifdef CONFIG_HOTPLUG_CPU
1399                if (setup_max_cpus)
1400                        possible += disabled_cpus;
1401#else
1402                if (possible > i)
1403                        possible = i;
1404#endif
1405        } else
1406                possible = setup_possible_cpus;
1407
1408        total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1409
1410        /* nr_cpu_ids could be reduced via nr_cpus= */
1411        if (possible > nr_cpu_ids) {
1412                pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
1413                        possible, nr_cpu_ids);
1414                possible = nr_cpu_ids;
1415        }
1416
1417#ifdef CONFIG_HOTPLUG_CPU
1418        if (!setup_max_cpus)
1419#endif
1420        if (possible > i) {
1421                pr_warn("%d Processors exceeds max_cpus limit of %u\n",
1422                        possible, setup_max_cpus);
1423                possible = i;
1424        }
1425
1426        nr_cpu_ids = possible;
1427
1428        pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
1429                possible, max_t(int, possible - num_processors, 0));
1430
1431        reset_cpu_possible_mask();
1432
1433        for (i = 0; i < possible; i++)
1434                set_cpu_possible(i, true);
1435}
1436
1437#ifdef CONFIG_HOTPLUG_CPU
1438
1439/* Recompute SMT state for all CPUs on offline */
1440static void recompute_smt_state(void)
1441{
1442        int max_threads, cpu;
1443
1444        max_threads = 0;
1445        for_each_online_cpu (cpu) {
1446                int threads = cpumask_weight(topology_sibling_cpumask(cpu));
1447
1448                if (threads > max_threads)
1449                        max_threads = threads;
1450        }
1451        __max_smt_threads = max_threads;
1452}
1453
1454static void remove_siblinginfo(int cpu)
1455{
1456        int sibling;
1457        struct cpuinfo_x86 *c = &cpu_data(cpu);
1458
1459        for_each_cpu(sibling, cpu_core_mask(cpu)) {
1460                cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
1461                /*/
1462                 * last thread sibling in this cpu core going down
1463                 */
1464                if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
1465                        cpu_data(sibling).booted_cores--;
1466        }
1467
1468        for_each_cpu(sibling, cpu_sibling_mask(cpu))
1469                cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
1470        for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
1471                cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
1472        cpumask_clear(cpu_llc_shared_mask(cpu));
1473        cpumask_clear(cpu_sibling_mask(cpu));
1474        cpumask_clear(cpu_core_mask(cpu));
1475        c->phys_proc_id = 0;
1476        c->cpu_core_id = 0;
1477        cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
1478        recompute_smt_state();
1479}
1480
1481static void __ref remove_cpu_from_maps(int cpu)
1482{
1483        set_cpu_online(cpu, false);
1484        cpumask_clear_cpu(cpu, cpu_callout_mask);
1485        cpumask_clear_cpu(cpu, cpu_callin_mask);
1486        /* was set by cpu_init() */
1487        cpumask_clear_cpu(cpu, cpu_initialized_mask);
1488        numa_remove_cpu(cpu);
1489}
1490
1491void cpu_disable_common(void)
1492{
1493        int cpu = smp_processor_id();
1494
1495        remove_siblinginfo(cpu);
1496
1497        /* It's now safe to remove this processor from the online map */
1498        lock_vector_lock();
1499        remove_cpu_from_maps(cpu);
1500        unlock_vector_lock();
1501        fixup_irqs();
1502}
1503
1504int native_cpu_disable(void)
1505{
1506        int ret;
1507
1508        ret = check_irq_vectors_for_cpu_disable();
1509        if (ret)
1510                return ret;
1511
1512        clear_local_APIC();
1513
1514        cpu_disable_common();
1515        return 0;
1516}
1517
1518void native_cpu_die(unsigned int cpu)
1519{
1520        /* We don't do anything here: idle task is faking death itself. */
1521        unsigned int i;
1522
1523        for (i = 0; i < 10; i++) {
1524                /* They ack this in play_dead by setting CPU_DEAD */
1525                if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1526                        if (system_state == SYSTEM_RUNNING)
1527                                pr_info("CPU %u is now offline\n", cpu);
1528                        return;
1529                }
1530                msleep(100);
1531        }
1532        pr_err("CPU %u didn't die...\n", cpu);
1533}
1534
1535void play_dead_common(void)
1536{
1537        idle_task_exit();
1538        reset_lazy_tlbstate();
1539        amd_e400_remove_cpu(raw_smp_processor_id());
1540
1541        mb();
1542        /* Ack it */
1543        __this_cpu_write(cpu_state, CPU_DEAD);
1544
1545        /*
1546         * With physical CPU hotplug, we should halt the cpu
1547         */
1548        local_irq_disable();
1549}
1550
1551static bool wakeup_cpu0(void)
1552{
1553        if (smp_processor_id() == 0 && enable_start_cpu0)
1554                return true;
1555
1556        return false;
1557}
1558
1559/*
1560 * We need to flush the caches before going to sleep, lest we have
1561 * dirty data in our caches when we come back up.
1562 */
1563static inline void mwait_play_dead(void)
1564{
1565        unsigned int eax, ebx, ecx, edx;
1566        unsigned int highest_cstate = 0;
1567        unsigned int highest_subcstate = 0;
1568        void *mwait_ptr;
1569        int i;
1570
1571        if (!this_cpu_has(X86_FEATURE_MWAIT))
1572                return;
1573        if (!this_cpu_has(X86_FEATURE_CLFLUSH))
1574                return;
1575        if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1576                return;
1577
1578        eax = CPUID_MWAIT_LEAF;
1579        ecx = 0;
1580        native_cpuid(&eax, &ebx, &ecx, &edx);
1581
1582        /*
1583         * eax will be 0 if EDX enumeration is not valid.
1584         * Initialized below to cstate, sub_cstate value when EDX is valid.
1585         */
1586        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
1587                eax = 0;
1588        } else {
1589                edx >>= MWAIT_SUBSTATE_SIZE;
1590                for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
1591                        if (edx & MWAIT_SUBSTATE_MASK) {
1592                                highest_cstate = i;
1593                                highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
1594                        }
1595                }
1596                eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
1597                        (highest_subcstate - 1);
1598        }
1599
1600        /*
1601         * This should be a memory location in a cache line which is
1602         * unlikely to be touched by other processors.  The actual
1603         * content is immaterial as it is not actually modified in any way.
1604         */
1605        mwait_ptr = &current_thread_info()->flags;
1606
1607        wbinvd();
1608
1609        while (1) {
1610                /*
1611                 * The CLFLUSH is a workaround for erratum AAI65 for
1612                 * the Xeon 7400 series.  It's not clear it is actually
1613                 * needed, but it should be harmless in either case.
1614                 * The WBINVD is insufficient due to the spurious-wakeup
1615                 * case where we return around the loop.
1616                 */
1617                clflush(mwait_ptr);
1618                __monitor(mwait_ptr, 0, 0);
1619                mb();
1620                __mwait(eax, 0);
1621                /*
1622                 * If NMI wants to wake up CPU0, start CPU0.
1623                 */
1624                if (wakeup_cpu0())
1625                        start_cpu0();
1626        }
1627}
1628
1629void hlt_play_dead(void)
1630{
1631        if (__this_cpu_read(cpu_info.x86) >= 4)
1632                wbinvd();
1633
1634        while (1) {
1635                native_halt();
1636                /*
1637                 * If NMI wants to wake up CPU0, start CPU0.
1638                 */
1639                if (wakeup_cpu0())
1640                        start_cpu0();
1641        }
1642}
1643
1644void native_play_dead(void)
1645{
1646        play_dead_common();
1647        tboot_shutdown(TB_SHUTDOWN_WFS);
1648
1649        spec_ctrl_ibrs_off();
1650
1651        mwait_play_dead();      /* Only returns on failure */
1652        if (cpuidle_play_dead())
1653                hlt_play_dead();
1654
1655        spec_ctrl_ibrs_on();
1656}
1657
1658#else /* ... !CONFIG_HOTPLUG_CPU */
1659int native_cpu_disable(void)
1660{
1661        return -ENOSYS;
1662}
1663
1664void native_cpu_die(unsigned int cpu)
1665{
1666        /* We said "no" in __cpu_disable */
1667        BUG();
1668}
1669
1670void native_play_dead(void)
1671{
1672        BUG();
1673}
1674
1675#endif
1676