linux/arch/x86/kernel/smpboot.c
<<
>>
Prefs
   1 /*
   2 *      x86 SMP booting functions
   3 *
   4 *      (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
   5 *      (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com>
   6 *      Copyright 2001 Andi Kleen, SuSE Labs.
   7 *
   8 *      Much of the core SMP work is based on previous work by Thomas Radke, to
   9 *      whom a great many thanks are extended.
  10 *
  11 *      Thanks to Intel for making available several different Pentium,
  12 *      Pentium Pro and Pentium-II/Xeon MP machines.
  13 *      Original development of Linux SMP code supported by Caldera.
  14 *
  15 *      This code is released under the GNU General Public License version 2 or
  16 *      later.
  17 *
  18 *      Fixes
  19 *              Felix Koop      :       NR_CPUS used properly
  20 *              Jose Renau      :       Handle single CPU case.
  21 *              Alan Cox        :       By repeated request 8) - Total BogoMIPS report.
  22 *              Greg Wright     :       Fix for kernel stacks panic.
  23 *              Erich Boleyn    :       MP v1.4 and additional changes.
  24 *      Matthias Sattler        :       Changes for 2.1 kernel map.
  25 *      Michel Lespinasse       :       Changes for 2.1 kernel map.
  26 *      Michael Chastain        :       Change trampoline.S to gnu as.
  27 *              Alan Cox        :       Dumb bug: 'B' step PPro's are fine
  28 *              Ingo Molnar     :       Added APIC timers, based on code
  29 *                                      from Jose Renau
  30 *              Ingo Molnar     :       various cleanups and rewrites
  31 *              Tigran Aivazian :       fixed "0.00 in /proc/uptime on SMP" bug.
  32 *      Maciej W. Rozycki       :       Bits for genuine 82489DX APICs
  33 *      Andi Kleen              :       Changed for SMP boot into long mode.
  34 *              Martin J. Bligh :       Added support for multi-quad systems
  35 *              Dave Jones      :       Report invalid combinations of Athlon CPUs.
  36 *              Rusty Russell   :       Hacked into shape for new "hotplug" boot process.
  37 *      Andi Kleen              :       Converted to new state machine.
  38 *      Ashok Raj               :       CPU hotplug support
  39 *      Glauber Costa           :       i386 and x86_64 integration
  40 */
  41
  42#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  43
  44#include <linux/init.h>
  45#include <linux/smp.h>
  46#include <linux/export.h>
  47#include <linux/sched.h>
  48#include <linux/percpu.h>
  49#include <linux/bootmem.h>
  50#include <linux/err.h>
  51#include <linux/nmi.h>
  52#include <linux/tboot.h>
  53#include <linux/stackprotector.h>
  54#include <linux/gfp.h>
  55#include <linux/cpuidle.h>
  56
  57#include <asm/acpi.h>
  58#include <asm/desc.h>
  59#include <asm/nmi.h>
  60#include <asm/irq.h>
  61#include <asm/idle.h>
  62#include <asm/realmode.h>
  63#include <asm/cpu.h>
  64#include <asm/numa.h>
  65#include <asm/pgtable.h>
  66#include <asm/tlbflush.h>
  67#include <asm/mtrr.h>
  68#include <asm/mwait.h>
  69#include <asm/apic.h>
  70#include <asm/io_apic.h>
  71#include <asm/fpu/internal.h>
  72#include <asm/setup.h>
  73#include <asm/uv/uv.h>
  74#include <linux/mc146818rtc.h>
  75#include <asm/i8259.h>
  76#include <asm/realmode.h>
  77#include <asm/misc.h>
  78
  79/* Number of siblings per CPU package */
  80int smp_num_siblings = 1;
  81EXPORT_SYMBOL(smp_num_siblings);
  82
  83/* Last level cache ID of each logical CPU */
  84DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
  85
  86/* representing HT siblings of each logical CPU */
  87DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
  88EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
  89
  90/* representing HT and core siblings of each logical CPU */
  91DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
  92EXPORT_PER_CPU_SYMBOL(cpu_core_map);
  93
  94DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
  95
  96/* Per CPU bogomips and other parameters */
  97DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
  98EXPORT_PER_CPU_SYMBOL(cpu_info);
  99
 100/* Logical package management. We might want to allocate that dynamically */
 101static int *physical_to_logical_pkg __read_mostly;
 102static unsigned long *physical_package_map __read_mostly;;
 103static unsigned int max_physical_pkg_id __read_mostly;
 104unsigned int __max_logical_packages __read_mostly;
 105EXPORT_SYMBOL(__max_logical_packages);
 106static unsigned int logical_packages __read_mostly;
 107static bool logical_packages_frozen __read_mostly;
 108
 109/* Maximum number of SMT threads on any online core */
 110int __max_smt_threads __read_mostly;
 111
 112static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 113{
 114        unsigned long flags;
 115
 116        spin_lock_irqsave(&rtc_lock, flags);
 117        CMOS_WRITE(0xa, 0xf);
 118        spin_unlock_irqrestore(&rtc_lock, flags);
 119        local_flush_tlb();
 120        pr_debug("1.\n");
 121        *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
 122                                                        start_eip >> 4;
 123        pr_debug("2.\n");
 124        *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
 125                                                        start_eip & 0xf;
 126        pr_debug("3.\n");
 127}
 128
 129static inline void smpboot_restore_warm_reset_vector(void)
 130{
 131        unsigned long flags;
 132
 133        /*
 134         * Install writable page 0 entry to set BIOS data area.
 135         */
 136        local_flush_tlb();
 137
 138        /*
 139         * Paranoid:  Set warm reset code and vector here back
 140         * to default values.
 141         */
 142        spin_lock_irqsave(&rtc_lock, flags);
 143        CMOS_WRITE(0, 0xf);
 144        spin_unlock_irqrestore(&rtc_lock, flags);
 145
 146        *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
 147}
 148
 149/*
 150 * Report back to the Boot Processor during boot time or to the caller processor
 151 * during CPU online.
 152 */
 153static void smp_callin(void)
 154{
 155        int cpuid, phys_id;
 156
 157        /*
 158         * If waken up by an INIT in an 82489DX configuration
 159         * cpu_callout_mask guarantees we don't get here before
 160         * an INIT_deassert IPI reaches our local APIC, so it is
 161         * now safe to touch our local APIC.
 162         */
 163        cpuid = smp_processor_id();
 164
 165        /*
 166         * (This works even if the APIC is not enabled.)
 167         */
 168        phys_id = read_apic_id();
 169
 170        /*
 171         * the boot CPU has finished the init stage and is spinning
 172         * on callin_map until we finish. We are free to set up this
 173         * CPU, first the APIC. (this is probably redundant on most
 174         * boards)
 175         */
 176        apic_ap_setup();
 177
 178        /*
 179         * Save our processor parameters. Note: this information
 180         * is needed for clock calibration.
 181         */
 182        smp_store_cpu_info(cpuid);
 183
 184        /*
 185         * Get our bogomips.
 186         * Update loops_per_jiffy in cpu_data. Previous call to
 187         * smp_store_cpu_info() stored a value that is close but not as
 188         * accurate as the value just calculated.
 189         */
 190        calibrate_delay();
 191        cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
 192        pr_debug("Stack at about %p\n", &cpuid);
 193
 194        /*
 195         * This must be done before setting cpu_online_mask
 196         * or calling notify_cpu_starting.
 197         */
 198        set_cpu_sibling_map(raw_smp_processor_id());
 199        wmb();
 200
 201        notify_cpu_starting(cpuid);
 202
 203        /*
 204         * Allow the master to continue.
 205         */
 206        cpumask_set_cpu(cpuid, cpu_callin_mask);
 207}
 208
 209static int cpu0_logical_apicid;
 210static int enable_start_cpu0;
 211/*
 212 * Activate a secondary processor.
 213 */
 214static void notrace start_secondary(void *unused)
 215{
 216        /*
 217         * Don't put *anything* before cpu_init(), SMP booting is too
 218         * fragile that we want to limit the things done here to the
 219         * most necessary things.
 220         */
 221        cpu_init();
 222        x86_cpuinit.early_percpu_clock_init();
 223        preempt_disable();
 224        smp_callin();
 225
 226        enable_start_cpu0 = 0;
 227
 228#ifdef CONFIG_X86_32
 229        /* switch away from the initial page table */
 230        load_cr3(swapper_pg_dir);
 231        __flush_tlb_all();
 232#endif
 233
 234        /* otherwise gcc will move up smp_processor_id before the cpu_init */
 235        barrier();
 236        /*
 237         * Check TSC synchronization with the BP:
 238         */
 239        check_tsc_sync_target();
 240
 241        /*
 242         * Lock vector_lock and initialize the vectors on this cpu
 243         * before setting the cpu online. We must set it online with
 244         * vector_lock held to prevent a concurrent setup/teardown
 245         * from seeing a half valid vector space.
 246         */
 247        lock_vector_lock();
 248        setup_vector_irq(smp_processor_id());
 249        set_cpu_online(smp_processor_id(), true);
 250        unlock_vector_lock();
 251        cpu_set_state_online(smp_processor_id());
 252        x86_platform.nmi_init();
 253
 254        /* enable local interrupts */
 255        local_irq_enable();
 256
 257        /* to prevent fake stack check failure in clock setup */
 258        boot_init_stack_canary();
 259
 260        x86_cpuinit.setup_percpu_clockev();
 261
 262        wmb();
 263        cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 264}
 265
 266int topology_update_package_map(unsigned int apicid, unsigned int cpu)
 267{
 268        unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
 269
 270        /* Called from early boot ? */
 271        if (!physical_package_map)
 272                return 0;
 273
 274        if (pkg >= max_physical_pkg_id)
 275                return -EINVAL;
 276
 277        /* Set the logical package id */
 278        if (test_and_set_bit(pkg, physical_package_map))
 279                goto found;
 280
 281        if (logical_packages_frozen) {
 282                physical_to_logical_pkg[pkg] = -1;
 283                pr_warn("APIC(%x) Package %u exceeds logical package max\n",
 284                        apicid, pkg);
 285                return -ENOSPC;
 286        }
 287
 288        new = logical_packages++;
 289        pr_info("APIC(%x) Converting physical %u to logical package %u\n",
 290                apicid, pkg, new);
 291        physical_to_logical_pkg[pkg] = new;
 292
 293found:
 294        cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg];
 295        return 0;
 296}
 297
 298/**
 299 * topology_phys_to_logical_pkg - Map a physical package id to a logical
 300 *
 301 * Returns logical package id or -1 if not found
 302 */
 303int topology_phys_to_logical_pkg(unsigned int phys_pkg)
 304{
 305        if (phys_pkg >= max_physical_pkg_id)
 306                return -1;
 307        return physical_to_logical_pkg[phys_pkg];
 308}
 309EXPORT_SYMBOL(topology_phys_to_logical_pkg);
 310
 311static void __init smp_init_package_map(void)
 312{
 313        unsigned int ncpus, cpu;
 314        size_t size;
 315
 316        /*
 317         * Today neither Intel nor AMD support heterogenous systems. That
 318         * might change in the future....
 319         *
 320         * While ideally we'd want '* smp_num_siblings' in the below @ncpus
 321         * computation, this won't actually work since some Intel BIOSes
 322         * report inconsistent HT data when they disable HT.
 323         *
 324         * In particular, they reduce the APIC-IDs to only include the cores,
 325         * but leave the CPUID topology to say there are (2) siblings.
 326         * This means we don't know how many threads there will be until
 327         * after the APIC enumeration.
 328         *
 329         * By not including this we'll sometimes over-estimate the number of
 330         * logical packages by the amount of !present siblings, but this is
 331         * still better than MAX_LOCAL_APIC.
 332         *
 333         * We use total_cpus not nr_cpu_ids because nr_cpu_ids can be limited
 334         * on the command line leading to a similar issue as the HT disable
 335         * problem because the hyperthreads are usually enumerated after the
 336         * primary cores.
 337         */
 338        ncpus = boot_cpu_data.x86_max_cores;
 339        if (!ncpus) {
 340                pr_warn("x86_max_cores == zero !?!?");
 341                ncpus = 1;
 342        }
 343
 344        __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
 345        logical_packages = 0;
 346
 347        /*
 348         * Possibly larger than what we need as the number of apic ids per
 349         * package can be smaller than the actual used apic ids.
 350         */
 351        max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus);
 352        size = max_physical_pkg_id * sizeof(unsigned int);
 353        physical_to_logical_pkg = kmalloc(size, GFP_KERNEL);
 354        memset(physical_to_logical_pkg, 0xff, size);
 355        size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
 356        physical_package_map = kzalloc(size, GFP_KERNEL);
 357
 358        for_each_present_cpu(cpu) {
 359                unsigned int apicid = apic->cpu_present_to_apicid(cpu);
 360
 361                if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
 362                        continue;
 363                if (!topology_update_package_map(apicid, cpu))
 364                        continue;
 365                pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
 366                per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
 367                set_cpu_possible(cpu, false);
 368                set_cpu_present(cpu, false);
 369        }
 370
 371        if (logical_packages > __max_logical_packages) {
 372                pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n",
 373                        logical_packages, __max_logical_packages);
 374                logical_packages_frozen = true;
 375                __max_logical_packages  = logical_packages;
 376        }
 377
 378        pr_info("Max logical packages: %u\n", __max_logical_packages);
 379}
 380
 381void __init smp_store_boot_cpu_info(void)
 382{
 383        int id = 0; /* CPU 0 */
 384        struct cpuinfo_x86 *c = &cpu_data(id);
 385
 386        *c = boot_cpu_data;
 387        c->cpu_index = id;
 388        smp_init_package_map();
 389}
 390
 391/*
 392 * The bootstrap kernel entry code has set these up. Save them for
 393 * a given CPU
 394 */
 395void smp_store_cpu_info(int id)
 396{
 397        struct cpuinfo_x86 *c = &cpu_data(id);
 398
 399        *c = boot_cpu_data;
 400        c->cpu_index = id;
 401        /*
 402         * During boot time, CPU0 has this setup already. Save the info when
 403         * bringing up AP or offlined CPU0.
 404         */
 405        identify_secondary_cpu(c);
 406}
 407
 408static bool
 409topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 410{
 411        int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 412
 413        return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
 414}
 415
 416static bool
 417topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
 418{
 419        int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 420
 421        return !WARN_ONCE(!topology_same_node(c, o),
 422                "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
 423                "[node: %d != %d]. Ignoring dependency.\n",
 424                cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
 425}
 426
 427#define link_mask(mfunc, c1, c2)                                        \
 428do {                                                                    \
 429        cpumask_set_cpu((c1), mfunc(c2));                               \
 430        cpumask_set_cpu((c2), mfunc(c1));                               \
 431} while (0)
 432
 433static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 434{
 435        if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
 436                int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 437
 438                if (c->phys_proc_id == o->phys_proc_id &&
 439                    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
 440                    c->cpu_core_id == o->cpu_core_id)
 441                        return topology_sane(c, o, "smt");
 442
 443        } else if (c->phys_proc_id == o->phys_proc_id &&
 444                   c->cpu_core_id == o->cpu_core_id) {
 445                return topology_sane(c, o, "smt");
 446        }
 447
 448        return false;
 449}
 450
 451static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 452{
 453        int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 454
 455        if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
 456            per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
 457                return topology_sane(c, o, "llc");
 458
 459        return false;
 460}
 461
 462/*
 463 * Unlike the other levels, we do not enforce keeping a
 464 * multicore group inside a NUMA node.  If this happens, we will
 465 * discard the MC level of the topology later.
 466 */
 467static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 468{
 469        if (c->phys_proc_id == o->phys_proc_id)
 470                return true;
 471        return false;
 472}
 473
 474static struct sched_domain_topology_level numa_inside_package_topology[] = {
 475#ifdef CONFIG_SCHED_SMT
 476        { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
 477#endif
 478#ifdef CONFIG_SCHED_MC
 479        { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
 480#endif
 481        { NULL, },
 482};
 483/*
 484 * set_sched_topology() sets the topology internal to a CPU.  The
 485 * NUMA topologies are layered on top of it to build the full
 486 * system topology.
 487 *
 488 * If NUMA nodes are observed to occur within a CPU package, this
 489 * function should be called.  It forces the sched domain code to
 490 * only use the SMT level for the CPU portion of the topology.
 491 * This essentially falls back to relying on NUMA information
 492 * from the SRAT table to describe the entire system topology
 493 * (except for hyperthreads).
 494 */
 495static void primarily_use_numa_for_topology(void)
 496{
 497        set_sched_topology(numa_inside_package_topology);
 498}
 499
 500void set_cpu_sibling_map(int cpu)
 501{
 502        bool has_smt = smp_num_siblings > 1;
 503        bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
 504        struct cpuinfo_x86 *c = &cpu_data(cpu);
 505        struct cpuinfo_x86 *o;
 506        int i, threads;
 507
 508        cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 509
 510        if (!has_mp) {
 511                cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
 512                cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
 513                cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
 514                c->booted_cores = 1;
 515                return;
 516        }
 517
 518        for_each_cpu(i, cpu_sibling_setup_mask) {
 519                o = &cpu_data(i);
 520
 521                if ((i == cpu) || (has_smt && match_smt(c, o)))
 522                        link_mask(topology_sibling_cpumask, cpu, i);
 523
 524                if ((i == cpu) || (has_mp && match_llc(c, o)))
 525                        link_mask(cpu_llc_shared_mask, cpu, i);
 526
 527        }
 528
 529        /*
 530         * This needs a separate iteration over the cpus because we rely on all
 531         * topology_sibling_cpumask links to be set-up.
 532         */
 533        for_each_cpu(i, cpu_sibling_setup_mask) {
 534                o = &cpu_data(i);
 535
 536                if ((i == cpu) || (has_mp && match_die(c, o))) {
 537                        link_mask(topology_core_cpumask, cpu, i);
 538
 539                        /*
 540                         *  Does this new cpu bringup a new core?
 541                         */
 542                        if (cpumask_weight(
 543                            topology_sibling_cpumask(cpu)) == 1) {
 544                                /*
 545                                 * for each core in package, increment
 546                                 * the booted_cores for this new cpu
 547                                 */
 548                                if (cpumask_first(
 549                                    topology_sibling_cpumask(i)) == i)
 550                                        c->booted_cores++;
 551                                /*
 552                                 * increment the core count for all
 553                                 * the other cpus in this package
 554                                 */
 555                                if (i != cpu)
 556                                        cpu_data(i).booted_cores++;
 557                        } else if (i != cpu && !c->booted_cores)
 558                                c->booted_cores = cpu_data(i).booted_cores;
 559                }
 560                if (match_die(c, o) && !topology_same_node(c, o))
 561                        primarily_use_numa_for_topology();
 562        }
 563
 564        threads = cpumask_weight(topology_sibling_cpumask(cpu));
 565        if (threads > __max_smt_threads)
 566                __max_smt_threads = threads;
 567}
 568
 569/* maps the cpu to the sched domain representing multi-core */
 570const struct cpumask *cpu_coregroup_mask(int cpu)
 571{
 572        return cpu_llc_shared_mask(cpu);
 573}
 574
 575static void impress_friends(void)
 576{
 577        int cpu;
 578        unsigned long bogosum = 0;
 579        /*
 580         * Allow the user to impress friends.
 581         */
 582        pr_debug("Before bogomips\n");
 583        for_each_possible_cpu(cpu)
 584                if (cpumask_test_cpu(cpu, cpu_callout_mask))
 585                        bogosum += cpu_data(cpu).loops_per_jiffy;
 586        pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
 587                num_online_cpus(),
 588                bogosum/(500000/HZ),
 589                (bogosum/(5000/HZ))%100);
 590
 591        pr_debug("Before bogocount - setting activated=1\n");
 592}
 593
 594void __inquire_remote_apic(int apicid)
 595{
 596        unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
 597        const char * const names[] = { "ID", "VERSION", "SPIV" };
 598        int timeout;
 599        u32 status;
 600
 601        pr_info("Inquiring remote APIC 0x%x...\n", apicid);
 602
 603        for (i = 0; i < ARRAY_SIZE(regs); i++) {
 604                pr_info("... APIC 0x%x %s: ", apicid, names[i]);
 605
 606                /*
 607                 * Wait for idle.
 608                 */
 609                status = safe_apic_wait_icr_idle();
 610                if (status)
 611                        pr_cont("a previous APIC delivery may have failed\n");
 612
 613                apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
 614
 615                timeout = 0;
 616                do {
 617                        udelay(100);
 618                        status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
 619                } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
 620
 621                switch (status) {
 622                case APIC_ICR_RR_VALID:
 623                        status = apic_read(APIC_RRR);
 624                        pr_cont("%08x\n", status);
 625                        break;
 626                default:
 627                        pr_cont("failed\n");
 628                }
 629        }
 630}
 631
 632/*
 633 * The Multiprocessor Specification 1.4 (1997) example code suggests
 634 * that there should be a 10ms delay between the BSP asserting INIT
 635 * and de-asserting INIT, when starting a remote processor.
 636 * But that slows boot and resume on modern processors, which include
 637 * many cores and don't require that delay.
 638 *
 639 * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
 640 * Modern processor families are quirked to remove the delay entirely.
 641 */
 642#define UDELAY_10MS_DEFAULT 10000
 643
 644static unsigned int init_udelay = UINT_MAX;
 645
 646static int __init cpu_init_udelay(char *str)
 647{
 648        get_option(&str, &init_udelay);
 649
 650        return 0;
 651}
 652early_param("cpu_init_udelay", cpu_init_udelay);
 653
 654static void __init smp_quirk_init_udelay(void)
 655{
 656        /* if cmdline changed it from default, leave it alone */
 657        if (init_udelay != UINT_MAX)
 658                return;
 659
 660        /* if modern processor, use no delay */
 661        if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
 662            ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
 663                init_udelay = 0;
 664                return;
 665        }
 666        /* else, use legacy delay */
 667        init_udelay = UDELAY_10MS_DEFAULT;
 668}
 669
 670/*
 671 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
 672 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
 673 * won't ... remember to clear down the APIC, etc later.
 674 */
 675int
 676wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 677{
 678        unsigned long send_status, accept_status = 0;
 679        int maxlvt;
 680
 681        /* Target chip */
 682        /* Boot on the stack */
 683        /* Kick the second */
 684        apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
 685
 686        pr_debug("Waiting for send to finish...\n");
 687        send_status = safe_apic_wait_icr_idle();
 688
 689        /*
 690         * Give the other CPU some time to accept the IPI.
 691         */
 692        udelay(200);
 693        if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
 694                maxlvt = lapic_get_maxlvt();
 695                if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
 696                        apic_write(APIC_ESR, 0);
 697                accept_status = (apic_read(APIC_ESR) & 0xEF);
 698        }
 699        pr_debug("NMI sent\n");
 700
 701        if (send_status)
 702                pr_err("APIC never delivered???\n");
 703        if (accept_status)
 704                pr_err("APIC delivery error (%lx)\n", accept_status);
 705
 706        return (send_status | accept_status);
 707}
 708
 709static int
 710wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 711{
 712        unsigned long send_status = 0, accept_status = 0;
 713        int maxlvt, num_starts, j;
 714
 715        maxlvt = lapic_get_maxlvt();
 716
 717        /*
 718         * Be paranoid about clearing APIC errors.
 719         */
 720        if (APIC_INTEGRATED(apic_version[phys_apicid])) {
 721                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
 722                        apic_write(APIC_ESR, 0);
 723                apic_read(APIC_ESR);
 724        }
 725
 726        pr_debug("Asserting INIT\n");
 727
 728        /*
 729         * Turn INIT on target chip
 730         */
 731        /*
 732         * Send IPI
 733         */
 734        apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
 735                       phys_apicid);
 736
 737        pr_debug("Waiting for send to finish...\n");
 738        send_status = safe_apic_wait_icr_idle();
 739
 740        udelay(init_udelay);
 741
 742        pr_debug("Deasserting INIT\n");
 743
 744        /* Target chip */
 745        /* Send IPI */
 746        apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
 747
 748        pr_debug("Waiting for send to finish...\n");
 749        send_status = safe_apic_wait_icr_idle();
 750
 751        mb();
 752
 753        /*
 754         * Should we send STARTUP IPIs ?
 755         *
 756         * Determine this based on the APIC version.
 757         * If we don't have an integrated APIC, don't send the STARTUP IPIs.
 758         */
 759        if (APIC_INTEGRATED(apic_version[phys_apicid]))
 760                num_starts = 2;
 761        else
 762                num_starts = 0;
 763
 764        /*
 765         * Run STARTUP IPI loop.
 766         */
 767        pr_debug("#startup loops: %d\n", num_starts);
 768
 769        for (j = 1; j <= num_starts; j++) {
 770                pr_debug("Sending STARTUP #%d\n", j);
 771                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
 772                        apic_write(APIC_ESR, 0);
 773                apic_read(APIC_ESR);
 774                pr_debug("After apic_write\n");
 775
 776                /*
 777                 * STARTUP IPI
 778                 */
 779
 780                /* Target chip */
 781                /* Boot on the stack */
 782                /* Kick the second */
 783                apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
 784                               phys_apicid);
 785
 786                /*
 787                 * Give the other CPU some time to accept the IPI.
 788                 */
 789                if (init_udelay == 0)
 790                        udelay(10);
 791                else
 792                        udelay(300);
 793
 794                pr_debug("Startup point 1\n");
 795
 796                pr_debug("Waiting for send to finish...\n");
 797                send_status = safe_apic_wait_icr_idle();
 798
 799                /*
 800                 * Give the other CPU some time to accept the IPI.
 801                 */
 802                if (init_udelay == 0)
 803                        udelay(10);
 804                else
 805                        udelay(200);
 806
 807                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
 808                        apic_write(APIC_ESR, 0);
 809                accept_status = (apic_read(APIC_ESR) & 0xEF);
 810                if (send_status || accept_status)
 811                        break;
 812        }
 813        pr_debug("After Startup\n");
 814
 815        if (send_status)
 816                pr_err("APIC never delivered???\n");
 817        if (accept_status)
 818                pr_err("APIC delivery error (%lx)\n", accept_status);
 819
 820        return (send_status | accept_status);
 821}
 822
 823void smp_announce(void)
 824{
 825        int num_nodes = num_online_nodes();
 826
 827        printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n",
 828               num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus());
 829}
 830
 831/* reduce the number of lines printed when booting a large cpu count system */
 832static void announce_cpu(int cpu, int apicid)
 833{
 834        static int current_node = -1;
 835        int node = early_cpu_to_node(cpu);
 836        static int width, node_width;
 837
 838        if (!width)
 839                width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
 840
 841        if (!node_width)
 842                node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
 843
 844        if (cpu == 1)
 845                printk(KERN_INFO "x86: Booting SMP configuration:\n");
 846
 847        if (system_state == SYSTEM_BOOTING) {
 848                if (node != current_node) {
 849                        if (current_node > (-1))
 850                                pr_cont("\n");
 851                        current_node = node;
 852
 853                        printk(KERN_INFO ".... node %*s#%d, CPUs:  ",
 854                               node_width - num_digits(node), " ", node);
 855                }
 856
 857                /* Add padding for the BSP */
 858                if (cpu == 1)
 859                        pr_cont("%*s", width + 1, " ");
 860
 861                pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
 862
 863        } else
 864                pr_info("Booting Node %d Processor %d APIC 0x%x\n",
 865                        node, cpu, apicid);
 866}
 867
 868static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
 869{
 870        int cpu;
 871
 872        cpu = smp_processor_id();
 873        if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
 874                return NMI_HANDLED;
 875
 876        return NMI_DONE;
 877}
 878
 879/*
 880 * Wake up AP by INIT, INIT, STARTUP sequence.
 881 *
 882 * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
 883 * boot-strap code which is not a desired behavior for waking up BSP. To
 884 * void the boot-strap code, wake up CPU0 by NMI instead.
 885 *
 886 * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
 887 * (i.e. physically hot removed and then hot added), NMI won't wake it up.
 888 * We'll change this code in the future to wake up hard offlined CPU0 if
 889 * real platform and request are available.
 890 */
 891static int
 892wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
 893               int *cpu0_nmi_registered)
 894{
 895        int id;
 896        int boot_error;
 897
 898        preempt_disable();
 899
 900        /*
 901         * Wake up AP by INIT, INIT, STARTUP sequence.
 902         */
 903        if (cpu) {
 904                boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
 905                goto out;
 906        }
 907
 908        /*
 909         * Wake up BSP by nmi.
 910         *
 911         * Register a NMI handler to help wake up CPU0.
 912         */
 913        boot_error = register_nmi_handler(NMI_LOCAL,
 914                                          wakeup_cpu0_nmi, 0, "wake_cpu0");
 915
 916        if (!boot_error) {
 917                enable_start_cpu0 = 1;
 918                *cpu0_nmi_registered = 1;
 919                if (apic->dest_logical == APIC_DEST_LOGICAL)
 920                        id = cpu0_logical_apicid;
 921                else
 922                        id = apicid;
 923                boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
 924        }
 925
 926out:
 927        preempt_enable();
 928
 929        return boot_error;
 930}
 931
 932void common_cpu_up(unsigned int cpu, struct task_struct *idle)
 933{
 934        /* Just in case we booted with a single CPU. */
 935        alternatives_enable_smp();
 936
 937        per_cpu(current_task, cpu) = idle;
 938
 939#ifdef CONFIG_X86_32
 940        /* Stack for startup_32 can be just as for start_secondary onwards */
 941        irq_ctx_init(cpu);
 942        per_cpu(cpu_current_top_of_stack, cpu) =
 943                (unsigned long)task_stack_page(idle) + THREAD_SIZE;
 944#else
 945        clear_tsk_thread_flag(idle, TIF_FORK);
 946        initial_gs = per_cpu_offset(cpu);
 947#endif
 948}
 949
 950/*
 951 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
 952 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
 953 * Returns zero if CPU booted OK, else error code from
 954 * ->wakeup_secondary_cpu.
 955 */
 956static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 957{
 958        volatile u32 *trampoline_status =
 959                (volatile u32 *) __va(real_mode_header->trampoline_status);
 960        /* start_ip had better be page-aligned! */
 961        unsigned long start_ip = real_mode_header->trampoline_start;
 962
 963        unsigned long boot_error = 0;
 964        int cpu0_nmi_registered = 0;
 965        unsigned long timeout;
 966
 967        idle->thread.sp = (unsigned long) (((struct pt_regs *)
 968                          (THREAD_SIZE +  task_stack_page(idle))) - 1);
 969
 970        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 971        initial_code = (unsigned long)start_secondary;
 972        stack_start  = idle->thread.sp;
 973
 974        /*
 975         * Enable the espfix hack for this CPU
 976        */
 977#ifdef CONFIG_X86_ESPFIX64
 978        init_espfix_ap(cpu);
 979#endif
 980
 981        /* So we see what's up */
 982        announce_cpu(cpu, apicid);
 983
 984        /*
 985         * This grunge runs the startup process for
 986         * the targeted processor.
 987         */
 988
 989        if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 990
 991                pr_debug("Setting warm reset code and vector.\n");
 992
 993                smpboot_setup_warm_reset_vector(start_ip);
 994                /*
 995                 * Be paranoid about clearing APIC errors.
 996                */
 997                if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
 998                        apic_write(APIC_ESR, 0);
 999                        apic_read(APIC_ESR);
1000                }
1001        }
1002
1003        /*
1004         * AP might wait on cpu_callout_mask in cpu_init() with
1005         * cpu_initialized_mask set if previous attempt to online
1006         * it timed-out. Clear cpu_initialized_mask so that after
1007         * INIT/SIPI it could start with a clean state.
1008         */
1009        cpumask_clear_cpu(cpu, cpu_initialized_mask);
1010        smp_mb();
1011
1012        /*
1013         * Wake up a CPU in difference cases:
1014         * - Use the method in the APIC driver if it's defined
1015         * Otherwise,
1016         * - Use an INIT boot APIC message for APs or NMI for BSP.
1017         */
1018        if (apic->wakeup_secondary_cpu)
1019                boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
1020        else
1021                boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid,
1022                                                     &cpu0_nmi_registered);
1023
1024        if (!boot_error) {
1025                /*
1026                 * Wait 10s total for first sign of life from AP
1027                 */
1028                boot_error = -1;
1029                timeout = jiffies + 10*HZ;
1030                while (time_before(jiffies, timeout)) {
1031                        if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
1032                                /*
1033                                 * Tell AP to proceed with initialization
1034                                 */
1035                                cpumask_set_cpu(cpu, cpu_callout_mask);
1036                                boot_error = 0;
1037                                break;
1038                        }
1039                        schedule();
1040                }
1041        }
1042
1043        if (!boot_error) {
1044                /*
1045                 * Wait till AP completes initial initialization
1046                 */
1047                while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
1048                        /*
1049                         * Allow other tasks to run while we wait for the
1050                         * AP to come online. This also gives a chance
1051                         * for the MTRR work(triggered by the AP coming online)
1052                         * to be completed in the stop machine context.
1053                         */
1054                        schedule();
1055                }
1056        }
1057
1058        /* mark "stuck" area as not stuck */
1059        *trampoline_status = 0;
1060
1061        if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
1062                /*
1063                 * Cleanup possible dangling ends...
1064                 */
1065                smpboot_restore_warm_reset_vector();
1066        }
1067        /*
1068         * Clean up the nmi handler. Do this after the callin and callout sync
1069         * to avoid impact of possible long unregister time.
1070         */
1071        if (cpu0_nmi_registered)
1072                unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
1073
1074        return boot_error;
1075}
1076
1077int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
1078{
1079        int apicid = apic->cpu_present_to_apicid(cpu);
1080        unsigned long flags;
1081        int err;
1082
1083        WARN_ON(irqs_disabled());
1084
1085        pr_debug("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
1086
1087        if (apicid == BAD_APICID ||
1088            !physid_isset(apicid, phys_cpu_present_map) ||
1089            !apic->apic_id_valid(apicid)) {
1090                pr_err("%s: bad cpu %d\n", __func__, cpu);
1091                return -EINVAL;
1092        }
1093
1094        /*
1095         * Already booted CPU?
1096         */
1097        if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
1098                pr_debug("do_boot_cpu %d Already started\n", cpu);
1099                return -ENOSYS;
1100        }
1101
1102        /*
1103         * Save current MTRR state in case it was changed since early boot
1104         * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
1105         */
1106        mtrr_save_state();
1107
1108        /* x86 CPUs take themselves offline, so delayed offline is OK. */
1109        err = cpu_check_up_prepare(cpu);
1110        if (err && err != -EBUSY)
1111                return err;
1112
1113        /* the FPU context is blank, nobody can own it */
1114        __cpu_disable_lazy_restore(cpu);
1115
1116        common_cpu_up(cpu, tidle);
1117
1118        /*
1119         * We have to walk the irq descriptors to setup the vector
1120         * space for the cpu which comes online.  Prevent irq
1121         * alloc/free across the bringup.
1122         */
1123        irq_lock_sparse();
1124
1125        err = do_boot_cpu(apicid, cpu, tidle);
1126
1127        if (err) {
1128                irq_unlock_sparse();
1129                pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
1130                return -EIO;
1131        }
1132
1133        /*
1134         * Check TSC synchronization with the AP (keep irqs disabled
1135         * while doing so):
1136         */
1137        local_irq_save(flags);
1138        check_tsc_sync_source(cpu);
1139        local_irq_restore(flags);
1140
1141        while (!cpu_online(cpu)) {
1142                cpu_relax();
1143                touch_nmi_watchdog();
1144        }
1145
1146        irq_unlock_sparse();
1147
1148        return 0;
1149}
1150
1151/**
1152 * arch_disable_smp_support() - disables SMP support for x86 at runtime
1153 */
1154void arch_disable_smp_support(void)
1155{
1156        disable_ioapic_support();
1157}
1158
1159/*
1160 * Fall back to non SMP mode after errors.
1161 *
1162 * RED-PEN audit/test this more. I bet there is more state messed up here.
1163 */
1164static __init void disable_smp(void)
1165{
1166        pr_info("SMP disabled\n");
1167
1168        disable_ioapic_support();
1169
1170        init_cpu_present(cpumask_of(0));
1171        init_cpu_possible(cpumask_of(0));
1172
1173        if (smp_found_config)
1174                physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1175        else
1176                physid_set_mask_of_physid(0, &phys_cpu_present_map);
1177        cpumask_set_cpu(0, topology_sibling_cpumask(0));
1178        cpumask_set_cpu(0, topology_core_cpumask(0));
1179}
1180
1181enum {
1182        SMP_OK,
1183        SMP_NO_CONFIG,
1184        SMP_NO_APIC,
1185        SMP_FORCE_UP,
1186};
1187
1188/*
1189 * Various sanity checks.
1190 */
1191static int __init smp_sanity_check(unsigned max_cpus)
1192{
1193        preempt_disable();
1194
1195#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32)
1196        if (def_to_bigsmp && nr_cpu_ids > 8) {
1197                unsigned int cpu;
1198                unsigned nr;
1199
1200                pr_warn("More than 8 CPUs detected - skipping them\n"
1201                        "Use CONFIG_X86_BIGSMP\n");
1202
1203                nr = 0;
1204                for_each_present_cpu(cpu) {
1205                        if (nr >= 8)
1206                                set_cpu_present(cpu, false);
1207                        nr++;
1208                }
1209
1210                nr = 0;
1211                for_each_possible_cpu(cpu) {
1212                        if (nr >= 8)
1213                                set_cpu_possible(cpu, false);
1214                        nr++;
1215                }
1216
1217                nr_cpu_ids = 8;
1218        }
1219#endif
1220
1221        if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1222                pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
1223                        hard_smp_processor_id());
1224
1225                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1226        }
1227
1228        /*
1229         * If we couldn't find an SMP configuration at boot time,
1230         * get out of here now!
1231         */
1232        if (!smp_found_config && !acpi_lapic) {
1233                preempt_enable();
1234                pr_notice("SMP motherboard not detected\n");
1235                return SMP_NO_CONFIG;
1236        }
1237
1238        /*
1239         * Should not be necessary because the MP table should list the boot
1240         * CPU too, but we do it for the sake of robustness anyway.
1241         */
1242        if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
1243                pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
1244                          boot_cpu_physical_apicid);
1245                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1246        }
1247        preempt_enable();
1248
1249        /*
1250         * If we couldn't find a local APIC, then get out of here now!
1251         */
1252        if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
1253            !boot_cpu_has(X86_FEATURE_APIC)) {
1254                if (!disable_apic) {
1255                        pr_err("BIOS bug, local APIC #%d not detected!...\n",
1256                                boot_cpu_physical_apicid);
1257                        pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
1258                }
1259                return SMP_NO_APIC;
1260        }
1261
1262        /*
1263         * If SMP should be disabled, then really disable it!
1264         */
1265        if (!max_cpus) {
1266                pr_info("SMP mode deactivated\n");
1267                return SMP_FORCE_UP;
1268        }
1269
1270        return SMP_OK;
1271}
1272
1273static void __init smp_cpu_index_default(void)
1274{
1275        int i;
1276        struct cpuinfo_x86 *c;
1277
1278        for_each_possible_cpu(i) {
1279                c = &cpu_data(i);
1280                /* mark all to hotplug */
1281                c->cpu_index = nr_cpu_ids;
1282        }
1283}
1284
1285/*
1286 * Prepare for SMP bootup.  The MP table or ACPI has been read
1287 * earlier.  Just do some sanity checking here and enable APIC mode.
1288 */
1289void __init native_smp_prepare_cpus(unsigned int max_cpus)
1290{
1291        unsigned int i;
1292
1293        smp_cpu_index_default();
1294
1295        /*
1296         * Setup boot CPU information
1297         */
1298        smp_store_boot_cpu_info(); /* Final full version of the data */
1299        cpumask_copy(cpu_callin_mask, cpumask_of(0));
1300        mb();
1301
1302        for_each_possible_cpu(i) {
1303                zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1304                zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1305                zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1306        }
1307        set_cpu_sibling_map(0);
1308
1309        switch (smp_sanity_check(max_cpus)) {
1310        case SMP_NO_CONFIG:
1311                disable_smp();
1312                if (APIC_init_uniprocessor())
1313                        pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
1314                return;
1315        case SMP_NO_APIC:
1316                disable_smp();
1317                return;
1318        case SMP_FORCE_UP:
1319                disable_smp();
1320                apic_bsp_setup(false);
1321                return;
1322        case SMP_OK:
1323                break;
1324        }
1325
1326        default_setup_apic_routing();
1327
1328        if (read_apic_id() != boot_cpu_physical_apicid) {
1329                panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1330                     read_apic_id(), boot_cpu_physical_apicid);
1331                /* Or can we switch back to PIC here? */
1332        }
1333
1334        cpu0_logical_apicid = apic_bsp_setup(false);
1335
1336        pr_info("CPU%d: ", 0);
1337        print_cpu_info(&cpu_data(0));
1338
1339        if (is_uv_system())
1340                uv_system_init();
1341
1342        set_mtrr_aps_delayed_init();
1343
1344        smp_quirk_init_udelay();
1345}
1346
1347void arch_enable_nonboot_cpus_begin(void)
1348{
1349        set_mtrr_aps_delayed_init();
1350}
1351
1352void arch_enable_nonboot_cpus_end(void)
1353{
1354        mtrr_aps_init();
1355}
1356
1357/*
1358 * Early setup to make printk work.
1359 */
1360void __init native_smp_prepare_boot_cpu(void)
1361{
1362        int me = smp_processor_id();
1363        switch_to_new_gdt(me);
1364        /* already set me in cpu_online_mask in boot_cpu_init() */
1365        cpumask_set_cpu(me, cpu_callout_mask);
1366        cpu_set_state_online(me);
1367}
1368
1369void __init native_smp_cpus_done(unsigned int max_cpus)
1370{
1371        pr_debug("Boot done\n");
1372
1373        nmi_selftest();
1374        impress_friends();
1375        setup_ioapic_dest();
1376        mtrr_aps_init();
1377}
1378
1379static int __initdata setup_possible_cpus = -1;
1380static int __init _setup_possible_cpus(char *str)
1381{
1382        get_option(&str, &setup_possible_cpus);
1383        return 0;
1384}
1385early_param("possible_cpus", _setup_possible_cpus);
1386
1387
1388/*
1389 * cpu_possible_mask should be static, it cannot change as cpu's
1390 * are onlined, or offlined. The reason is per-cpu data-structures
1391 * are allocated by some modules at init time, and dont expect to
1392 * do this dynamically on cpu arrival/departure.
1393 * cpu_present_mask on the other hand can change dynamically.
1394 * In case when cpu_hotplug is not compiled, then we resort to current
1395 * behaviour, which is cpu_possible == cpu_present.
1396 * - Ashok Raj
1397 *
1398 * Three ways to find out the number of additional hotplug CPUs:
1399 * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
1400 * - The user can overwrite it with possible_cpus=NUM
1401 * - Otherwise don't reserve additional CPUs.
1402 * We do this because additional CPUs waste a lot of memory.
1403 * -AK
1404 */
1405__init void prefill_possible_map(void)
1406{
1407        int i, possible;
1408
1409        /* no processor from mptable or madt */
1410        if (!num_processors)
1411                num_processors = 1;
1412
1413        i = setup_max_cpus ?: 1;
1414        if (setup_possible_cpus == -1) {
1415                possible = num_processors;
1416#ifdef CONFIG_HOTPLUG_CPU
1417                if (setup_max_cpus)
1418                        possible += disabled_cpus;
1419#else
1420                if (possible > i)
1421                        possible = i;
1422#endif
1423        } else
1424                possible = setup_possible_cpus;
1425
1426        total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1427
1428        /* nr_cpu_ids could be reduced via nr_cpus= */
1429        if (possible > nr_cpu_ids) {
1430                pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
1431                        possible, nr_cpu_ids);
1432                possible = nr_cpu_ids;
1433        }
1434
1435#ifdef CONFIG_HOTPLUG_CPU
1436        if (!setup_max_cpus)
1437#endif
1438        if (possible > i) {
1439                pr_warn("%d Processors exceeds max_cpus limit of %u\n",
1440                        possible, setup_max_cpus);
1441                possible = i;
1442        }
1443
1444        pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
1445                possible, max_t(int, possible - num_processors, 0));
1446
1447        for (i = 0; i < possible; i++)
1448                set_cpu_possible(i, true);
1449        for (; i < NR_CPUS; i++)
1450                set_cpu_possible(i, false);
1451
1452        nr_cpu_ids = possible;
1453}
1454
1455#ifdef CONFIG_HOTPLUG_CPU
1456
1457/* Recompute SMT state for all CPUs on offline */
1458static void recompute_smt_state(void)
1459{
1460        int max_threads, cpu;
1461
1462        max_threads = 0;
1463        for_each_online_cpu (cpu) {
1464                int threads = cpumask_weight(topology_sibling_cpumask(cpu));
1465
1466                if (threads > max_threads)
1467                        max_threads = threads;
1468        }
1469        __max_smt_threads = max_threads;
1470}
1471
1472static void remove_siblinginfo(int cpu)
1473{
1474        int sibling;
1475        struct cpuinfo_x86 *c = &cpu_data(cpu);
1476
1477        for_each_cpu(sibling, topology_core_cpumask(cpu)) {
1478                cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
1479                /*/
1480                 * last thread sibling in this cpu core going down
1481                 */
1482                if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
1483                        cpu_data(sibling).booted_cores--;
1484        }
1485
1486        for_each_cpu(sibling, topology_sibling_cpumask(cpu))
1487                cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
1488        for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
1489                cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
1490        cpumask_clear(cpu_llc_shared_mask(cpu));
1491        cpumask_clear(topology_sibling_cpumask(cpu));
1492        cpumask_clear(topology_core_cpumask(cpu));
1493        c->phys_proc_id = 0;
1494        c->cpu_core_id = 0;
1495        cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
1496        recompute_smt_state();
1497}
1498
1499static void remove_cpu_from_maps(int cpu)
1500{
1501        set_cpu_online(cpu, false);
1502        cpumask_clear_cpu(cpu, cpu_callout_mask);
1503        cpumask_clear_cpu(cpu, cpu_callin_mask);
1504        /* was set by cpu_init() */
1505        cpumask_clear_cpu(cpu, cpu_initialized_mask);
1506        numa_remove_cpu(cpu);
1507}
1508
1509void cpu_disable_common(void)
1510{
1511        int cpu = smp_processor_id();
1512
1513        remove_siblinginfo(cpu);
1514
1515        /* It's now safe to remove this processor from the online map */
1516        lock_vector_lock();
1517        remove_cpu_from_maps(cpu);
1518        unlock_vector_lock();
1519        fixup_irqs();
1520}
1521
1522int native_cpu_disable(void)
1523{
1524        int ret;
1525
1526        ret = check_irq_vectors_for_cpu_disable();
1527        if (ret)
1528                return ret;
1529
1530        clear_local_APIC();
1531        cpu_disable_common();
1532
1533        return 0;
1534}
1535
1536int common_cpu_die(unsigned int cpu)
1537{
1538        int ret = 0;
1539
1540        /* We don't do anything here: idle task is faking death itself. */
1541
1542        /* They ack this in play_dead() by setting CPU_DEAD */
1543        if (cpu_wait_death(cpu, 5)) {
1544                if (system_state == SYSTEM_RUNNING)
1545                        pr_info("CPU %u is now offline\n", cpu);
1546        } else {
1547                pr_err("CPU %u didn't die...\n", cpu);
1548                ret = -1;
1549        }
1550
1551        return ret;
1552}
1553
1554void native_cpu_die(unsigned int cpu)
1555{
1556        common_cpu_die(cpu);
1557}
1558
1559void play_dead_common(void)
1560{
1561        idle_task_exit();
1562        reset_lazy_tlbstate();
1563        amd_e400_remove_cpu(raw_smp_processor_id());
1564
1565        /* Ack it */
1566        (void)cpu_report_death();
1567
1568        /*
1569         * With physical CPU hotplug, we should halt the cpu
1570         */
1571        local_irq_disable();
1572}
1573
1574static bool wakeup_cpu0(void)
1575{
1576        if (smp_processor_id() == 0 && enable_start_cpu0)
1577                return true;
1578
1579        return false;
1580}
1581
1582/*
1583 * We need to flush the caches before going to sleep, lest we have
1584 * dirty data in our caches when we come back up.
1585 */
1586static inline void mwait_play_dead(void)
1587{
1588        unsigned int eax, ebx, ecx, edx;
1589        unsigned int highest_cstate = 0;
1590        unsigned int highest_subcstate = 0;
1591        void *mwait_ptr;
1592        int i;
1593
1594        if (!this_cpu_has(X86_FEATURE_MWAIT))
1595                return;
1596        if (!this_cpu_has(X86_FEATURE_CLFLUSH))
1597                return;
1598        if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1599                return;
1600
1601        eax = CPUID_MWAIT_LEAF;
1602        ecx = 0;
1603        native_cpuid(&eax, &ebx, &ecx, &edx);
1604
1605        /*
1606         * eax will be 0 if EDX enumeration is not valid.
1607         * Initialized below to cstate, sub_cstate value when EDX is valid.
1608         */
1609        if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
1610                eax = 0;
1611        } else {
1612                edx >>= MWAIT_SUBSTATE_SIZE;
1613                for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
1614                        if (edx & MWAIT_SUBSTATE_MASK) {
1615                                highest_cstate = i;
1616                                highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
1617                        }
1618                }
1619                eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
1620                        (highest_subcstate - 1);
1621        }
1622
1623        /*
1624         * This should be a memory location in a cache line which is
1625         * unlikely to be touched by other processors.  The actual
1626         * content is immaterial as it is not actually modified in any way.
1627         */
1628        mwait_ptr = &current_thread_info()->flags;
1629
1630        wbinvd();
1631
1632        while (1) {
1633                /*
1634                 * The CLFLUSH is a workaround for erratum AAI65 for
1635                 * the Xeon 7400 series.  It's not clear it is actually
1636                 * needed, but it should be harmless in either case.
1637                 * The WBINVD is insufficient due to the spurious-wakeup
1638                 * case where we return around the loop.
1639                 */
1640                mb();
1641                clflush(mwait_ptr);
1642                mb();
1643                __monitor(mwait_ptr, 0, 0);
1644                mb();
1645                __mwait(eax, 0);
1646                /*
1647                 * If NMI wants to wake up CPU0, start CPU0.
1648                 */
1649                if (wakeup_cpu0())
1650                        start_cpu0();
1651        }
1652}
1653
1654void hlt_play_dead(void)
1655{
1656        if (__this_cpu_read(cpu_info.x86) >= 4)
1657                wbinvd();
1658
1659        while (1) {
1660                native_halt();
1661                /*
1662                 * If NMI wants to wake up CPU0, start CPU0.
1663                 */
1664                if (wakeup_cpu0())
1665                        start_cpu0();
1666        }
1667}
1668
1669void native_play_dead(void)
1670{
1671        play_dead_common();
1672        tboot_shutdown(TB_SHUTDOWN_WFS);
1673
1674        mwait_play_dead();      /* Only returns on failure */
1675        if (cpuidle_play_dead())
1676                hlt_play_dead();
1677}
1678
1679#else /* ... !CONFIG_HOTPLUG_CPU */
1680int native_cpu_disable(void)
1681{
1682        return -ENOSYS;
1683}
1684
1685void native_cpu_die(unsigned int cpu)
1686{
1687        /* We said "no" in __cpu_disable */
1688        BUG();
1689}
1690
1691void native_play_dead(void)
1692{
1693        BUG();
1694}
1695
1696#endif
1697