linux/arch/x86/mm/srat_64.c
<<
>>
Prefs
   1/*
   2 * ACPI 3.0 based NUMA setup
   3 * Copyright 2004 Andi Kleen, SuSE Labs.
   4 *
   5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
   6 *
   7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
   8 * Assumes all memory regions belonging to a single proximity domain
   9 * are in one chunk. Holes between them will be included in the node.
  10 */
  11
  12#include <linux/kernel.h>
  13#include <linux/acpi.h>
  14#include <linux/mmzone.h>
  15#include <linux/bitmap.h>
  16#include <linux/module.h>
  17#include <linux/topology.h>
  18#include <linux/bootmem.h>
  19#include <linux/mm.h>
  20#include <asm/proto.h>
  21#include <asm/numa.h>
  22#include <asm/e820.h>
  23#include <asm/apic.h>
  24#include <asm/uv/uv.h>
  25
  26int acpi_numa __initdata;
  27
  28static struct acpi_table_slit *acpi_slit;
  29
  30static nodemask_t nodes_parsed __initdata;
  31static nodemask_t cpu_nodes_parsed __initdata;
  32static struct bootnode nodes[MAX_NUMNODES] __initdata;
  33static struct bootnode nodes_add[MAX_NUMNODES];
  34
  35static int num_node_memblks __initdata;
  36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
  37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
  38
  39static __init int setup_node(int pxm)
  40{
  41        return acpi_map_pxm_to_node(pxm);
  42}
  43
  44static __init int conflicting_memblks(unsigned long start, unsigned long end)
  45{
  46        int i;
  47        for (i = 0; i < num_node_memblks; i++) {
  48                struct bootnode *nd = &node_memblk_range[i];
  49                if (nd->start == nd->end)
  50                        continue;
  51                if (nd->end > start && nd->start < end)
  52                        return memblk_nodeid[i];
  53                if (nd->end == end && nd->start == start)
  54                        return memblk_nodeid[i];
  55        }
  56        return -1;
  57}
  58
  59static __init void cutoff_node(int i, unsigned long start, unsigned long end)
  60{
  61        struct bootnode *nd = &nodes[i];
  62
  63        if (nd->start < start) {
  64                nd->start = start;
  65                if (nd->end < nd->start)
  66                        nd->start = nd->end;
  67        }
  68        if (nd->end > end) {
  69                nd->end = end;
  70                if (nd->start > nd->end)
  71                        nd->start = nd->end;
  72        }
  73}
  74
  75static __init void bad_srat(void)
  76{
  77        int i;
  78        printk(KERN_ERR "SRAT: SRAT not used.\n");
  79        acpi_numa = -1;
  80        for (i = 0; i < MAX_LOCAL_APIC; i++)
  81                apicid_to_node[i] = NUMA_NO_NODE;
  82        for (i = 0; i < MAX_NUMNODES; i++) {
  83                nodes[i].start = nodes[i].end = 0;
  84                nodes_add[i].start = nodes_add[i].end = 0;
  85        }
  86        remove_all_active_ranges();
  87}
  88
  89static __init inline int srat_disabled(void)
  90{
  91        return numa_off || acpi_numa < 0;
  92}
  93
  94/* Callback for SLIT parsing */
  95void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
  96{
  97        unsigned length;
  98        unsigned long phys;
  99
 100        length = slit->header.length;
 101        phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
 102                 PAGE_SIZE);
 103
 104        if (phys == -1L)
 105                panic(" Can not save slit!\n");
 106
 107        acpi_slit = __va(phys);
 108        memcpy(acpi_slit, slit, length);
 109        reserve_early(phys, phys + length, "ACPI SLIT");
 110}
 111
 112/* Callback for Proximity Domain -> x2APIC mapping */
 113void __init
 114acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
 115{
 116        int pxm, node;
 117        int apic_id;
 118
 119        if (srat_disabled())
 120                return;
 121        if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
 122                bad_srat();
 123                return;
 124        }
 125        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 126                return;
 127        pxm = pa->proximity_domain;
 128        node = setup_node(pxm);
 129        if (node < 0) {
 130                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
 131                bad_srat();
 132                return;
 133        }
 134
 135        apic_id = pa->apic_id;
 136        apicid_to_node[apic_id] = node;
 137        node_set(node, cpu_nodes_parsed);
 138        acpi_numa = 1;
 139        printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
 140               pxm, apic_id, node);
 141}
 142
 143/* Callback for Proximity Domain -> LAPIC mapping */
 144void __init
 145acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 146{
 147        int pxm, node;
 148        int apic_id;
 149
 150        if (srat_disabled())
 151                return;
 152        if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
 153                bad_srat();
 154                return;
 155        }
 156        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 157                return;
 158        pxm = pa->proximity_domain_lo;
 159        node = setup_node(pxm);
 160        if (node < 0) {
 161                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
 162                bad_srat();
 163                return;
 164        }
 165
 166        if (get_uv_system_type() >= UV_X2APIC)
 167                apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
 168        else
 169                apic_id = pa->apic_id;
 170        apicid_to_node[apic_id] = node;
 171        node_set(node, cpu_nodes_parsed);
 172        acpi_numa = 1;
 173        printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
 174               pxm, apic_id, node);
 175}
 176
 177#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 178static inline int save_add_info(void) {return 1;}
 179#else
 180static inline int save_add_info(void) {return 0;}
 181#endif
 182/*
 183 * Update nodes_add[]
 184 * This code supports one contiguous hot add area per node
 185 */
 186static void __init
 187update_nodes_add(int node, unsigned long start, unsigned long end)
 188{
 189        unsigned long s_pfn = start >> PAGE_SHIFT;
 190        unsigned long e_pfn = end >> PAGE_SHIFT;
 191        int changed = 0;
 192        struct bootnode *nd = &nodes_add[node];
 193
 194        /* I had some trouble with strange memory hotadd regions breaking
 195           the boot. Be very strict here and reject anything unexpected.
 196           If you want working memory hotadd write correct SRATs.
 197
 198           The node size check is a basic sanity check to guard against
 199           mistakes */
 200        if ((signed long)(end - start) < NODE_MIN_SIZE) {
 201                printk(KERN_ERR "SRAT: Hotplug area too small\n");
 202                return;
 203        }
 204
 205        /* This check might be a bit too strict, but I'm keeping it for now. */
 206        if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
 207                printk(KERN_ERR
 208                        "SRAT: Hotplug area %lu -> %lu has existing memory\n",
 209                        s_pfn, e_pfn);
 210                return;
 211        }
 212
 213        /* Looks good */
 214
 215        if (nd->start == nd->end) {
 216                nd->start = start;
 217                nd->end = end;
 218                changed = 1;
 219        } else {
 220                if (nd->start == end) {
 221                        nd->start = start;
 222                        changed = 1;
 223                }
 224                if (nd->end == start) {
 225                        nd->end = end;
 226                        changed = 1;
 227                }
 228                if (!changed)
 229                        printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
 230        }
 231
 232        if (changed)
 233                printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
 234                                 nd->start, nd->end);
 235}
 236
 237/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 238void __init
 239acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 240{
 241        struct bootnode *nd, oldnode;
 242        unsigned long start, end;
 243        int node, pxm;
 244        int i;
 245
 246        if (srat_disabled())
 247                return;
 248        if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
 249                bad_srat();
 250                return;
 251        }
 252        if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
 253                return;
 254
 255        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
 256                return;
 257        start = ma->base_address;
 258        end = start + ma->length;
 259        pxm = ma->proximity_domain;
 260        node = setup_node(pxm);
 261        if (node < 0) {
 262                printk(KERN_ERR "SRAT: Too many proximity domains.\n");
 263                bad_srat();
 264                return;
 265        }
 266        i = conflicting_memblks(start, end);
 267        if (i == node) {
 268                printk(KERN_WARNING
 269                "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
 270                        pxm, start, end, nodes[i].start, nodes[i].end);
 271        } else if (i >= 0) {
 272                printk(KERN_ERR
 273                       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
 274                       pxm, start, end, node_to_pxm(i),
 275                        nodes[i].start, nodes[i].end);
 276                bad_srat();
 277                return;
 278        }
 279        nd = &nodes[node];
 280        oldnode = *nd;
 281        if (!node_test_and_set(node, nodes_parsed)) {
 282                nd->start = start;
 283                nd->end = end;
 284        } else {
 285                if (start < nd->start)
 286                        nd->start = start;
 287                if (nd->end < end)
 288                        nd->end = end;
 289        }
 290
 291        printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 292               start, end);
 293        e820_register_active_regions(node, start >> PAGE_SHIFT,
 294                                     end >> PAGE_SHIFT);
 295
 296        if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
 297                update_nodes_add(node, start, end);
 298                /* restore nodes[node] */
 299                *nd = oldnode;
 300                if ((nd->start | nd->end) == 0)
 301                        node_clear(node, nodes_parsed);
 302        }
 303
 304        node_memblk_range[num_node_memblks].start = start;
 305        node_memblk_range[num_node_memblks].end = end;
 306        memblk_nodeid[num_node_memblks] = node;
 307        num_node_memblks++;
 308}
 309
 310/* Sanity check to catch more bad SRATs (they are amazingly common).
 311   Make sure the PXMs cover all memory. */
 312static int __init nodes_cover_memory(const struct bootnode *nodes)
 313{
 314        int i;
 315        unsigned long pxmram, e820ram;
 316
 317        pxmram = 0;
 318        for_each_node_mask(i, nodes_parsed) {
 319                unsigned long s = nodes[i].start >> PAGE_SHIFT;
 320                unsigned long e = nodes[i].end >> PAGE_SHIFT;
 321                pxmram += e - s;
 322                pxmram -= absent_pages_in_range(s, e);
 323                if ((long)pxmram < 0)
 324                        pxmram = 0;
 325        }
 326
 327        e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
 328        /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
 329        if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
 330                printk(KERN_ERR
 331        "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
 332                        (pxmram << PAGE_SHIFT) >> 20,
 333                        (e820ram << PAGE_SHIFT) >> 20);
 334                return 0;
 335        }
 336        return 1;
 337}
 338
 339void __init acpi_numa_arch_fixup(void) {}
 340
 341/* Use the information discovered above to actually set up the nodes. */
 342int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 343{
 344        int i;
 345
 346        if (acpi_numa <= 0)
 347                return -1;
 348
 349        /* First clean up the node list */
 350        for (i = 0; i < MAX_NUMNODES; i++)
 351                cutoff_node(i, start, end);
 352
 353        if (!nodes_cover_memory(nodes)) {
 354                bad_srat();
 355                return -1;
 356        }
 357
 358        memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
 359                                           memblk_nodeid);
 360        if (memnode_shift < 0) {
 361                printk(KERN_ERR
 362                     "SRAT: No NUMA node hash function found. Contact maintainer\n");
 363                bad_srat();
 364                return -1;
 365        }
 366
 367        /* Account for nodes with cpus and no memory */
 368        nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
 369
 370        /* Finally register nodes */
 371        for_each_node_mask(i, node_possible_map)
 372                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 373        /* Try again in case setup_node_bootmem missed one due
 374           to missing bootmem */
 375        for_each_node_mask(i, node_possible_map)
 376                if (!node_online(i))
 377                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 378
 379        for (i = 0; i < nr_cpu_ids; i++) {
 380                int node = early_cpu_to_node(i);
 381
 382                if (node == NUMA_NO_NODE)
 383                        continue;
 384                if (!node_online(node))
 385                        numa_clear_node(i);
 386        }
 387        numa_init_array();
 388        return 0;
 389}
 390
 391#ifdef CONFIG_NUMA_EMU
 392static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
 393        [0 ... MAX_NUMNODES-1] = PXM_INVAL
 394};
 395static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
 396        [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 397};
 398static int __init find_node_by_addr(unsigned long addr)
 399{
 400        int ret = NUMA_NO_NODE;
 401        int i;
 402
 403        for_each_node_mask(i, nodes_parsed) {
 404                /*
 405                 * Find the real node that this emulated node appears on.  For
 406                 * the sake of simplicity, we only use a real node's starting
 407                 * address to determine which emulated node it appears on.
 408                 */
 409                if (addr >= nodes[i].start && addr < nodes[i].end) {
 410                        ret = i;
 411                        break;
 412                }
 413        }
 414        return ret;
 415}
 416
 417/*
 418 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
 419 * mappings that respect the real ACPI topology but reflect our emulated
 420 * environment.  For each emulated node, we find which real node it appears on
 421 * and create PXM to NID mappings for those fake nodes which mirror that
 422 * locality.  SLIT will now represent the correct distances between emulated
 423 * nodes as a result of the real topology.
 424 */
 425void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 426{
 427        int i, j;
 428
 429        printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
 430                         "topology.\n");
 431        for (i = 0; i < num_nodes; i++) {
 432                int nid, pxm;
 433
 434                nid = find_node_by_addr(fake_nodes[i].start);
 435                if (nid == NUMA_NO_NODE)
 436                        continue;
 437                pxm = node_to_pxm(nid);
 438                if (pxm == PXM_INVAL)
 439                        continue;
 440                fake_node_to_pxm_map[i] = pxm;
 441                /*
 442                 * For each apicid_to_node mapping that exists for this real
 443                 * node, it must now point to the fake node ID.
 444                 */
 445                for (j = 0; j < MAX_LOCAL_APIC; j++)
 446                        if (apicid_to_node[j] == nid)
 447                                fake_apicid_to_node[j] = i;
 448        }
 449        for (i = 0; i < num_nodes; i++)
 450                __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
 451        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
 452
 453        nodes_clear(nodes_parsed);
 454        for (i = 0; i < num_nodes; i++)
 455                if (fake_nodes[i].start != fake_nodes[i].end)
 456                        node_set(i, nodes_parsed);
 457        WARN_ON(!nodes_cover_memory(fake_nodes));
 458}
 459
 460static int null_slit_node_compare(int a, int b)
 461{
 462        return node_to_pxm(a) == node_to_pxm(b);
 463}
 464#else
 465static int null_slit_node_compare(int a, int b)
 466{
 467        return a == b;
 468}
 469#endif /* CONFIG_NUMA_EMU */
 470
 471int __node_distance(int a, int b)
 472{
 473        int index;
 474
 475        if (!acpi_slit)
 476                return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
 477                                                      REMOTE_DISTANCE;
 478        index = acpi_slit->locality_count * node_to_pxm(a);
 479        return acpi_slit->entry[index + node_to_pxm(b)];
 480}
 481
 482EXPORT_SYMBOL(__node_distance);
 483
 484#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
 485int memory_add_physaddr_to_nid(u64 start)
 486{
 487        int i, ret = 0;
 488
 489        for_each_node(i)
 490                if (nodes_add[i].start <= start && nodes_add[i].end > start)
 491                        ret = i;
 492
 493        return ret;
 494}
 495EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 496#endif
 497