linux/arch/x86/mm/srat_64.c
<<
>>
Prefs
   1/*
   2 * ACPI 3.0 based NUMA setup
   3 * Copyright 2004 Andi Kleen, SuSE Labs.
   4 *
   5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
   6 *
   7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
   8 * Assumes all memory regions belonging to a single proximity domain
   9 * are in one chunk. Holes between them will be included in the node.
  10 */
  11
  12#include <linux/kernel.h>
  13#include <linux/acpi.h>
  14#include <linux/mmzone.h>
  15#include <linux/bitmap.h>
  16#include <linux/module.h>
  17#include <linux/topology.h>
  18#include <linux/bootmem.h>
  19#include <linux/mm.h>
  20#include <asm/proto.h>
  21#include <asm/numa.h>
  22#include <asm/e820.h>
  23
  24int acpi_numa __initdata;
  25
  26static struct acpi_table_slit *acpi_slit;
  27
  28static nodemask_t nodes_parsed __initdata;
  29static struct bootnode nodes[MAX_NUMNODES] __initdata;
  30static struct bootnode nodes_add[MAX_NUMNODES];
  31static int found_add_area __initdata;
  32int hotadd_percent __initdata = 0;
  33
  34/* Too small nodes confuse the VM badly. Usually they result
  35   from BIOS bugs. */
  36#define NODE_MIN_SIZE (4*1024*1024)
  37
  38static __init int setup_node(int pxm)
  39{
  40        return acpi_map_pxm_to_node(pxm);
  41}
  42
  43static __init int conflicting_nodes(unsigned long start, unsigned long end)
  44{
  45        int i;
  46        for_each_node_mask(i, nodes_parsed) {
  47                struct bootnode *nd = &nodes[i];
  48                if (nd->start == nd->end)
  49                        continue;
  50                if (nd->end > start && nd->start < end)
  51                        return i;
  52                if (nd->end == end && nd->start == start)
  53                        return i;
  54        }
  55        return -1;
  56}
  57
  58static __init void cutoff_node(int i, unsigned long start, unsigned long end)
  59{
  60        struct bootnode *nd = &nodes[i];
  61
  62        if (found_add_area)
  63                return;
  64
  65        if (nd->start < start) {
  66                nd->start = start;
  67                if (nd->end < nd->start)
  68                        nd->start = nd->end;
  69        }
  70        if (nd->end > end) {
  71                nd->end = end;
  72                if (nd->start > nd->end)
  73                        nd->start = nd->end;
  74        }
  75}
  76
  77static __init void bad_srat(void)
  78{
  79        int i;
  80        printk(KERN_ERR "SRAT: SRAT not used.\n");
  81        acpi_numa = -1;
  82        found_add_area = 0;
  83        for (i = 0; i < MAX_LOCAL_APIC; i++)
  84                apicid_to_node[i] = NUMA_NO_NODE;
  85        for (i = 0; i < MAX_NUMNODES; i++)
  86                nodes_add[i].start = nodes[i].end = 0;
  87        remove_all_active_ranges();
  88}
  89
  90static __init inline int srat_disabled(void)
  91{
  92        return numa_off || acpi_numa < 0;
  93}
  94
  95/*
  96 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
  97 * up the NUMA heuristics which wants the local node to have a smaller
  98 * distance than the others.
  99 * Do some quick checks here and only use the SLIT if it passes.
 100 */
 101static __init int slit_valid(struct acpi_table_slit *slit)
 102{
 103        int i, j;
 104        int d = slit->locality_count;
 105        for (i = 0; i < d; i++) {
 106                for (j = 0; j < d; j++)  {
 107                        u8 val = slit->entry[d*i + j];
 108                        if (i == j) {
 109                                if (val != LOCAL_DISTANCE)
 110                                        return 0;
 111                        } else if (val <= LOCAL_DISTANCE)
 112                                return 0;
 113                }
 114        }
 115        return 1;
 116}
 117
 118/* Callback for SLIT parsing */
 119void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 120{
 121        if (!slit_valid(slit)) {
 122                printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
 123                return;
 124        }
 125        acpi_slit = slit;
 126}
 127
 128/* Callback for Proximity Domain -> LAPIC mapping */
 129void __init
 130acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 131{
 132        int pxm, node;
 133        if (srat_disabled())
 134                return;
 135        if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
 136                bad_srat();
 137                return;
 138        }
 139        if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 140                return;
 141        pxm = pa->proximity_domain_lo;
 142        node = setup_node(pxm);
 143        if (node < 0) {
 144                printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
 145                bad_srat();
 146                return;
 147        }
 148        apicid_to_node[pa->apic_id] = node;
 149        acpi_numa = 1;
 150        printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
 151               pxm, pa->apic_id, node);
 152}
 153
 154#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
 155/*
 156 * Protect against too large hotadd areas that would fill up memory.
 157 */
 158static int hotadd_enough_memory(struct bootnode *nd)
 159{
 160        static unsigned long allocated;
 161        static unsigned long last_area_end;
 162        unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
 163        long mem = pages * sizeof(struct page);
 164        unsigned long addr;
 165        unsigned long allowed;
 166        unsigned long oldpages = pages;
 167
 168        if (mem < 0)
 169                return 0;
 170        allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
 171        allowed = (allowed / 100) * hotadd_percent;
 172        if (allocated + mem > allowed) {
 173                unsigned long range;
 174                /* Give them at least part of their hotadd memory upto hotadd_percent
 175                   It would be better to spread the limit out
 176                   over multiple hotplug areas, but that is too complicated
 177                   right now */
 178                if (allocated >= allowed)
 179                        return 0;
 180                range = allowed - allocated;
 181                pages = (range / PAGE_SIZE);
 182                mem = pages * sizeof(struct page);
 183                nd->end = nd->start + range;
 184        }
 185        /* Not completely fool proof, but a good sanity check */
 186        addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
 187        if (addr == -1UL)
 188                return 0;
 189        if (pages != oldpages)
 190                printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
 191                        pages << PAGE_SHIFT);
 192        last_area_end = addr + mem;
 193        allocated += mem;
 194        return 1;
 195}
 196
 197static int update_end_of_memory(unsigned long end)
 198{
 199        found_add_area = 1;
 200        if ((end >> PAGE_SHIFT) > end_pfn)
 201                end_pfn = end >> PAGE_SHIFT;
 202        return 1;
 203}
 204
 205static inline int save_add_info(void)
 206{
 207        return hotadd_percent > 0;
 208}
 209#else
 210int update_end_of_memory(unsigned long end) {return -1;}
 211static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 212#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 213static inline int save_add_info(void) {return 1;}
 214#else
 215static inline int save_add_info(void) {return 0;}
 216#endif
 217#endif
 218/*
 219 * Update nodes_add and decide if to include add are in the zone.
 220 * Both SPARSE and RESERVE need nodes_add infomation.
 221 * This code supports one contiguous hot add area per node.
 222 */
 223static int reserve_hotadd(int node, unsigned long start, unsigned long end)
 224{
 225        unsigned long s_pfn = start >> PAGE_SHIFT;
 226        unsigned long e_pfn = end >> PAGE_SHIFT;
 227        int ret = 0, changed = 0;
 228        struct bootnode *nd = &nodes_add[node];
 229
 230        /* I had some trouble with strange memory hotadd regions breaking
 231           the boot. Be very strict here and reject anything unexpected.
 232           If you want working memory hotadd write correct SRATs.
 233
 234           The node size check is a basic sanity check to guard against
 235           mistakes */
 236        if ((signed long)(end - start) < NODE_MIN_SIZE) {
 237                printk(KERN_ERR "SRAT: Hotplug area too small\n");
 238                return -1;
 239        }
 240
 241        /* This check might be a bit too strict, but I'm keeping it for now. */
 242        if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
 243                printk(KERN_ERR
 244                        "SRAT: Hotplug area %lu -> %lu has existing memory\n",
 245                        s_pfn, e_pfn);
 246                return -1;
 247        }
 248
 249        if (!hotadd_enough_memory(&nodes_add[node]))  {
 250                printk(KERN_ERR "SRAT: Hotplug area too large\n");
 251                return -1;
 252        }
 253
 254        /* Looks good */
 255
 256        if (nd->start == nd->end) {
 257                nd->start = start;
 258                nd->end = end;
 259                changed = 1;
 260        } else {
 261                if (nd->start == end) {
 262                        nd->start = start;
 263                        changed = 1;
 264                }
 265                if (nd->end == start) {
 266                        nd->end = end;
 267                        changed = 1;
 268                }
 269                if (!changed)
 270                        printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
 271        }
 272
 273        ret = update_end_of_memory(nd->end);
 274
 275        if (changed)
 276                printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
 277        return ret;
 278}
 279
 280/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 281void __init
 282acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 283{
 284        struct bootnode *nd, oldnode;
 285        unsigned long start, end;
 286        int node, pxm;
 287        int i;
 288
 289        if (srat_disabled())
 290                return;
 291        if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
 292                bad_srat();
 293                return;
 294        }
 295        if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
 296                return;
 297
 298        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
 299                return;
 300        start = ma->base_address;
 301        end = start + ma->length;
 302        pxm = ma->proximity_domain;
 303        node = setup_node(pxm);
 304        if (node < 0) {
 305                printk(KERN_ERR "SRAT: Too many proximity domains.\n");
 306                bad_srat();
 307                return;
 308        }
 309        i = conflicting_nodes(start, end);
 310        if (i == node) {
 311                printk(KERN_WARNING
 312                "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
 313                        pxm, start, end, nodes[i].start, nodes[i].end);
 314        } else if (i >= 0) {
 315                printk(KERN_ERR
 316                       "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
 317                       pxm, start, end, node_to_pxm(i),
 318                        nodes[i].start, nodes[i].end);
 319                bad_srat();
 320                return;
 321        }
 322        nd = &nodes[node];
 323        oldnode = *nd;
 324        if (!node_test_and_set(node, nodes_parsed)) {
 325                nd->start = start;
 326                nd->end = end;
 327        } else {
 328                if (start < nd->start)
 329                        nd->start = start;
 330                if (nd->end < end)
 331                        nd->end = end;
 332        }
 333
 334        printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
 335               nd->start, nd->end);
 336        e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
 337                                                nd->end >> PAGE_SHIFT);
 338        push_node_boundaries(node, nd->start >> PAGE_SHIFT,
 339                                                nd->end >> PAGE_SHIFT);
 340
 341        if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
 342            (reserve_hotadd(node, start, end) < 0)) {
 343                /* Ignore hotadd region. Undo damage */
 344                printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
 345                *nd = oldnode;
 346                if ((nd->start | nd->end) == 0)
 347                        node_clear(node, nodes_parsed);
 348        }
 349}
 350
 351/* Sanity check to catch more bad SRATs (they are amazingly common).
 352   Make sure the PXMs cover all memory. */
 353static int __init nodes_cover_memory(const struct bootnode *nodes)
 354{
 355        int i;
 356        unsigned long pxmram, e820ram;
 357
 358        pxmram = 0;
 359        for_each_node_mask(i, nodes_parsed) {
 360                unsigned long s = nodes[i].start >> PAGE_SHIFT;
 361                unsigned long e = nodes[i].end >> PAGE_SHIFT;
 362                pxmram += e - s;
 363                pxmram -= absent_pages_in_range(s, e);
 364                if ((long)pxmram < 0)
 365                        pxmram = 0;
 366        }
 367
 368        e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
 369        /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
 370        if ((long)(e820ram - pxmram) >= 1*1024*1024) {
 371                printk(KERN_ERR
 372        "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
 373                        (pxmram << PAGE_SHIFT) >> 20,
 374                        (e820ram << PAGE_SHIFT) >> 20);
 375                return 0;
 376        }
 377        return 1;
 378}
 379
 380static void unparse_node(int node)
 381{
 382        int i;
 383        node_clear(node, nodes_parsed);
 384        for (i = 0; i < MAX_LOCAL_APIC; i++) {
 385                if (apicid_to_node[i] == node)
 386                        apicid_to_node[i] = NUMA_NO_NODE;
 387        }
 388}
 389
 390void __init acpi_numa_arch_fixup(void) {}
 391
 392/* Use the information discovered above to actually set up the nodes. */
 393int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 394{
 395        int i;
 396
 397        if (acpi_numa <= 0)
 398                return -1;
 399
 400        /* First clean up the node list */
 401        for (i = 0; i < MAX_NUMNODES; i++) {
 402                cutoff_node(i, start, end);
 403                if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
 404                        unparse_node(i);
 405                        node_set_offline(i);
 406                }
 407        }
 408
 409        if (!nodes_cover_memory(nodes)) {
 410                bad_srat();
 411                return -1;
 412        }
 413
 414        memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
 415        if (memnode_shift < 0) {
 416                printk(KERN_ERR
 417                     "SRAT: No NUMA node hash function found. Contact maintainer\n");
 418                bad_srat();
 419                return -1;
 420        }
 421
 422        node_possible_map = nodes_parsed;
 423
 424        /* Finally register nodes */
 425        for_each_node_mask(i, node_possible_map)
 426                setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 427        /* Try again in case setup_node_bootmem missed one due
 428           to missing bootmem */
 429        for_each_node_mask(i, node_possible_map)
 430                if (!node_online(i))
 431                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 432
 433        for (i = 0; i < NR_CPUS; i++) {
 434                if (cpu_to_node(i) == NUMA_NO_NODE)
 435                        continue;
 436                if (!node_isset(cpu_to_node(i), node_possible_map))
 437                        numa_set_node(i, NUMA_NO_NODE);
 438        }
 439        numa_init_array();
 440        return 0;
 441}
 442
 443#ifdef CONFIG_NUMA_EMU
 444static int __init find_node_by_addr(unsigned long addr)
 445{
 446        int ret = NUMA_NO_NODE;
 447        int i;
 448
 449        for_each_node_mask(i, nodes_parsed) {
 450                /*
 451                 * Find the real node that this emulated node appears on.  For
 452                 * the sake of simplicity, we only use a real node's starting
 453                 * address to determine which emulated node it appears on.
 454                 */
 455                if (addr >= nodes[i].start && addr < nodes[i].end) {
 456                        ret = i;
 457                        break;
 458                }
 459        }
 460        return i;
 461}
 462
 463/*
 464 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
 465 * mappings that respect the real ACPI topology but reflect our emulated
 466 * environment.  For each emulated node, we find which real node it appears on
 467 * and create PXM to NID mappings for those fake nodes which mirror that
 468 * locality.  SLIT will now represent the correct distances between emulated
 469 * nodes as a result of the real topology.
 470 */
 471void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 472{
 473        int i, j;
 474        int fake_node_to_pxm_map[MAX_NUMNODES] = {
 475                [0 ... MAX_NUMNODES-1] = PXM_INVAL
 476        };
 477        unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
 478                [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 479        };
 480
 481        printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
 482                         "topology.\n");
 483        for (i = 0; i < num_nodes; i++) {
 484                int nid, pxm;
 485
 486                nid = find_node_by_addr(fake_nodes[i].start);
 487                if (nid == NUMA_NO_NODE)
 488                        continue;
 489                pxm = node_to_pxm(nid);
 490                if (pxm == PXM_INVAL)
 491                        continue;
 492                fake_node_to_pxm_map[i] = pxm;
 493                /*
 494                 * For each apicid_to_node mapping that exists for this real
 495                 * node, it must now point to the fake node ID.
 496                 */
 497                for (j = 0; j < MAX_LOCAL_APIC; j++)
 498                        if (apicid_to_node[j] == nid)
 499                                fake_apicid_to_node[j] = i;
 500        }
 501        for (i = 0; i < num_nodes; i++)
 502                __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
 503        memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
 504
 505        nodes_clear(nodes_parsed);
 506        for (i = 0; i < num_nodes; i++)
 507                if (fake_nodes[i].start != fake_nodes[i].end)
 508                        node_set(i, nodes_parsed);
 509        WARN_ON(!nodes_cover_memory(fake_nodes));
 510}
 511
 512static int null_slit_node_compare(int a, int b)
 513{
 514        return node_to_pxm(a) == node_to_pxm(b);
 515}
 516#else
 517static int null_slit_node_compare(int a, int b)
 518{
 519        return a == b;
 520}
 521#endif /* CONFIG_NUMA_EMU */
 522
 523void __init srat_reserve_add_area(int nodeid)
 524{
 525        if (found_add_area && nodes_add[nodeid].end) {
 526                u64 total_mb;
 527
 528                printk(KERN_INFO "SRAT: Reserving hot-add memory space "
 529                                "for node %d at %Lx-%Lx\n",
 530                        nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
 531                total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
 532                                        >> PAGE_SHIFT;
 533                total_mb *= sizeof(struct page);
 534                total_mb >>= 20;
 535                printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
 536                                "pre-allocated memory.\n", (unsigned long long)total_mb);
 537                reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
 538                               nodes_add[nodeid].end - nodes_add[nodeid].start);
 539        }
 540}
 541
 542int __node_distance(int a, int b)
 543{
 544        int index;
 545
 546        if (!acpi_slit)
 547                return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
 548                                                      REMOTE_DISTANCE;
 549        index = acpi_slit->locality_count * node_to_pxm(a);
 550        return acpi_slit->entry[index + node_to_pxm(b)];
 551}
 552
 553EXPORT_SYMBOL(__node_distance);
 554
 555int memory_add_physaddr_to_nid(u64 start)
 556{
 557        int i, ret = 0;
 558
 559        for_each_node(i)
 560                if (nodes_add[i].start <= start && nodes_add[i].end > start)
 561                        ret = i;
 562
 563        return ret;
 564}
 565EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 566
 567