LXR linux/arch/powerpc/mm/numa.c

   1/*
   2 * pSeries NUMA support
   3 *
   4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
   5 *
   6 * This program is free software; you can redistribute it and/or
   7 * modify it under the terms of the GNU General Public License
   8 * as published by the Free Software Foundation; either version
   9 * 2 of the License, or (at your option) any later version.
  10 */
  11#include <linux/threads.h>
  12#include <linux/bootmem.h>
  13#include <linux/init.h>
  14#include <linux/mm.h>
  15#include <linux/mmzone.h>
  16#include <linux/export.h>
  17#include <linux/nodemask.h>
  18#include <linux/cpu.h>
  19#include <linux/notifier.h>
  20#include <linux/memblock.h>
  21#include <linux/of.h>
  22#include <linux/pfn.h>
  23#include <linux/cpuset.h>
  24#include <linux/node.h>
  25#include <asm/sparsemem.h>
  26#include <asm/prom.h>
  27#include <asm/smp.h>
  28#include <asm/firmware.h>
  29#include <asm/paca.h>
  30#include <asm/hvcall.h>
  31#include <asm/setup.h>
  32
  33static int numa_enabled = 1;
  34
  35static char *cmdline __initdata;
  36
  37static int numa_debug;
  38#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
  39
  40int numa_cpu_lookup_table[NR_CPUS];
  41cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
  42struct pglist_data *node_data[MAX_NUMNODES];
  43
  44EXPORT_SYMBOL(numa_cpu_lookup_table);
  45EXPORT_SYMBOL(node_to_cpumask_map);
  46EXPORT_SYMBOL(node_data);
  47
  48static int min_common_depth;
  49static int n_mem_addr_cells, n_mem_size_cells;
  50static int form1_affinity;
  51
  52#define MAX_DISTANCE_REF_POINTS 4
  53static int distance_ref_points_depth;
  54static const unsigned int *distance_ref_points;
  55static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
  56
  57/*
  58 * Allocate node_to_cpumask_map based on number of available nodes
  59 * Requires node_possible_map to be valid.
  60 *
  61 * Note: cpumask_of_node() is not valid until after this is done.
  62 */
  63static void __init setup_node_to_cpumask_map(void)
  64{
  65        unsigned int node, num = 0;
  66
  67        /* setup nr_node_ids if not done yet */
  68        if (nr_node_ids == MAX_NUMNODES) {
  69                for_each_node_mask(node, node_possible_map)
  70                        num = node;
  71                nr_node_ids = num + 1;
  72        }
  73
  74        /* allocate the map */
  75        for (node = 0; node < nr_node_ids; node++)
  76                alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
  77
  78        /* cpumask_of_node() will now work */
  79        dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
  80}
  81
  82static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
  83                                                unsigned int *nid)
  84{
  85        unsigned long long mem;
  86        char *p = cmdline;
  87        static unsigned int fake_nid;
  88        static unsigned long long curr_boundary;
  89
  90        /*
  91         * Modify node id, iff we started creating NUMA nodes
  92         * We want to continue from where we left of the last time
  93         */
  94        if (fake_nid)
  95                *nid = fake_nid;
  96        /*
  97         * In case there are no more arguments to parse, the
  98         * node_id should be the same as the last fake node id
  99         * (we've handled this above).
 100         */
 101        if (!p)
 102                return 0;
 103
 104        mem = memparse(p, &p);
 105        if (!mem)
 106                return 0;
 107
 108        if (mem < curr_boundary)
 109                return 0;
 110
 111        curr_boundary = mem;
 112
 113        if ((end_pfn << PAGE_SHIFT) > mem) {
 114                /*
 115                 * Skip commas and spaces
 116                 */
 117                while (*p == ',' || *p == ' ' || *p == '\t')
 118                        p++;
 119
 120                cmdline = p;
 121                fake_nid++;
 122                *nid = fake_nid;
 123                dbg("created new fake_node with id %d\n", fake_nid);
 124                return 1;
 125        }
 126        return 0;
 127}
 128
 129/*
 130 * get_node_active_region - Return active region containing pfn
 131 * Active range returned is empty if none found.
 132 * @pfn: The page to return the region for
 133 * @node_ar: Returned set to the active region containing @pfn
 134 */
 135static void __init get_node_active_region(unsigned long pfn,
 136                                          struct node_active_region *node_ar)
 137{
 138        unsigned long start_pfn, end_pfn;
 139        int i, nid;
 140
 141        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 142                if (pfn >= start_pfn && pfn < end_pfn) {
 143                        node_ar->nid = nid;
 144                        node_ar->start_pfn = start_pfn;
 145                        node_ar->end_pfn = end_pfn;
 146                        break;
 147                }
 148        }
 149}
 150
 151static void map_cpu_to_node(int cpu, int node)
 152{
 153        numa_cpu_lookup_table[cpu] = node;
 154
 155        dbg("adding cpu %d to node %d\n", cpu, node);
 156
 157        if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
 158                cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
 159}
 160
 161#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
 162static void unmap_cpu_from_node(unsigned long cpu)
 163{
 164        int node = numa_cpu_lookup_table[cpu];
 165
 166        dbg("removing cpu %lu from node %d\n", cpu, node);
 167
 168        if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
 169                cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
 170        } else {
 171                printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
 172                       cpu, node);
 173        }
 174}
 175#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 176
 177/* must hold reference to node during call */
 178static const int *of_get_associativity(struct device_node *dev)
 179{
 180        return of_get_property(dev, "ibm,associativity", NULL);
 181}
 182
 183/*
 184 * Returns the property linux,drconf-usable-memory if
 185 * it exists (the property exists only in kexec/kdump kernels,
 186 * added by kexec-tools)
 187 */
 188static const u32 *of_get_usable_memory(struct device_node *memory)
 189{
 190        const u32 *prop;
 191        u32 len;
 192        prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
 193        if (!prop || len < sizeof(unsigned int))
 194                return 0;
 195        return prop;
 196}
 197
 198int __node_distance(int a, int b)
 199{
 200        int i;
 201        int distance = LOCAL_DISTANCE;
 202
 203        if (!form1_affinity)
 204                return distance;
 205
 206        for (i = 0; i < distance_ref_points_depth; i++) {
 207                if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
 208                        break;
 209
 210                /* Double the distance for each NUMA level */
 211                distance *= 2;
 212        }
 213
 214        return distance;
 215}
 216
 217static void initialize_distance_lookup_table(int nid,
 218                const unsigned int *associativity)
 219{
 220        int i;
 221
 222        if (!form1_affinity)
 223                return;
 224
 225        for (i = 0; i < distance_ref_points_depth; i++) {
 226                distance_lookup_table[nid][i] =
 227                        associativity[distance_ref_points[i]];
 228        }
 229}
 230
 231/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
 232 * info is found.
 233 */
 234static int associativity_to_nid(const unsigned int *associativity)
 235{
 236        int nid = -1;
 237
 238        if (min_common_depth == -1)
 239                goto out;
 240
 241        if (associativity[0] >= min_common_depth)
 242                nid = associativity[min_common_depth];
 243
 244        /* POWER4 LPAR uses 0xffff as invalid node */
 245        if (nid == 0xffff || nid >= MAX_NUMNODES)
 246                nid = -1;
 247
 248        if (nid > 0 && associativity[0] >= distance_ref_points_depth)
 249                initialize_distance_lookup_table(nid, associativity);
 250
 251out:
 252        return nid;
 253}
 254
 255/* Returns the nid associated with the given device tree node,
 256 * or -1 if not found.
 257 */
 258static int of_node_to_nid_single(struct device_node *device)
 259{
 260        int nid = -1;
 261        const unsigned int *tmp;
 262
 263        tmp = of_get_associativity(device);
 264        if (tmp)
 265                nid = associativity_to_nid(tmp);
 266        return nid;
 267}
 268
 269/* Walk the device tree upwards, looking for an associativity id */
 270int of_node_to_nid(struct device_node *device)
 271{
 272        struct device_node *tmp;
 273        int nid = -1;
 274
 275        of_node_get(device);
 276        while (device) {
 277                nid = of_node_to_nid_single(device);
 278                if (nid != -1)
 279                        break;
 280
 281                tmp = device;
 282                device = of_get_parent(tmp);
 283                of_node_put(tmp);
 284        }
 285        of_node_put(device);
 286
 287        return nid;
 288}
 289EXPORT_SYMBOL_GPL(of_node_to_nid);
 290
 291static int __init find_min_common_depth(void)
 292{
 293        int depth;
 294        struct device_node *chosen;
 295        struct device_node *root;
 296        const char *vec5;
 297
 298        if (firmware_has_feature(FW_FEATURE_OPAL))
 299                root = of_find_node_by_path("/ibm,opal");
 300        else
 301                root = of_find_node_by_path("/rtas");
 302        if (!root)
 303                root = of_find_node_by_path("/");
 304
 305        /*
 306         * This property is a set of 32-bit integers, each representing
 307         * an index into the ibm,associativity nodes.
 308         *
 309         * With form 0 affinity the first integer is for an SMP configuration
 310         * (should be all 0's) and the second is for a normal NUMA
 311         * configuration. We have only one level of NUMA.
 312         *
 313         * With form 1 affinity the first integer is the most significant
 314         * NUMA boundary and the following are progressively less significant
 315         * boundaries. There can be more than one level of NUMA.
 316         */
 317        distance_ref_points = of_get_property(root,
 318                                        "ibm,associativity-reference-points",
 319                                        &distance_ref_points_depth);
 320
 321        if (!distance_ref_points) {
 322                dbg("NUMA: ibm,associativity-reference-points not found.\n");
 323                goto err;
 324        }
 325
 326        distance_ref_points_depth /= sizeof(int);
 327
 328#define VEC5_AFFINITY_BYTE      5
 329#define VEC5_AFFINITY           0x80
 330
 331        if (firmware_has_feature(FW_FEATURE_OPAL))
 332                form1_affinity = 1;
 333        else {
 334                chosen = of_find_node_by_path("/chosen");
 335                if (chosen) {
 336                        vec5 = of_get_property(chosen,
 337                                               "ibm,architecture-vec-5", NULL);
 338                        if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
 339                                                        VEC5_AFFINITY)) {
 340                                dbg("Using form 1 affinity\n");
 341                                form1_affinity = 1;
 342                        }
 343
 344                        of_node_put(chosen);
 345                }
 346        }
 347
 348        if (form1_affinity) {
 349                depth = distance_ref_points[0];
 350        } else {
 351                if (distance_ref_points_depth < 2) {
 352                        printk(KERN_WARNING "NUMA: "
 353                                "short ibm,associativity-reference-points\n");
 354                        goto err;
 355                }
 356
 357                depth = distance_ref_points[1];
 358        }
 359
 360        /*
 361         * Warn and cap if the hardware supports more than
 362         * MAX_DISTANCE_REF_POINTS domains.
 363         */
 364        if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
 365                printk(KERN_WARNING "NUMA: distance array capped at "
 366                        "%d entries\n", MAX_DISTANCE_REF_POINTS);
 367                distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
 368        }
 369
 370        of_node_put(root);
 371        return depth;
 372
 373err:
 374        of_node_put(root);
 375        return -1;
 376}
 377
 378static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
 379{
 380        struct device_node *memory = NULL;
 381
 382        memory = of_find_node_by_type(memory, "memory");
 383        if (!memory)
 384                panic("numa.c: No memory nodes found!");
 385
 386        *n_addr_cells = of_n_addr_cells(memory);
 387        *n_size_cells = of_n_size_cells(memory);
 388        of_node_put(memory);
 389}
 390
 391static unsigned long read_n_cells(int n, const unsigned int **buf)
 392{
 393        unsigned long result = 0;
 394
 395        while (n--) {
 396                result = (result << 32) | **buf;
 397                (*buf)++;
 398        }
 399        return result;
 400}
 401
 402struct of_drconf_cell {
 403        u64     base_addr;
 404        u32     drc_index;
 405        u32     reserved;
 406        u32     aa_index;
 407        u32     flags;
 408};
 409
 410#define DRCONF_MEM_ASSIGNED     0x00000008
 411#define DRCONF_MEM_AI_INVALID   0x00000040
 412#define DRCONF_MEM_RESERVED     0x00000080
 413
 414/*
 415 * Read the next memblock list entry from the ibm,dynamic-memory property
 416 * and return the information in the provided of_drconf_cell structure.
 417 */
 418static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
 419{
 420        const u32 *cp;
 421
 422        drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
 423
 424        cp = *cellp;
 425        drmem->drc_index = cp[0];
 426        drmem->reserved = cp[1];
 427        drmem->aa_index = cp[2];
 428        drmem->flags = cp[3];
 429
 430        *cellp = cp + 4;
 431}
 432
 433/*
 434 * Retrieve and validate the ibm,dynamic-memory property of the device tree.
 435 *
 436 * The layout of the ibm,dynamic-memory property is a number N of memblock
 437 * list entries followed by N memblock list entries.  Each memblock list entry
 438 * contains information as laid out in the of_drconf_cell struct above.
 439 */
 440static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
 441{
 442        const u32 *prop;
 443        u32 len, entries;
 444
 445        prop = of_get_property(memory, "ibm,dynamic-memory", &len);
 446        if (!prop || len < sizeof(unsigned int))
 447                return 0;
 448
 449        entries = *prop++;
 450
 451        /* Now that we know the number of entries, revalidate the size
 452         * of the property read in to ensure we have everything
 453         */
 454        if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
 455                return 0;
 456
 457        *dm = prop;
 458        return entries;
 459}
 460
 461/*
 462 * Retrieve and validate the ibm,lmb-size property for drconf memory
 463 * from the device tree.
 464 */
 465static u64 of_get_lmb_size(struct device_node *memory)
 466{
 467        const u32 *prop;
 468        u32 len;
 469
 470        prop = of_get_property(memory, "ibm,lmb-size", &len);
 471        if (!prop || len < sizeof(unsigned int))
 472                return 0;
 473
 474        return read_n_cells(n_mem_size_cells, &prop);
 475}
 476
 477struct assoc_arrays {
 478        u32     n_arrays;
 479        u32     array_sz;
 480        const u32 *arrays;
 481};
 482
 483/*
 484 * Retrieve and validate the list of associativity arrays for drconf
 485 * memory from the ibm,associativity-lookup-arrays property of the
 486 * device tree..
 487 *
 488 * The layout of the ibm,associativity-lookup-arrays property is a number N
 489 * indicating the number of associativity arrays, followed by a number M
 490 * indicating the size of each associativity array, followed by a list
 491 * of N associativity arrays.
 492 */
 493static int of_get_assoc_arrays(struct device_node *memory,
 494                               struct assoc_arrays *aa)
 495{
 496        const u32 *prop;
 497        u32 len;
 498
 499        prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
 500        if (!prop || len < 2 * sizeof(unsigned int))
 501                return -1;
 502
 503        aa->n_arrays = *prop++;
 504        aa->array_sz = *prop++;
 505
 506        /* Now that we know the number of arrays and size of each array,
 507         * revalidate the size of the property read in.
 508         */
 509        if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
 510                return -1;
 511
 512        aa->arrays = prop;
 513        return 0;
 514}
 515
 516/*
 517 * This is like of_node_to_nid_single() for memory represented in the
 518 * ibm,dynamic-reconfiguration-memory node.
 519 */
 520static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
 521                                   struct assoc_arrays *aa)
 522{
 523        int default_nid = 0;
 524        int nid = default_nid;
 525        int index;
 526
 527        if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
 528            !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
 529            drmem->aa_index < aa->n_arrays) {
 530                index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
 531                nid = aa->arrays[index];
 532
 533                if (nid == 0xffff || nid >= MAX_NUMNODES)
 534                        nid = default_nid;
 535        }
 536
 537        return nid;
 538}
 539
 540/*
 541 * Figure out to which domain a cpu belongs and stick it there.
 542 * Return the id of the domain used.
 543 */
 544static int __cpuinit numa_setup_cpu(unsigned long lcpu)
 545{
 546        int nid = 0;
 547        struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
 548
 549        if (!cpu) {
 550                WARN_ON(1);
 551                goto out;
 552        }
 553
 554        nid = of_node_to_nid_single(cpu);
 555
 556        if (nid < 0 || !node_online(nid))
 557                nid = first_online_node;
 558out:
 559        map_cpu_to_node(lcpu, nid);
 560
 561        of_node_put(cpu);
 562
 563        return nid;
 564}
 565
 566static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
 567                             unsigned long action,
 568                             void *hcpu)
 569{
 570        unsigned long lcpu = (unsigned long)hcpu;
 571        int ret = NOTIFY_DONE;
 572
 573        switch (action) {
 574        case CPU_UP_PREPARE:
 575        case CPU_UP_PREPARE_FROZEN:
 576                numa_setup_cpu(lcpu);
 577                ret = NOTIFY_OK;
 578                break;
 579#ifdef CONFIG_HOTPLUG_CPU
 580        case CPU_DEAD:
 581        case CPU_DEAD_FROZEN:
 582        case CPU_UP_CANCELED:
 583        case CPU_UP_CANCELED_FROZEN:
 584                unmap_cpu_from_node(lcpu);
 585                break;
 586                ret = NOTIFY_OK;
 587#endif
 588        }
 589        return ret;
 590}
 591
 592/*
 593 * Check and possibly modify a memory region to enforce the memory limit.
 594 *
 595 * Returns the size the region should have to enforce the memory limit.
 596 * This will either be the original value of size, a truncated value,
 597 * or zero. If the returned value of size is 0 the region should be
 598 * discarded as it lies wholly above the memory limit.
 599 */
 600static unsigned long __init numa_enforce_memory_limit(unsigned long start,
 601                                                      unsigned long size)
 602{
 603        /*
 604         * We use memblock_end_of_DRAM() in here instead of memory_limit because
 605         * we've already adjusted it for the limit and it takes care of
 606         * having memory holes below the limit.  Also, in the case of
 607         * iommu_is_off, memory_limit is not set but is implicitly enforced.
 608         */
 609
 610        if (start + size <= memblock_end_of_DRAM())
 611                return size;
 612
 613        if (start >= memblock_end_of_DRAM())
 614                return 0;
 615
 616        return memblock_end_of_DRAM() - start;
 617}
 618
 619/*
 620 * Reads the counter for a given entry in
 621 * linux,drconf-usable-memory property
 622 */
 623static inline int __init read_usm_ranges(const u32 **usm)
 624{
 625        /*
 626         * For each lmb in ibm,dynamic-memory a corresponding
 627         * entry in linux,drconf-usable-memory property contains
 628         * a counter followed by that many (base, size) duple.
 629         * read the counter from linux,drconf-usable-memory
 630         */
 631        return read_n_cells(n_mem_size_cells, usm);
 632}
 633
 634/*
 635 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
 636 * node.  This assumes n_mem_{addr,size}_cells have been set.
 637 */
 638static void __init parse_drconf_memory(struct device_node *memory)
 639{
 640        const u32 *uninitialized_var(dm), *usm;
 641        unsigned int n, rc, ranges, is_kexec_kdump = 0;
 642        unsigned long lmb_size, base, size, sz;
 643        int nid;
 644        struct assoc_arrays aa = { .arrays = NULL };
 645
 646        n = of_get_drconf_memory(memory, &dm);
 647        if (!n)
 648                return;
 649
 650        lmb_size = of_get_lmb_size(memory);
 651        if (!lmb_size)
 652                return;
 653
 654        rc = of_get_assoc_arrays(memory, &aa);
 655        if (rc)
 656                return;
 657
 658        /* check if this is a kexec/kdump kernel */
 659        usm = of_get_usable_memory(memory);
 660        if (usm != NULL)
 661                is_kexec_kdump = 1;
 662
 663        for (; n != 0; --n) {
 664                struct of_drconf_cell drmem;
 665
 666                read_drconf_cell(&drmem, &dm);
 667
 668                /* skip this block if the reserved bit is set in flags (0x80)
 669                   or if the block is not assigned to this partition (0x8) */
 670                if ((drmem.flags & DRCONF_MEM_RESERVED)
 671                    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
 672                        continue;
 673
 674                base = drmem.base_addr;
 675                size = lmb_size;
 676                ranges = 1;
 677
 678                if (is_kexec_kdump) {
 679                        ranges = read_usm_ranges(&usm);
 680                        if (!ranges) /* there are no (base, size) duple */
 681                                continue;
 682                }
 683                do {
 684                        if (is_kexec_kdump) {
 685                                base = read_n_cells(n_mem_addr_cells, &usm);
 686                                size = read_n_cells(n_mem_size_cells, &usm);
 687                        }
 688                        nid = of_drconf_to_nid_single(&drmem, &aa);
 689                        fake_numa_create_new_node(
 690                                ((base + size) >> PAGE_SHIFT),
 691                                           &nid);
 692                        node_set_online(nid);
 693                        sz = numa_enforce_memory_limit(base, size);
 694                        if (sz)
 695                                memblock_set_node(base, sz, nid);
 696                } while (--ranges);
 697        }
 698}
 699
 700static int __init parse_numa_properties(void)
 701{
 702        struct device_node *memory;
 703        int default_nid = 0;
 704        unsigned long i;
 705
 706        if (numa_enabled == 0) {
 707                printk(KERN_WARNING "NUMA disabled by user\n");
 708                return -1;
 709        }
 710
 711        min_common_depth = find_min_common_depth();
 712
 713        if (min_common_depth < 0)
 714                return min_common_depth;
 715
 716        dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
 717
 718        /*
 719         * Even though we connect cpus to numa domains later in SMP
 720         * init, we need to know the node ids now. This is because
 721         * each node to be onlined must have NODE_DATA etc backing it.
 722         */
 723        for_each_present_cpu(i) {
 724                struct device_node *cpu;
 725                int nid;
 726
 727                cpu = of_get_cpu_node(i, NULL);
 728                BUG_ON(!cpu);
 729                nid = of_node_to_nid_single(cpu);
 730                of_node_put(cpu);
 731
 732                /*
 733                 * Don't fall back to default_nid yet -- we will plug
 734                 * cpus into nodes once the memory scan has discovered
 735                 * the topology.
 736                 */
 737                if (nid < 0)
 738                        continue;
 739                node_set_online(nid);
 740        }
 741
 742        get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
 743
 744        for_each_node_by_type(memory, "memory") {
 745                unsigned long start;
 746                unsigned long size;
 747                int nid;
 748                int ranges;
 749                const unsigned int *memcell_buf;
 750                unsigned int len;
 751
 752                memcell_buf = of_get_property(memory,
 753                        "linux,usable-memory", &len);
 754                if (!memcell_buf || len <= 0)
 755                        memcell_buf = of_get_property(memory, "reg", &len);
 756                if (!memcell_buf || len <= 0)
 757                        continue;
 758
 759                /* ranges in cell */
 760                ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
 761new_range:
 762                /* these are order-sensitive, and modify the buffer pointer */
 763                start = read_n_cells(n_mem_addr_cells, &memcell_buf);
 764                size = read_n_cells(n_mem_size_cells, &memcell_buf);
 765
 766                /*
 767                 * Assumption: either all memory nodes or none will
 768                 * have associativity properties.  If none, then
 769                 * everything goes to default_nid.
 770                 */
 771                nid = of_node_to_nid_single(memory);
 772                if (nid < 0)
 773                        nid = default_nid;
 774
 775                fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
 776                node_set_online(nid);
 777
 778                if (!(size = numa_enforce_memory_limit(start, size))) {
 779                        if (--ranges)
 780                                goto new_range;
 781                        else
 782                                continue;
 783                }
 784
 785                memblock_set_node(start, size, nid);
 786
 787                if (--ranges)
 788                        goto new_range;
 789        }
 790
 791        /*
 792         * Now do the same thing for each MEMBLOCK listed in the
 793         * ibm,dynamic-memory property in the
 794         * ibm,dynamic-reconfiguration-memory node.
 795         */
 796        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 797        if (memory)
 798                parse_drconf_memory(memory);
 799
 800        return 0;
 801}
 802
 803static void __init setup_nonnuma(void)
 804{
 805        unsigned long top_of_ram = memblock_end_of_DRAM();
 806        unsigned long total_ram = memblock_phys_mem_size();
 807        unsigned long start_pfn, end_pfn;
 808        unsigned int nid = 0;
 809        struct memblock_region *reg;
 810
 811        printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 812               top_of_ram, total_ram);
 813        printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 814               (top_of_ram - total_ram) >> 20);
 815
 816        for_each_memblock(memory, reg) {
 817                start_pfn = memblock_region_memory_base_pfn(reg);
 818                end_pfn = memblock_region_memory_end_pfn(reg);
 819
 820                fake_numa_create_new_node(end_pfn, &nid);
 821                memblock_set_node(PFN_PHYS(start_pfn),
 822                                  PFN_PHYS(end_pfn - start_pfn), nid);
 823                node_set_online(nid);
 824        }
 825}
 826
 827void __init dump_numa_cpu_topology(void)
 828{
 829        unsigned int node;
 830        unsigned int cpu, count;
 831
 832        if (min_common_depth == -1 || !numa_enabled)
 833                return;
 834
 835        for_each_online_node(node) {
 836                printk(KERN_DEBUG "Node %d CPUs:", node);
 837
 838                count = 0;
 839                /*
 840                 * If we used a CPU iterator here we would miss printing
 841                 * the holes in the cpumap.
 842                 */
 843                for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
 844                        if (cpumask_test_cpu(cpu,
 845                                        node_to_cpumask_map[node])) {
 846                                if (count == 0)
 847                                        printk(" %u", cpu);
 848                                ++count;
 849                        } else {
 850                                if (count > 1)
 851                                        printk("-%u", cpu - 1);
 852                                count = 0;
 853                        }
 854                }
 855
 856                if (count > 1)
 857                        printk("-%u", nr_cpu_ids - 1);
 858                printk("\n");
 859        }
 860}
 861
 862static void __init dump_numa_memory_topology(void)
 863{
 864        unsigned int node;
 865        unsigned int count;
 866
 867        if (min_common_depth == -1 || !numa_enabled)
 868                return;
 869
 870        for_each_online_node(node) {
 871                unsigned long i;
 872
 873                printk(KERN_DEBUG "Node %d Memory:", node);
 874
 875                count = 0;
 876
 877                for (i = 0; i < memblock_end_of_DRAM();
 878                     i += (1 << SECTION_SIZE_BITS)) {
 879                        if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
 880                                if (count == 0)
 881                                        printk(" 0x%lx", i);
 882                                ++count;
 883                        } else {
 884                                if (count > 0)
 885                                        printk("-0x%lx", i);
 886                                count = 0;
 887                        }
 888                }
 889
 890                if (count > 0)
 891                        printk("-0x%lx", i);
 892                printk("\n");
 893        }
 894}
 895
 896/*
 897 * Allocate some memory, satisfying the memblock or bootmem allocator where
 898 * required. nid is the preferred node and end is the physical address of
 899 * the highest address in the node.
 900 *
 901 * Returns the virtual address of the memory.
 902 */
 903static void __init *careful_zallocation(int nid, unsigned long size,
 904                                       unsigned long align,
 905                                       unsigned long end_pfn)
 906{
 907        void *ret;
 908        int new_nid;
 909        unsigned long ret_paddr;
 910
 911        ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
 912
 913        /* retry over all memory */
 914        if (!ret_paddr)
 915                ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
 916
 917        if (!ret_paddr)
 918                panic("numa.c: cannot allocate %lu bytes for node %d",
 919                      size, nid);
 920
 921        ret = __va(ret_paddr);
 922
 923        /*
 924         * We initialize the nodes in numeric order: 0, 1, 2...
 925         * and hand over control from the MEMBLOCK allocator to the
 926         * bootmem allocator.  If this function is called for
 927         * node 5, then we know that all nodes <5 are using the
 928         * bootmem allocator instead of the MEMBLOCK allocator.
 929         *
 930         * So, check the nid from which this allocation came
 931         * and double check to see if we need to use bootmem
 932         * instead of the MEMBLOCK.  We don't free the MEMBLOCK memory
 933         * since it would be useless.
 934         */
 935        new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
 936        if (new_nid < nid) {
 937                ret = __alloc_bootmem_node(NODE_DATA(new_nid),
 938                                size, align, 0);
 939
 940                dbg("alloc_bootmem %p %lx\n", ret, size);
 941        }
 942
 943        memset(ret, 0, size);
 944        return ret;
 945}
 946
 947static struct notifier_block __cpuinitdata ppc64_numa_nb = {
 948        .notifier_call = cpu_numa_callback,
 949        .priority = 1 /* Must run before sched domains notifier. */
 950};
 951
 952static void __init mark_reserved_regions_for_nid(int nid)
 953{
 954        struct pglist_data *node = NODE_DATA(nid);
 955        struct memblock_region *reg;
 956
 957        for_each_memblock(reserved, reg) {
 958                unsigned long physbase = reg->base;
 959                unsigned long size = reg->size;
 960                unsigned long start_pfn = physbase >> PAGE_SHIFT;
 961                unsigned long end_pfn = PFN_UP(physbase + size);
 962                struct node_active_region node_ar;
 963                unsigned long node_end_pfn = node->node_start_pfn +
 964                                             node->node_spanned_pages;
 965
 966                /*
 967                 * Check to make sure that this memblock.reserved area is
 968                 * within the bounds of the node that we care about.
 969                 * Checking the nid of the start and end points is not
 970                 * sufficient because the reserved area could span the
 971                 * entire node.
 972                 */
 973                if (end_pfn <= node->node_start_pfn ||
 974                    start_pfn >= node_end_pfn)
 975                        continue;
 976
 977                get_node_active_region(start_pfn, &node_ar);
 978                while (start_pfn < end_pfn &&
 979                        node_ar.start_pfn < node_ar.end_pfn) {
 980                        unsigned long reserve_size = size;
 981                        /*
 982                         * if reserved region extends past active region
 983                         * then trim size to active region
 984                         */
 985                        if (end_pfn > node_ar.end_pfn)
 986                                reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
 987                                        - physbase;
 988                        /*
 989                         * Only worry about *this* node, others may not
 990                         * yet have valid NODE_DATA().
 991                         */
 992                        if (node_ar.nid == nid) {
 993                                dbg("reserve_bootmem %lx %lx nid=%d\n",
 994                                        physbase, reserve_size, node_ar.nid);
 995                                reserve_bootmem_node(NODE_DATA(node_ar.nid),
 996                                                physbase, reserve_size,
 997                                                BOOTMEM_DEFAULT);
 998                        }
 999                        /*
1000                         * if reserved region is contained in the active region

1001                         * then done.
1002                         */
1003                        if (end_pfn <= node_ar.end_pfn)
1004                                break;
1005
1006                        /*
1007                         * reserved region extends past the active region
1008                         *   get next active region that contains this
1009                         *   reserved region
1010                         */
1011                        start_pfn = node_ar.end_pfn;
1012                        physbase = start_pfn << PAGE_SHIFT;
1013                        size = size - reserve_size;
1014                        get_node_active_region(start_pfn, &node_ar);
1015                }
1016        }
1017}
1018
1019
1020void __init do_init_bootmem(void)
1021{
1022        int nid;
1023
1024        min_low_pfn = 0;
1025        max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
1026        max_pfn = max_low_pfn;
1027
1028        if (parse_numa_properties())
1029                setup_nonnuma();
1030        else
1031                dump_numa_memory_topology();
1032
1033        for_each_online_node(nid) {
1034                unsigned long start_pfn, end_pfn;
1035                void *bootmem_vaddr;
1036                unsigned long bootmap_pages;
1037
1038                get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1039
1040                /*
1041                 * Allocate the node structure node local if possible
1042                 *
1043                 * Be careful moving this around, as it relies on all
1044                 * previous nodes' bootmem to be initialized and have
1045                 * all reserved areas marked.
1046                 */
1047                NODE_DATA(nid) = careful_zallocation(nid,
1048                                        sizeof(struct pglist_data),
1049                                        SMP_CACHE_BYTES, end_pfn);
1050
1051                dbg("node %d\n", nid);
1052                dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
1053
1054                NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
1055                NODE_DATA(nid)->node_start_pfn = start_pfn;
1056                NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
1057
1058                if (NODE_DATA(nid)->node_spanned_pages == 0)
1059                        continue;
1060
1061                dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
1062                dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
1063
1064                bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
1065                bootmem_vaddr = careful_zallocation(nid,
1066                                        bootmap_pages << PAGE_SHIFT,
1067                                        PAGE_SIZE, end_pfn);
1068
1069                dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
1070
1071                init_bootmem_node(NODE_DATA(nid),
1072                                  __pa(bootmem_vaddr) >> PAGE_SHIFT,
1073                                  start_pfn, end_pfn);
1074
1075                free_bootmem_with_active_regions(nid, end_pfn);
1076                /*
1077                 * Be very careful about moving this around.  Future
1078                 * calls to careful_zallocation() depend on this getting
1079                 * done correctly.
1080                 */
1081                mark_reserved_regions_for_nid(nid);
1082                sparse_memory_present_with_active_regions(nid);
1083        }
1084
1085        init_bootmem_done = 1;
1086
1087        /*
1088         * Now bootmem is initialised we can create the node to cpumask
1089         * lookup tables and setup the cpu callback to populate them.
1090         */
1091        setup_node_to_cpumask_map();
1092
1093        register_cpu_notifier(&ppc64_numa_nb);
1094        cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
1095                          (void *)(unsigned long)boot_cpuid);
1096}
1097
1098void __init paging_init(void)
1099{
1100        unsigned long max_zone_pfns[MAX_NR_ZONES];
1101        memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
1102        max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
1103        free_area_init_nodes(max_zone_pfns);
1104}
1105
1106static int __init early_numa(char *p)
1107{
1108        if (!p)
1109                return 0;
1110
1111        if (strstr(p, "off"))
1112                numa_enabled = 0;
1113
1114        if (strstr(p, "debug"))
1115                numa_debug = 1;
1116
1117        p = strstr(p, "fake=");
1118        if (p)
1119                cmdline = p + strlen("fake=");
1120
1121        return 0;
1122}
1123early_param("numa", early_numa);
1124
1125#ifdef CONFIG_MEMORY_HOTPLUG
1126/*
1127 * Find the node associated with a hot added memory section for
1128 * memory represented in the device tree by the property
1129 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
1130 */
1131static int hot_add_drconf_scn_to_nid(struct device_node *memory,
1132                                     unsigned long scn_addr)
1133{
1134        const u32 *dm;
1135        unsigned int drconf_cell_cnt, rc;
1136        unsigned long lmb_size;
1137        struct assoc_arrays aa;
1138        int nid = -1;
1139
1140        drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1141        if (!drconf_cell_cnt)
1142                return -1;
1143
1144        lmb_size = of_get_lmb_size(memory);
1145        if (!lmb_size)
1146                return -1;
1147
1148        rc = of_get_assoc_arrays(memory, &aa);
1149        if (rc)
1150                return -1;
1151
1152        for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
1153                struct of_drconf_cell drmem;
1154
1155                read_drconf_cell(&drmem, &dm);
1156
1157                /* skip this block if it is reserved or not assigned to
1158                 * this partition */
1159                if ((drmem.flags & DRCONF_MEM_RESERVED)
1160                    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
1161                        continue;
1162
1163                if ((scn_addr < drmem.base_addr)
1164                    || (scn_addr >= (drmem.base_addr + lmb_size)))
1165                        continue;
1166
1167                nid = of_drconf_to_nid_single(&drmem, &aa);
1168                break;
1169        }
1170
1171        return nid;
1172}
1173
1174/*
1175 * Find the node associated with a hot added memory section for memory
1176 * represented in the device tree as a node (i.e. memory@XXXX) for
1177 * each memblock.
1178 */
1179int hot_add_node_scn_to_nid(unsigned long scn_addr)
1180{
1181        struct device_node *memory;
1182        int nid = -1;
1183
1184        for_each_node_by_type(memory, "memory") {
1185                unsigned long start, size;
1186                int ranges;
1187                const unsigned int *memcell_buf;
1188                unsigned int len;
1189
1190                memcell_buf = of_get_property(memory, "reg", &len);
1191                if (!memcell_buf || len <= 0)
1192                        continue;
1193
1194                /* ranges in cell */
1195                ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1196
1197                while (ranges--) {
1198                        start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1199                        size = read_n_cells(n_mem_size_cells, &memcell_buf);
1200
1201                        if ((scn_addr < start) || (scn_addr >= (start + size)))
1202                                continue;
1203
1204                        nid = of_node_to_nid_single(memory);
1205                        break;
1206                }
1207
1208                if (nid >= 0)
1209                        break;
1210        }
1211
1212        of_node_put(memory);
1213
1214        return nid;
1215}
1216
1217/*
1218 * Find the node associated with a hot added memory section.  Section
1219 * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
1220 * sections are fully contained within a single MEMBLOCK.
1221 */
1222int hot_add_scn_to_nid(unsigned long scn_addr)
1223{
1224        struct device_node *memory = NULL;
1225        int nid, found = 0;
1226
1227        if (!numa_enabled || (min_common_depth < 0))
1228                return first_online_node;
1229
1230        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1231        if (memory) {
1232                nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
1233                of_node_put(memory);
1234        } else {
1235                nid = hot_add_node_scn_to_nid(scn_addr);
1236        }
1237
1238        if (nid < 0 || !node_online(nid))
1239                nid = first_online_node;
1240
1241        if (NODE_DATA(nid)->node_spanned_pages)
1242                return nid;
1243
1244        for_each_online_node(nid) {
1245                if (NODE_DATA(nid)->node_spanned_pages) {
1246                        found = 1;
1247                        break;
1248                }
1249        }
1250
1251        BUG_ON(!found);
1252        return nid;
1253}
1254
1255static u64 hot_add_drconf_memory_max(void)
1256{
1257        struct device_node *memory = NULL;
1258        unsigned int drconf_cell_cnt = 0;
1259        u64 lmb_size = 0;
1260        const u32 *dm = 0;
1261
1262        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1263        if (memory) {
1264                drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1265                lmb_size = of_get_lmb_size(memory);
1266                of_node_put(memory);
1267        }
1268        return lmb_size * drconf_cell_cnt;
1269}
1270
1271/*
1272 * memory_hotplug_max - return max address of memory that may be added
1273 *
1274 * This is currently only used on systems that support drconfig memory
1275 * hotplug.
1276 */
1277u64 memory_hotplug_max(void)
1278{
1279        return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1280}
1281#endif /* CONFIG_MEMORY_HOTPLUG */
1282
1283/* Virtual Processor Home Node (VPHN) support */
1284#ifdef CONFIG_PPC_SPLPAR
1285static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
1286static cpumask_t cpu_associativity_changes_mask;
1287static int vphn_enabled;
1288static void set_topology_timer(void);
1289
1290/*
1291 * Store the current values of the associativity change counters in the
1292 * hypervisor.
1293 */
1294static void setup_cpu_associativity_change_counters(void)
1295{
1296        int cpu;
1297
1298        /* The VPHN feature supports a maximum of 8 reference points */
1299        BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1300
1301        for_each_possible_cpu(cpu) {
1302                int i;
1303                u8 *counts = vphn_cpu_change_counts[cpu];
1304                volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1305
1306                for (i = 0; i < distance_ref_points_depth; i++)
1307                        counts[i] = hypervisor_counts[i];
1308        }
1309}
1310
1311/*
1312 * The hypervisor maintains a set of 8 associativity change counters in
1313 * the VPA of each cpu that correspond to the associativity levels in the
1314 * ibm,associativity-reference-points property. When an associativity
1315 * level changes, the corresponding counter is incremented.
1316 *
1317 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1318 * node associativity levels have changed.
1319 *
1320 * Returns the number of cpus with unhandled associativity changes.
1321 */
1322static int update_cpu_associativity_changes_mask(void)
1323{
1324        int cpu, nr_cpus = 0;
1325        cpumask_t *changes = &cpu_associativity_changes_mask;
1326
1327        cpumask_clear(changes);
1328
1329        for_each_possible_cpu(cpu) {
1330                int i, changed = 0;
1331                u8 *counts = vphn_cpu_change_counts[cpu];
1332                volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1333
1334                for (i = 0; i < distance_ref_points_depth; i++) {
1335                        if (hypervisor_counts[i] != counts[i]) {
1336                                counts[i] = hypervisor_counts[i];
1337                                changed = 1;
1338                        }
1339                }
1340                if (changed) {
1341                        cpumask_set_cpu(cpu, changes);
1342                        nr_cpus++;
1343                }
1344        }
1345
1346        return nr_cpus;
1347}
1348
1349/*
1350 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
1351 * the complete property we have to add the length in the first cell.
1352 */
1353#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
1354
1355/*
1356 * Convert the associativity domain numbers returned from the hypervisor
1357 * to the sequence they would appear in the ibm,associativity property.
1358 */
1359static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
1360{
1361        int i, nr_assoc_doms = 0;
1362        const u16 *field = (const u16*) packed;
1363
1364#define VPHN_FIELD_UNUSED       (0xffff)
1365#define VPHN_FIELD_MSB          (0x8000)
1366#define VPHN_FIELD_MASK         (~VPHN_FIELD_MSB)
1367
1368        for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
1369                if (*field == VPHN_FIELD_UNUSED) {
1370                        /* All significant fields processed, and remaining
1371                         * fields contain the reserved value of all 1's.
1372                         * Just store them.
1373                         */
1374                        unpacked[i] = *((u32*)field);
1375                        field += 2;
1376                } else if (*field & VPHN_FIELD_MSB) {
1377                        /* Data is in the lower 15 bits of this field */
1378                        unpacked[i] = *field & VPHN_FIELD_MASK;
1379                        field++;
1380                        nr_assoc_doms++;
1381                } else {
1382                        /* Data is in the lower 15 bits of this field
1383                         * concatenated with the next 16 bit field
1384                         */
1385                        unpacked[i] = *((u32*)field);
1386                        field += 2;
1387                        nr_assoc_doms++;
1388                }
1389        }
1390
1391        /* The first cell contains the length of the property */
1392        unpacked[0] = nr_assoc_doms;
1393
1394        return nr_assoc_doms;
1395}
1396
1397/*
1398 * Retrieve the new associativity information for a virtual processor's
1399 * home node.
1400 */
1401static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
1402{
1403        long rc;
1404        long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1405        u64 flags = 1;
1406        int hwcpu = get_hard_smp_processor_id(cpu);
1407
1408        rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1409        vphn_unpack_associativity(retbuf, associativity);
1410
1411        return rc;
1412}
1413
1414static long vphn_get_associativity(unsigned long cpu,
1415                                        unsigned int *associativity)
1416{
1417        long rc;
1418
1419        rc = hcall_vphn(cpu, associativity);
1420
1421        switch (rc) {
1422        case H_FUNCTION:
1423                printk(KERN_INFO
1424                        "VPHN is not supported. Disabling polling...\n");
1425                stop_topology_update();
1426                break;
1427        case H_HARDWARE:
1428                printk(KERN_ERR
1429                        "hcall_vphn() experienced a hardware fault "
1430                        "preventing VPHN. Disabling polling...\n");
1431                stop_topology_update();
1432        }
1433
1434        return rc;
1435}
1436
1437/*
1438 * Update the node maps and sysfs entries for each cpu whose home node
1439 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
1440 */
1441int arch_update_cpu_topology(void)
1442{
1443        int cpu, nid, old_nid, changed = 0;
1444        unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
1445        struct device *dev;
1446
1447        for_each_cpu(cpu,&cpu_associativity_changes_mask) {
1448                vphn_get_associativity(cpu, associativity);
1449                nid = associativity_to_nid(associativity);
1450
1451                if (nid < 0 || !node_online(nid))
1452                        nid = first_online_node;
1453
1454                old_nid = numa_cpu_lookup_table[cpu];
1455
1456                /* Disable hotplug while we update the cpu
1457                 * masks and sysfs.
1458                 */
1459                get_online_cpus();
1460                unregister_cpu_under_node(cpu, old_nid);
1461                unmap_cpu_from_node(cpu);
1462                map_cpu_to_node(cpu, nid);
1463                register_cpu_under_node(cpu, nid);
1464                put_online_cpus();
1465
1466                dev = get_cpu_device(cpu);
1467                if (dev)
1468                        kobject_uevent(&dev->kobj, KOBJ_CHANGE);
1469                changed = 1;
1470        }
1471
1472        return changed;
1473}
1474
1475static void topology_work_fn(struct work_struct *work)
1476{
1477        rebuild_sched_domains();
1478}
1479static DECLARE_WORK(topology_work, topology_work_fn);
1480
1481void topology_schedule_update(void)
1482{
1483        schedule_work(&topology_work);
1484}
1485
1486static void topology_timer_fn(unsigned long ignored)
1487{
1488        if (!vphn_enabled)
1489                return;
1490        if (update_cpu_associativity_changes_mask() > 0)
1491                topology_schedule_update();
1492        set_topology_timer();
1493}
1494static struct timer_list topology_timer =
1495        TIMER_INITIALIZER(topology_timer_fn, 0, 0);
1496
1497static void set_topology_timer(void)
1498{
1499        topology_timer.data = 0;
1500        topology_timer.expires = jiffies + 60 * HZ;
1501        add_timer(&topology_timer);
1502}
1503
1504/*
1505 * Start polling for VPHN associativity changes.
1506 */
1507int start_topology_update(void)
1508{
1509        int rc = 0;
1510
1511        /* Disabled until races with load balancing are fixed */
1512        if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
1513            get_lppaca()->shared_proc) {
1514                vphn_enabled = 1;
1515                setup_cpu_associativity_change_counters();
1516                init_timer_deferrable(&topology_timer);
1517                set_topology_timer();
1518                rc = 1;
1519        }
1520
1521        return rc;
1522}
1523__initcall(start_topology_update);
1524
1525/*
1526 * Disable polling for VPHN associativity changes.
1527 */
1528int stop_topology_update(void)
1529{
1530        vphn_enabled = 0;
1531        return del_timer_sync(&topology_timer);
1532}
1533#endif /* CONFIG_PPC_SPLPAR */
1534