linux/arch/x86/mm/numa_emulation.c
<<
>>
Prefs
   1/*
   2 * NUMA emulation
   3 */
   4#include <linux/kernel.h>
   5#include <linux/errno.h>
   6#include <linux/topology.h>
   7#include <linux/memblock.h>
   8#include <linux/bootmem.h>
   9#include <asm/dma.h>
  10
  11#include "numa_internal.h"
  12
  13static int emu_nid_to_phys[MAX_NUMNODES];
  14static char *emu_cmdline __initdata;
  15
  16void __init numa_emu_cmdline(char *str)
  17{
  18        emu_cmdline = str;
  19}
  20
  21static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
  22{
  23        int i;
  24
  25        for (i = 0; i < mi->nr_blks; i++)
  26                if (mi->blk[i].nid == nid)
  27                        return i;
  28        return -ENOENT;
  29}
  30
  31static u64 __init mem_hole_size(u64 start, u64 end)
  32{
  33        unsigned long start_pfn = PFN_UP(start);
  34        unsigned long end_pfn = PFN_DOWN(end);
  35
  36        if (start_pfn < end_pfn)
  37                return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
  38        return 0;
  39}
  40
  41/*
  42 * Sets up nid to range from @start to @end.  The return value is -errno if
  43 * something went wrong, 0 otherwise.
  44 */
  45static int __init emu_setup_memblk(struct numa_meminfo *ei,
  46                                   struct numa_meminfo *pi,
  47                                   int nid, int phys_blk, u64 size)
  48{
  49        struct numa_memblk *eb = &ei->blk[ei->nr_blks];
  50        struct numa_memblk *pb = &pi->blk[phys_blk];
  51
  52        if (ei->nr_blks >= NR_NODE_MEMBLKS) {
  53                pr_err("NUMA: Too many emulated memblks, failing emulation\n");
  54                return -EINVAL;
  55        }
  56
  57        ei->nr_blks++;
  58        eb->start = pb->start;
  59        eb->end = pb->start + size;
  60        eb->nid = nid;
  61
  62        if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
  63                emu_nid_to_phys[nid] = nid;
  64
  65        pb->start += size;
  66        if (pb->start >= pb->end) {
  67                WARN_ON_ONCE(pb->start > pb->end);
  68                numa_remove_memblk_from(phys_blk, pi);
  69        }
  70
  71        printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
  72               nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
  73        return 0;
  74}
  75
  76/*
  77 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
  78 * to max_addr.  The return value is the number of nodes allocated.
  79 */
  80static int __init split_nodes_interleave(struct numa_meminfo *ei,
  81                                         struct numa_meminfo *pi,
  82                                         u64 addr, u64 max_addr, int nr_nodes)
  83{
  84        nodemask_t physnode_mask = NODE_MASK_NONE;
  85        u64 size;
  86        int big;
  87        int nid = 0;
  88        int i, ret;
  89
  90        if (nr_nodes <= 0)
  91                return -1;
  92        if (nr_nodes > MAX_NUMNODES) {
  93                pr_info("numa=fake=%d too large, reducing to %d\n",
  94                        nr_nodes, MAX_NUMNODES);
  95                nr_nodes = MAX_NUMNODES;
  96        }
  97
  98        /*
  99         * Calculate target node size.  x86_32 freaks on __udivdi3() so do
 100         * the division in ulong number of pages and convert back.
 101         */
 102        size = max_addr - addr - mem_hole_size(addr, max_addr);
 103        size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
 104
 105        /*
 106         * Calculate the number of big nodes that can be allocated as a result
 107         * of consolidating the remainder.
 108         */
 109        big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
 110                FAKE_NODE_MIN_SIZE;
 111
 112        size &= FAKE_NODE_MIN_HASH_MASK;
 113        if (!size) {
 114                pr_err("Not enough memory for each node.  "
 115                        "NUMA emulation disabled.\n");
 116                return -1;
 117        }
 118
 119        for (i = 0; i < pi->nr_blks; i++)
 120                node_set(pi->blk[i].nid, physnode_mask);
 121
 122        /*
 123         * Continue to fill physical nodes with fake nodes until there is no
 124         * memory left on any of them.
 125         */
 126        while (nodes_weight(physnode_mask)) {
 127                for_each_node_mask(i, physnode_mask) {
 128                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
 129                        u64 start, limit, end;
 130                        int phys_blk;
 131
 132                        phys_blk = emu_find_memblk_by_nid(i, pi);
 133                        if (phys_blk < 0) {
 134                                node_clear(i, physnode_mask);
 135                                continue;
 136                        }
 137                        start = pi->blk[phys_blk].start;
 138                        limit = pi->blk[phys_blk].end;
 139                        end = start + size;
 140
 141                        if (nid < big)
 142                                end += FAKE_NODE_MIN_SIZE;
 143
 144                        /*
 145                         * Continue to add memory to this fake node if its
 146                         * non-reserved memory is less than the per-node size.
 147                         */
 148                        while (end - start - mem_hole_size(start, end) < size) {
 149                                end += FAKE_NODE_MIN_SIZE;
 150                                if (end > limit) {
 151                                        end = limit;
 152                                        break;
 153                                }
 154                        }
 155
 156                        /*
 157                         * If there won't be at least FAKE_NODE_MIN_SIZE of
 158                         * non-reserved memory in ZONE_DMA32 for the next node,
 159                         * this one must extend to the boundary.
 160                         */
 161                        if (end < dma32_end && dma32_end - end -
 162                            mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 163                                end = dma32_end;
 164
 165                        /*
 166                         * If there won't be enough non-reserved memory for the
 167                         * next node, this one must extend to the end of the
 168                         * physical node.
 169                         */
 170                        if (limit - end - mem_hole_size(end, limit) < size)
 171                                end = limit;
 172
 173                        ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
 174                                               phys_blk,
 175                                               min(end, limit) - start);
 176                        if (ret < 0)
 177                                return ret;
 178                }
 179        }
 180        return 0;
 181}
 182
 183/*
 184 * Returns the end address of a node so that there is at least `size' amount of
 185 * non-reserved memory or `max_addr' is reached.
 186 */
 187static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
 188{
 189        u64 end = start + size;
 190
 191        while (end - start - mem_hole_size(start, end) < size) {
 192                end += FAKE_NODE_MIN_SIZE;
 193                if (end > max_addr) {
 194                        end = max_addr;
 195                        break;
 196                }
 197        }
 198        return end;
 199}
 200
 201/*
 202 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
 203 * `addr' to `max_addr'.  The return value is the number of nodes allocated.
 204 */
 205static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 206                                              struct numa_meminfo *pi,
 207                                              u64 addr, u64 max_addr, u64 size)
 208{
 209        nodemask_t physnode_mask = NODE_MASK_NONE;
 210        u64 min_size;
 211        int nid = 0;
 212        int i, ret;
 213
 214        if (!size)
 215                return -1;
 216        /*
 217         * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
 218         * increased accordingly if the requested size is too small.  This
 219         * creates a uniform distribution of node sizes across the entire
 220         * machine (but not necessarily over physical nodes).
 221         */
 222        min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
 223        min_size = max(min_size, FAKE_NODE_MIN_SIZE);
 224        if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
 225                min_size = (min_size + FAKE_NODE_MIN_SIZE) &
 226                                                FAKE_NODE_MIN_HASH_MASK;
 227        if (size < min_size) {
 228                pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
 229                        size >> 20, min_size >> 20);
 230                size = min_size;
 231        }
 232        size &= FAKE_NODE_MIN_HASH_MASK;
 233
 234        for (i = 0; i < pi->nr_blks; i++)
 235                node_set(pi->blk[i].nid, physnode_mask);
 236
 237        /*
 238         * Fill physical nodes with fake nodes of size until there is no memory
 239         * left on any of them.
 240         */
 241        while (nodes_weight(physnode_mask)) {
 242                for_each_node_mask(i, physnode_mask) {
 243                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
 244                        u64 start, limit, end;
 245                        int phys_blk;
 246
 247                        phys_blk = emu_find_memblk_by_nid(i, pi);
 248                        if (phys_blk < 0) {
 249                                node_clear(i, physnode_mask);
 250                                continue;
 251                        }
 252                        start = pi->blk[phys_blk].start;
 253                        limit = pi->blk[phys_blk].end;
 254
 255                        end = find_end_of_node(start, limit, size);
 256                        /*
 257                         * If there won't be at least FAKE_NODE_MIN_SIZE of
 258                         * non-reserved memory in ZONE_DMA32 for the next node,
 259                         * this one must extend to the boundary.
 260                         */
 261                        if (end < dma32_end && dma32_end - end -
 262                            mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 263                                end = dma32_end;
 264
 265                        /*
 266                         * If there won't be enough non-reserved memory for the
 267                         * next node, this one must extend to the end of the
 268                         * physical node.
 269                         */
 270                        if (limit - end - mem_hole_size(end, limit) < size)
 271                                end = limit;
 272
 273                        ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
 274                                               phys_blk,
 275                                               min(end, limit) - start);
 276                        if (ret < 0)
 277                                return ret;
 278                }
 279        }
 280        return 0;
 281}
 282
 283/**
 284 * numa_emulation - Emulate NUMA nodes
 285 * @numa_meminfo: NUMA configuration to massage
 286 * @numa_dist_cnt: The size of the physical NUMA distance table
 287 *
 288 * Emulate NUMA nodes according to the numa=fake kernel parameter.
 289 * @numa_meminfo contains the physical memory configuration and is modified
 290 * to reflect the emulated configuration on success.  @numa_dist_cnt is
 291 * used to determine the size of the physical distance table.
 292 *
 293 * On success, the following modifications are made.
 294 *
 295 * - @numa_meminfo is updated to reflect the emulated nodes.
 296 *
 297 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
 298 *   emulated nodes.
 299 *
 300 * - NUMA distance table is rebuilt to represent distances between emulated
 301 *   nodes.  The distances are determined considering how emulated nodes
 302 *   are mapped to physical nodes and match the actual distances.
 303 *
 304 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
 305 *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
 306 *
 307 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
 308 * identity mapping and no other modification is made.
 309 */
 310void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 311{
 312        static struct numa_meminfo ei __initdata;
 313        static struct numa_meminfo pi __initdata;
 314        const u64 max_addr = PFN_PHYS(max_pfn);
 315        u8 *phys_dist = NULL;
 316        size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
 317        int max_emu_nid, dfl_phys_nid;
 318        int i, j, ret;
 319
 320        if (!emu_cmdline)
 321                goto no_emu;
 322
 323        memset(&ei, 0, sizeof(ei));
 324        pi = *numa_meminfo;
 325
 326        for (i = 0; i < MAX_NUMNODES; i++)
 327                emu_nid_to_phys[i] = NUMA_NO_NODE;
 328
 329        /*
 330         * If the numa=fake command-line contains a 'M' or 'G', it represents
 331         * the fixed node size.  Otherwise, if it is just a single number N,
 332         * split the system RAM into N fake nodes.
 333         */
 334        if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
 335                u64 size;
 336
 337                size = memparse(emu_cmdline, &emu_cmdline);
 338                ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
 339        } else {
 340                unsigned long n;
 341
 342                n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
 343                ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
 344        }
 345        if (*emu_cmdline == ':')
 346                emu_cmdline++;
 347
 348        if (ret < 0)
 349                goto no_emu;
 350
 351        if (numa_cleanup_meminfo(&ei) < 0) {
 352                pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
 353                goto no_emu;
 354        }
 355
 356        /* copy the physical distance table */
 357        if (numa_dist_cnt) {
 358                u64 phys;
 359
 360                phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
 361                                              phys_size, PAGE_SIZE);
 362                if (!phys) {
 363                        pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
 364                        goto no_emu;
 365                }
 366                memblock_reserve(phys, phys_size);
 367                phys_dist = __va(phys);
 368
 369                for (i = 0; i < numa_dist_cnt; i++)
 370                        for (j = 0; j < numa_dist_cnt; j++)
 371                                phys_dist[i * numa_dist_cnt + j] =
 372                                        node_distance(i, j);
 373        }
 374
 375        /*
 376         * Determine the max emulated nid and the default phys nid to use
 377         * for unmapped nodes.
 378         */
 379        max_emu_nid = 0;
 380        dfl_phys_nid = NUMA_NO_NODE;
 381        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
 382                if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
 383                        max_emu_nid = i;
 384                        if (dfl_phys_nid == NUMA_NO_NODE)
 385                                dfl_phys_nid = emu_nid_to_phys[i];
 386                }
 387        }
 388        if (dfl_phys_nid == NUMA_NO_NODE) {
 389                pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
 390                goto no_emu;
 391        }
 392
 393        /* commit */
 394        *numa_meminfo = ei;
 395
 396        /*
 397         * Transform __apicid_to_node table to use emulated nids by
 398         * reverse-mapping phys_nid.  The maps should always exist but fall
 399         * back to zero just in case.
 400         */
 401        for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
 402                if (__apicid_to_node[i] == NUMA_NO_NODE)
 403                        continue;
 404                for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
 405                        if (__apicid_to_node[i] == emu_nid_to_phys[j])
 406                                break;
 407                __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
 408        }
 409
 410        /* make sure all emulated nodes are mapped to a physical node */
 411        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
 412                if (emu_nid_to_phys[i] == NUMA_NO_NODE)
 413                        emu_nid_to_phys[i] = dfl_phys_nid;
 414
 415        /* transform distance table */
 416        numa_reset_distance();
 417        for (i = 0; i < max_emu_nid + 1; i++) {
 418                for (j = 0; j < max_emu_nid + 1; j++) {
 419                        int physi = emu_nid_to_phys[i];
 420                        int physj = emu_nid_to_phys[j];
 421                        int dist;
 422
 423                        if (get_option(&emu_cmdline, &dist) == 2)
 424                                ;
 425                        else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
 426                                dist = physi == physj ?
 427                                        LOCAL_DISTANCE : REMOTE_DISTANCE;
 428                        else
 429                                dist = phys_dist[physi * numa_dist_cnt + physj];
 430
 431                        numa_set_distance(i, j, dist);
 432                }
 433        }
 434
 435        /* free the copied physical distance table */
 436        if (phys_dist)
 437                memblock_free(__pa(phys_dist), phys_size);
 438        return;
 439
 440no_emu:
 441        /* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
 442        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
 443                emu_nid_to_phys[i] = i;
 444}
 445
 446#ifndef CONFIG_DEBUG_PER_CPU_MAPS
 447void numa_add_cpu(int cpu)
 448{
 449        int physnid, nid;
 450
 451        nid = early_cpu_to_node(cpu);
 452        BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
 453
 454        physnid = emu_nid_to_phys[nid];
 455
 456        /*
 457         * Map the cpu to each emulated node that is allocated on the physical
 458         * node of the cpu's apic id.
 459         */
 460        for_each_online_node(nid)
 461                if (emu_nid_to_phys[nid] == physnid)
 462                        cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
 463}
 464
 465void numa_remove_cpu(int cpu)
 466{
 467        int i;
 468
 469        for_each_online_node(i)
 470                cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
 471}
 472#else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
 473static void numa_set_cpumask(int cpu, bool enable)
 474{
 475        int nid, physnid;
 476
 477        nid = early_cpu_to_node(cpu);
 478        if (nid == NUMA_NO_NODE) {
 479                /* early_cpu_to_node() already emits a warning and trace */
 480                return;
 481        }
 482
 483        physnid = emu_nid_to_phys[nid];
 484
 485        for_each_online_node(nid) {
 486                if (emu_nid_to_phys[nid] != physnid)
 487                        continue;
 488
 489                debug_cpumask_set_cpu(cpu, nid, enable);
 490        }
 491}
 492
 493void numa_add_cpu(int cpu)
 494{
 495        numa_set_cpumask(cpu, true);
 496}
 497
 498void numa_remove_cpu(int cpu)
 499{
 500        numa_set_cpumask(cpu, false);
 501}
 502#endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */
 503