linux/arch/x86/mm/numa_emulation.c
<<
>>
Prefs
   1/*
   2 * NUMA emulation
   3 */
   4#include <linux/kernel.h>
   5#include <linux/errno.h>
   6#include <linux/topology.h>
   7#include <linux/memblock.h>
   8#include <linux/bootmem.h>
   9#include <asm/dma.h>
  10
  11#include "numa_internal.h"
  12
  13static int emu_nid_to_phys[MAX_NUMNODES];
  14static char *emu_cmdline __initdata;
  15
  16void __init numa_emu_cmdline(char *str)
  17{
  18        emu_cmdline = str;
  19}
  20
  21static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
  22{
  23        int i;
  24
  25        for (i = 0; i < mi->nr_blks; i++)
  26                if (mi->blk[i].nid == nid)
  27                        return i;
  28        return -ENOENT;
  29}
  30
  31static u64 __init mem_hole_size(u64 start, u64 end)
  32{
  33        unsigned long start_pfn = PFN_UP(start);
  34        unsigned long end_pfn = PFN_DOWN(end);
  35
  36        if (start_pfn < end_pfn)
  37                return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
  38        return 0;
  39}
  40
  41/*
  42 * Sets up nid to range from @start to @end.  The return value is -errno if
  43 * something went wrong, 0 otherwise.
  44 */
  45static int __init emu_setup_memblk(struct numa_meminfo *ei,
  46                                   struct numa_meminfo *pi,
  47                                   int nid, int phys_blk, u64 size)
  48{
  49        struct numa_memblk *eb = &ei->blk[ei->nr_blks];
  50        struct numa_memblk *pb = &pi->blk[phys_blk];
  51
  52        if (ei->nr_blks >= NR_NODE_MEMBLKS) {
  53                pr_err("NUMA: Too many emulated memblks, failing emulation\n");
  54                return -EINVAL;
  55        }
  56
  57        ei->nr_blks++;
  58        eb->start = pb->start;
  59        eb->end = pb->start + size;
  60        eb->nid = nid;
  61
  62        if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
  63                emu_nid_to_phys[nid] = pb->nid;
  64
  65        pb->start += size;
  66        if (pb->start >= pb->end) {
  67                WARN_ON_ONCE(pb->start > pb->end);
  68                numa_remove_memblk_from(phys_blk, pi);
  69        }
  70
  71        printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
  72               nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
  73        return 0;
  74}
  75
  76/*
  77 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
  78 * to max_addr.
  79 *
  80 * Returns zero on success or negative on error.
  81 */
  82static int __init split_nodes_interleave(struct numa_meminfo *ei,
  83                                         struct numa_meminfo *pi,
  84                                         u64 addr, u64 max_addr, int nr_nodes)
  85{
  86        nodemask_t physnode_mask = numa_nodes_parsed;
  87        u64 size;
  88        int big;
  89        int nid = 0;
  90        int i, ret;
  91
  92        if (nr_nodes <= 0)
  93                return -1;
  94        if (nr_nodes > MAX_NUMNODES) {
  95                pr_info("numa=fake=%d too large, reducing to %d\n",
  96                        nr_nodes, MAX_NUMNODES);
  97                nr_nodes = MAX_NUMNODES;
  98        }
  99
 100        /*
 101         * Calculate target node size.  x86_32 freaks on __udivdi3() so do
 102         * the division in ulong number of pages and convert back.
 103         */
 104        size = max_addr - addr - mem_hole_size(addr, max_addr);
 105        size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
 106
 107        /*
 108         * Calculate the number of big nodes that can be allocated as a result
 109         * of consolidating the remainder.
 110         */
 111        big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
 112                FAKE_NODE_MIN_SIZE;
 113
 114        size &= FAKE_NODE_MIN_HASH_MASK;
 115        if (!size) {
 116                pr_err("Not enough memory for each node.  "
 117                        "NUMA emulation disabled.\n");
 118                return -1;
 119        }
 120
 121        /*
 122         * Continue to fill physical nodes with fake nodes until there is no
 123         * memory left on any of them.
 124         */
 125        while (nodes_weight(physnode_mask)) {
 126                for_each_node_mask(i, physnode_mask) {
 127                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
 128                        u64 start, limit, end;
 129                        int phys_blk;
 130
 131                        phys_blk = emu_find_memblk_by_nid(i, pi);
 132                        if (phys_blk < 0) {
 133                                node_clear(i, physnode_mask);
 134                                continue;
 135                        }
 136                        start = pi->blk[phys_blk].start;
 137                        limit = pi->blk[phys_blk].end;
 138                        end = start + size;
 139
 140                        if (nid < big)
 141                                end += FAKE_NODE_MIN_SIZE;
 142
 143                        /*
 144                         * Continue to add memory to this fake node if its
 145                         * non-reserved memory is less than the per-node size.
 146                         */
 147                        while (end - start - mem_hole_size(start, end) < size) {
 148                                end += FAKE_NODE_MIN_SIZE;
 149                                if (end > limit) {
 150                                        end = limit;
 151                                        break;
 152                                }
 153                        }
 154
 155                        /*
 156                         * If there won't be at least FAKE_NODE_MIN_SIZE of
 157                         * non-reserved memory in ZONE_DMA32 for the next node,
 158                         * this one must extend to the boundary.
 159                         */
 160                        if (end < dma32_end && dma32_end - end -
 161                            mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 162                                end = dma32_end;
 163
 164                        /*
 165                         * If there won't be enough non-reserved memory for the
 166                         * next node, this one must extend to the end of the
 167                         * physical node.
 168                         */
 169                        if (limit - end - mem_hole_size(end, limit) < size)
 170                                end = limit;
 171
 172                        ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
 173                                               phys_blk,
 174                                               min(end, limit) - start);
 175                        if (ret < 0)
 176                                return ret;
 177                }
 178        }
 179        return 0;
 180}
 181
 182/*
 183 * Returns the end address of a node so that there is at least `size' amount of
 184 * non-reserved memory or `max_addr' is reached.
 185 */
 186static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
 187{
 188        u64 end = start + size;
 189
 190        while (end - start - mem_hole_size(start, end) < size) {
 191                end += FAKE_NODE_MIN_SIZE;
 192                if (end > max_addr) {
 193                        end = max_addr;
 194                        break;
 195                }
 196        }
 197        return end;
 198}
 199
 200static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
 201{
 202        unsigned long max_pfn = PHYS_PFN(max_addr);
 203        unsigned long base_pfn = PHYS_PFN(base);
 204        unsigned long hole_pfns = PHYS_PFN(hole);
 205
 206        return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
 207}
 208
 209/*
 210 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
 211 * `addr' to `max_addr'.
 212 *
 213 * Returns zero on success or negative on error.
 214 */
 215static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
 216                                              struct numa_meminfo *pi,
 217                                              u64 addr, u64 max_addr, u64 size,
 218                                              int nr_nodes, struct numa_memblk *pblk,
 219                                              int nid)
 220{
 221        nodemask_t physnode_mask = numa_nodes_parsed;
 222        int i, ret, uniform = 0;
 223        u64 min_size;
 224
 225        if ((!size && !nr_nodes) || (nr_nodes && !pblk))
 226                return -1;
 227
 228        /*
 229         * In the 'uniform' case split the passed in physical node by
 230         * nr_nodes, in the non-uniform case, ignore the passed in
 231         * physical block and try to create nodes of at least size
 232         * @size.
 233         *
 234         * In the uniform case, split the nodes strictly by physical
 235         * capacity, i.e. ignore holes. In the non-uniform case account
 236         * for holes and treat @size as a minimum floor.
 237         */
 238        if (!nr_nodes)
 239                nr_nodes = MAX_NUMNODES;
 240        else {
 241                nodes_clear(physnode_mask);
 242                node_set(pblk->nid, physnode_mask);
 243                uniform = 1;
 244        }
 245
 246        if (uniform) {
 247                min_size = uniform_size(max_addr, addr, 0, nr_nodes);
 248                size = min_size;
 249        } else {
 250                /*
 251                 * The limit on emulated nodes is MAX_NUMNODES, so the
 252                 * size per node is increased accordingly if the
 253                 * requested size is too small.  This creates a uniform
 254                 * distribution of node sizes across the entire machine
 255                 * (but not necessarily over physical nodes).
 256                 */
 257                min_size = uniform_size(max_addr, addr,
 258                                mem_hole_size(addr, max_addr), nr_nodes);
 259        }
 260        min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
 261        if (size < min_size) {
 262                pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
 263                        size >> 20, min_size >> 20);
 264                size = min_size;
 265        }
 266        size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
 267
 268        /*
 269         * Fill physical nodes with fake nodes of size until there is no memory
 270         * left on any of them.
 271         */
 272        while (nodes_weight(physnode_mask)) {
 273                for_each_node_mask(i, physnode_mask) {
 274                        u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
 275                        u64 start, limit, end;
 276                        int phys_blk;
 277
 278                        phys_blk = emu_find_memblk_by_nid(i, pi);
 279                        if (phys_blk < 0) {
 280                                node_clear(i, physnode_mask);
 281                                continue;
 282                        }
 283
 284                        start = pi->blk[phys_blk].start;
 285                        limit = pi->blk[phys_blk].end;
 286
 287                        if (uniform)
 288                                end = start + size;
 289                        else
 290                                end = find_end_of_node(start, limit, size);
 291                        /*
 292                         * If there won't be at least FAKE_NODE_MIN_SIZE of
 293                         * non-reserved memory in ZONE_DMA32 for the next node,
 294                         * this one must extend to the boundary.
 295                         */
 296                        if (end < dma32_end && dma32_end - end -
 297                            mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 298                                end = dma32_end;
 299
 300                        /*
 301                         * If there won't be enough non-reserved memory for the
 302                         * next node, this one must extend to the end of the
 303                         * physical node.
 304                         */
 305                        if ((limit - end - mem_hole_size(end, limit) < size)
 306                                        && !uniform)
 307                                end = limit;
 308
 309                        ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
 310                                               phys_blk,
 311                                               min(end, limit) - start);
 312                        if (ret < 0)
 313                                return ret;
 314                }
 315        }
 316        return nid;
 317}
 318
 319static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 320                                              struct numa_meminfo *pi,
 321                                              u64 addr, u64 max_addr, u64 size)
 322{
 323        return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
 324                        0, NULL, NUMA_NO_NODE);
 325}
 326
 327int __init setup_emu2phys_nid(int *dfl_phys_nid)
 328{
 329        int i, max_emu_nid = 0;
 330
 331        *dfl_phys_nid = NUMA_NO_NODE;
 332        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
 333                if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
 334                        max_emu_nid = i;
 335                        if (*dfl_phys_nid == NUMA_NO_NODE)
 336                                *dfl_phys_nid = emu_nid_to_phys[i];
 337                }
 338        }
 339
 340        return max_emu_nid;
 341}
 342
 343/**
 344 * numa_emulation - Emulate NUMA nodes
 345 * @numa_meminfo: NUMA configuration to massage
 346 * @numa_dist_cnt: The size of the physical NUMA distance table
 347 *
 348 * Emulate NUMA nodes according to the numa=fake kernel parameter.
 349 * @numa_meminfo contains the physical memory configuration and is modified
 350 * to reflect the emulated configuration on success.  @numa_dist_cnt is
 351 * used to determine the size of the physical distance table.
 352 *
 353 * On success, the following modifications are made.
 354 *
 355 * - @numa_meminfo is updated to reflect the emulated nodes.
 356 *
 357 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
 358 *   emulated nodes.
 359 *
 360 * - NUMA distance table is rebuilt to represent distances between emulated
 361 *   nodes.  The distances are determined considering how emulated nodes
 362 *   are mapped to physical nodes and match the actual distances.
 363 *
 364 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
 365 *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
 366 *
 367 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
 368 * identity mapping and no other modification is made.
 369 */
 370void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 371{
 372        static struct numa_meminfo ei __initdata;
 373        static struct numa_meminfo pi __initdata;
 374        const u64 max_addr = PFN_PHYS(max_pfn);
 375        u8 *phys_dist = NULL;
 376        size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
 377        int max_emu_nid, dfl_phys_nid;
 378        int i, j, ret;
 379
 380        if (!emu_cmdline)
 381                goto no_emu;
 382
 383        memset(&ei, 0, sizeof(ei));
 384        pi = *numa_meminfo;
 385
 386        for (i = 0; i < MAX_NUMNODES; i++)
 387                emu_nid_to_phys[i] = NUMA_NO_NODE;
 388
 389        /*
 390         * If the numa=fake command-line contains a 'M' or 'G', it represents
 391         * the fixed node size.  Otherwise, if it is just a single number N,
 392         * split the system RAM into N fake nodes.
 393         */
 394        if (strchr(emu_cmdline, 'U')) {
 395                nodemask_t physnode_mask = numa_nodes_parsed;
 396                unsigned long n;
 397                int nid = 0;
 398
 399                n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
 400                ret = -1;
 401                for_each_node_mask(i, physnode_mask) {
 402                        /*
 403                         * The reason we pass in blk[0] is due to
 404                         * numa_remove_memblk_from() called by
 405                         * emu_setup_memblk() will delete entry 0
 406                         * and then move everything else up in the pi.blk
 407                         * array. Therefore we should always be looking
 408                         * at blk[0].
 409                         */
 410                        ret = split_nodes_size_interleave_uniform(&ei, &pi,
 411                                        pi.blk[0].start, pi.blk[0].end, 0,
 412                                        n, &pi.blk[0], nid);
 413                        if (ret < 0)
 414                                break;
 415                        if (ret < n) {
 416                                pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
 417                                                __func__, i, ret, n);
 418                                ret = -1;
 419                                break;
 420                        }
 421                        nid = ret;
 422                }
 423        } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
 424                u64 size;
 425
 426                size = memparse(emu_cmdline, &emu_cmdline);
 427                ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
 428        } else {
 429                unsigned long n;
 430
 431                n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
 432                ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
 433        }
 434        if (*emu_cmdline == ':')
 435                emu_cmdline++;
 436
 437        if (ret < 0)
 438                goto no_emu;
 439
 440        if (numa_cleanup_meminfo(&ei) < 0) {
 441                pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
 442                goto no_emu;
 443        }
 444
 445        /* copy the physical distance table */
 446        if (numa_dist_cnt) {
 447                u64 phys;
 448
 449                phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
 450                                              phys_size, PAGE_SIZE);
 451                if (!phys) {
 452                        pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
 453                        goto no_emu;
 454                }
 455                memblock_reserve(phys, phys_size);
 456                phys_dist = __va(phys);
 457
 458                for (i = 0; i < numa_dist_cnt; i++)
 459                        for (j = 0; j < numa_dist_cnt; j++)
 460                                phys_dist[i * numa_dist_cnt + j] =
 461                                        node_distance(i, j);
 462        }
 463
 464        /*
 465         * Determine the max emulated nid and the default phys nid to use
 466         * for unmapped nodes.
 467         */
 468        max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
 469
 470        /* commit */
 471        *numa_meminfo = ei;
 472
 473        /* Make sure numa_nodes_parsed only contains emulated nodes */
 474        nodes_clear(numa_nodes_parsed);
 475        for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
 476                if (ei.blk[i].start != ei.blk[i].end &&
 477                    ei.blk[i].nid != NUMA_NO_NODE)
 478                        node_set(ei.blk[i].nid, numa_nodes_parsed);
 479
 480        /*
 481         * Transform __apicid_to_node table to use emulated nids by
 482         * reverse-mapping phys_nid.  The maps should always exist but fall
 483         * back to zero just in case.
 484         */
 485        for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
 486                if (__apicid_to_node[i] == NUMA_NO_NODE)
 487                        continue;
 488                for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
 489                        if (__apicid_to_node[i] == emu_nid_to_phys[j])
 490                                break;
 491                __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
 492        }
 493
 494        /* make sure all emulated nodes are mapped to a physical node */
 495        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
 496                if (emu_nid_to_phys[i] == NUMA_NO_NODE)
 497                        emu_nid_to_phys[i] = dfl_phys_nid;
 498
 499        /* transform distance table */
 500        numa_reset_distance();
 501        for (i = 0; i < max_emu_nid + 1; i++) {
 502                for (j = 0; j < max_emu_nid + 1; j++) {
 503                        int physi = emu_nid_to_phys[i];
 504                        int physj = emu_nid_to_phys[j];
 505                        int dist;
 506
 507                        if (get_option(&emu_cmdline, &dist) == 2)
 508                                ;
 509                        else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
 510                                dist = physi == physj ?
 511                                        LOCAL_DISTANCE : REMOTE_DISTANCE;
 512                        else
 513                                dist = phys_dist[physi * numa_dist_cnt + physj];
 514
 515                        numa_set_distance(i, j, dist);
 516                }
 517        }
 518
 519        /* free the copied physical distance table */
 520        if (phys_dist)
 521                memblock_free(__pa(phys_dist), phys_size);
 522        return;
 523
 524no_emu:
 525        /* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
 526        for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
 527                emu_nid_to_phys[i] = i;
 528}
 529
 530#ifndef CONFIG_DEBUG_PER_CPU_MAPS
 531void numa_add_cpu(int cpu)
 532{
 533        int physnid, nid;
 534
 535        nid = early_cpu_to_node(cpu);
 536        BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
 537
 538        physnid = emu_nid_to_phys[nid];
 539
 540        /*
 541         * Map the cpu to each emulated node that is allocated on the physical
 542         * node of the cpu's apic id.
 543         */
 544        for_each_online_node(nid)
 545                if (emu_nid_to_phys[nid] == physnid)
 546                        cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
 547}
 548
 549void numa_remove_cpu(int cpu)
 550{
 551        int i;
 552
 553        for_each_online_node(i)
 554                cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
 555}
 556#else   /* !CONFIG_DEBUG_PER_CPU_MAPS */
 557static void numa_set_cpumask(int cpu, bool enable)
 558{
 559        int nid, physnid;
 560
 561        nid = early_cpu_to_node(cpu);
 562        if (nid == NUMA_NO_NODE) {
 563                /* early_cpu_to_node() already emits a warning and trace */
 564                return;
 565        }
 566
 567        physnid = emu_nid_to_phys[nid];
 568
 569        for_each_online_node(nid) {
 570                if (emu_nid_to_phys[nid] != physnid)
 571                        continue;
 572
 573                debug_cpumask_set_cpu(cpu, nid, enable);
 574        }
 575}
 576
 577void numa_add_cpu(int cpu)
 578{
 579        numa_set_cpumask(cpu, true);
 580}
 581
 582void numa_remove_cpu(int cpu)
 583{
 584        numa_set_cpumask(cpu, false);
 585}
 586#endif  /* !CONFIG_DEBUG_PER_CPU_MAPS */
 587