linux/arch/s390/numa/mode_emu.c
<<
>>
Prefs
   1/*
   2 * NUMA support for s390
   3 *
   4 * NUMA emulation (aka fake NUMA) distributes the available memory to nodes
   5 * without using real topology information about the physical memory of the
   6 * machine.
   7 *
   8 * It distributes the available CPUs to nodes while respecting the original
   9 * machine topology information. This is done by trying to avoid to separate
  10 * CPUs which reside on the same book or even on the same MC.
  11 *
  12 * Because the current Linux scheduler code requires a stable cpu to node
  13 * mapping, cores are pinned to nodes when the first CPU thread is set online.
  14 *
  15 * Copyright IBM Corp. 2015
  16 */
  17
  18#define KMSG_COMPONENT "numa_emu"
  19#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  20
  21#include <linux/kernel.h>
  22#include <linux/cpumask.h>
  23#include <linux/memblock.h>
  24#include <linux/node.h>
  25#include <linux/memory.h>
  26#include <linux/slab.h>
  27#include <asm/smp.h>
  28#include <asm/topology.h>
  29#include "numa_mode.h"
  30#include "toptree.h"
  31
  32/* Distances between the different system components */
  33#define DIST_EMPTY      0
  34#define DIST_CORE       1
  35#define DIST_MC         2
  36#define DIST_BOOK       3
  37#define DIST_MAX        4
  38
  39/* Node distance reported to common code */
  40#define EMU_NODE_DIST   10
  41
  42/* Node ID for free (not yet pinned) cores */
  43#define NODE_ID_FREE    -1
  44
  45/* Different levels of toptree */
  46enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY};
  47
  48/* The two toptree IDs */
  49enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
  50
  51/* Number of NUMA nodes */
  52static int emu_nodes = 1;
  53/* NUMA stripe size */
  54static unsigned long emu_size;
  55
  56/*
  57 * Node to core pinning information updates are protected by
  58 * "sched_domains_mutex".
  59 */
  60static struct {
  61        s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */
  62        int total;                      /* Total number of pinned cores */
  63        int per_node_target;            /* Cores per node without extra cores */
  64        int per_node[MAX_NUMNODES];     /* Number of cores pinned to node */
  65} *emu_cores;
  66
  67/*
  68 * Pin a core to a node
  69 */
  70static void pin_core_to_node(int core_id, int node_id)
  71{
  72        if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) {
  73                emu_cores->per_node[node_id]++;
  74                emu_cores->to_node_id[core_id] = node_id;
  75                emu_cores->total++;
  76        } else {
  77                WARN_ON(emu_cores->to_node_id[core_id] != node_id);
  78        }
  79}
  80
  81/*
  82 * Number of pinned cores of a node
  83 */
  84static int cores_pinned(struct toptree *node)
  85{
  86        return emu_cores->per_node[node->id];
  87}
  88
  89/*
  90 * ID of the node where the core is pinned (or NODE_ID_FREE)
  91 */
  92static int core_pinned_to_node_id(struct toptree *core)
  93{
  94        return emu_cores->to_node_id[core->id];
  95}
  96
  97/*
  98 * Number of cores in the tree that are not yet pinned
  99 */
 100static int cores_free(struct toptree *tree)
 101{
 102        struct toptree *core;
 103        int count = 0;
 104
 105        toptree_for_each(core, tree, CORE) {
 106                if (core_pinned_to_node_id(core) == NODE_ID_FREE)
 107                        count++;
 108        }
 109        return count;
 110}
 111
 112/*
 113 * Return node of core
 114 */
 115static struct toptree *core_node(struct toptree *core)
 116{
 117        return core->parent->parent->parent;
 118}
 119
 120/*
 121 * Return book of core
 122 */
 123static struct toptree *core_book(struct toptree *core)
 124{
 125        return core->parent->parent;
 126}
 127
 128/*
 129 * Return mc of core
 130 */
 131static struct toptree *core_mc(struct toptree *core)
 132{
 133        return core->parent;
 134}
 135
 136/*
 137 * Distance between two cores
 138 */
 139static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
 140{
 141        if (core_book(core1)->id != core_book(core2)->id)
 142                return DIST_BOOK;
 143        if (core_mc(core1)->id != core_mc(core2)->id)
 144                return DIST_MC;
 145        /* Same core or sibling on same MC */
 146        return DIST_CORE;
 147}
 148
 149/*
 150 * Distance of a node to a core
 151 */
 152static int dist_node_to_core(struct toptree *node, struct toptree *core)
 153{
 154        struct toptree *core_node;
 155        int dist_min = DIST_MAX;
 156
 157        toptree_for_each(core_node, node, CORE)
 158                dist_min = min(dist_min, dist_core_to_core(core_node, core));
 159        return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
 160}
 161
 162/*
 163 * Unify will delete empty nodes, therefore recreate nodes.
 164 */
 165static void toptree_unify_tree(struct toptree *tree)
 166{
 167        int nid;
 168
 169        toptree_unify(tree);
 170        for (nid = 0; nid < emu_nodes; nid++)
 171                toptree_get_child(tree, nid);
 172}
 173
 174/*
 175 * Find the best/nearest node for a given core and ensure that no node
 176 * gets more than "emu_cores->per_node_target + extra" cores.
 177 */
 178static struct toptree *node_for_core(struct toptree *numa, struct toptree *core,
 179                                     int extra)
 180{
 181        struct toptree *node, *node_best = NULL;
 182        int dist_cur, dist_best, cores_target;
 183
 184        cores_target = emu_cores->per_node_target + extra;
 185        dist_best = DIST_MAX;
 186        node_best = NULL;
 187        toptree_for_each(node, numa, NODE) {
 188                /* Already pinned cores must use their nodes */
 189                if (core_pinned_to_node_id(core) == node->id) {
 190                        node_best = node;
 191                        break;
 192                }
 193                /* Skip nodes that already have enough cores */
 194                if (cores_pinned(node) >= cores_target)
 195                        continue;
 196                dist_cur = dist_node_to_core(node, core);
 197                if (dist_cur < dist_best) {
 198                        dist_best = dist_cur;
 199                        node_best = node;
 200                }
 201        }
 202        return node_best;
 203}
 204
 205/*
 206 * Find the best node for each core with respect to "extra" core count
 207 */
 208static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys,
 209                                   int extra)
 210{
 211        struct toptree *node, *core, *tmp;
 212
 213        toptree_for_each_safe(core, tmp, phys, CORE) {
 214                node = node_for_core(numa, core, extra);
 215                if (!node)
 216                        return;
 217                toptree_move(core, node);
 218                pin_core_to_node(core->id, node->id);
 219        }
 220}
 221
 222/*
 223 * Move structures of given level to specified NUMA node
 224 */
 225static void move_level_to_numa_node(struct toptree *node, struct toptree *phys,
 226                                    enum toptree_level level, bool perfect)
 227{
 228        int cores_free, cores_target = emu_cores->per_node_target;
 229        struct toptree *cur, *tmp;
 230
 231        toptree_for_each_safe(cur, tmp, phys, level) {
 232                cores_free = cores_target - toptree_count(node, CORE);
 233                if (perfect) {
 234                        if (cores_free == toptree_count(cur, CORE))
 235                                toptree_move(cur, node);
 236                } else {
 237                        if (cores_free >= toptree_count(cur, CORE))
 238                                toptree_move(cur, node);
 239                }
 240        }
 241}
 242
 243/*
 244 * Move structures of a given level to NUMA nodes. If "perfect" is specified
 245 * move only perfectly fitting structures. Otherwise move also smaller
 246 * than needed structures.
 247 */
 248static void move_level_to_numa(struct toptree *numa, struct toptree *phys,
 249                               enum toptree_level level, bool perfect)
 250{
 251        struct toptree *node;
 252
 253        toptree_for_each(node, numa, NODE)
 254                move_level_to_numa_node(node, phys, level, perfect);
 255}
 256
 257/*
 258 * For the first run try to move the big structures
 259 */
 260static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
 261{
 262        struct toptree *core;
 263
 264        /* Always try to move perfectly fitting structures first */
 265        move_level_to_numa(numa, phys, BOOK, true);
 266        move_level_to_numa(numa, phys, BOOK, false);
 267        move_level_to_numa(numa, phys, MC, true);
 268        move_level_to_numa(numa, phys, MC, false);
 269        /* Now pin all the moved cores */
 270        toptree_for_each(core, numa, CORE)
 271                pin_core_to_node(core->id, core_node(core)->id);
 272}
 273
 274/*
 275 * Allocate new topology and create required nodes
 276 */
 277static struct toptree *toptree_new(int id, int nodes)
 278{
 279        struct toptree *tree;
 280        int nid;
 281
 282        tree = toptree_alloc(TOPOLOGY, id);
 283        if (!tree)
 284                goto fail;
 285        for (nid = 0; nid < nodes; nid++) {
 286                if (!toptree_get_child(tree, nid))
 287                        goto fail;
 288        }
 289        return tree;
 290fail:
 291        panic("NUMA emulation could not allocate topology");
 292}
 293
 294/*
 295 * Allocate and initialize core to node mapping
 296 */
 297static void create_core_to_node_map(void)
 298{
 299        int i;
 300
 301        emu_cores = kzalloc(sizeof(*emu_cores), GFP_KERNEL);
 302        if (emu_cores == NULL)
 303                panic("Could not allocate cores to node memory");
 304        for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++)
 305                emu_cores->to_node_id[i] = NODE_ID_FREE;
 306}
 307
 308/*
 309 * Move cores from physical topology into NUMA target topology
 310 * and try to keep as much of the physical topology as possible.
 311 */
 312static struct toptree *toptree_to_numa(struct toptree *phys)
 313{
 314        static int first = 1;
 315        struct toptree *numa;
 316        int cores_total;
 317
 318        cores_total = emu_cores->total + cores_free(phys);
 319        emu_cores->per_node_target = cores_total / emu_nodes;
 320        numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
 321        if (first) {
 322                toptree_to_numa_first(numa, phys);
 323                first = 0;
 324        }
 325        toptree_to_numa_single(numa, phys, 0);
 326        toptree_to_numa_single(numa, phys, 1);
 327        toptree_unify_tree(numa);
 328
 329        WARN_ON(cpumask_weight(&phys->mask));
 330        return numa;
 331}
 332
 333/*
 334 * Create a toptree out of the physical topology that we got from the hypervisor
 335 */
 336static struct toptree *toptree_from_topology(void)
 337{
 338        struct toptree *phys, *node, *book, *mc, *core;
 339        struct cpu_topology_s390 *top;
 340        int cpu;
 341
 342        phys = toptree_new(TOPTREE_ID_PHYS, 1);
 343
 344        for_each_online_cpu(cpu) {
 345                top = &per_cpu(cpu_topology, cpu);
 346                node = toptree_get_child(phys, 0);
 347                book = toptree_get_child(node, top->book_id);
 348                mc = toptree_get_child(book, top->socket_id);
 349                core = toptree_get_child(mc, top->core_id);
 350                if (!book || !mc || !core)
 351                        panic("NUMA emulation could not allocate memory");
 352                cpumask_set_cpu(cpu, &core->mask);
 353                toptree_update_mask(mc);
 354        }
 355        return phys;
 356}
 357
 358/*
 359 * Add toptree core to topology and create correct CPU masks
 360 */
 361static void topology_add_core(struct toptree *core)
 362{
 363        struct cpu_topology_s390 *top;
 364        int cpu;
 365
 366        for_each_cpu(cpu, &core->mask) {
 367                top = &per_cpu(cpu_topology, cpu);
 368                cpumask_copy(&top->thread_mask, &core->mask);
 369                cpumask_copy(&top->core_mask, &core_mc(core)->mask);
 370                cpumask_copy(&top->book_mask, &core_book(core)->mask);
 371                cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]);
 372                top->node_id = core_node(core)->id;
 373        }
 374}
 375
 376/*
 377 * Apply toptree to topology and create CPU masks
 378 */
 379static void toptree_to_topology(struct toptree *numa)
 380{
 381        struct toptree *core;
 382        int i;
 383
 384        /* Clear all node masks */
 385        for (i = 0; i < MAX_NUMNODES; i++)
 386                cpumask_clear(&node_to_cpumask_map[i]);
 387
 388        /* Rebuild all masks */
 389        toptree_for_each(core, numa, CORE)
 390                topology_add_core(core);
 391}
 392
 393/*
 394 * Show the node to core mapping
 395 */
 396static void print_node_to_core_map(void)
 397{
 398        int nid, cid;
 399
 400        if (!numa_debug_enabled)
 401                return;
 402        printk(KERN_DEBUG "NUMA node to core mapping\n");
 403        for (nid = 0; nid < emu_nodes; nid++) {
 404                printk(KERN_DEBUG "  node %3d: ", nid);
 405                for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) {
 406                        if (emu_cores->to_node_id[cid] == nid)
 407                                printk(KERN_CONT "%d ", cid);
 408                }
 409                printk(KERN_CONT "\n");
 410        }
 411}
 412
 413/*
 414 * Transfer physical topology into a NUMA topology and modify CPU masks
 415 * according to the NUMA topology.
 416 *
 417 * Must be called with "sched_domains_mutex" lock held.
 418 */
 419static void emu_update_cpu_topology(void)
 420{
 421        struct toptree *phys, *numa;
 422
 423        if (emu_cores == NULL)
 424                create_core_to_node_map();
 425        phys = toptree_from_topology();
 426        numa = toptree_to_numa(phys);
 427        toptree_free(phys);
 428        toptree_to_topology(numa);
 429        toptree_free(numa);
 430        print_node_to_core_map();
 431}
 432
 433/*
 434 * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
 435 * alignment (needed for memory hotplug).
 436 */
 437static unsigned long emu_setup_size_adjust(unsigned long size)
 438{
 439        unsigned long size_new;
 440
 441        size = size ? : CONFIG_EMU_SIZE;
 442        size_new = roundup(size, memory_block_size_bytes());
 443        if (size_new == size)
 444                return size;
 445        pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n",
 446                size >> 20, size_new >> 20);
 447        return size_new;
 448}
 449
 450/*
 451 * If we have not enough memory for the specified nodes, reduce the node count.
 452 */
 453static int emu_setup_nodes_adjust(int nodes)
 454{
 455        int nodes_max;
 456
 457        nodes_max = memblock.memory.total_size / emu_size;
 458        nodes_max = max(nodes_max, 1);
 459        if (nodes_max >= nodes)
 460                return nodes;
 461        pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
 462        return nodes_max;
 463}
 464
 465/*
 466 * Early emu setup
 467 */
 468static void emu_setup(void)
 469{
 470        emu_size = emu_setup_size_adjust(emu_size);
 471        emu_nodes = emu_setup_nodes_adjust(emu_nodes);
 472        pr_info("Creating %d nodes with memory stripe size %ld MB\n",
 473                emu_nodes, emu_size >> 20);
 474}
 475
 476/*
 477 * Return node id for given page number
 478 */
 479static int emu_pfn_to_nid(unsigned long pfn)
 480{
 481        return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
 482}
 483
 484/*
 485 * Return stripe size
 486 */
 487static unsigned long emu_align(void)
 488{
 489        return emu_size;
 490}
 491
 492/*
 493 * Return distance between two nodes
 494 */
 495static int emu_distance(int node1, int node2)
 496{
 497        return (node1 != node2) * EMU_NODE_DIST;
 498}
 499
 500/*
 501 * Define callbacks for generic s390 NUMA infrastructure
 502 */
 503const struct numa_mode numa_mode_emu = {
 504        .name = "emu",
 505        .setup = emu_setup,
 506        .update_cpu_topology = emu_update_cpu_topology,
 507        .__pfn_to_nid = emu_pfn_to_nid,
 508        .align = emu_align,
 509        .distance = emu_distance,
 510};
 511
 512/*
 513 * Kernel parameter: emu_nodes=<n>
 514 */
 515static int __init early_parse_emu_nodes(char *p)
 516{
 517        int count;
 518
 519        if (kstrtoint(p, 0, &count) != 0 || count <= 0)
 520                return 0;
 521        if (count <= 0)
 522                return 0;
 523        emu_nodes = min(count, MAX_NUMNODES);
 524        return 0;
 525}
 526early_param("emu_nodes", early_parse_emu_nodes);
 527
 528/*
 529 * Kernel parameter: emu_size=[<n>[k|M|G|T]]
 530 */
 531static int __init early_parse_emu_size(char *p)
 532{
 533        emu_size = memparse(p, NULL);
 534        return 0;
 535}
 536early_param("emu_size", early_parse_emu_size);
 537