linux/arch/s390/numa/mode_emu.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NUMA support for s390
   4 *
   5 * NUMA emulation (aka fake NUMA) distributes the available memory to nodes
   6 * without using real topology information about the physical memory of the
   7 * machine.
   8 *
   9 * It distributes the available CPUs to nodes while respecting the original
  10 * machine topology information. This is done by trying to avoid to separate
  11 * CPUs which reside on the same book or even on the same MC.
  12 *
  13 * Because the current Linux scheduler code requires a stable cpu to node
  14 * mapping, cores are pinned to nodes when the first CPU thread is set online.
  15 *
  16 * Copyright IBM Corp. 2015
  17 */
  18
  19#define KMSG_COMPONENT "numa_emu"
  20#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  21
  22#include <linux/kernel.h>
  23#include <linux/cpumask.h>
  24#include <linux/memblock.h>
  25#include <linux/bootmem.h>
  26#include <linux/node.h>
  27#include <linux/memory.h>
  28#include <linux/slab.h>
  29#include <asm/smp.h>
  30#include <asm/topology.h>
  31#include "numa_mode.h"
  32#include "toptree.h"
  33
  34/* Distances between the different system components */
  35#define DIST_EMPTY      0
  36#define DIST_CORE       1
  37#define DIST_MC         2
  38#define DIST_BOOK       3
  39#define DIST_DRAWER     4
  40#define DIST_MAX        5
  41
  42/* Node distance reported to common code */
  43#define EMU_NODE_DIST   10
  44
  45/* Node ID for free (not yet pinned) cores */
  46#define NODE_ID_FREE    -1
  47
  48/* Different levels of toptree */
  49enum toptree_level {CORE, MC, BOOK, DRAWER, NODE, TOPOLOGY};
  50
  51/* The two toptree IDs */
  52enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
  53
  54/* Number of NUMA nodes */
  55static int emu_nodes = 1;
  56/* NUMA stripe size */
  57static unsigned long emu_size;
  58
  59/*
  60 * Node to core pinning information updates are protected by
  61 * "sched_domains_mutex".
  62 */
  63static struct {
  64        s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */
  65        int total;                      /* Total number of pinned cores */
  66        int per_node_target;            /* Cores per node without extra cores */
  67        int per_node[MAX_NUMNODES];     /* Number of cores pinned to node */
  68} *emu_cores;
  69
  70/*
  71 * Pin a core to a node
  72 */
  73static void pin_core_to_node(int core_id, int node_id)
  74{
  75        if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) {
  76                emu_cores->per_node[node_id]++;
  77                emu_cores->to_node_id[core_id] = node_id;
  78                emu_cores->total++;
  79        } else {
  80                WARN_ON(emu_cores->to_node_id[core_id] != node_id);
  81        }
  82}
  83
  84/*
  85 * Number of pinned cores of a node
  86 */
  87static int cores_pinned(struct toptree *node)
  88{
  89        return emu_cores->per_node[node->id];
  90}
  91
  92/*
  93 * ID of the node where the core is pinned (or NODE_ID_FREE)
  94 */
  95static int core_pinned_to_node_id(struct toptree *core)
  96{
  97        return emu_cores->to_node_id[core->id];
  98}
  99
 100/*
 101 * Number of cores in the tree that are not yet pinned
 102 */
 103static int cores_free(struct toptree *tree)
 104{
 105        struct toptree *core;
 106        int count = 0;
 107
 108        toptree_for_each(core, tree, CORE) {
 109                if (core_pinned_to_node_id(core) == NODE_ID_FREE)
 110                        count++;
 111        }
 112        return count;
 113}
 114
 115/*
 116 * Return node of core
 117 */
 118static struct toptree *core_node(struct toptree *core)
 119{
 120        return core->parent->parent->parent->parent;
 121}
 122
 123/*
 124 * Return drawer of core
 125 */
 126static struct toptree *core_drawer(struct toptree *core)
 127{
 128        return core->parent->parent->parent;
 129}
 130
 131/*
 132 * Return book of core
 133 */
 134static struct toptree *core_book(struct toptree *core)
 135{
 136        return core->parent->parent;
 137}
 138
 139/*
 140 * Return mc of core
 141 */
 142static struct toptree *core_mc(struct toptree *core)
 143{
 144        return core->parent;
 145}
 146
 147/*
 148 * Distance between two cores
 149 */
 150static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
 151{
 152        if (core_drawer(core1)->id != core_drawer(core2)->id)
 153                return DIST_DRAWER;
 154        if (core_book(core1)->id != core_book(core2)->id)
 155                return DIST_BOOK;
 156        if (core_mc(core1)->id != core_mc(core2)->id)
 157                return DIST_MC;
 158        /* Same core or sibling on same MC */
 159        return DIST_CORE;
 160}
 161
 162/*
 163 * Distance of a node to a core
 164 */
 165static int dist_node_to_core(struct toptree *node, struct toptree *core)
 166{
 167        struct toptree *core_node;
 168        int dist_min = DIST_MAX;
 169
 170        toptree_for_each(core_node, node, CORE)
 171                dist_min = min(dist_min, dist_core_to_core(core_node, core));
 172        return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
 173}
 174
 175/*
 176 * Unify will delete empty nodes, therefore recreate nodes.
 177 */
 178static void toptree_unify_tree(struct toptree *tree)
 179{
 180        int nid;
 181
 182        toptree_unify(tree);
 183        for (nid = 0; nid < emu_nodes; nid++)
 184                toptree_get_child(tree, nid);
 185}
 186
 187/*
 188 * Find the best/nearest node for a given core and ensure that no node
 189 * gets more than "emu_cores->per_node_target + extra" cores.
 190 */
 191static struct toptree *node_for_core(struct toptree *numa, struct toptree *core,
 192                                     int extra)
 193{
 194        struct toptree *node, *node_best = NULL;
 195        int dist_cur, dist_best, cores_target;
 196
 197        cores_target = emu_cores->per_node_target + extra;
 198        dist_best = DIST_MAX;
 199        node_best = NULL;
 200        toptree_for_each(node, numa, NODE) {
 201                /* Already pinned cores must use their nodes */
 202                if (core_pinned_to_node_id(core) == node->id) {
 203                        node_best = node;
 204                        break;
 205                }
 206                /* Skip nodes that already have enough cores */
 207                if (cores_pinned(node) >= cores_target)
 208                        continue;
 209                dist_cur = dist_node_to_core(node, core);
 210                if (dist_cur < dist_best) {
 211                        dist_best = dist_cur;
 212                        node_best = node;
 213                }
 214        }
 215        return node_best;
 216}
 217
 218/*
 219 * Find the best node for each core with respect to "extra" core count
 220 */
 221static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys,
 222                                   int extra)
 223{
 224        struct toptree *node, *core, *tmp;
 225
 226        toptree_for_each_safe(core, tmp, phys, CORE) {
 227                node = node_for_core(numa, core, extra);
 228                if (!node)
 229                        return;
 230                toptree_move(core, node);
 231                pin_core_to_node(core->id, node->id);
 232        }
 233}
 234
 235/*
 236 * Move structures of given level to specified NUMA node
 237 */
 238static void move_level_to_numa_node(struct toptree *node, struct toptree *phys,
 239                                    enum toptree_level level, bool perfect)
 240{
 241        int cores_free, cores_target = emu_cores->per_node_target;
 242        struct toptree *cur, *tmp;
 243
 244        toptree_for_each_safe(cur, tmp, phys, level) {
 245                cores_free = cores_target - toptree_count(node, CORE);
 246                if (perfect) {
 247                        if (cores_free == toptree_count(cur, CORE))
 248                                toptree_move(cur, node);
 249                } else {
 250                        if (cores_free >= toptree_count(cur, CORE))
 251                                toptree_move(cur, node);
 252                }
 253        }
 254}
 255
 256/*
 257 * Move structures of a given level to NUMA nodes. If "perfect" is specified
 258 * move only perfectly fitting structures. Otherwise move also smaller
 259 * than needed structures.
 260 */
 261static void move_level_to_numa(struct toptree *numa, struct toptree *phys,
 262                               enum toptree_level level, bool perfect)
 263{
 264        struct toptree *node;
 265
 266        toptree_for_each(node, numa, NODE)
 267                move_level_to_numa_node(node, phys, level, perfect);
 268}
 269
 270/*
 271 * For the first run try to move the big structures
 272 */
 273static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
 274{
 275        struct toptree *core;
 276
 277        /* Always try to move perfectly fitting structures first */
 278        move_level_to_numa(numa, phys, DRAWER, true);
 279        move_level_to_numa(numa, phys, DRAWER, false);
 280        move_level_to_numa(numa, phys, BOOK, true);
 281        move_level_to_numa(numa, phys, BOOK, false);
 282        move_level_to_numa(numa, phys, MC, true);
 283        move_level_to_numa(numa, phys, MC, false);
 284        /* Now pin all the moved cores */
 285        toptree_for_each(core, numa, CORE)
 286                pin_core_to_node(core->id, core_node(core)->id);
 287}
 288
 289/*
 290 * Allocate new topology and create required nodes
 291 */
 292static struct toptree *toptree_new(int id, int nodes)
 293{
 294        struct toptree *tree;
 295        int nid;
 296
 297        tree = toptree_alloc(TOPOLOGY, id);
 298        if (!tree)
 299                goto fail;
 300        for (nid = 0; nid < nodes; nid++) {
 301                if (!toptree_get_child(tree, nid))
 302                        goto fail;
 303        }
 304        return tree;
 305fail:
 306        panic("NUMA emulation could not allocate topology");
 307}
 308
 309/*
 310 * Allocate and initialize core to node mapping
 311 */
 312static void __ref create_core_to_node_map(void)
 313{
 314        int i;
 315
 316        emu_cores = memblock_virt_alloc(sizeof(*emu_cores), 8);
 317        for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++)
 318                emu_cores->to_node_id[i] = NODE_ID_FREE;
 319}
 320
 321/*
 322 * Move cores from physical topology into NUMA target topology
 323 * and try to keep as much of the physical topology as possible.
 324 */
 325static struct toptree *toptree_to_numa(struct toptree *phys)
 326{
 327        static int first = 1;
 328        struct toptree *numa;
 329        int cores_total;
 330
 331        cores_total = emu_cores->total + cores_free(phys);
 332        emu_cores->per_node_target = cores_total / emu_nodes;
 333        numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
 334        if (first) {
 335                toptree_to_numa_first(numa, phys);
 336                first = 0;
 337        }
 338        toptree_to_numa_single(numa, phys, 0);
 339        toptree_to_numa_single(numa, phys, 1);
 340        toptree_unify_tree(numa);
 341
 342        WARN_ON(cpumask_weight(&phys->mask));
 343        return numa;
 344}
 345
 346/*
 347 * Create a toptree out of the physical topology that we got from the hypervisor
 348 */
 349static struct toptree *toptree_from_topology(void)
 350{
 351        struct toptree *phys, *node, *drawer, *book, *mc, *core;
 352        struct cpu_topology_s390 *top;
 353        int cpu;
 354
 355        phys = toptree_new(TOPTREE_ID_PHYS, 1);
 356
 357        for_each_cpu(cpu, &cpus_with_topology) {
 358                top = &cpu_topology[cpu];
 359                node = toptree_get_child(phys, 0);
 360                drawer = toptree_get_child(node, top->drawer_id);
 361                book = toptree_get_child(drawer, top->book_id);
 362                mc = toptree_get_child(book, top->socket_id);
 363                core = toptree_get_child(mc, smp_get_base_cpu(cpu));
 364                if (!drawer || !book || !mc || !core)
 365                        panic("NUMA emulation could not allocate memory");
 366                cpumask_set_cpu(cpu, &core->mask);
 367                toptree_update_mask(mc);
 368        }
 369        return phys;
 370}
 371
 372/*
 373 * Add toptree core to topology and create correct CPU masks
 374 */
 375static void topology_add_core(struct toptree *core)
 376{
 377        struct cpu_topology_s390 *top;
 378        int cpu;
 379
 380        for_each_cpu(cpu, &core->mask) {
 381                top = &cpu_topology[cpu];
 382                cpumask_copy(&top->thread_mask, &core->mask);
 383                cpumask_copy(&top->core_mask, &core_mc(core)->mask);
 384                cpumask_copy(&top->book_mask, &core_book(core)->mask);
 385                cpumask_copy(&top->drawer_mask, &core_drawer(core)->mask);
 386                cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]);
 387                top->node_id = core_node(core)->id;
 388        }
 389}
 390
 391/*
 392 * Apply toptree to topology and create CPU masks
 393 */
 394static void toptree_to_topology(struct toptree *numa)
 395{
 396        struct toptree *core;
 397        int i;
 398
 399        /* Clear all node masks */
 400        for (i = 0; i < MAX_NUMNODES; i++)
 401                cpumask_clear(&node_to_cpumask_map[i]);
 402
 403        /* Rebuild all masks */
 404        toptree_for_each(core, numa, CORE)
 405                topology_add_core(core);
 406}
 407
 408/*
 409 * Show the node to core mapping
 410 */
 411static void print_node_to_core_map(void)
 412{
 413        int nid, cid;
 414
 415        if (!numa_debug_enabled)
 416                return;
 417        printk(KERN_DEBUG "NUMA node to core mapping\n");
 418        for (nid = 0; nid < emu_nodes; nid++) {
 419                printk(KERN_DEBUG "  node %3d: ", nid);
 420                for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) {
 421                        if (emu_cores->to_node_id[cid] == nid)
 422                                printk(KERN_CONT "%d ", cid);
 423                }
 424                printk(KERN_CONT "\n");
 425        }
 426}
 427
 428static void pin_all_possible_cpus(void)
 429{
 430        int core_id, node_id, cpu;
 431        static int initialized;
 432
 433        if (initialized)
 434                return;
 435        print_node_to_core_map();
 436        node_id = 0;
 437        for_each_possible_cpu(cpu) {
 438                core_id = smp_get_base_cpu(cpu);
 439                if (emu_cores->to_node_id[core_id] != NODE_ID_FREE)
 440                        continue;
 441                pin_core_to_node(core_id, node_id);
 442                cpu_topology[cpu].node_id = node_id;
 443                node_id = (node_id + 1) % emu_nodes;
 444        }
 445        print_node_to_core_map();
 446        initialized = 1;
 447}
 448
 449/*
 450 * Transfer physical topology into a NUMA topology and modify CPU masks
 451 * according to the NUMA topology.
 452 *
 453 * Must be called with "sched_domains_mutex" lock held.
 454 */
 455static void emu_update_cpu_topology(void)
 456{
 457        struct toptree *phys, *numa;
 458
 459        if (emu_cores == NULL)
 460                create_core_to_node_map();
 461        phys = toptree_from_topology();
 462        numa = toptree_to_numa(phys);
 463        toptree_free(phys);
 464        toptree_to_topology(numa);
 465        toptree_free(numa);
 466        pin_all_possible_cpus();
 467}
 468
 469/*
 470 * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
 471 * alignment (needed for memory hotplug).
 472 */
 473static unsigned long emu_setup_size_adjust(unsigned long size)
 474{
 475        unsigned long size_new;
 476
 477        size = size ? : CONFIG_EMU_SIZE;
 478        size_new = roundup(size, memory_block_size_bytes());
 479        if (size_new == size)
 480                return size;
 481        pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n",
 482                size >> 20, size_new >> 20);
 483        return size_new;
 484}
 485
 486/*
 487 * If we have not enough memory for the specified nodes, reduce the node count.
 488 */
 489static int emu_setup_nodes_adjust(int nodes)
 490{
 491        int nodes_max;
 492
 493        nodes_max = memblock.memory.total_size / emu_size;
 494        nodes_max = max(nodes_max, 1);
 495        if (nodes_max >= nodes)
 496                return nodes;
 497        pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
 498        return nodes_max;
 499}
 500
 501/*
 502 * Early emu setup
 503 */
 504static void emu_setup(void)
 505{
 506        int nid;
 507
 508        emu_size = emu_setup_size_adjust(emu_size);
 509        emu_nodes = emu_setup_nodes_adjust(emu_nodes);
 510        for (nid = 0; nid < emu_nodes; nid++)
 511                node_set(nid, node_possible_map);
 512        pr_info("Creating %d nodes with memory stripe size %ld MB\n",
 513                emu_nodes, emu_size >> 20);
 514}
 515
 516/*
 517 * Return node id for given page number
 518 */
 519static int emu_pfn_to_nid(unsigned long pfn)
 520{
 521        return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
 522}
 523
 524/*
 525 * Return stripe size
 526 */
 527static unsigned long emu_align(void)
 528{
 529        return emu_size;
 530}
 531
 532/*
 533 * Return distance between two nodes
 534 */
 535static int emu_distance(int node1, int node2)
 536{
 537        return (node1 != node2) * EMU_NODE_DIST;
 538}
 539
 540/*
 541 * Define callbacks for generic s390 NUMA infrastructure
 542 */
 543const struct numa_mode numa_mode_emu = {
 544        .name = "emu",
 545        .setup = emu_setup,
 546        .update_cpu_topology = emu_update_cpu_topology,
 547        .__pfn_to_nid = emu_pfn_to_nid,
 548        .align = emu_align,
 549        .distance = emu_distance,
 550};
 551
 552/*
 553 * Kernel parameter: emu_nodes=<n>
 554 */
 555static int __init early_parse_emu_nodes(char *p)
 556{
 557        int count;
 558
 559        if (kstrtoint(p, 0, &count) != 0 || count <= 0)
 560                return 0;
 561        if (count <= 0)
 562                return 0;
 563        emu_nodes = min(count, MAX_NUMNODES);
 564        return 0;
 565}
 566early_param("emu_nodes", early_parse_emu_nodes);
 567
 568/*
 569 * Kernel parameter: emu_size=[<n>[k|M|G|T]]
 570 */
 571static int __init early_parse_emu_size(char *p)
 572{
 573        emu_size = memparse(p, NULL);
 574        return 0;
 575}
 576early_param("emu_size", early_parse_emu_size);
 577