qemu/hw/core/numa.c
<<
>>
Prefs
   1/*
   2 * NUMA parameter parsing routines
   3 *
   4 * Copyright (c) 2014 Fujitsu Ltd.
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "sysemu/numa.h"
  27#include "exec/cpu-common.h"
  28#include "exec/ramlist.h"
  29#include "qemu/bitmap.h"
  30#include "qemu/error-report.h"
  31#include "qapi/error.h"
  32#include "qapi/opts-visitor.h"
  33#include "qapi/qapi-visit-machine.h"
  34#include "sysemu/qtest.h"
  35#include "hw/mem/pc-dimm.h"
  36#include "hw/mem/memory-device.h"
  37#include "qemu/option.h"
  38#include "qemu/config-file.h"
  39#include "qemu/cutils.h"
  40
  41QemuOptsList qemu_numa_opts = {
  42    .name = "numa",
  43    .implied_opt_name = "type",
  44    .head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head),
  45    .desc = { { 0 } } /* validated with OptsVisitor */
  46};
  47
  48static int have_memdevs;
  49static int have_mem;
  50static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
  51                             * For all nodes, nodeid < max_numa_nodeid
  52                             */
  53int nb_numa_nodes;
  54bool have_numa_distance;
  55NodeInfo numa_info[MAX_NODES];
  56
  57
  58static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
  59                            Error **errp)
  60{
  61    Error *err = NULL;
  62    uint16_t nodenr;
  63    uint16List *cpus = NULL;
  64    MachineClass *mc = MACHINE_GET_CLASS(ms);
  65    unsigned int max_cpus = ms->smp.max_cpus;
  66
  67    if (node->has_nodeid) {
  68        nodenr = node->nodeid;
  69    } else {
  70        nodenr = nb_numa_nodes;
  71    }
  72
  73    if (nodenr >= MAX_NODES) {
  74        error_setg(errp, "Max number of NUMA nodes reached: %"
  75                   PRIu16 "", nodenr);
  76        return;
  77    }
  78
  79    if (numa_info[nodenr].present) {
  80        error_setg(errp, "Duplicate NUMA nodeid: %" PRIu16, nodenr);
  81        return;
  82    }
  83
  84    if (!mc->cpu_index_to_instance_props || !mc->get_default_cpu_node_id) {
  85        error_setg(errp, "NUMA is not supported by this machine-type");
  86        return;
  87    }
  88    for (cpus = node->cpus; cpus; cpus = cpus->next) {
  89        CpuInstanceProperties props;
  90        if (cpus->value >= max_cpus) {
  91            error_setg(errp,
  92                       "CPU index (%" PRIu16 ")"
  93                       " should be smaller than maxcpus (%d)",
  94                       cpus->value, max_cpus);
  95            return;
  96        }
  97        props = mc->cpu_index_to_instance_props(ms, cpus->value);
  98        props.node_id = nodenr;
  99        props.has_node_id = true;
 100        machine_set_cpu_numa_node(ms, &props, &err);
 101        if (err) {
 102            error_propagate(errp, err);
 103            return;
 104        }
 105    }
 106
 107    have_memdevs = have_memdevs ? : node->has_memdev;
 108    have_mem = have_mem ? : node->has_mem;
 109    if ((node->has_mem && have_memdevs) || (node->has_memdev && have_mem)) {
 110        error_setg(errp, "numa configuration should use either mem= or memdev=,"
 111                   "mixing both is not allowed");
 112        return;
 113    }
 114
 115    if (node->has_mem) {
 116        numa_info[nodenr].node_mem = node->mem;
 117        if (!qtest_enabled()) {
 118            warn_report("Parameter -numa node,mem is deprecated,"
 119                        " use -numa node,memdev instead");
 120        }
 121    }
 122    if (node->has_memdev) {
 123        Object *o;
 124        o = object_resolve_path_type(node->memdev, TYPE_MEMORY_BACKEND, NULL);
 125        if (!o) {
 126            error_setg(errp, "memdev=%s is ambiguous", node->memdev);
 127            return;
 128        }
 129
 130        object_ref(o);
 131        numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
 132        numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
 133    }
 134    numa_info[nodenr].present = true;
 135    max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
 136    nb_numa_nodes++;
 137}
 138
 139static void parse_numa_distance(NumaDistOptions *dist, Error **errp)
 140{
 141    uint16_t src = dist->src;
 142    uint16_t dst = dist->dst;
 143    uint8_t val = dist->val;
 144
 145    if (src >= MAX_NODES || dst >= MAX_NODES) {
 146        error_setg(errp, "Parameter '%s' expects an integer between 0 and %d",
 147                   src >= MAX_NODES ? "src" : "dst", MAX_NODES - 1);
 148        return;
 149    }
 150
 151    if (!numa_info[src].present || !numa_info[dst].present) {
 152        error_setg(errp, "Source/Destination NUMA node is missing. "
 153                   "Please use '-numa node' option to declare it first.");
 154        return;
 155    }
 156
 157    if (val < NUMA_DISTANCE_MIN) {
 158        error_setg(errp, "NUMA distance (%" PRIu8 ") is invalid, "
 159                   "it shouldn't be less than %d.",
 160                   val, NUMA_DISTANCE_MIN);
 161        return;
 162    }
 163
 164    if (src == dst && val != NUMA_DISTANCE_MIN) {
 165        error_setg(errp, "Local distance of node %d should be %d.",
 166                   src, NUMA_DISTANCE_MIN);
 167        return;
 168    }
 169
 170    numa_info[src].distance[dst] = val;
 171    have_numa_distance = true;
 172}
 173
 174void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
 175{
 176    Error *err = NULL;
 177
 178    switch (object->type) {
 179    case NUMA_OPTIONS_TYPE_NODE:
 180        parse_numa_node(ms, &object->u.node, &err);
 181        if (err) {
 182            goto end;
 183        }
 184        break;
 185    case NUMA_OPTIONS_TYPE_DIST:
 186        parse_numa_distance(&object->u.dist, &err);
 187        if (err) {
 188            goto end;
 189        }
 190        break;
 191    case NUMA_OPTIONS_TYPE_CPU:
 192        if (!object->u.cpu.has_node_id) {
 193            error_setg(&err, "Missing mandatory node-id property");
 194            goto end;
 195        }
 196        if (!numa_info[object->u.cpu.node_id].present) {
 197            error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be "
 198                "defined with -numa node,nodeid=ID before it's used with "
 199                "-numa cpu,node-id=ID", object->u.cpu.node_id);
 200            goto end;
 201        }
 202
 203        machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu),
 204                                  &err);
 205        break;
 206    default:
 207        abort();
 208    }
 209
 210end:
 211    error_propagate(errp, err);
 212}
 213
 214static int parse_numa(void *opaque, QemuOpts *opts, Error **errp)
 215{
 216    NumaOptions *object = NULL;
 217    MachineState *ms = MACHINE(opaque);
 218    Error *err = NULL;
 219    Visitor *v = opts_visitor_new(opts);
 220
 221    visit_type_NumaOptions(v, NULL, &object, &err);
 222    visit_free(v);
 223    if (err) {
 224        goto end;
 225    }
 226
 227    /* Fix up legacy suffix-less format */
 228    if ((object->type == NUMA_OPTIONS_TYPE_NODE) && object->u.node.has_mem) {
 229        const char *mem_str = qemu_opt_get(opts, "mem");
 230        qemu_strtosz_MiB(mem_str, NULL, &object->u.node.mem);
 231    }
 232
 233    set_numa_options(ms, object, &err);
 234
 235end:
 236    qapi_free_NumaOptions(object);
 237    if (err) {
 238        error_propagate(errp, err);
 239        return -1;
 240    }
 241
 242    return 0;
 243}
 244
 245/* If all node pair distances are symmetric, then only distances
 246 * in one direction are enough. If there is even one asymmetric
 247 * pair, though, then all distances must be provided. The
 248 * distance from a node to itself is always NUMA_DISTANCE_MIN,
 249 * so providing it is never necessary.
 250 */
 251static void validate_numa_distance(void)
 252{
 253    int src, dst;
 254    bool is_asymmetrical = false;
 255
 256    for (src = 0; src < nb_numa_nodes; src++) {
 257        for (dst = src; dst < nb_numa_nodes; dst++) {
 258            if (numa_info[src].distance[dst] == 0 &&
 259                numa_info[dst].distance[src] == 0) {
 260                if (src != dst) {
 261                    error_report("The distance between node %d and %d is "
 262                                 "missing, at least one distance value "
 263                                 "between each nodes should be provided.",
 264                                 src, dst);
 265                    exit(EXIT_FAILURE);
 266                }
 267            }
 268
 269            if (numa_info[src].distance[dst] != 0 &&
 270                numa_info[dst].distance[src] != 0 &&
 271                numa_info[src].distance[dst] !=
 272                numa_info[dst].distance[src]) {
 273                is_asymmetrical = true;
 274            }
 275        }
 276    }
 277
 278    if (is_asymmetrical) {
 279        for (src = 0; src < nb_numa_nodes; src++) {
 280            for (dst = 0; dst < nb_numa_nodes; dst++) {
 281                if (src != dst && numa_info[src].distance[dst] == 0) {
 282                    error_report("At least one asymmetrical pair of "
 283                            "distances is given, please provide distances "
 284                            "for both directions of all node pairs.");
 285                    exit(EXIT_FAILURE);
 286                }
 287            }
 288        }
 289    }
 290}
 291
 292static void complete_init_numa_distance(void)
 293{
 294    int src, dst;
 295
 296    /* Fixup NUMA distance by symmetric policy because if it is an
 297     * asymmetric distance table, it should be a complete table and
 298     * there would not be any missing distance except local node, which
 299     * is verified by validate_numa_distance above.
 300     */
 301    for (src = 0; src < nb_numa_nodes; src++) {
 302        for (dst = 0; dst < nb_numa_nodes; dst++) {
 303            if (numa_info[src].distance[dst] == 0) {
 304                if (src == dst) {
 305                    numa_info[src].distance[dst] = NUMA_DISTANCE_MIN;
 306                } else {
 307                    numa_info[src].distance[dst] = numa_info[dst].distance[src];
 308                }
 309            }
 310        }
 311    }
 312}
 313
 314void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
 315                                 int nb_nodes, ram_addr_t size)
 316{
 317    int i;
 318    uint64_t usedmem = 0;
 319
 320    /* Align each node according to the alignment
 321     * requirements of the machine class
 322     */
 323
 324    for (i = 0; i < nb_nodes - 1; i++) {
 325        nodes[i].node_mem = (size / nb_nodes) &
 326                            ~((1 << mc->numa_mem_align_shift) - 1);
 327        usedmem += nodes[i].node_mem;
 328    }
 329    nodes[i].node_mem = size - usedmem;
 330}
 331
 332void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
 333                                  int nb_nodes, ram_addr_t size)
 334{
 335    int i;
 336    uint64_t usedmem = 0, node_mem;
 337    uint64_t granularity = size / nb_nodes;
 338    uint64_t propagate = 0;
 339
 340    for (i = 0; i < nb_nodes - 1; i++) {
 341        node_mem = (granularity + propagate) &
 342                   ~((1 << mc->numa_mem_align_shift) - 1);
 343        propagate = granularity + propagate - node_mem;
 344        nodes[i].node_mem = node_mem;
 345        usedmem += node_mem;
 346    }
 347    nodes[i].node_mem = size - usedmem;
 348}
 349
 350void numa_complete_configuration(MachineState *ms)
 351{
 352    int i;
 353    MachineClass *mc = MACHINE_GET_CLASS(ms);
 354
 355    /*
 356     * If memory hotplug is enabled (slots > 0) but without '-numa'
 357     * options explicitly on CLI, guestes will break.
 358     *
 359     *   Windows: won't enable memory hotplug without SRAT table at all
 360     *
 361     *   Linux: if QEMU is started with initial memory all below 4Gb
 362     *   and no SRAT table present, guest kernel will use nommu DMA ops,
 363     *   which breaks 32bit hw drivers when memory is hotplugged and
 364     *   guest tries to use it with that drivers.
 365     *
 366     * Enable NUMA implicitly by adding a new NUMA node automatically.
 367     */
 368    if (ms->ram_slots > 0 && nb_numa_nodes == 0 &&
 369        mc->auto_enable_numa_with_memhp) {
 370            NumaNodeOptions node = { };
 371            parse_numa_node(ms, &node, &error_abort);
 372    }
 373
 374    assert(max_numa_nodeid <= MAX_NODES);
 375
 376    /* No support for sparse NUMA node IDs yet: */
 377    for (i = max_numa_nodeid - 1; i >= 0; i--) {
 378        /* Report large node IDs first, to make mistakes easier to spot */
 379        if (!numa_info[i].present) {
 380            error_report("numa: Node ID missing: %d", i);
 381            exit(1);
 382        }
 383    }
 384
 385    /* This must be always true if all nodes are present: */
 386    assert(nb_numa_nodes == max_numa_nodeid);
 387
 388    if (nb_numa_nodes > 0) {
 389        uint64_t numa_total;
 390
 391        if (nb_numa_nodes > MAX_NODES) {
 392            nb_numa_nodes = MAX_NODES;
 393        }
 394
 395        /* If no memory size is given for any node, assume the default case
 396         * and distribute the available memory equally across all nodes
 397         */
 398        for (i = 0; i < nb_numa_nodes; i++) {
 399            if (numa_info[i].node_mem != 0) {
 400                break;
 401            }
 402        }
 403        if (i == nb_numa_nodes) {
 404            assert(mc->numa_auto_assign_ram);
 405            mc->numa_auto_assign_ram(mc, numa_info, nb_numa_nodes, ram_size);
 406            if (!qtest_enabled()) {
 407                warn_report("Default splitting of RAM between nodes is deprecated,"
 408                            " Use '-numa node,memdev' to explictly define RAM"
 409                            " allocation per node");
 410            }
 411        }
 412
 413        numa_total = 0;
 414        for (i = 0; i < nb_numa_nodes; i++) {
 415            numa_total += numa_info[i].node_mem;
 416        }
 417        if (numa_total != ram_size) {
 418            error_report("total memory for NUMA nodes (0x%" PRIx64 ")"
 419                         " should equal RAM size (0x" RAM_ADDR_FMT ")",
 420                         numa_total, ram_size);
 421            exit(1);
 422        }
 423
 424        /* QEMU needs at least all unique node pair distances to build
 425         * the whole NUMA distance table. QEMU treats the distance table
 426         * as symmetric by default, i.e. distance A->B == distance B->A.
 427         * Thus, QEMU is able to complete the distance table
 428         * initialization even though only distance A->B is provided and
 429         * distance B->A is not. QEMU knows the distance of a node to
 430         * itself is always 10, so A->A distances may be omitted. When
 431         * the distances of two nodes of a pair differ, i.e. distance
 432         * A->B != distance B->A, then that means the distance table is
 433         * asymmetric. In this case, the distances for both directions
 434         * of all node pairs are required.
 435         */
 436        if (have_numa_distance) {
 437            /* Validate enough NUMA distance information was provided. */
 438            validate_numa_distance();
 439
 440            /* Validation succeeded, now fill in any missing distances. */
 441            complete_init_numa_distance();
 442        }
 443    }
 444}
 445
 446void parse_numa_opts(MachineState *ms)
 447{
 448    qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, ms, &error_fatal);
 449}
 450
 451void numa_cpu_pre_plug(const CPUArchId *slot, DeviceState *dev, Error **errp)
 452{
 453    int node_id = object_property_get_int(OBJECT(dev), "node-id", &error_abort);
 454
 455    if (node_id == CPU_UNSET_NUMA_NODE_ID) {
 456        /* due to bug in libvirt, it doesn't pass node-id from props on
 457         * device_add as expected, so we have to fix it up here */
 458        if (slot->props.has_node_id) {
 459            object_property_set_int(OBJECT(dev), slot->props.node_id,
 460                                    "node-id", errp);
 461        }
 462    } else if (node_id != slot->props.node_id) {
 463        error_setg(errp, "invalid node-id, must be %"PRId64,
 464                   slot->props.node_id);
 465    }
 466}
 467
 468static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
 469                                           const char *name,
 470                                           uint64_t ram_size)
 471{
 472    if (mem_path) {
 473#ifdef __linux__
 474        Error *err = NULL;
 475        memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, 0,
 476                                         mem_path, &err);
 477        if (err) {
 478            error_report_err(err);
 479            if (mem_prealloc) {
 480                exit(1);
 481            }
 482            warn_report("falling back to regular RAM allocation");
 483            error_printf("This is deprecated. Make sure that -mem-path "
 484                         " specified path has sufficient resources to allocate"
 485                         " -m specified RAM amount");
 486            /* Legacy behavior: if allocation failed, fall back to
 487             * regular RAM allocation.
 488             */
 489            mem_path = NULL;
 490            memory_region_init_ram_nomigrate(mr, owner, name, ram_size, &error_fatal);
 491        }
 492#else
 493        fprintf(stderr, "-mem-path not supported on this host\n");
 494        exit(1);
 495#endif
 496    } else {
 497        memory_region_init_ram_nomigrate(mr, owner, name, ram_size, &error_fatal);
 498    }
 499    vmstate_register_ram_global(mr);
 500}
 501
 502void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
 503                                          const char *name,
 504                                          uint64_t ram_size)
 505{
 506    uint64_t addr = 0;
 507    int i;
 508
 509    if (nb_numa_nodes == 0 || !have_memdevs) {
 510        allocate_system_memory_nonnuma(mr, owner, name, ram_size);
 511        return;
 512    }
 513
 514    memory_region_init(mr, owner, name, ram_size);
 515    for (i = 0; i < nb_numa_nodes; i++) {
 516        uint64_t size = numa_info[i].node_mem;
 517        HostMemoryBackend *backend = numa_info[i].node_memdev;
 518        if (!backend) {
 519            continue;
 520        }
 521        MemoryRegion *seg = host_memory_backend_get_memory(backend);
 522
 523        if (memory_region_is_mapped(seg)) {
 524            char *path = object_get_canonical_path_component(OBJECT(backend));
 525            error_report("memory backend %s is used multiple times. Each "
 526                         "-numa option must use a different memdev value.",
 527                         path);
 528            g_free(path);
 529            exit(1);
 530        }
 531
 532        host_memory_backend_set_mapped(backend, true);
 533        memory_region_add_subregion(mr, addr, seg);
 534        vmstate_register_ram_global(seg);
 535        addr += size;
 536    }
 537}
 538
 539static void numa_stat_memory_devices(NumaNodeMem node_mem[])
 540{
 541    MemoryDeviceInfoList *info_list = qmp_memory_device_list();
 542    MemoryDeviceInfoList *info;
 543    PCDIMMDeviceInfo     *pcdimm_info;
 544    VirtioPMEMDeviceInfo *vpi;
 545
 546    for (info = info_list; info; info = info->next) {
 547        MemoryDeviceInfo *value = info->value;
 548
 549        if (value) {
 550            switch (value->type) {
 551            case MEMORY_DEVICE_INFO_KIND_DIMM:
 552            case MEMORY_DEVICE_INFO_KIND_NVDIMM:
 553                pcdimm_info = value->type == MEMORY_DEVICE_INFO_KIND_DIMM ?
 554                              value->u.dimm.data : value->u.nvdimm.data;
 555                node_mem[pcdimm_info->node].node_mem += pcdimm_info->size;
 556                node_mem[pcdimm_info->node].node_plugged_mem +=
 557                    pcdimm_info->size;
 558                break;
 559            case MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM:
 560                vpi = value->u.virtio_pmem.data;
 561                /* TODO: once we support numa, assign to right node */
 562                node_mem[0].node_mem += vpi->size;
 563                node_mem[0].node_plugged_mem += vpi->size;
 564                break;
 565            default:
 566                g_assert_not_reached();
 567            }
 568        }
 569    }
 570    qapi_free_MemoryDeviceInfoList(info_list);
 571}
 572
 573void query_numa_node_mem(NumaNodeMem node_mem[])
 574{
 575    int i;
 576
 577    if (nb_numa_nodes <= 0) {
 578        return;
 579    }
 580
 581    numa_stat_memory_devices(node_mem);
 582    for (i = 0; i < nb_numa_nodes; i++) {
 583        node_mem[i].node_mem += numa_info[i].node_mem;
 584    }
 585}
 586
 587void ram_block_notifier_add(RAMBlockNotifier *n)
 588{
 589    QLIST_INSERT_HEAD(&ram_list.ramblock_notifiers, n, next);
 590}
 591
 592void ram_block_notifier_remove(RAMBlockNotifier *n)
 593{
 594    QLIST_REMOVE(n, next);
 595}
 596
 597void ram_block_notify_add(void *host, size_t size)
 598{
 599    RAMBlockNotifier *notifier;
 600
 601    QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
 602        notifier->ram_block_added(notifier, host, size);
 603    }
 604}
 605
 606void ram_block_notify_remove(void *host, size_t size)
 607{
 608    RAMBlockNotifier *notifier;
 609
 610    QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
 611        notifier->ram_block_removed(notifier, host, size);
 612    }
 613}
 614