qemu/numa.c
<<
>>
Prefs
   1/*
   2 * NUMA parameter parsing routines
   3 *
   4 * Copyright (c) 2014 Fujitsu Ltd.
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "sysemu/numa.h"
  27#include "exec/cpu-common.h"
  28#include "exec/ramlist.h"
  29#include "qemu/bitmap.h"
  30#include "qom/cpu.h"
  31#include "qemu/error-report.h"
  32#include "qapi/error.h"
  33#include "qapi/opts-visitor.h"
  34#include "qapi/qapi-commands-misc.h"
  35#include "qapi/qapi-visit-misc.h"
  36#include "hw/boards.h"
  37#include "sysemu/hostmem.h"
  38#include "hw/mem/pc-dimm.h"
  39#include "qemu/option.h"
  40#include "qemu/config-file.h"
  41#include "qemu/cutils.h"
  42
  43QemuOptsList qemu_numa_opts = {
  44    .name = "numa",
  45    .implied_opt_name = "type",
  46    .head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head),
  47    .desc = { { 0 } } /* validated with OptsVisitor */
  48};
  49
  50static int have_memdevs = -1;
  51static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
  52                             * For all nodes, nodeid < max_numa_nodeid
  53                             */
  54int nb_numa_nodes;
  55bool have_numa_distance;
  56NodeInfo numa_info[MAX_NODES];
  57
  58
  59static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
  60                            Error **errp)
  61{
  62    uint16_t nodenr;
  63    uint16List *cpus = NULL;
  64    MachineClass *mc = MACHINE_GET_CLASS(ms);
  65
  66    if (node->has_nodeid) {
  67        nodenr = node->nodeid;
  68    } else {
  69        nodenr = nb_numa_nodes;
  70    }
  71
  72    if (nodenr >= MAX_NODES) {
  73        error_setg(errp, "Max number of NUMA nodes reached: %"
  74                   PRIu16 "", nodenr);
  75        return;
  76    }
  77
  78    if (numa_info[nodenr].present) {
  79        error_setg(errp, "Duplicate NUMA nodeid: %" PRIu16, nodenr);
  80        return;
  81    }
  82
  83    if (!mc->cpu_index_to_instance_props || !mc->get_default_cpu_node_id) {
  84        error_report("NUMA is not supported by this machine-type");
  85        exit(1);
  86    }
  87    for (cpus = node->cpus; cpus; cpus = cpus->next) {
  88        CpuInstanceProperties props;
  89        if (cpus->value >= max_cpus) {
  90            error_setg(errp,
  91                       "CPU index (%" PRIu16 ")"
  92                       " should be smaller than maxcpus (%d)",
  93                       cpus->value, max_cpus);
  94            return;
  95        }
  96        props = mc->cpu_index_to_instance_props(ms, cpus->value);
  97        props.node_id = nodenr;
  98        props.has_node_id = true;
  99        machine_set_cpu_numa_node(ms, &props, &error_fatal);
 100    }
 101
 102    if (node->has_mem && node->has_memdev) {
 103        error_setg(errp, "cannot specify both mem= and memdev=");
 104        return;
 105    }
 106
 107    if (have_memdevs == -1) {
 108        have_memdevs = node->has_memdev;
 109    }
 110    if (node->has_memdev != have_memdevs) {
 111        error_setg(errp, "memdev option must be specified for either "
 112                   "all or no nodes");
 113        return;
 114    }
 115
 116    if (node->has_mem) {
 117        numa_info[nodenr].node_mem = node->mem;
 118    }
 119    if (node->has_memdev) {
 120        Object *o;
 121        o = object_resolve_path_type(node->memdev, TYPE_MEMORY_BACKEND, NULL);
 122        if (!o) {
 123            error_setg(errp, "memdev=%s is ambiguous", node->memdev);
 124            return;
 125        }
 126
 127        object_ref(o);
 128        numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
 129        numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
 130    }
 131    numa_info[nodenr].present = true;
 132    max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
 133    nb_numa_nodes++;
 134}
 135
 136static void parse_numa_distance(NumaDistOptions *dist, Error **errp)
 137{
 138    uint16_t src = dist->src;
 139    uint16_t dst = dist->dst;
 140    uint8_t val = dist->val;
 141
 142    if (src >= MAX_NODES || dst >= MAX_NODES) {
 143        error_setg(errp,
 144                   "Invalid node %d, max possible could be %d",
 145                   MAX(src, dst), MAX_NODES);
 146        return;
 147    }
 148
 149    if (!numa_info[src].present || !numa_info[dst].present) {
 150        error_setg(errp, "Source/Destination NUMA node is missing. "
 151                   "Please use '-numa node' option to declare it first.");
 152        return;
 153    }
 154
 155    if (val < NUMA_DISTANCE_MIN) {
 156        error_setg(errp, "NUMA distance (%" PRIu8 ") is invalid, "
 157                   "it shouldn't be less than %d.",
 158                   val, NUMA_DISTANCE_MIN);
 159        return;
 160    }
 161
 162    if (src == dst && val != NUMA_DISTANCE_MIN) {
 163        error_setg(errp, "Local distance of node %d should be %d.",
 164                   src, NUMA_DISTANCE_MIN);
 165        return;
 166    }
 167
 168    numa_info[src].distance[dst] = val;
 169    have_numa_distance = true;
 170}
 171
 172static int parse_numa(void *opaque, QemuOpts *opts, Error **errp)
 173{
 174    NumaOptions *object = NULL;
 175    MachineState *ms = opaque;
 176    Error *err = NULL;
 177
 178    {
 179        Visitor *v = opts_visitor_new(opts);
 180        visit_type_NumaOptions(v, NULL, &object, &err);
 181        visit_free(v);
 182    }
 183
 184    if (err) {
 185        goto end;
 186    }
 187
 188    /* Fix up legacy suffix-less format */
 189    if ((object->type == NUMA_OPTIONS_TYPE_NODE) && object->u.node.has_mem) {
 190        const char *mem_str = qemu_opt_get(opts, "mem");
 191        qemu_strtosz_MiB(mem_str, NULL, &object->u.node.mem);
 192    }
 193
 194    switch (object->type) {
 195    case NUMA_OPTIONS_TYPE_NODE:
 196        parse_numa_node(ms, &object->u.node, &err);
 197        if (err) {
 198            goto end;
 199        }
 200        break;
 201    case NUMA_OPTIONS_TYPE_DIST:
 202        parse_numa_distance(&object->u.dist, &err);
 203        if (err) {
 204            goto end;
 205        }
 206        break;
 207    case NUMA_OPTIONS_TYPE_CPU:
 208        if (!object->u.cpu.has_node_id) {
 209            error_setg(&err, "Missing mandatory node-id property");
 210            goto end;
 211        }
 212        if (!numa_info[object->u.cpu.node_id].present) {
 213            error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be "
 214                "defined with -numa node,nodeid=ID before it's used with "
 215                "-numa cpu,node-id=ID", object->u.cpu.node_id);
 216            goto end;
 217        }
 218
 219        machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu),
 220                                  &err);
 221        break;
 222    default:
 223        abort();
 224    }
 225
 226end:
 227    qapi_free_NumaOptions(object);
 228    if (err) {
 229        error_report_err(err);
 230        return -1;
 231    }
 232
 233    return 0;
 234}
 235
 236/* If all node pair distances are symmetric, then only distances
 237 * in one direction are enough. If there is even one asymmetric
 238 * pair, though, then all distances must be provided. The
 239 * distance from a node to itself is always NUMA_DISTANCE_MIN,
 240 * so providing it is never necessary.
 241 */
 242static void validate_numa_distance(void)
 243{
 244    int src, dst;
 245    bool is_asymmetrical = false;
 246
 247    for (src = 0; src < nb_numa_nodes; src++) {
 248        for (dst = src; dst < nb_numa_nodes; dst++) {
 249            if (numa_info[src].distance[dst] == 0 &&
 250                numa_info[dst].distance[src] == 0) {
 251                if (src != dst) {
 252                    error_report("The distance between node %d and %d is "
 253                                 "missing, at least one distance value "
 254                                 "between each nodes should be provided.",
 255                                 src, dst);
 256                    exit(EXIT_FAILURE);
 257                }
 258            }
 259
 260            if (numa_info[src].distance[dst] != 0 &&
 261                numa_info[dst].distance[src] != 0 &&
 262                numa_info[src].distance[dst] !=
 263                numa_info[dst].distance[src]) {
 264                is_asymmetrical = true;
 265            }
 266        }
 267    }
 268
 269    if (is_asymmetrical) {
 270        for (src = 0; src < nb_numa_nodes; src++) {
 271            for (dst = 0; dst < nb_numa_nodes; dst++) {
 272                if (src != dst && numa_info[src].distance[dst] == 0) {
 273                    error_report("At least one asymmetrical pair of "
 274                            "distances is given, please provide distances "
 275                            "for both directions of all node pairs.");
 276                    exit(EXIT_FAILURE);
 277                }
 278            }
 279        }
 280    }
 281}
 282
 283static void complete_init_numa_distance(void)
 284{
 285    int src, dst;
 286
 287    /* Fixup NUMA distance by symmetric policy because if it is an
 288     * asymmetric distance table, it should be a complete table and
 289     * there would not be any missing distance except local node, which
 290     * is verified by validate_numa_distance above.
 291     */
 292    for (src = 0; src < nb_numa_nodes; src++) {
 293        for (dst = 0; dst < nb_numa_nodes; dst++) {
 294            if (numa_info[src].distance[dst] == 0) {
 295                if (src == dst) {
 296                    numa_info[src].distance[dst] = NUMA_DISTANCE_MIN;
 297                } else {
 298                    numa_info[src].distance[dst] = numa_info[dst].distance[src];
 299                }
 300            }
 301        }
 302    }
 303}
 304
 305void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
 306                                 int nb_nodes, ram_addr_t size)
 307{
 308    int i;
 309    uint64_t usedmem = 0;
 310
 311    /* Align each node according to the alignment
 312     * requirements of the machine class
 313     */
 314
 315    for (i = 0; i < nb_nodes - 1; i++) {
 316        nodes[i].node_mem = (size / nb_nodes) &
 317                            ~((1 << mc->numa_mem_align_shift) - 1);
 318        usedmem += nodes[i].node_mem;
 319    }
 320    nodes[i].node_mem = size - usedmem;
 321}
 322
 323void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
 324                                  int nb_nodes, ram_addr_t size)
 325{
 326    int i;
 327    uint64_t usedmem = 0, node_mem;
 328    uint64_t granularity = size / nb_nodes;
 329    uint64_t propagate = 0;
 330
 331    for (i = 0; i < nb_nodes - 1; i++) {
 332        node_mem = (granularity + propagate) &
 333                   ~((1 << mc->numa_mem_align_shift) - 1);
 334        propagate = granularity + propagate - node_mem;
 335        nodes[i].node_mem = node_mem;
 336        usedmem += node_mem;
 337    }
 338    nodes[i].node_mem = size - usedmem;
 339}
 340
 341void parse_numa_opts(MachineState *ms)
 342{
 343    int i;
 344    MachineClass *mc = MACHINE_GET_CLASS(ms);
 345
 346    if (qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, ms, NULL)) {
 347        exit(1);
 348    }
 349
 350    /*
 351     * If memory hotplug is enabled (slots > 0) but without '-numa'
 352     * options explicitly on CLI, guestes will break.
 353     *
 354     *   Windows: won't enable memory hotplug without SRAT table at all
 355     *
 356     *   Linux: if QEMU is started with initial memory all below 4Gb
 357     *   and no SRAT table present, guest kernel will use nommu DMA ops,
 358     *   which breaks 32bit hw drivers when memory is hotplugged and
 359     *   guest tries to use it with that drivers.
 360     *
 361     * Enable NUMA implicitly by adding a new NUMA node automatically.
 362     */
 363    if (ms->ram_slots > 0 && nb_numa_nodes == 0 &&
 364        mc->auto_enable_numa_with_memhp) {
 365            NumaNodeOptions node = { };
 366            parse_numa_node(ms, &node, NULL);
 367    }
 368
 369    assert(max_numa_nodeid <= MAX_NODES);
 370
 371    /* No support for sparse NUMA node IDs yet: */
 372    for (i = max_numa_nodeid - 1; i >= 0; i--) {
 373        /* Report large node IDs first, to make mistakes easier to spot */
 374        if (!numa_info[i].present) {
 375            error_report("numa: Node ID missing: %d", i);
 376            exit(1);
 377        }
 378    }
 379
 380    /* This must be always true if all nodes are present: */
 381    assert(nb_numa_nodes == max_numa_nodeid);
 382
 383    if (nb_numa_nodes > 0) {
 384        uint64_t numa_total;
 385
 386        if (nb_numa_nodes > MAX_NODES) {
 387            nb_numa_nodes = MAX_NODES;
 388        }
 389
 390        /* If no memory size is given for any node, assume the default case
 391         * and distribute the available memory equally across all nodes
 392         */
 393        for (i = 0; i < nb_numa_nodes; i++) {
 394            if (numa_info[i].node_mem != 0) {
 395                break;
 396            }
 397        }
 398        if (i == nb_numa_nodes) {
 399            assert(mc->numa_auto_assign_ram);
 400            mc->numa_auto_assign_ram(mc, numa_info, nb_numa_nodes, ram_size);
 401        }
 402
 403        numa_total = 0;
 404        for (i = 0; i < nb_numa_nodes; i++) {
 405            numa_total += numa_info[i].node_mem;
 406        }
 407        if (numa_total != ram_size) {
 408            error_report("total memory for NUMA nodes (0x%" PRIx64 ")"
 409                         " should equal RAM size (0x" RAM_ADDR_FMT ")",
 410                         numa_total, ram_size);
 411            exit(1);
 412        }
 413
 414        /* QEMU needs at least all unique node pair distances to build
 415         * the whole NUMA distance table. QEMU treats the distance table
 416         * as symmetric by default, i.e. distance A->B == distance B->A.
 417         * Thus, QEMU is able to complete the distance table
 418         * initialization even though only distance A->B is provided and
 419         * distance B->A is not. QEMU knows the distance of a node to
 420         * itself is always 10, so A->A distances may be omitted. When
 421         * the distances of two nodes of a pair differ, i.e. distance
 422         * A->B != distance B->A, then that means the distance table is
 423         * asymmetric. In this case, the distances for both directions
 424         * of all node pairs are required.
 425         */
 426        if (have_numa_distance) {
 427            /* Validate enough NUMA distance information was provided. */
 428            validate_numa_distance();
 429
 430            /* Validation succeeded, now fill in any missing distances. */
 431            complete_init_numa_distance();
 432        }
 433    }
 434}
 435
 436void numa_cpu_pre_plug(const CPUArchId *slot, DeviceState *dev, Error **errp)
 437{
 438    int node_id = object_property_get_int(OBJECT(dev), "node-id", &error_abort);
 439
 440    if (node_id == CPU_UNSET_NUMA_NODE_ID) {
 441        /* due to bug in libvirt, it doesn't pass node-id from props on
 442         * device_add as expected, so we have to fix it up here */
 443        if (slot->props.has_node_id) {
 444            object_property_set_int(OBJECT(dev), slot->props.node_id,
 445                                    "node-id", errp);
 446        }
 447    } else if (node_id != slot->props.node_id) {
 448        error_setg(errp, "node-id=%d must match numa node specified "
 449                   "with -numa option", node_id);
 450    }
 451}
 452
 453static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
 454                                           const char *name,
 455                                           uint64_t ram_size)
 456{
 457    if (mem_path) {
 458#ifdef __linux__
 459        Error *err = NULL;
 460        memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, false,
 461                                         mem_path, &err);
 462        if (err) {
 463            error_report_err(err);
 464            if (mem_prealloc) {
 465                exit(1);
 466            }
 467            error_report("falling back to regular RAM allocation.");
 468
 469            /* Legacy behavior: if allocation failed, fall back to
 470             * regular RAM allocation.
 471             */
 472            memory_region_init_ram_nomigrate(mr, owner, name, ram_size, &error_fatal);
 473        }
 474#else
 475        fprintf(stderr, "-mem-path not supported on this host\n");
 476        exit(1);
 477#endif
 478    } else {
 479        memory_region_init_ram_nomigrate(mr, owner, name, ram_size, &error_fatal);
 480    }
 481    vmstate_register_ram_global(mr);
 482}
 483
 484void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
 485                                          const char *name,
 486                                          uint64_t ram_size)
 487{
 488    uint64_t addr = 0;
 489    int i;
 490
 491    if (nb_numa_nodes == 0 || !have_memdevs) {
 492        allocate_system_memory_nonnuma(mr, owner, name, ram_size);
 493        return;
 494    }
 495
 496    memory_region_init(mr, owner, name, ram_size);
 497    for (i = 0; i < nb_numa_nodes; i++) {
 498        uint64_t size = numa_info[i].node_mem;
 499        HostMemoryBackend *backend = numa_info[i].node_memdev;
 500        if (!backend) {
 501            continue;
 502        }
 503        MemoryRegion *seg = host_memory_backend_get_memory(backend,
 504                                                           &error_fatal);
 505
 506        if (memory_region_is_mapped(seg)) {
 507            char *path = object_get_canonical_path_component(OBJECT(backend));
 508            error_report("memory backend %s is used multiple times. Each "
 509                         "-numa option must use a different memdev value.",
 510                         path);
 511            exit(1);
 512        }
 513
 514        host_memory_backend_set_mapped(backend, true);
 515        memory_region_add_subregion(mr, addr, seg);
 516        vmstate_register_ram_global(seg);
 517        addr += size;
 518    }
 519}
 520
 521static void numa_stat_memory_devices(NumaNodeMem node_mem[])
 522{
 523    MemoryDeviceInfoList *info_list = qmp_pc_dimm_device_list();
 524    MemoryDeviceInfoList *info;
 525    PCDIMMDeviceInfo     *pcdimm_info;
 526
 527    for (info = info_list; info; info = info->next) {
 528        MemoryDeviceInfo *value = info->value;
 529
 530        if (value) {
 531            switch (value->type) {
 532            case MEMORY_DEVICE_INFO_KIND_DIMM:
 533                pcdimm_info = value->u.dimm.data;
 534                break;
 535
 536            case MEMORY_DEVICE_INFO_KIND_NVDIMM:
 537                pcdimm_info = value->u.nvdimm.data;
 538                break;
 539
 540            default:
 541                pcdimm_info = NULL;
 542                break;
 543            }
 544
 545            if (pcdimm_info) {
 546                node_mem[pcdimm_info->node].node_mem += pcdimm_info->size;
 547                if (pcdimm_info->hotpluggable && pcdimm_info->hotplugged) {
 548                    node_mem[pcdimm_info->node].node_plugged_mem +=
 549                        pcdimm_info->size;
 550                }
 551            }
 552        }
 553    }
 554    qapi_free_MemoryDeviceInfoList(info_list);
 555}
 556
 557void query_numa_node_mem(NumaNodeMem node_mem[])
 558{
 559    int i;
 560
 561    if (nb_numa_nodes <= 0) {
 562        return;
 563    }
 564
 565    numa_stat_memory_devices(node_mem);
 566    for (i = 0; i < nb_numa_nodes; i++) {
 567        node_mem[i].node_mem += numa_info[i].node_mem;
 568    }
 569}
 570
 571static int query_memdev(Object *obj, void *opaque)
 572{
 573    MemdevList **list = opaque;
 574    MemdevList *m = NULL;
 575
 576    if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
 577        m = g_malloc0(sizeof(*m));
 578
 579        m->value = g_malloc0(sizeof(*m->value));
 580
 581        m->value->id = object_property_get_str(obj, "id", NULL);
 582        m->value->has_id = !!m->value->id;
 583
 584        m->value->size = object_property_get_uint(obj, "size",
 585                                                  &error_abort);
 586        m->value->merge = object_property_get_bool(obj, "merge",
 587                                                   &error_abort);
 588        m->value->dump = object_property_get_bool(obj, "dump",
 589                                                  &error_abort);
 590        m->value->prealloc = object_property_get_bool(obj,
 591                                                      "prealloc",
 592                                                      &error_abort);
 593        m->value->policy = object_property_get_enum(obj,
 594                                                    "policy",
 595                                                    "HostMemPolicy",
 596                                                    &error_abort);
 597        object_property_get_uint16List(obj, "host-nodes",
 598                                       &m->value->host_nodes,
 599                                       &error_abort);
 600
 601        m->next = *list;
 602        *list = m;
 603    }
 604
 605    return 0;
 606}
 607
 608MemdevList *qmp_query_memdev(Error **errp)
 609{
 610    Object *obj = object_get_objects_root();
 611    MemdevList *list = NULL;
 612
 613    object_child_foreach(obj, query_memdev, &list);
 614    return list;
 615}
 616
 617void ram_block_notifier_add(RAMBlockNotifier *n)
 618{
 619    QLIST_INSERT_HEAD(&ram_list.ramblock_notifiers, n, next);
 620}
 621
 622void ram_block_notifier_remove(RAMBlockNotifier *n)
 623{
 624    QLIST_REMOVE(n, next);
 625}
 626
 627void ram_block_notify_add(void *host, size_t size)
 628{
 629    RAMBlockNotifier *notifier;
 630
 631    QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
 632        notifier->ram_block_added(notifier, host, size);
 633    }
 634}
 635
 636void ram_block_notify_remove(void *host, size_t size)
 637{
 638    RAMBlockNotifier *notifier;
 639
 640    QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
 641        notifier->ram_block_removed(notifier, host, size);
 642    }
 643}
 644