qemu/hw/core/numa.c
<<
>>
Prefs
   1/*
   2 * NUMA parameter parsing routines
   3 *
   4 * Copyright (c) 2014 Fujitsu Ltd.
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu/units.h"
  27#include "sysemu/hostmem.h"
  28#include "sysemu/numa.h"
  29#include "sysemu/sysemu.h"
  30#include "exec/cpu-common.h"
  31#include "exec/ramlist.h"
  32#include "qemu/bitmap.h"
  33#include "qemu/error-report.h"
  34#include "qapi/error.h"
  35#include "qapi/opts-visitor.h"
  36#include "qapi/qapi-visit-machine.h"
  37#include "sysemu/qtest.h"
  38#include "hw/core/cpu.h"
  39#include "hw/mem/pc-dimm.h"
  40#include "migration/vmstate.h"
  41#include "hw/boards.h"
  42#include "hw/mem/memory-device.h"
  43#include "qemu/option.h"
  44#include "qemu/config-file.h"
  45#include "qemu/cutils.h"
  46
  47QemuOptsList qemu_numa_opts = {
  48    .name = "numa",
  49    .implied_opt_name = "type",
  50    .head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head),
  51    .desc = { { 0 } } /* validated with OptsVisitor */
  52};
  53
  54static int have_memdevs;
  55bool numa_uses_legacy_mem(void)
  56{
  57    return !have_memdevs;
  58}
  59
  60static int have_mem;
  61static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
  62                             * For all nodes, nodeid < max_numa_nodeid
  63                             */
  64
  65static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
  66                            Error **errp)
  67{
  68    Error *err = NULL;
  69    uint16_t nodenr;
  70    uint16List *cpus = NULL;
  71    MachineClass *mc = MACHINE_GET_CLASS(ms);
  72    unsigned int max_cpus = ms->smp.max_cpus;
  73    NodeInfo *numa_info = ms->numa_state->nodes;
  74
  75    if (node->has_nodeid) {
  76        nodenr = node->nodeid;
  77    } else {
  78        nodenr = ms->numa_state->num_nodes;
  79    }
  80
  81    if (nodenr >= MAX_NODES) {
  82        error_setg(errp, "Max number of NUMA nodes reached: %"
  83                   PRIu16 "", nodenr);
  84        return;
  85    }
  86
  87    if (numa_info[nodenr].present) {
  88        error_setg(errp, "Duplicate NUMA nodeid: %" PRIu16, nodenr);
  89        return;
  90    }
  91
  92    for (cpus = node->cpus; cpus; cpus = cpus->next) {
  93        CpuInstanceProperties props;
  94        if (cpus->value >= max_cpus) {
  95            error_setg(errp,
  96                       "CPU index (%" PRIu16 ")"
  97                       " should be smaller than maxcpus (%d)",
  98                       cpus->value, max_cpus);
  99            return;
 100        }
 101        props = mc->cpu_index_to_instance_props(ms, cpus->value);
 102        props.node_id = nodenr;
 103        props.has_node_id = true;
 104        machine_set_cpu_numa_node(ms, &props, &err);
 105        if (err) {
 106            error_propagate(errp, err);
 107            return;
 108        }
 109    }
 110
 111    have_memdevs = have_memdevs ? : node->has_memdev;
 112    have_mem = have_mem ? : node->has_mem;
 113    if ((node->has_mem && have_memdevs) || (node->has_memdev && have_mem)) {
 114        error_setg(errp, "numa configuration should use either mem= or memdev=,"
 115                   "mixing both is not allowed");
 116        return;
 117    }
 118
 119    if (node->has_mem) {
 120        numa_info[nodenr].node_mem = node->mem;
 121        if (!qtest_enabled()) {
 122            warn_report("Parameter -numa node,mem is deprecated,"
 123                        " use -numa node,memdev instead");
 124        }
 125    }
 126    if (node->has_memdev) {
 127        Object *o;
 128        o = object_resolve_path_type(node->memdev, TYPE_MEMORY_BACKEND, NULL);
 129        if (!o) {
 130            error_setg(errp, "memdev=%s is ambiguous", node->memdev);
 131            return;
 132        }
 133
 134        object_ref(o);
 135        numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
 136        numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
 137    }
 138
 139    /*
 140     * If not set the initiator, set it to MAX_NODES. And if
 141     * HMAT is enabled and this node has no cpus, QEMU will raise error.
 142     */
 143    numa_info[nodenr].initiator = MAX_NODES;
 144    if (node->has_initiator) {
 145        if (!ms->numa_state->hmat_enabled) {
 146            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
 147                       "(HMAT) is disabled, enable it with -machine hmat=on "
 148                       "before using any of hmat specific options");
 149            return;
 150        }
 151
 152        if (node->initiator >= MAX_NODES) {
 153            error_report("The initiator id %" PRIu16 " expects an integer "
 154                         "between 0 and %d", node->initiator,
 155                         MAX_NODES - 1);
 156            return;
 157        }
 158
 159        numa_info[nodenr].initiator = node->initiator;
 160    }
 161    numa_info[nodenr].present = true;
 162    max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
 163    ms->numa_state->num_nodes++;
 164}
 165
 166static
 167void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp)
 168{
 169    uint16_t src = dist->src;
 170    uint16_t dst = dist->dst;
 171    uint8_t val = dist->val;
 172    NodeInfo *numa_info = ms->numa_state->nodes;
 173
 174    if (src >= MAX_NODES || dst >= MAX_NODES) {
 175        error_setg(errp, "Parameter '%s' expects an integer between 0 and %d",
 176                   src >= MAX_NODES ? "src" : "dst", MAX_NODES - 1);
 177        return;
 178    }
 179
 180    if (!numa_info[src].present || !numa_info[dst].present) {
 181        error_setg(errp, "Source/Destination NUMA node is missing. "
 182                   "Please use '-numa node' option to declare it first.");
 183        return;
 184    }
 185
 186    if (val < NUMA_DISTANCE_MIN) {
 187        error_setg(errp, "NUMA distance (%" PRIu8 ") is invalid, "
 188                   "it shouldn't be less than %d.",
 189                   val, NUMA_DISTANCE_MIN);
 190        return;
 191    }
 192
 193    if (src == dst && val != NUMA_DISTANCE_MIN) {
 194        error_setg(errp, "Local distance of node %d should be %d.",
 195                   src, NUMA_DISTANCE_MIN);
 196        return;
 197    }
 198
 199    numa_info[src].distance[dst] = val;
 200    ms->numa_state->have_numa_distance = true;
 201}
 202
 203void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node,
 204                        Error **errp)
 205{
 206    int i, first_bit, last_bit;
 207    uint64_t max_entry, temp_base, bitmap_copy;
 208    NodeInfo *numa_info = numa_state->nodes;
 209    HMAT_LB_Info *hmat_lb =
 210        numa_state->hmat_lb[node->hierarchy][node->data_type];
 211    HMAT_LB_Data lb_data = {};
 212    HMAT_LB_Data *lb_temp;
 213
 214    /* Error checking */
 215    if (node->initiator > numa_state->num_nodes) {
 216        error_setg(errp, "Invalid initiator=%d, it should be less than %d",
 217                   node->initiator, numa_state->num_nodes);
 218        return;
 219    }
 220    if (node->target > numa_state->num_nodes) {
 221        error_setg(errp, "Invalid target=%d, it should be less than %d",
 222                   node->target, numa_state->num_nodes);
 223        return;
 224    }
 225    if (!numa_info[node->initiator].has_cpu) {
 226        error_setg(errp, "Invalid initiator=%d, it isn't an "
 227                   "initiator proximity domain", node->initiator);
 228        return;
 229    }
 230    if (!numa_info[node->target].present) {
 231        error_setg(errp, "The target=%d should point to an existing node",
 232                   node->target);
 233        return;
 234    }
 235
 236    if (!hmat_lb) {
 237        hmat_lb = g_malloc0(sizeof(*hmat_lb));
 238        numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb;
 239        hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data));
 240    }
 241    hmat_lb->hierarchy = node->hierarchy;
 242    hmat_lb->data_type = node->data_type;
 243    lb_data.initiator = node->initiator;
 244    lb_data.target = node->target;
 245
 246    if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) {
 247        /* Input latency data */
 248
 249        if (!node->has_latency) {
 250            error_setg(errp, "Missing 'latency' option");
 251            return;
 252        }
 253        if (node->has_bandwidth) {
 254            error_setg(errp, "Invalid option 'bandwidth' since "
 255                       "the data type is latency");
 256            return;
 257        }
 258
 259        /* Detect duplicate configuration */
 260        for (i = 0; i < hmat_lb->list->len; i++) {
 261            lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
 262
 263            if (node->initiator == lb_temp->initiator &&
 264                node->target == lb_temp->target) {
 265                error_setg(errp, "Duplicate configuration of the latency for "
 266                    "initiator=%d and target=%d", node->initiator,
 267                    node->target);
 268                return;
 269            }
 270        }
 271
 272        hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX;
 273
 274        if (node->latency) {
 275            /* Calculate the temporary base and compressed latency */
 276            max_entry = node->latency;
 277            temp_base = 1;
 278            while (QEMU_IS_ALIGNED(max_entry, 10)) {
 279                max_entry /= 10;
 280                temp_base *= 10;
 281            }
 282
 283            /* Calculate the max compressed latency */
 284            temp_base = MIN(hmat_lb->base, temp_base);
 285            max_entry = node->latency / hmat_lb->base;
 286            max_entry = MAX(hmat_lb->range_bitmap, max_entry);
 287
 288            /*
 289             * For latency hmat_lb->range_bitmap record the max compressed
 290             * latency which should be less than 0xFFFF (UINT16_MAX)
 291             */
 292            if (max_entry >= UINT16_MAX) {
 293                error_setg(errp, "Latency %" PRIu64 " between initiator=%d and "
 294                        "target=%d should not differ from previously entered "
 295                        "min or max values on more than %d", node->latency,
 296                        node->initiator, node->target, UINT16_MAX - 1);
 297                return;
 298            } else {
 299                hmat_lb->base = temp_base;
 300                hmat_lb->range_bitmap = max_entry;
 301            }
 302
 303            /*
 304             * Set lb_info_provided bit 0 as 1,
 305             * latency information is provided
 306             */
 307            numa_info[node->target].lb_info_provided |= BIT(0);
 308        }
 309        lb_data.data = node->latency;
 310    } else if (node->data_type >= HMATLB_DATA_TYPE_ACCESS_BANDWIDTH) {
 311        /* Input bandwidth data */
 312        if (!node->has_bandwidth) {
 313            error_setg(errp, "Missing 'bandwidth' option");
 314            return;
 315        }
 316        if (node->has_latency) {
 317            error_setg(errp, "Invalid option 'latency' since "
 318                       "the data type is bandwidth");
 319            return;
 320        }
 321        if (!QEMU_IS_ALIGNED(node->bandwidth, MiB)) {
 322            error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d and "
 323                       "target=%d should be 1MB aligned", node->bandwidth,
 324                       node->initiator, node->target);
 325            return;
 326        }
 327
 328        /* Detect duplicate configuration */
 329        for (i = 0; i < hmat_lb->list->len; i++) {
 330            lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i);
 331
 332            if (node->initiator == lb_temp->initiator &&
 333                node->target == lb_temp->target) {
 334                error_setg(errp, "Duplicate configuration of the bandwidth for "
 335                    "initiator=%d and target=%d", node->initiator,
 336                    node->target);
 337                return;
 338            }
 339        }
 340
 341        hmat_lb->base = hmat_lb->base ? hmat_lb->base : 1;
 342
 343        if (node->bandwidth) {
 344            /* Keep bitmap unchanged when bandwidth out of range */
 345            bitmap_copy = hmat_lb->range_bitmap;
 346            bitmap_copy |= node->bandwidth;
 347            first_bit = ctz64(bitmap_copy);
 348            temp_base = UINT64_C(1) << first_bit;
 349            max_entry = node->bandwidth / temp_base;
 350            last_bit = 64 - clz64(bitmap_copy);
 351
 352            /*
 353             * For bandwidth, first_bit record the base unit of bandwidth bits,
 354             * last_bit record the last bit of the max bandwidth. The max
 355             * compressed bandwidth should be less than 0xFFFF (UINT16_MAX)
 356             */
 357            if ((last_bit - first_bit) > UINT16_BITS ||
 358                max_entry >= UINT16_MAX) {
 359                error_setg(errp, "Bandwidth %" PRIu64 " between initiator=%d "
 360                        "and target=%d should not differ from previously "
 361                        "entered values on more than %d", node->bandwidth,
 362                        node->initiator, node->target, UINT16_MAX - 1);
 363                return;
 364            } else {
 365                hmat_lb->base = temp_base;
 366                hmat_lb->range_bitmap = bitmap_copy;
 367            }
 368
 369            /*
 370             * Set lb_info_provided bit 1 as 1,
 371             * bandwidth information is provided
 372             */
 373            numa_info[node->target].lb_info_provided |= BIT(1);
 374        }
 375        lb_data.data = node->bandwidth;
 376    } else {
 377        assert(0);
 378    }
 379
 380    g_array_append_val(hmat_lb->list, lb_data);
 381}
 382
 383void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node,
 384                           Error **errp)
 385{
 386    int nb_numa_nodes = ms->numa_state->num_nodes;
 387    NodeInfo *numa_info = ms->numa_state->nodes;
 388    NumaHmatCacheOptions *hmat_cache = NULL;
 389
 390    if (node->node_id >= nb_numa_nodes) {
 391        error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less "
 392                   "than %d", node->node_id, nb_numa_nodes);
 393        return;
 394    }
 395
 396    if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) {
 397        error_setg(errp, "The latency and bandwidth information of "
 398                   "node-id=%" PRIu32 " should be provided before memory side "
 399                   "cache attributes", node->node_id);
 400        return;
 401    }
 402
 403    if (node->level < 1 || node->level >= HMAT_LB_LEVELS) {
 404        error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 "
 405                   "and less than or equal to %d", node->level,
 406                   HMAT_LB_LEVELS - 1);
 407        return;
 408    }
 409
 410    assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX);
 411    assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX);
 412    if (ms->numa_state->hmat_cache[node->node_id][node->level]) {
 413        error_setg(errp, "Duplicate configuration of the side cache for "
 414                   "node-id=%" PRIu32 " and level=%" PRIu8,
 415                   node->node_id, node->level);
 416        return;
 417    }
 418
 419    if ((node->level > 1) &&
 420        ms->numa_state->hmat_cache[node->node_id][node->level - 1] &&
 421        (node->size >=
 422            ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) {
 423        error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8
 424                   " should be less than the size(%" PRIu64 ") of "
 425                   "level=%u", node->size, node->level,
 426                   ms->numa_state->hmat_cache[node->node_id]
 427                                             [node->level - 1]->size,
 428                   node->level - 1);
 429        return;
 430    }
 431
 432    if ((node->level < HMAT_LB_LEVELS - 1) &&
 433        ms->numa_state->hmat_cache[node->node_id][node->level + 1] &&
 434        (node->size <=
 435            ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) {
 436        error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8
 437                   " should be larger than the size(%" PRIu64 ") of "
 438                   "level=%u", node->size, node->level,
 439                   ms->numa_state->hmat_cache[node->node_id]
 440                                             [node->level + 1]->size,
 441                   node->level + 1);
 442        return;
 443    }
 444
 445    hmat_cache = g_malloc0(sizeof(*hmat_cache));
 446    memcpy(hmat_cache, node, sizeof(*hmat_cache));
 447    ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache;
 448}
 449
 450void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
 451{
 452    Error *err = NULL;
 453
 454    if (!ms->numa_state) {
 455        error_setg(errp, "NUMA is not supported by this machine-type");
 456        goto end;
 457    }
 458
 459    switch (object->type) {
 460    case NUMA_OPTIONS_TYPE_NODE:
 461        parse_numa_node(ms, &object->u.node, &err);
 462        if (err) {
 463            goto end;
 464        }
 465        break;
 466    case NUMA_OPTIONS_TYPE_DIST:
 467        parse_numa_distance(ms, &object->u.dist, &err);
 468        if (err) {
 469            goto end;
 470        }
 471        break;
 472    case NUMA_OPTIONS_TYPE_CPU:
 473        if (!object->u.cpu.has_node_id) {
 474            error_setg(&err, "Missing mandatory node-id property");
 475            goto end;
 476        }
 477        if (!ms->numa_state->nodes[object->u.cpu.node_id].present) {
 478            error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be "
 479                "defined with -numa node,nodeid=ID before it's used with "
 480                "-numa cpu,node-id=ID", object->u.cpu.node_id);
 481            goto end;
 482        }
 483
 484        machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu),
 485                                  &err);
 486        break;
 487    case NUMA_OPTIONS_TYPE_HMAT_LB:
 488        if (!ms->numa_state->hmat_enabled) {
 489            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
 490                       "(HMAT) is disabled, enable it with -machine hmat=on "
 491                       "before using any of hmat specific options");
 492            return;
 493        }
 494
 495        parse_numa_hmat_lb(ms->numa_state, &object->u.hmat_lb, &err);
 496        if (err) {
 497            goto end;
 498        }
 499        break;
 500    case NUMA_OPTIONS_TYPE_HMAT_CACHE:
 501        if (!ms->numa_state->hmat_enabled) {
 502            error_setg(errp, "ACPI Heterogeneous Memory Attribute Table "
 503                       "(HMAT) is disabled, enable it with -machine hmat=on "
 504                       "before using any of hmat specific options");
 505            return;
 506        }
 507
 508        parse_numa_hmat_cache(ms, &object->u.hmat_cache, &err);
 509        if (err) {
 510            goto end;
 511        }
 512        break;
 513    default:
 514        abort();
 515    }
 516
 517end:
 518    error_propagate(errp, err);
 519}
 520
 521static int parse_numa(void *opaque, QemuOpts *opts, Error **errp)
 522{
 523    NumaOptions *object = NULL;
 524    MachineState *ms = MACHINE(opaque);
 525    Error *err = NULL;
 526    Visitor *v = opts_visitor_new(opts);
 527
 528    visit_type_NumaOptions(v, NULL, &object, &err);
 529    visit_free(v);
 530    if (err) {
 531        goto end;
 532    }
 533
 534    /* Fix up legacy suffix-less format */
 535    if ((object->type == NUMA_OPTIONS_TYPE_NODE) && object->u.node.has_mem) {
 536        const char *mem_str = qemu_opt_get(opts, "mem");
 537        qemu_strtosz_MiB(mem_str, NULL, &object->u.node.mem);
 538    }
 539
 540    set_numa_options(ms, object, &err);
 541
 542end:
 543    qapi_free_NumaOptions(object);
 544    if (err) {
 545        error_propagate(errp, err);
 546        return -1;
 547    }
 548
 549    return 0;
 550}
 551
 552/* If all node pair distances are symmetric, then only distances
 553 * in one direction are enough. If there is even one asymmetric
 554 * pair, though, then all distances must be provided. The
 555 * distance from a node to itself is always NUMA_DISTANCE_MIN,
 556 * so providing it is never necessary.
 557 */
 558static void validate_numa_distance(MachineState *ms)
 559{
 560    int src, dst;
 561    bool is_asymmetrical = false;
 562    int nb_numa_nodes = ms->numa_state->num_nodes;
 563    NodeInfo *numa_info = ms->numa_state->nodes;
 564
 565    for (src = 0; src < nb_numa_nodes; src++) {
 566        for (dst = src; dst < nb_numa_nodes; dst++) {
 567            if (numa_info[src].distance[dst] == 0 &&
 568                numa_info[dst].distance[src] == 0) {
 569                if (src != dst) {
 570                    error_report("The distance between node %d and %d is "
 571                                 "missing, at least one distance value "
 572                                 "between each nodes should be provided.",
 573                                 src, dst);
 574                    exit(EXIT_FAILURE);
 575                }
 576            }
 577
 578            if (numa_info[src].distance[dst] != 0 &&
 579                numa_info[dst].distance[src] != 0 &&
 580                numa_info[src].distance[dst] !=
 581                numa_info[dst].distance[src]) {
 582                is_asymmetrical = true;
 583            }
 584        }
 585    }
 586
 587    if (is_asymmetrical) {
 588        for (src = 0; src < nb_numa_nodes; src++) {
 589            for (dst = 0; dst < nb_numa_nodes; dst++) {
 590                if (src != dst && numa_info[src].distance[dst] == 0) {
 591                    error_report("At least one asymmetrical pair of "
 592                            "distances is given, please provide distances "
 593                            "for both directions of all node pairs.");
 594                    exit(EXIT_FAILURE);
 595                }
 596            }
 597        }
 598    }
 599}
 600
 601static void complete_init_numa_distance(MachineState *ms)
 602{
 603    int src, dst;
 604    NodeInfo *numa_info = ms->numa_state->nodes;
 605
 606    /* Fixup NUMA distance by symmetric policy because if it is an
 607     * asymmetric distance table, it should be a complete table and
 608     * there would not be any missing distance except local node, which
 609     * is verified by validate_numa_distance above.
 610     */
 611    for (src = 0; src < ms->numa_state->num_nodes; src++) {
 612        for (dst = 0; dst < ms->numa_state->num_nodes; dst++) {
 613            if (numa_info[src].distance[dst] == 0) {
 614                if (src == dst) {
 615                    numa_info[src].distance[dst] = NUMA_DISTANCE_MIN;
 616                } else {
 617                    numa_info[src].distance[dst] = numa_info[dst].distance[src];
 618                }
 619            }
 620        }
 621    }
 622}
 623
 624void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
 625                                 int nb_nodes, ram_addr_t size)
 626{
 627    int i;
 628    uint64_t usedmem = 0;
 629
 630    /* Align each node according to the alignment
 631     * requirements of the machine class
 632     */
 633
 634    for (i = 0; i < nb_nodes - 1; i++) {
 635        nodes[i].node_mem = (size / nb_nodes) &
 636                            ~((1 << mc->numa_mem_align_shift) - 1);
 637        usedmem += nodes[i].node_mem;
 638    }
 639    nodes[i].node_mem = size - usedmem;
 640}
 641
 642void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
 643                                  int nb_nodes, ram_addr_t size)
 644{
 645    int i;
 646    uint64_t usedmem = 0, node_mem;
 647    uint64_t granularity = size / nb_nodes;
 648    uint64_t propagate = 0;
 649
 650    for (i = 0; i < nb_nodes - 1; i++) {
 651        node_mem = (granularity + propagate) &
 652                   ~((1 << mc->numa_mem_align_shift) - 1);
 653        propagate = granularity + propagate - node_mem;
 654        nodes[i].node_mem = node_mem;
 655        usedmem += node_mem;
 656    }
 657    nodes[i].node_mem = size - usedmem;
 658}
 659
 660static void numa_init_memdev_container(MachineState *ms, MemoryRegion *ram)
 661{
 662    int i;
 663    uint64_t addr = 0;
 664
 665    for (i = 0; i < ms->numa_state->num_nodes; i++) {
 666        uint64_t size = ms->numa_state->nodes[i].node_mem;
 667        HostMemoryBackend *backend = ms->numa_state->nodes[i].node_memdev;
 668        if (!backend) {
 669            continue;
 670        }
 671        MemoryRegion *seg = machine_consume_memdev(ms, backend);
 672        memory_region_add_subregion(ram, addr, seg);
 673        addr += size;
 674    }
 675}
 676
 677void numa_complete_configuration(MachineState *ms)
 678{
 679    int i;
 680    MachineClass *mc = MACHINE_GET_CLASS(ms);
 681    NodeInfo *numa_info = ms->numa_state->nodes;
 682
 683    /*
 684     * If memory hotplug is enabled (slots > 0) but without '-numa'
 685     * options explicitly on CLI, guestes will break.
 686     *
 687     *   Windows: won't enable memory hotplug without SRAT table at all
 688     *
 689     *   Linux: if QEMU is started with initial memory all below 4Gb
 690     *   and no SRAT table present, guest kernel will use nommu DMA ops,
 691     *   which breaks 32bit hw drivers when memory is hotplugged and
 692     *   guest tries to use it with that drivers.
 693     *
 694     * Enable NUMA implicitly by adding a new NUMA node automatically.
 695     *
 696     * Or if MachineClass::auto_enable_numa is true and no NUMA nodes,
 697     * assume there is just one node with whole RAM.
 698     */
 699    if (ms->numa_state->num_nodes == 0 &&
 700        ((ms->ram_slots > 0 &&
 701        mc->auto_enable_numa_with_memhp) ||
 702        mc->auto_enable_numa)) {
 703            NumaNodeOptions node = { };
 704            parse_numa_node(ms, &node, &error_abort);
 705            numa_info[0].node_mem = ram_size;
 706    }
 707
 708    assert(max_numa_nodeid <= MAX_NODES);
 709
 710    /* No support for sparse NUMA node IDs yet: */
 711    for (i = max_numa_nodeid - 1; i >= 0; i--) {
 712        /* Report large node IDs first, to make mistakes easier to spot */
 713        if (!numa_info[i].present) {
 714            error_report("numa: Node ID missing: %d", i);
 715            exit(1);
 716        }
 717    }
 718
 719    /* This must be always true if all nodes are present: */
 720    assert(ms->numa_state->num_nodes == max_numa_nodeid);
 721
 722    if (ms->numa_state->num_nodes > 0) {
 723        uint64_t numa_total;
 724
 725        if (ms->numa_state->num_nodes > MAX_NODES) {
 726            ms->numa_state->num_nodes = MAX_NODES;
 727        }
 728
 729        /* If no memory size is given for any node, assume the default case
 730         * and distribute the available memory equally across all nodes
 731         */
 732        for (i = 0; i < ms->numa_state->num_nodes; i++) {
 733            if (numa_info[i].node_mem != 0) {
 734                break;
 735            }
 736        }
 737        if (i == ms->numa_state->num_nodes) {
 738            assert(mc->numa_auto_assign_ram);
 739            mc->numa_auto_assign_ram(mc, numa_info,
 740                                     ms->numa_state->num_nodes, ram_size);
 741            if (!qtest_enabled()) {
 742                warn_report("Default splitting of RAM between nodes is deprecated,"
 743                            " Use '-numa node,memdev' to explictly define RAM"
 744                            " allocation per node");
 745            }
 746        }
 747
 748        numa_total = 0;
 749        for (i = 0; i < ms->numa_state->num_nodes; i++) {
 750            numa_total += numa_info[i].node_mem;
 751        }
 752        if (numa_total != ram_size) {
 753            error_report("total memory for NUMA nodes (0x%" PRIx64 ")"
 754                         " should equal RAM size (0x" RAM_ADDR_FMT ")",
 755                         numa_total, ram_size);
 756            exit(1);
 757        }
 758
 759        if (!numa_uses_legacy_mem() && mc->default_ram_id) {
 760            ms->ram = g_new(MemoryRegion, 1);
 761            memory_region_init(ms->ram, OBJECT(ms), mc->default_ram_id,
 762                               ram_size);
 763            numa_init_memdev_container(ms, ms->ram);
 764        }
 765        /* QEMU needs at least all unique node pair distances to build
 766         * the whole NUMA distance table. QEMU treats the distance table
 767         * as symmetric by default, i.e. distance A->B == distance B->A.
 768         * Thus, QEMU is able to complete the distance table
 769         * initialization even though only distance A->B is provided and
 770         * distance B->A is not. QEMU knows the distance of a node to
 771         * itself is always 10, so A->A distances may be omitted. When
 772         * the distances of two nodes of a pair differ, i.e. distance
 773         * A->B != distance B->A, then that means the distance table is
 774         * asymmetric. In this case, the distances for both directions
 775         * of all node pairs are required.
 776         */
 777        if (ms->numa_state->have_numa_distance) {
 778            /* Validate enough NUMA distance information was provided. */
 779            validate_numa_distance(ms);
 780
 781            /* Validation succeeded, now fill in any missing distances. */
 782            complete_init_numa_distance(ms);
 783        }
 784    }
 785}
 786
 787void parse_numa_opts(MachineState *ms)
 788{
 789    qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, ms, &error_fatal);
 790}
 791
 792void numa_cpu_pre_plug(const CPUArchId *slot, DeviceState *dev, Error **errp)
 793{
 794    int node_id = object_property_get_int(OBJECT(dev), "node-id", &error_abort);
 795
 796    if (node_id == CPU_UNSET_NUMA_NODE_ID) {
 797        /* due to bug in libvirt, it doesn't pass node-id from props on
 798         * device_add as expected, so we have to fix it up here */
 799        if (slot->props.has_node_id) {
 800            object_property_set_int(OBJECT(dev), slot->props.node_id,
 801                                    "node-id", errp);
 802        }
 803    } else if (node_id != slot->props.node_id) {
 804        error_setg(errp, "invalid node-id, must be %"PRId64,
 805                   slot->props.node_id);
 806    }
 807}
 808
 809static void numa_stat_memory_devices(NumaNodeMem node_mem[])
 810{
 811    MemoryDeviceInfoList *info_list = qmp_memory_device_list();
 812    MemoryDeviceInfoList *info;
 813    PCDIMMDeviceInfo     *pcdimm_info;
 814    VirtioPMEMDeviceInfo *vpi;
 815
 816    for (info = info_list; info; info = info->next) {
 817        MemoryDeviceInfo *value = info->value;
 818
 819        if (value) {
 820            switch (value->type) {
 821            case MEMORY_DEVICE_INFO_KIND_DIMM:
 822            case MEMORY_DEVICE_INFO_KIND_NVDIMM:
 823                pcdimm_info = value->type == MEMORY_DEVICE_INFO_KIND_DIMM ?
 824                              value->u.dimm.data : value->u.nvdimm.data;
 825                node_mem[pcdimm_info->node].node_mem += pcdimm_info->size;
 826                node_mem[pcdimm_info->node].node_plugged_mem +=
 827                    pcdimm_info->size;
 828                break;
 829            case MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM:
 830                vpi = value->u.virtio_pmem.data;
 831                /* TODO: once we support numa, assign to right node */
 832                node_mem[0].node_mem += vpi->size;
 833                node_mem[0].node_plugged_mem += vpi->size;
 834                break;
 835            default:
 836                g_assert_not_reached();
 837            }
 838        }
 839    }
 840    qapi_free_MemoryDeviceInfoList(info_list);
 841}
 842
 843void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms)
 844{
 845    int i;
 846
 847    if (ms->numa_state == NULL || ms->numa_state->num_nodes <= 0) {
 848        return;
 849    }
 850
 851    numa_stat_memory_devices(node_mem);
 852    for (i = 0; i < ms->numa_state->num_nodes; i++) {
 853        node_mem[i].node_mem += ms->numa_state->nodes[i].node_mem;
 854    }
 855}
 856
 857void ram_block_notifier_add(RAMBlockNotifier *n)
 858{
 859    QLIST_INSERT_HEAD(&ram_list.ramblock_notifiers, n, next);
 860}
 861
 862void ram_block_notifier_remove(RAMBlockNotifier *n)
 863{
 864    QLIST_REMOVE(n, next);
 865}
 866
 867void ram_block_notify_add(void *host, size_t size)
 868{
 869    RAMBlockNotifier *notifier;
 870
 871    QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
 872        notifier->ram_block_added(notifier, host, size);
 873    }
 874}
 875
 876void ram_block_notify_remove(void *host, size_t size)
 877{
 878    RAMBlockNotifier *notifier;
 879
 880    QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
 881        notifier->ram_block_removed(notifier, host, size);
 882    }
 883}
 884