1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include "qemu/osdep.h"
26#include "sysemu/hostmem.h"
27#include "sysemu/numa.h"
28#include "sysemu/sysemu.h"
29#include "exec/cpu-common.h"
30#include "exec/ramlist.h"
31#include "qemu/bitmap.h"
32#include "qemu/error-report.h"
33#include "qapi/error.h"
34#include "qapi/opts-visitor.h"
35#include "qapi/qapi-visit-machine.h"
36#include "sysemu/qtest.h"
37#include "hw/core/cpu.h"
38#include "hw/mem/pc-dimm.h"
39#include "migration/vmstate.h"
40#include "hw/boards.h"
41#include "hw/mem/memory-device.h"
42#include "qemu/option.h"
43#include "qemu/config-file.h"
44#include "qemu/cutils.h"
45
46QemuOptsList qemu_numa_opts = {
47 .name = "numa",
48 .implied_opt_name = "type",
49 .head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head),
50 .desc = { { 0 } }
51};
52
53static int have_memdevs;
54static int have_mem;
55static int max_numa_nodeid;
56
57
58
59static void parse_numa_node(MachineState *ms, NumaNodeOptions *node,
60 Error **errp)
61{
62 Error *err = NULL;
63 uint16_t nodenr;
64 uint16List *cpus = NULL;
65 MachineClass *mc = MACHINE_GET_CLASS(ms);
66 unsigned int max_cpus = ms->smp.max_cpus;
67 NodeInfo *numa_info = ms->numa_state->nodes;
68
69 if (node->has_nodeid) {
70 nodenr = node->nodeid;
71 } else {
72 nodenr = ms->numa_state->num_nodes;
73 }
74
75 if (nodenr >= MAX_NODES) {
76 error_setg(errp, "Max number of NUMA nodes reached: %"
77 PRIu16 "", nodenr);
78 return;
79 }
80
81 if (numa_info[nodenr].present) {
82 error_setg(errp, "Duplicate NUMA nodeid: %" PRIu16, nodenr);
83 return;
84 }
85
86 for (cpus = node->cpus; cpus; cpus = cpus->next) {
87 CpuInstanceProperties props;
88 if (cpus->value >= max_cpus) {
89 error_setg(errp,
90 "CPU index (%" PRIu16 ")"
91 " should be smaller than maxcpus (%d)",
92 cpus->value, max_cpus);
93 return;
94 }
95 props = mc->cpu_index_to_instance_props(ms, cpus->value);
96 props.node_id = nodenr;
97 props.has_node_id = true;
98 machine_set_cpu_numa_node(ms, &props, &err);
99 if (err) {
100 error_propagate(errp, err);
101 return;
102 }
103 }
104
105 have_memdevs = have_memdevs ? : node->has_memdev;
106 have_mem = have_mem ? : node->has_mem;
107 if ((node->has_mem && have_memdevs) || (node->has_memdev && have_mem)) {
108 error_setg(errp, "numa configuration should use either mem= or memdev=,"
109 "mixing both is not allowed");
110 return;
111 }
112
113 if (node->has_mem) {
114 numa_info[nodenr].node_mem = node->mem;
115 if (!qtest_enabled()) {
116 warn_report("Parameter -numa node,mem is deprecated,"
117 " use -numa node,memdev instead");
118 }
119 }
120 if (node->has_memdev) {
121 Object *o;
122 o = object_resolve_path_type(node->memdev, TYPE_MEMORY_BACKEND, NULL);
123 if (!o) {
124 error_setg(errp, "memdev=%s is ambiguous", node->memdev);
125 return;
126 }
127
128 object_ref(o);
129 numa_info[nodenr].node_mem = object_property_get_uint(o, "size", NULL);
130 numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
131 }
132 numa_info[nodenr].present = true;
133 max_numa_nodeid = MAX(max_numa_nodeid, nodenr + 1);
134 ms->numa_state->num_nodes++;
135}
136
137static
138void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp)
139{
140 uint16_t src = dist->src;
141 uint16_t dst = dist->dst;
142 uint8_t val = dist->val;
143 NodeInfo *numa_info = ms->numa_state->nodes;
144
145 if (src >= MAX_NODES || dst >= MAX_NODES) {
146 error_setg(errp, "Parameter '%s' expects an integer between 0 and %d",
147 src >= MAX_NODES ? "src" : "dst", MAX_NODES - 1);
148 return;
149 }
150
151 if (!numa_info[src].present || !numa_info[dst].present) {
152 error_setg(errp, "Source/Destination NUMA node is missing. "
153 "Please use '-numa node' option to declare it first.");
154 return;
155 }
156
157 if (val < NUMA_DISTANCE_MIN) {
158 error_setg(errp, "NUMA distance (%" PRIu8 ") is invalid, "
159 "it shouldn't be less than %d.",
160 val, NUMA_DISTANCE_MIN);
161 return;
162 }
163
164 if (src == dst && val != NUMA_DISTANCE_MIN) {
165 error_setg(errp, "Local distance of node %d should be %d.",
166 src, NUMA_DISTANCE_MIN);
167 return;
168 }
169
170 numa_info[src].distance[dst] = val;
171 ms->numa_state->have_numa_distance = true;
172}
173
174void set_numa_options(MachineState *ms, NumaOptions *object, Error **errp)
175{
176 Error *err = NULL;
177
178 if (!ms->numa_state) {
179 error_setg(errp, "NUMA is not supported by this machine-type");
180 goto end;
181 }
182
183 switch (object->type) {
184 case NUMA_OPTIONS_TYPE_NODE:
185 parse_numa_node(ms, &object->u.node, &err);
186 if (err) {
187 goto end;
188 }
189 break;
190 case NUMA_OPTIONS_TYPE_DIST:
191 parse_numa_distance(ms, &object->u.dist, &err);
192 if (err) {
193 goto end;
194 }
195 break;
196 case NUMA_OPTIONS_TYPE_CPU:
197 if (!object->u.cpu.has_node_id) {
198 error_setg(&err, "Missing mandatory node-id property");
199 goto end;
200 }
201 if (!ms->numa_state->nodes[object->u.cpu.node_id].present) {
202 error_setg(&err, "Invalid node-id=%" PRId64 ", NUMA node must be "
203 "defined with -numa node,nodeid=ID before it's used with "
204 "-numa cpu,node-id=ID", object->u.cpu.node_id);
205 goto end;
206 }
207
208 machine_set_cpu_numa_node(ms, qapi_NumaCpuOptions_base(&object->u.cpu),
209 &err);
210 break;
211 default:
212 abort();
213 }
214
215end:
216 error_propagate(errp, err);
217}
218
219static int parse_numa(void *opaque, QemuOpts *opts, Error **errp)
220{
221 NumaOptions *object = NULL;
222 MachineState *ms = MACHINE(opaque);
223 Error *err = NULL;
224 Visitor *v = opts_visitor_new(opts);
225
226 visit_type_NumaOptions(v, NULL, &object, &err);
227 visit_free(v);
228 if (err) {
229 goto end;
230 }
231
232
233 if ((object->type == NUMA_OPTIONS_TYPE_NODE) && object->u.node.has_mem) {
234 const char *mem_str = qemu_opt_get(opts, "mem");
235 qemu_strtosz_MiB(mem_str, NULL, &object->u.node.mem);
236 }
237
238 set_numa_options(ms, object, &err);
239
240end:
241 qapi_free_NumaOptions(object);
242 if (err) {
243 error_propagate(errp, err);
244 return -1;
245 }
246
247 return 0;
248}
249
250
251
252
253
254
255
256static void validate_numa_distance(MachineState *ms)
257{
258 int src, dst;
259 bool is_asymmetrical = false;
260 int nb_numa_nodes = ms->numa_state->num_nodes;
261 NodeInfo *numa_info = ms->numa_state->nodes;
262
263 for (src = 0; src < nb_numa_nodes; src++) {
264 for (dst = src; dst < nb_numa_nodes; dst++) {
265 if (numa_info[src].distance[dst] == 0 &&
266 numa_info[dst].distance[src] == 0) {
267 if (src != dst) {
268 error_report("The distance between node %d and %d is "
269 "missing, at least one distance value "
270 "between each nodes should be provided.",
271 src, dst);
272 exit(EXIT_FAILURE);
273 }
274 }
275
276 if (numa_info[src].distance[dst] != 0 &&
277 numa_info[dst].distance[src] != 0 &&
278 numa_info[src].distance[dst] !=
279 numa_info[dst].distance[src]) {
280 is_asymmetrical = true;
281 }
282 }
283 }
284
285 if (is_asymmetrical) {
286 for (src = 0; src < nb_numa_nodes; src++) {
287 for (dst = 0; dst < nb_numa_nodes; dst++) {
288 if (src != dst && numa_info[src].distance[dst] == 0) {
289 error_report("At least one asymmetrical pair of "
290 "distances is given, please provide distances "
291 "for both directions of all node pairs.");
292 exit(EXIT_FAILURE);
293 }
294 }
295 }
296 }
297}
298
299static void complete_init_numa_distance(MachineState *ms)
300{
301 int src, dst;
302 NodeInfo *numa_info = ms->numa_state->nodes;
303
304
305
306
307
308
309 for (src = 0; src < ms->numa_state->num_nodes; src++) {
310 for (dst = 0; dst < ms->numa_state->num_nodes; dst++) {
311 if (numa_info[src].distance[dst] == 0) {
312 if (src == dst) {
313 numa_info[src].distance[dst] = NUMA_DISTANCE_MIN;
314 } else {
315 numa_info[src].distance[dst] = numa_info[dst].distance[src];
316 }
317 }
318 }
319 }
320}
321
322void numa_legacy_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
323 int nb_nodes, ram_addr_t size)
324{
325 int i;
326 uint64_t usedmem = 0;
327
328
329
330
331
332 for (i = 0; i < nb_nodes - 1; i++) {
333 nodes[i].node_mem = (size / nb_nodes) &
334 ~((1 << mc->numa_mem_align_shift) - 1);
335 usedmem += nodes[i].node_mem;
336 }
337 nodes[i].node_mem = size - usedmem;
338}
339
340void numa_default_auto_assign_ram(MachineClass *mc, NodeInfo *nodes,
341 int nb_nodes, ram_addr_t size)
342{
343 int i;
344 uint64_t usedmem = 0, node_mem;
345 uint64_t granularity = size / nb_nodes;
346 uint64_t propagate = 0;
347
348 for (i = 0; i < nb_nodes - 1; i++) {
349 node_mem = (granularity + propagate) &
350 ~((1 << mc->numa_mem_align_shift) - 1);
351 propagate = granularity + propagate - node_mem;
352 nodes[i].node_mem = node_mem;
353 usedmem += node_mem;
354 }
355 nodes[i].node_mem = size - usedmem;
356}
357
358void numa_complete_configuration(MachineState *ms)
359{
360 int i;
361 MachineClass *mc = MACHINE_GET_CLASS(ms);
362 NodeInfo *numa_info = ms->numa_state->nodes;
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380 if (ms->numa_state->num_nodes == 0 &&
381 ((ms->ram_slots > 0 &&
382 mc->auto_enable_numa_with_memhp) ||
383 mc->auto_enable_numa)) {
384 NumaNodeOptions node = { };
385 parse_numa_node(ms, &node, &error_abort);
386 numa_info[0].node_mem = ram_size;
387 }
388
389 assert(max_numa_nodeid <= MAX_NODES);
390
391
392 for (i = max_numa_nodeid - 1; i >= 0; i--) {
393
394 if (!numa_info[i].present) {
395 error_report("numa: Node ID missing: %d", i);
396 exit(1);
397 }
398 }
399
400
401 assert(ms->numa_state->num_nodes == max_numa_nodeid);
402
403 if (ms->numa_state->num_nodes > 0) {
404 uint64_t numa_total;
405
406 if (ms->numa_state->num_nodes > MAX_NODES) {
407 ms->numa_state->num_nodes = MAX_NODES;
408 }
409
410
411
412
413 for (i = 0; i < ms->numa_state->num_nodes; i++) {
414 if (numa_info[i].node_mem != 0) {
415 break;
416 }
417 }
418 if (i == ms->numa_state->num_nodes) {
419 assert(mc->numa_auto_assign_ram);
420 mc->numa_auto_assign_ram(mc, numa_info,
421 ms->numa_state->num_nodes, ram_size);
422 if (!qtest_enabled()) {
423 warn_report("Default splitting of RAM between nodes is deprecated,"
424 " Use '-numa node,memdev' to explictly define RAM"
425 " allocation per node");
426 }
427 }
428
429 numa_total = 0;
430 for (i = 0; i < ms->numa_state->num_nodes; i++) {
431 numa_total += numa_info[i].node_mem;
432 }
433 if (numa_total != ram_size) {
434 error_report("total memory for NUMA nodes (0x%" PRIx64 ")"
435 " should equal RAM size (0x" RAM_ADDR_FMT ")",
436 numa_total, ram_size);
437 exit(1);
438 }
439
440
441
442
443
444
445
446
447
448
449
450
451
452 if (ms->numa_state->have_numa_distance) {
453
454 validate_numa_distance(ms);
455
456
457 complete_init_numa_distance(ms);
458 }
459 }
460}
461
462void parse_numa_opts(MachineState *ms)
463{
464 qemu_opts_foreach(qemu_find_opts("numa"), parse_numa, ms, &error_fatal);
465}
466
467void numa_cpu_pre_plug(const CPUArchId *slot, DeviceState *dev, Error **errp)
468{
469 int node_id = object_property_get_int(OBJECT(dev), "node-id", &error_abort);
470
471 if (node_id == CPU_UNSET_NUMA_NODE_ID) {
472
473
474 if (slot->props.has_node_id) {
475 object_property_set_int(OBJECT(dev), slot->props.node_id,
476 "node-id", errp);
477 }
478 } else if (node_id != slot->props.node_id) {
479 error_setg(errp, "invalid node-id, must be %"PRId64,
480 slot->props.node_id);
481 }
482}
483
484static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
485 const char *name,
486 uint64_t ram_size)
487{
488 if (mem_path) {
489#ifdef __linux__
490 Error *err = NULL;
491 memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, 0,
492 mem_path, &err);
493 if (err) {
494 error_report_err(err);
495 if (mem_prealloc) {
496 exit(1);
497 }
498 warn_report("falling back to regular RAM allocation");
499 error_printf("This is deprecated. Make sure that -mem-path "
500 " specified path has sufficient resources to allocate"
501 " -m specified RAM amount\n");
502
503
504
505 mem_path = NULL;
506 memory_region_init_ram_nomigrate(mr, owner, name, ram_size, &error_fatal);
507 }
508#else
509 fprintf(stderr, "-mem-path not supported on this host\n");
510 exit(1);
511#endif
512 } else {
513 memory_region_init_ram_nomigrate(mr, owner, name, ram_size, &error_fatal);
514 }
515 vmstate_register_ram_global(mr);
516}
517
518void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
519 const char *name,
520 uint64_t ram_size)
521{
522 uint64_t addr = 0;
523 int i;
524 MachineState *ms = MACHINE(qdev_get_machine());
525
526 if (ms->numa_state == NULL ||
527 ms->numa_state->num_nodes == 0 || !have_memdevs) {
528 allocate_system_memory_nonnuma(mr, owner, name, ram_size);
529 return;
530 }
531
532 memory_region_init(mr, owner, name, ram_size);
533 for (i = 0; i < ms->numa_state->num_nodes; i++) {
534 uint64_t size = ms->numa_state->nodes[i].node_mem;
535 HostMemoryBackend *backend = ms->numa_state->nodes[i].node_memdev;
536 if (!backend) {
537 continue;
538 }
539 MemoryRegion *seg = host_memory_backend_get_memory(backend);
540
541 if (memory_region_is_mapped(seg)) {
542 char *path = object_get_canonical_path_component(OBJECT(backend));
543 error_report("memory backend %s is used multiple times. Each "
544 "-numa option must use a different memdev value.",
545 path);
546 g_free(path);
547 exit(1);
548 }
549
550 host_memory_backend_set_mapped(backend, true);
551 memory_region_add_subregion(mr, addr, seg);
552 vmstate_register_ram_global(seg);
553 addr += size;
554 }
555}
556
557static void numa_stat_memory_devices(NumaNodeMem node_mem[])
558{
559 MemoryDeviceInfoList *info_list = qmp_memory_device_list();
560 MemoryDeviceInfoList *info;
561 PCDIMMDeviceInfo *pcdimm_info;
562 VirtioPMEMDeviceInfo *vpi;
563
564 for (info = info_list; info; info = info->next) {
565 MemoryDeviceInfo *value = info->value;
566
567 if (value) {
568 switch (value->type) {
569 case MEMORY_DEVICE_INFO_KIND_DIMM:
570 case MEMORY_DEVICE_INFO_KIND_NVDIMM:
571 pcdimm_info = value->type == MEMORY_DEVICE_INFO_KIND_DIMM ?
572 value->u.dimm.data : value->u.nvdimm.data;
573 node_mem[pcdimm_info->node].node_mem += pcdimm_info->size;
574 node_mem[pcdimm_info->node].node_plugged_mem +=
575 pcdimm_info->size;
576 break;
577 case MEMORY_DEVICE_INFO_KIND_VIRTIO_PMEM:
578 vpi = value->u.virtio_pmem.data;
579
580 node_mem[0].node_mem += vpi->size;
581 node_mem[0].node_plugged_mem += vpi->size;
582 break;
583 default:
584 g_assert_not_reached();
585 }
586 }
587 }
588 qapi_free_MemoryDeviceInfoList(info_list);
589}
590
591void query_numa_node_mem(NumaNodeMem node_mem[], MachineState *ms)
592{
593 int i;
594
595 if (ms->numa_state == NULL || ms->numa_state->num_nodes <= 0) {
596 return;
597 }
598
599 numa_stat_memory_devices(node_mem);
600 for (i = 0; i < ms->numa_state->num_nodes; i++) {
601 node_mem[i].node_mem += ms->numa_state->nodes[i].node_mem;
602 }
603}
604
605void ram_block_notifier_add(RAMBlockNotifier *n)
606{
607 QLIST_INSERT_HEAD(&ram_list.ramblock_notifiers, n, next);
608}
609
610void ram_block_notifier_remove(RAMBlockNotifier *n)
611{
612 QLIST_REMOVE(n, next);
613}
614
615void ram_block_notify_add(void *host, size_t size)
616{
617 RAMBlockNotifier *notifier;
618
619 QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
620 notifier->ram_block_added(notifier, host, size);
621 }
622}
623
624void ram_block_notify_remove(void *host, size_t size)
625{
626 RAMBlockNotifier *notifier;
627
628 QLIST_FOREACH(notifier, &ram_list.ramblock_notifiers, next) {
629 notifier->ram_block_removed(notifier, host, size);
630 }
631}
632