1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20
21#include "qemu-common.h"
22#include "migration/migration.h"
23#include "migration/postcopy-ram.h"
24#include "sysemu/sysemu.h"
25#include "sysemu/balloon.h"
26#include "qemu/error-report.h"
27#include "trace.h"
28
29
30
31
32#define MAX_DISCARDS_PER_COMMAND 12
33
34struct PostcopyDiscardState {
35 const char *ramblock_name;
36 uint64_t offset;
37 uint16_t cur_entry;
38
39
40
41 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
42 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
43 unsigned int nsentwords;
44 unsigned int nsentcmds;
45};
46
47
48
49
50
51#if defined(__linux__)
52
53#include <poll.h>
54#include <sys/ioctl.h>
55#include <sys/syscall.h>
56#include <asm/types.h>
57#endif
58
59#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
60#include <sys/eventfd.h>
61#include <linux/userfaultfd.h>
62
63static bool ufd_version_check(int ufd)
64{
65 struct uffdio_api api_struct;
66 uint64_t ioctl_mask;
67
68 api_struct.api = UFFD_API;
69 api_struct.features = 0;
70 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
71 error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
72 strerror(errno));
73 return false;
74 }
75
76 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
77 (__u64)1 << _UFFDIO_UNREGISTER;
78 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
79 error_report("Missing userfault features: %" PRIx64,
80 (uint64_t)(~api_struct.ioctls & ioctl_mask));
81 return false;
82 }
83
84 return true;
85}
86
87
88
89
90
91
92bool postcopy_ram_supported_by_host(void)
93{
94 long pagesize = getpagesize();
95 int ufd = -1;
96 bool ret = false;
97 void *testarea = NULL;
98 struct uffdio_register reg_struct;
99 struct uffdio_range range_struct;
100 uint64_t feature_mask;
101
102 if ((1ul << qemu_target_page_bits()) > pagesize) {
103 error_report("Target page size bigger than host page size");
104 goto out;
105 }
106
107 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
108 if (ufd == -1) {
109 error_report("%s: userfaultfd not available: %s", __func__,
110 strerror(errno));
111 goto out;
112 }
113
114
115 if (!ufd_version_check(ufd)) {
116 goto out;
117 }
118
119
120
121
122
123 if (munlockall()) {
124 error_report("%s: munlockall: %s", __func__, strerror(errno));
125 return -1;
126 }
127
128
129
130
131
132
133 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
134 MAP_ANONYMOUS, -1, 0);
135 if (testarea == MAP_FAILED) {
136 error_report("%s: Failed to map test area: %s", __func__,
137 strerror(errno));
138 goto out;
139 }
140 g_assert(((size_t)testarea & (pagesize-1)) == 0);
141
142 reg_struct.range.start = (uintptr_t)testarea;
143 reg_struct.range.len = pagesize;
144 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
145
146 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
147 error_report("%s userfault register: %s", __func__, strerror(errno));
148 goto out;
149 }
150
151 range_struct.start = (uintptr_t)testarea;
152 range_struct.len = pagesize;
153 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
154 error_report("%s userfault unregister: %s", __func__, strerror(errno));
155 goto out;
156 }
157
158 feature_mask = (__u64)1 << _UFFDIO_WAKE |
159 (__u64)1 << _UFFDIO_COPY |
160 (__u64)1 << _UFFDIO_ZEROPAGE;
161 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
162 error_report("Missing userfault map features: %" PRIx64,
163 (uint64_t)(~reg_struct.ioctls & feature_mask));
164 goto out;
165 }
166
167
168 ret = true;
169out:
170 if (testarea) {
171 munmap(testarea, pagesize);
172 }
173 if (ufd != -1) {
174 close(ufd);
175 }
176 return ret;
177}
178
179
180
181
182
183
184
185
186
187
188int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
189 size_t length)
190{
191 trace_postcopy_ram_discard_range(start, length);
192 if (madvise(start, length, MADV_DONTNEED)) {
193 error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno));
194 return -1;
195 }
196
197 return 0;
198}
199
200
201
202
203
204
205static int init_range(const char *block_name, void *host_addr,
206 ram_addr_t offset, ram_addr_t length, void *opaque)
207{
208 MigrationIncomingState *mis = opaque;
209
210 trace_postcopy_init_range(block_name, host_addr, offset, length);
211
212
213
214
215
216
217
218 if (postcopy_ram_discard_range(mis, host_addr, length)) {
219 return -1;
220 }
221
222 return 0;
223}
224
225
226
227
228
229static int cleanup_range(const char *block_name, void *host_addr,
230 ram_addr_t offset, ram_addr_t length, void *opaque)
231{
232 MigrationIncomingState *mis = opaque;
233 struct uffdio_range range_struct;
234 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
235
236
237
238
239
240 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
241
242
243
244
245
246
247 range_struct.start = (uintptr_t)host_addr;
248 range_struct.len = length;
249
250 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
251 error_report("%s: userfault unregister %s", __func__, strerror(errno));
252
253 return -1;
254 }
255
256 return 0;
257}
258
259
260
261
262
263
264int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
265{
266 if (qemu_ram_foreach_block(init_range, mis)) {
267 return -1;
268 }
269
270 return 0;
271}
272
273
274
275
276int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
277{
278 trace_postcopy_ram_incoming_cleanup_entry();
279
280 if (mis->have_fault_thread) {
281 uint64_t tmp64;
282
283 if (qemu_ram_foreach_block(cleanup_range, mis)) {
284 return -1;
285 }
286
287
288
289
290 tmp64 = 1;
291 if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
292 trace_postcopy_ram_incoming_cleanup_join();
293 qemu_thread_join(&mis->fault_thread);
294 } else {
295
296 error_report("%s: incrementing userfault_quit_fd: %s", __func__,
297 strerror(errno));
298 }
299 trace_postcopy_ram_incoming_cleanup_closeuf();
300 close(mis->userfault_fd);
301 close(mis->userfault_quit_fd);
302 mis->have_fault_thread = false;
303 }
304
305 qemu_balloon_inhibit(false);
306
307 if (enable_mlock) {
308 if (os_mlock() < 0) {
309 error_report("mlock: %s", strerror(errno));
310
311
312
313
314 }
315 }
316
317 postcopy_state_set(POSTCOPY_INCOMING_END);
318 migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
319
320 if (mis->postcopy_tmp_page) {
321 munmap(mis->postcopy_tmp_page, getpagesize());
322 mis->postcopy_tmp_page = NULL;
323 }
324 trace_postcopy_ram_incoming_cleanup_exit();
325 return 0;
326}
327
328
329
330
331static int nhp_range(const char *block_name, void *host_addr,
332 ram_addr_t offset, ram_addr_t length, void *opaque)
333{
334 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
335
336
337
338
339
340
341 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
342
343 return 0;
344}
345
346
347
348
349
350
351int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
352{
353 if (qemu_ram_foreach_block(nhp_range, mis)) {
354 return -1;
355 }
356
357 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
358
359 return 0;
360}
361
362
363
364
365
366
367
368
369
370
371static int ram_block_enable_notify(const char *block_name, void *host_addr,
372 ram_addr_t offset, ram_addr_t length,
373 void *opaque)
374{
375 MigrationIncomingState *mis = opaque;
376 struct uffdio_register reg_struct;
377
378 reg_struct.range.start = (uintptr_t)host_addr;
379 reg_struct.range.len = length;
380 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
381
382
383 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
384 error_report("%s userfault register: %s", __func__, strerror(errno));
385 return -1;
386 }
387
388 return 0;
389}
390
391
392
393
394static void *postcopy_ram_fault_thread(void *opaque)
395{
396 MigrationIncomingState *mis = opaque;
397 struct uffd_msg msg;
398 int ret;
399 size_t hostpagesize = getpagesize();
400 RAMBlock *rb = NULL;
401 RAMBlock *last_rb = NULL;
402
403 trace_postcopy_ram_fault_thread_entry();
404 qemu_sem_post(&mis->fault_thread_sem);
405
406 while (true) {
407 ram_addr_t rb_offset;
408 struct pollfd pfd[2];
409
410
411
412
413
414
415 pfd[0].fd = mis->userfault_fd;
416 pfd[0].events = POLLIN;
417 pfd[0].revents = 0;
418 pfd[1].fd = mis->userfault_quit_fd;
419 pfd[1].events = POLLIN;
420 pfd[1].revents = 0;
421
422 if (poll(pfd, 2, -1 ) == -1) {
423 error_report("%s: userfault poll: %s", __func__, strerror(errno));
424 break;
425 }
426
427 if (pfd[1].revents) {
428 trace_postcopy_ram_fault_thread_quit();
429 break;
430 }
431
432 ret = read(mis->userfault_fd, &msg, sizeof(msg));
433 if (ret != sizeof(msg)) {
434 if (errno == EAGAIN) {
435
436
437
438
439 continue;
440 }
441 if (ret < 0) {
442 error_report("%s: Failed to read full userfault message: %s",
443 __func__, strerror(errno));
444 break;
445 } else {
446 error_report("%s: Read %d bytes from userfaultfd expected %zd",
447 __func__, ret, sizeof(msg));
448 break;
449 }
450 }
451 if (msg.event != UFFD_EVENT_PAGEFAULT) {
452 error_report("%s: Read unexpected event %ud from userfaultfd",
453 __func__, msg.event);
454 continue;
455 }
456
457 rb = qemu_ram_block_from_host(
458 (void *)(uintptr_t)msg.arg.pagefault.address,
459 true, &rb_offset);
460 if (!rb) {
461 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
462 PRIx64, (uint64_t)msg.arg.pagefault.address);
463 break;
464 }
465
466 rb_offset &= ~(hostpagesize - 1);
467 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
468 qemu_ram_get_idstr(rb),
469 rb_offset);
470
471
472
473
474
475 if (rb != last_rb) {
476 last_rb = rb;
477 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
478 rb_offset, hostpagesize);
479 } else {
480
481 migrate_send_rp_req_pages(mis, NULL,
482 rb_offset, hostpagesize);
483 }
484 }
485 trace_postcopy_ram_fault_thread_exit();
486 return NULL;
487}
488
489int postcopy_ram_enable_notify(MigrationIncomingState *mis)
490{
491
492 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
493 if (mis->userfault_fd == -1) {
494 error_report("%s: Failed to open userfault fd: %s", __func__,
495 strerror(errno));
496 return -1;
497 }
498
499
500
501
502
503 if (!ufd_version_check(mis->userfault_fd)) {
504 return -1;
505 }
506
507
508 mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
509 if (mis->userfault_quit_fd == -1) {
510 error_report("%s: Opening userfault_quit_fd: %s", __func__,
511 strerror(errno));
512 close(mis->userfault_fd);
513 return -1;
514 }
515
516 qemu_sem_init(&mis->fault_thread_sem, 0);
517 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
518 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
519 qemu_sem_wait(&mis->fault_thread_sem);
520 qemu_sem_destroy(&mis->fault_thread_sem);
521 mis->have_fault_thread = true;
522
523
524 if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
525 return -1;
526 }
527
528
529
530
531
532 qemu_balloon_inhibit(true);
533
534 trace_postcopy_ram_enable_notify();
535
536 return 0;
537}
538
539
540
541
542
543int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
544{
545 struct uffdio_copy copy_struct;
546
547 copy_struct.dst = (uint64_t)(uintptr_t)host;
548 copy_struct.src = (uint64_t)(uintptr_t)from;
549 copy_struct.len = getpagesize();
550 copy_struct.mode = 0;
551
552
553
554
555
556
557 if (ioctl(mis->userfault_fd, UFFDIO_COPY, ©_struct)) {
558 int e = errno;
559 error_report("%s: %s copy host: %p from: %p",
560 __func__, strerror(e), host, from);
561
562 return -e;
563 }
564
565 trace_postcopy_place_page(host);
566 return 0;
567}
568
569
570
571
572
573int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
574{
575 struct uffdio_zeropage zero_struct;
576
577 zero_struct.range.start = (uint64_t)(uintptr_t)host;
578 zero_struct.range.len = getpagesize();
579 zero_struct.mode = 0;
580
581 if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) {
582 int e = errno;
583 error_report("%s: %s zero host: %p",
584 __func__, strerror(e), host);
585
586 return -e;
587 }
588
589 trace_postcopy_place_page_zero(host);
590 return 0;
591}
592
593
594
595
596
597
598
599
600
601void *postcopy_get_tmp_page(MigrationIncomingState *mis)
602{
603 if (!mis->postcopy_tmp_page) {
604 mis->postcopy_tmp_page = mmap(NULL, getpagesize(),
605 PROT_READ | PROT_WRITE, MAP_PRIVATE |
606 MAP_ANONYMOUS, -1, 0);
607 if (mis->postcopy_tmp_page == MAP_FAILED) {
608 mis->postcopy_tmp_page = NULL;
609 error_report("%s: %s", __func__, strerror(errno));
610 return NULL;
611 }
612 }
613
614 return mis->postcopy_tmp_page;
615}
616
617#else
618
619bool postcopy_ram_supported_by_host(void)
620{
621 error_report("%s: No OS support", __func__);
622 return false;
623}
624
625int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
626{
627 error_report("postcopy_ram_incoming_init: No OS support");
628 return -1;
629}
630
631int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
632{
633 assert(0);
634 return -1;
635}
636
637int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
638 size_t length)
639{
640 assert(0);
641 return -1;
642}
643
644int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
645{
646 assert(0);
647 return -1;
648}
649
650int postcopy_ram_enable_notify(MigrationIncomingState *mis)
651{
652 assert(0);
653 return -1;
654}
655
656int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
657{
658 assert(0);
659 return -1;
660}
661
662int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
663{
664 assert(0);
665 return -1;
666}
667
668void *postcopy_get_tmp_page(MigrationIncomingState *mis)
669{
670 assert(0);
671 return NULL;
672}
673
674#endif
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
690 unsigned long offset,
691 const char *name)
692{
693 PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
694
695 if (res) {
696 res->ramblock_name = name;
697 res->offset = offset;
698 }
699
700 return res;
701}
702
703
704
705
706
707
708
709
710
711
712
713void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
714 unsigned long start, unsigned long length)
715{
716 size_t tp_bits = qemu_target_page_bits();
717
718 pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits;
719 pds->length_list[pds->cur_entry] = length << tp_bits;
720 trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
721 pds->cur_entry++;
722 pds->nsentwords++;
723
724 if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
725
726 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
727 pds->ramblock_name,
728 pds->cur_entry,
729 pds->start_list,
730 pds->length_list);
731 pds->nsentcmds++;
732 pds->cur_entry = 0;
733 }
734}
735
736
737
738
739
740
741
742
743void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
744{
745
746 if (pds->cur_entry) {
747 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
748 pds->ramblock_name,
749 pds->cur_entry,
750 pds->start_list,
751 pds->length_list);
752 pds->nsentcmds++;
753 }
754
755 trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
756 pds->nsentcmds);
757
758 g_free(pds);
759}
760