1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "exec/target_page.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "savevm.h"
24#include "postcopy-ram.h"
25#include "ram.h"
26#include "sysemu/sysemu.h"
27#include "sysemu/balloon.h"
28#include "qemu/error-report.h"
29#include "trace.h"
30
31
32
33
34#define MAX_DISCARDS_PER_COMMAND 12
35
36struct PostcopyDiscardState {
37 const char *ramblock_name;
38 uint16_t cur_entry;
39
40
41
42 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
43 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
44 unsigned int nsentwords;
45 unsigned int nsentcmds;
46};
47
48
49
50
51
52#if defined(__linux__)
53
54#include <poll.h>
55#include <sys/ioctl.h>
56#include <sys/syscall.h>
57#include <asm/types.h>
58#endif
59
60#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
61#include <sys/eventfd.h>
62#include <linux/userfaultfd.h>
63
64
65
66
67
68
69
70
71
72
73
74
75static bool receive_ufd_features(uint64_t *features)
76{
77 struct uffdio_api api_struct = {0};
78 int ufd;
79 bool ret = true;
80
81
82 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
83 if (ufd == -1) {
84 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
85 strerror(errno));
86 return false;
87 }
88
89
90 api_struct.api = UFFD_API;
91 api_struct.features = 0;
92 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
93 error_report("%s: UFFDIO_API failed: %s", __func__,
94 strerror(errno));
95 ret = false;
96 goto release_ufd;
97 }
98
99 *features = api_struct.features;
100
101release_ufd:
102 close(ufd);
103 return ret;
104}
105
106
107
108
109
110
111
112
113
114
115static bool request_ufd_features(int ufd, uint64_t features)
116{
117 struct uffdio_api api_struct = {0};
118 uint64_t ioctl_mask;
119
120 api_struct.api = UFFD_API;
121 api_struct.features = features;
122 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
123 error_report("%s failed: UFFDIO_API failed: %s", __func__,
124 strerror(errno));
125 return false;
126 }
127
128 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
129 (__u64)1 << _UFFDIO_UNREGISTER;
130 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
131 error_report("Missing userfault features: %" PRIx64,
132 (uint64_t)(~api_struct.ioctls & ioctl_mask));
133 return false;
134 }
135
136 return true;
137}
138
139static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
140{
141 uint64_t asked_features = 0;
142 static uint64_t supported_features;
143
144
145
146
147
148
149 if (!supported_features) {
150 if (!receive_ufd_features(&supported_features)) {
151 error_report("%s failed", __func__);
152 return false;
153 }
154 }
155
156
157
158
159
160
161 if (!request_ufd_features(ufd, asked_features)) {
162 error_report("%s failed: features %" PRIu64, __func__,
163 asked_features);
164 return false;
165 }
166
167 if (getpagesize() != ram_pagesize_summary()) {
168 bool have_hp = false;
169
170#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
171 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
172#endif
173 if (!have_hp) {
174 error_report("Userfault on this host does not support huge pages");
175 return false;
176 }
177 }
178 return true;
179}
180
181
182
183static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
184 ram_addr_t offset, ram_addr_t length, void *opaque)
185{
186 RAMBlock *rb = qemu_ram_block_by_name(block_name);
187 size_t pagesize = qemu_ram_pagesize(rb);
188
189 if (qemu_ram_is_shared(rb)) {
190 error_report("Postcopy on shared RAM (%s) is not yet supported",
191 block_name);
192 return 1;
193 }
194
195 if (length % pagesize) {
196 error_report("Postcopy requires RAM blocks to be a page size multiple,"
197 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
198 "page size of 0x%zx", block_name, length, pagesize);
199 return 1;
200 }
201 return 0;
202}
203
204
205
206
207
208
209bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
210{
211 long pagesize = getpagesize();
212 int ufd = -1;
213 bool ret = false;
214 void *testarea = NULL;
215 struct uffdio_register reg_struct;
216 struct uffdio_range range_struct;
217 uint64_t feature_mask;
218
219 if (qemu_target_page_size() > pagesize) {
220 error_report("Target page size bigger than host page size");
221 goto out;
222 }
223
224 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
225 if (ufd == -1) {
226 error_report("%s: userfaultfd not available: %s", __func__,
227 strerror(errno));
228 goto out;
229 }
230
231
232 if (!ufd_check_and_apply(ufd, mis)) {
233 goto out;
234 }
235
236
237 if (qemu_ram_foreach_block(test_ramblock_postcopiable, NULL)) {
238 goto out;
239 }
240
241
242
243
244
245 if (munlockall()) {
246 error_report("%s: munlockall: %s", __func__, strerror(errno));
247 return -1;
248 }
249
250
251
252
253
254
255 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
256 MAP_ANONYMOUS, -1, 0);
257 if (testarea == MAP_FAILED) {
258 error_report("%s: Failed to map test area: %s", __func__,
259 strerror(errno));
260 goto out;
261 }
262 g_assert(((size_t)testarea & (pagesize-1)) == 0);
263
264 reg_struct.range.start = (uintptr_t)testarea;
265 reg_struct.range.len = pagesize;
266 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
267
268 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
269 error_report("%s userfault register: %s", __func__, strerror(errno));
270 goto out;
271 }
272
273 range_struct.start = (uintptr_t)testarea;
274 range_struct.len = pagesize;
275 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
276 error_report("%s userfault unregister: %s", __func__, strerror(errno));
277 goto out;
278 }
279
280 feature_mask = (__u64)1 << _UFFDIO_WAKE |
281 (__u64)1 << _UFFDIO_COPY |
282 (__u64)1 << _UFFDIO_ZEROPAGE;
283 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
284 error_report("Missing userfault map features: %" PRIx64,
285 (uint64_t)(~reg_struct.ioctls & feature_mask));
286 goto out;
287 }
288
289
290 ret = true;
291out:
292 if (testarea) {
293 munmap(testarea, pagesize);
294 }
295 if (ufd != -1) {
296 close(ufd);
297 }
298 return ret;
299}
300
301
302
303
304
305
306static int init_range(const char *block_name, void *host_addr,
307 ram_addr_t offset, ram_addr_t length, void *opaque)
308{
309 trace_postcopy_init_range(block_name, host_addr, offset, length);
310
311
312
313
314
315
316
317 if (ram_discard_range(block_name, 0, length)) {
318 return -1;
319 }
320
321 return 0;
322}
323
324
325
326
327
328static int cleanup_range(const char *block_name, void *host_addr,
329 ram_addr_t offset, ram_addr_t length, void *opaque)
330{
331 MigrationIncomingState *mis = opaque;
332 struct uffdio_range range_struct;
333 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
334
335
336
337
338
339 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
340
341
342
343
344
345
346 range_struct.start = (uintptr_t)host_addr;
347 range_struct.len = length;
348
349 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
350 error_report("%s: userfault unregister %s", __func__, strerror(errno));
351
352 return -1;
353 }
354
355 return 0;
356}
357
358
359
360
361
362
363int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
364{
365 if (qemu_ram_foreach_block(init_range, NULL)) {
366 return -1;
367 }
368
369 return 0;
370}
371
372
373
374
375int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
376{
377 trace_postcopy_ram_incoming_cleanup_entry();
378
379 if (mis->have_fault_thread) {
380 uint64_t tmp64;
381
382 if (qemu_ram_foreach_block(cleanup_range, mis)) {
383 return -1;
384 }
385
386
387
388
389 tmp64 = 1;
390 if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
391 trace_postcopy_ram_incoming_cleanup_join();
392 qemu_thread_join(&mis->fault_thread);
393 } else {
394
395 error_report("%s: incrementing userfault_quit_fd: %s", __func__,
396 strerror(errno));
397 }
398 trace_postcopy_ram_incoming_cleanup_closeuf();
399 close(mis->userfault_fd);
400 close(mis->userfault_quit_fd);
401 mis->have_fault_thread = false;
402 }
403
404 qemu_balloon_inhibit(false);
405
406 if (enable_mlock) {
407 if (os_mlock() < 0) {
408 error_report("mlock: %s", strerror(errno));
409
410
411
412
413 }
414 }
415
416 postcopy_state_set(POSTCOPY_INCOMING_END);
417
418 if (mis->postcopy_tmp_page) {
419 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
420 mis->postcopy_tmp_page = NULL;
421 }
422 if (mis->postcopy_tmp_zero_page) {
423 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
424 mis->postcopy_tmp_zero_page = NULL;
425 }
426 trace_postcopy_ram_incoming_cleanup_exit();
427 return 0;
428}
429
430
431
432
433static int nhp_range(const char *block_name, void *host_addr,
434 ram_addr_t offset, ram_addr_t length, void *opaque)
435{
436 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
437
438
439
440
441
442
443 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
444
445 return 0;
446}
447
448
449
450
451
452
453int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
454{
455 if (qemu_ram_foreach_block(nhp_range, mis)) {
456 return -1;
457 }
458
459 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
460
461 return 0;
462}
463
464
465
466
467
468
469
470
471
472
473static int ram_block_enable_notify(const char *block_name, void *host_addr,
474 ram_addr_t offset, ram_addr_t length,
475 void *opaque)
476{
477 MigrationIncomingState *mis = opaque;
478 struct uffdio_register reg_struct;
479
480 reg_struct.range.start = (uintptr_t)host_addr;
481 reg_struct.range.len = length;
482 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
483
484
485 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
486 error_report("%s userfault register: %s", __func__, strerror(errno));
487 return -1;
488 }
489 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
490 error_report("%s userfault: Region doesn't support COPY", __func__);
491 return -1;
492 }
493
494 return 0;
495}
496
497
498
499
500static void *postcopy_ram_fault_thread(void *opaque)
501{
502 MigrationIncomingState *mis = opaque;
503 struct uffd_msg msg;
504 int ret;
505 RAMBlock *rb = NULL;
506 RAMBlock *last_rb = NULL;
507
508 trace_postcopy_ram_fault_thread_entry();
509 qemu_sem_post(&mis->fault_thread_sem);
510
511 while (true) {
512 ram_addr_t rb_offset;
513 struct pollfd pfd[2];
514
515
516
517
518
519
520 pfd[0].fd = mis->userfault_fd;
521 pfd[0].events = POLLIN;
522 pfd[0].revents = 0;
523 pfd[1].fd = mis->userfault_quit_fd;
524 pfd[1].events = POLLIN;
525 pfd[1].revents = 0;
526
527 if (poll(pfd, 2, -1 ) == -1) {
528 error_report("%s: userfault poll: %s", __func__, strerror(errno));
529 break;
530 }
531
532 if (pfd[1].revents) {
533 trace_postcopy_ram_fault_thread_quit();
534 break;
535 }
536
537 ret = read(mis->userfault_fd, &msg, sizeof(msg));
538 if (ret != sizeof(msg)) {
539 if (errno == EAGAIN) {
540
541
542
543
544 continue;
545 }
546 if (ret < 0) {
547 error_report("%s: Failed to read full userfault message: %s",
548 __func__, strerror(errno));
549 break;
550 } else {
551 error_report("%s: Read %d bytes from userfaultfd expected %zd",
552 __func__, ret, sizeof(msg));
553 break;
554 }
555 }
556 if (msg.event != UFFD_EVENT_PAGEFAULT) {
557 error_report("%s: Read unexpected event %ud from userfaultfd",
558 __func__, msg.event);
559 continue;
560 }
561
562 rb = qemu_ram_block_from_host(
563 (void *)(uintptr_t)msg.arg.pagefault.address,
564 true, &rb_offset);
565 if (!rb) {
566 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
567 PRIx64, (uint64_t)msg.arg.pagefault.address);
568 break;
569 }
570
571 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
572 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
573 qemu_ram_get_idstr(rb),
574 rb_offset);
575
576
577
578
579
580 if (rb != last_rb) {
581 last_rb = rb;
582 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
583 rb_offset, qemu_ram_pagesize(rb));
584 } else {
585
586 migrate_send_rp_req_pages(mis, NULL,
587 rb_offset, qemu_ram_pagesize(rb));
588 }
589 }
590 trace_postcopy_ram_fault_thread_exit();
591 return NULL;
592}
593
594int postcopy_ram_enable_notify(MigrationIncomingState *mis)
595{
596
597 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
598 if (mis->userfault_fd == -1) {
599 error_report("%s: Failed to open userfault fd: %s", __func__,
600 strerror(errno));
601 return -1;
602 }
603
604
605
606
607
608 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
609 return -1;
610 }
611
612
613 mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
614 if (mis->userfault_quit_fd == -1) {
615 error_report("%s: Opening userfault_quit_fd: %s", __func__,
616 strerror(errno));
617 close(mis->userfault_fd);
618 return -1;
619 }
620
621 qemu_sem_init(&mis->fault_thread_sem, 0);
622 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
623 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
624 qemu_sem_wait(&mis->fault_thread_sem);
625 qemu_sem_destroy(&mis->fault_thread_sem);
626 mis->have_fault_thread = true;
627
628
629 if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
630 return -1;
631 }
632
633
634
635
636
637 qemu_balloon_inhibit(true);
638
639 trace_postcopy_ram_enable_notify();
640
641 return 0;
642}
643
644static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
645 void *from_addr, uint64_t pagesize, RAMBlock *rb)
646{
647 int ret;
648 if (from_addr) {
649 struct uffdio_copy copy_struct;
650 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
651 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
652 copy_struct.len = pagesize;
653 copy_struct.mode = 0;
654 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
655 } else {
656 struct uffdio_zeropage zero_struct;
657 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
658 zero_struct.range.len = pagesize;
659 zero_struct.mode = 0;
660 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
661 }
662 if (!ret) {
663 ramblock_recv_bitmap_set_range(rb, host_addr,
664 pagesize / qemu_target_page_size());
665 }
666 return ret;
667}
668
669
670
671
672
673int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
674 RAMBlock *rb)
675{
676 size_t pagesize = qemu_ram_pagesize(rb);
677
678
679
680
681
682
683 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
684 int e = errno;
685 error_report("%s: %s copy host: %p from: %p (size: %zd)",
686 __func__, strerror(e), host, from, pagesize);
687
688 return -e;
689 }
690
691 trace_postcopy_place_page(host);
692 return 0;
693}
694
695
696
697
698
699int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
700 RAMBlock *rb)
701{
702 trace_postcopy_place_page_zero(host);
703
704 if (qemu_ram_pagesize(rb) == getpagesize()) {
705 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, getpagesize(),
706 rb)) {
707 int e = errno;
708 error_report("%s: %s zero host: %p",
709 __func__, strerror(e), host);
710
711 return -e;
712 }
713 } else {
714
715 if (!mis->postcopy_tmp_zero_page) {
716 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
717 PROT_READ | PROT_WRITE,
718 MAP_PRIVATE | MAP_ANONYMOUS,
719 -1, 0);
720 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
721 int e = errno;
722 mis->postcopy_tmp_zero_page = NULL;
723 error_report("%s: %s mapping large zero page",
724 __func__, strerror(e));
725 return -e;
726 }
727 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
728 }
729 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
730 rb);
731 }
732
733 return 0;
734}
735
736
737
738
739
740
741
742
743
744void *postcopy_get_tmp_page(MigrationIncomingState *mis)
745{
746 if (!mis->postcopy_tmp_page) {
747 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
748 PROT_READ | PROT_WRITE, MAP_PRIVATE |
749 MAP_ANONYMOUS, -1, 0);
750 if (mis->postcopy_tmp_page == MAP_FAILED) {
751 mis->postcopy_tmp_page = NULL;
752 error_report("%s: %s", __func__, strerror(errno));
753 return NULL;
754 }
755 }
756
757 return mis->postcopy_tmp_page;
758}
759
760#else
761
762bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
763{
764 error_report("%s: No OS support", __func__);
765 return false;
766}
767
768int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
769{
770 error_report("postcopy_ram_incoming_init: No OS support");
771 return -1;
772}
773
774int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
775{
776 assert(0);
777 return -1;
778}
779
780int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
781{
782 assert(0);
783 return -1;
784}
785
786int postcopy_ram_enable_notify(MigrationIncomingState *mis)
787{
788 assert(0);
789 return -1;
790}
791
792int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
793 RAMBlock *rb)
794{
795 assert(0);
796 return -1;
797}
798
799int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
800 RAMBlock *rb)
801{
802 assert(0);
803 return -1;
804}
805
806void *postcopy_get_tmp_page(MigrationIncomingState *mis)
807{
808 assert(0);
809 return NULL;
810}
811
812#endif
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
828 const char *name)
829{
830 PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
831
832 if (res) {
833 res->ramblock_name = name;
834 }
835
836 return res;
837}
838
839
840
841
842
843
844
845
846
847
848
849void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
850 unsigned long start, unsigned long length)
851{
852 size_t tp_size = qemu_target_page_size();
853
854 pds->start_list[pds->cur_entry] = start * tp_size;
855 pds->length_list[pds->cur_entry] = length * tp_size;
856 trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
857 pds->cur_entry++;
858 pds->nsentwords++;
859
860 if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
861
862 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
863 pds->ramblock_name,
864 pds->cur_entry,
865 pds->start_list,
866 pds->length_list);
867 pds->nsentcmds++;
868 pds->cur_entry = 0;
869 }
870}
871
872
873
874
875
876
877
878
879void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
880{
881
882 if (pds->cur_entry) {
883 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
884 pds->ramblock_name,
885 pds->cur_entry,
886 pds->start_list,
887 pds->length_list);
888 pds->nsentcmds++;
889 }
890
891 trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
892 pds->nsentcmds);
893
894 g_free(pds);
895}
896
897
898
899
900
901
902static PostcopyState incoming_postcopy_state;
903
904PostcopyState postcopy_state_get(void)
905{
906 return atomic_mb_read(&incoming_postcopy_state);
907}
908
909
910PostcopyState postcopy_state_set(PostcopyState new_state)
911{
912 return atomic_xchg(&incoming_postcopy_state, new_state);
913}
914