1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "exec/target_page.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "savevm.h"
24#include "postcopy-ram.h"
25#include "ram.h"
26#include "qapi/error.h"
27#include "qemu/notify.h"
28#include "sysemu/sysemu.h"
29#include "sysemu/balloon.h"
30#include "qemu/error-report.h"
31#include "trace.h"
32
33
34
35
36#define MAX_DISCARDS_PER_COMMAND 12
37
38struct PostcopyDiscardState {
39 const char *ramblock_name;
40 uint16_t cur_entry;
41
42
43
44 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
45 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
46 unsigned int nsentwords;
47 unsigned int nsentcmds;
48};
49
50static NotifierWithReturnList postcopy_notifier_list;
51
52void postcopy_infrastructure_init(void)
53{
54 notifier_with_return_list_init(&postcopy_notifier_list);
55}
56
57void postcopy_add_notifier(NotifierWithReturn *nn)
58{
59 notifier_with_return_list_add(&postcopy_notifier_list, nn);
60}
61
62void postcopy_remove_notifier(NotifierWithReturn *n)
63{
64 notifier_with_return_remove(n);
65}
66
67int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
68{
69 struct PostcopyNotifyData pnd;
70 pnd.reason = reason;
71 pnd.errp = errp;
72
73 return notifier_with_return_list_notify(&postcopy_notifier_list,
74 &pnd);
75}
76
77
78
79
80
81#if defined(__linux__)
82
83#include <poll.h>
84#include <sys/ioctl.h>
85#include <sys/syscall.h>
86#include <asm/types.h>
87#endif
88
89#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
90#include <sys/eventfd.h>
91#include <linux/userfaultfd.h>
92
93typedef struct PostcopyBlocktimeContext {
94
95 uint32_t *page_fault_vcpu_time;
96
97 uintptr_t *vcpu_addr;
98 uint32_t total_blocktime;
99
100 uint32_t *vcpu_blocktime;
101
102 uint32_t last_begin;
103
104 int smp_cpus_down;
105 uint64_t start_time;
106
107
108
109
110
111 Notifier exit_notifier;
112} PostcopyBlocktimeContext;
113
114static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
115{
116 g_free(ctx->page_fault_vcpu_time);
117 g_free(ctx->vcpu_addr);
118 g_free(ctx->vcpu_blocktime);
119 g_free(ctx);
120}
121
122static void migration_exit_cb(Notifier *n, void *data)
123{
124 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
125 exit_notifier);
126 destroy_blocktime_context(ctx);
127}
128
129static struct PostcopyBlocktimeContext *blocktime_context_new(void)
130{
131 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
132 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
133 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
134 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
135
136 ctx->exit_notifier.notify = migration_exit_cb;
137 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
138 qemu_add_exit_notifier(&ctx->exit_notifier);
139 return ctx;
140}
141
142static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
143{
144 uint32List *list = NULL, *entry = NULL;
145 int i;
146
147 for (i = smp_cpus - 1; i >= 0; i--) {
148 entry = g_new0(uint32List, 1);
149 entry->value = ctx->vcpu_blocktime[i];
150 entry->next = list;
151 list = entry;
152 }
153
154 return list;
155}
156
157
158
159
160
161
162
163
164void fill_destination_postcopy_migration_info(MigrationInfo *info)
165{
166 MigrationIncomingState *mis = migration_incoming_get_current();
167 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
168
169 if (!bc) {
170 return;
171 }
172
173 info->has_postcopy_blocktime = true;
174 info->postcopy_blocktime = bc->total_blocktime;
175 info->has_postcopy_vcpu_blocktime = true;
176 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
177}
178
179static uint32_t get_postcopy_total_blocktime(void)
180{
181 MigrationIncomingState *mis = migration_incoming_get_current();
182 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
183
184 if (!bc) {
185 return 0;
186 }
187
188 return bc->total_blocktime;
189}
190
191
192
193
194
195
196
197
198
199
200
201static bool receive_ufd_features(uint64_t *features)
202{
203 struct uffdio_api api_struct = {0};
204 int ufd;
205 bool ret = true;
206
207
208 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
209 if (ufd == -1) {
210 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
211 strerror(errno));
212 return false;
213 }
214
215
216 api_struct.api = UFFD_API;
217 api_struct.features = 0;
218 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
219 error_report("%s: UFFDIO_API failed: %s", __func__,
220 strerror(errno));
221 ret = false;
222 goto release_ufd;
223 }
224
225 *features = api_struct.features;
226
227release_ufd:
228 close(ufd);
229 return ret;
230}
231
232
233
234
235
236
237
238
239
240
241static bool request_ufd_features(int ufd, uint64_t features)
242{
243 struct uffdio_api api_struct = {0};
244 uint64_t ioctl_mask;
245
246 api_struct.api = UFFD_API;
247 api_struct.features = features;
248 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
249 error_report("%s failed: UFFDIO_API failed: %s", __func__,
250 strerror(errno));
251 return false;
252 }
253
254 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
255 (__u64)1 << _UFFDIO_UNREGISTER;
256 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
257 error_report("Missing userfault features: %" PRIx64,
258 (uint64_t)(~api_struct.ioctls & ioctl_mask));
259 return false;
260 }
261
262 return true;
263}
264
265static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
266{
267 uint64_t asked_features = 0;
268 static uint64_t supported_features;
269
270
271
272
273
274
275 if (!supported_features) {
276 if (!receive_ufd_features(&supported_features)) {
277 error_report("%s failed", __func__);
278 return false;
279 }
280 }
281
282#ifdef UFFD_FEATURE_THREAD_ID
283 if (migrate_postcopy_blocktime() && mis &&
284 UFFD_FEATURE_THREAD_ID & supported_features) {
285
286
287 if (!mis->blocktime_ctx) {
288 mis->blocktime_ctx = blocktime_context_new();
289 }
290
291 asked_features |= UFFD_FEATURE_THREAD_ID;
292 }
293#endif
294
295
296
297
298
299
300 if (!request_ufd_features(ufd, asked_features)) {
301 error_report("%s failed: features %" PRIu64, __func__,
302 asked_features);
303 return false;
304 }
305
306 if (getpagesize() != ram_pagesize_summary()) {
307 bool have_hp = false;
308
309#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
310 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
311#endif
312 if (!have_hp) {
313 error_report("Userfault on this host does not support huge pages");
314 return false;
315 }
316 }
317 return true;
318}
319
320
321
322static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
323 ram_addr_t offset, ram_addr_t length, void *opaque)
324{
325 RAMBlock *rb = qemu_ram_block_by_name(block_name);
326 size_t pagesize = qemu_ram_pagesize(rb);
327
328 if (length % pagesize) {
329 error_report("Postcopy requires RAM blocks to be a page size multiple,"
330 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
331 "page size of 0x%zx", block_name, length, pagesize);
332 return 1;
333 }
334 return 0;
335}
336
337
338
339
340
341
342bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
343{
344 long pagesize = getpagesize();
345 int ufd = -1;
346 bool ret = false;
347 void *testarea = NULL;
348 struct uffdio_register reg_struct;
349 struct uffdio_range range_struct;
350 uint64_t feature_mask;
351 Error *local_err = NULL;
352
353 if (qemu_target_page_size() > pagesize) {
354 error_report("Target page size bigger than host page size");
355 goto out;
356 }
357
358 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
359 if (ufd == -1) {
360 error_report("%s: userfaultfd not available: %s", __func__,
361 strerror(errno));
362 goto out;
363 }
364
365
366 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
367 error_report_err(local_err);
368 goto out;
369 }
370
371
372 if (!ufd_check_and_apply(ufd, mis)) {
373 goto out;
374 }
375
376
377 if (qemu_ram_foreach_migratable_block(test_ramblock_postcopiable, NULL)) {
378 goto out;
379 }
380
381
382
383
384
385 if (munlockall()) {
386 error_report("%s: munlockall: %s", __func__, strerror(errno));
387 return -1;
388 }
389
390
391
392
393
394
395 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
396 MAP_ANONYMOUS, -1, 0);
397 if (testarea == MAP_FAILED) {
398 error_report("%s: Failed to map test area: %s", __func__,
399 strerror(errno));
400 goto out;
401 }
402 g_assert(((size_t)testarea & (pagesize-1)) == 0);
403
404 reg_struct.range.start = (uintptr_t)testarea;
405 reg_struct.range.len = pagesize;
406 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
407
408 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
409 error_report("%s userfault register: %s", __func__, strerror(errno));
410 goto out;
411 }
412
413 range_struct.start = (uintptr_t)testarea;
414 range_struct.len = pagesize;
415 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
416 error_report("%s userfault unregister: %s", __func__, strerror(errno));
417 goto out;
418 }
419
420 feature_mask = (__u64)1 << _UFFDIO_WAKE |
421 (__u64)1 << _UFFDIO_COPY |
422 (__u64)1 << _UFFDIO_ZEROPAGE;
423 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
424 error_report("Missing userfault map features: %" PRIx64,
425 (uint64_t)(~reg_struct.ioctls & feature_mask));
426 goto out;
427 }
428
429
430 ret = true;
431out:
432 if (testarea) {
433 munmap(testarea, pagesize);
434 }
435 if (ufd != -1) {
436 close(ufd);
437 }
438 return ret;
439}
440
441
442
443
444
445
446static int init_range(const char *block_name, void *host_addr,
447 ram_addr_t offset, ram_addr_t length, void *opaque)
448{
449 trace_postcopy_init_range(block_name, host_addr, offset, length);
450
451
452
453
454
455
456
457 if (ram_discard_range(block_name, 0, length)) {
458 return -1;
459 }
460
461 return 0;
462}
463
464
465
466
467
468static int cleanup_range(const char *block_name, void *host_addr,
469 ram_addr_t offset, ram_addr_t length, void *opaque)
470{
471 MigrationIncomingState *mis = opaque;
472 struct uffdio_range range_struct;
473 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
474
475
476
477
478
479 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
480
481
482
483
484
485
486 range_struct.start = (uintptr_t)host_addr;
487 range_struct.len = length;
488
489 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
490 error_report("%s: userfault unregister %s", __func__, strerror(errno));
491
492 return -1;
493 }
494
495 return 0;
496}
497
498
499
500
501
502
503int postcopy_ram_incoming_init(MigrationIncomingState *mis)
504{
505 if (qemu_ram_foreach_migratable_block(init_range, NULL)) {
506 return -1;
507 }
508
509 return 0;
510}
511
512
513
514
515int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
516{
517 trace_postcopy_ram_incoming_cleanup_entry();
518
519 if (mis->have_fault_thread) {
520 Error *local_err = NULL;
521
522
523 atomic_set(&mis->fault_thread_quit, 1);
524 postcopy_fault_thread_notify(mis);
525 trace_postcopy_ram_incoming_cleanup_join();
526 qemu_thread_join(&mis->fault_thread);
527
528 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
529 error_report_err(local_err);
530 return -1;
531 }
532
533 if (qemu_ram_foreach_migratable_block(cleanup_range, mis)) {
534 return -1;
535 }
536
537 trace_postcopy_ram_incoming_cleanup_closeuf();
538 close(mis->userfault_fd);
539 close(mis->userfault_event_fd);
540 mis->have_fault_thread = false;
541 }
542
543 qemu_balloon_inhibit(false);
544
545 if (enable_mlock) {
546 if (os_mlock() < 0) {
547 error_report("mlock: %s", strerror(errno));
548
549
550
551
552 }
553 }
554
555 postcopy_state_set(POSTCOPY_INCOMING_END);
556
557 if (mis->postcopy_tmp_page) {
558 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
559 mis->postcopy_tmp_page = NULL;
560 }
561 if (mis->postcopy_tmp_zero_page) {
562 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
563 mis->postcopy_tmp_zero_page = NULL;
564 }
565 trace_postcopy_ram_incoming_cleanup_blocktime(
566 get_postcopy_total_blocktime());
567
568 trace_postcopy_ram_incoming_cleanup_exit();
569 return 0;
570}
571
572
573
574
575static int nhp_range(const char *block_name, void *host_addr,
576 ram_addr_t offset, ram_addr_t length, void *opaque)
577{
578 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
579
580
581
582
583
584
585 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
586
587 return 0;
588}
589
590
591
592
593
594
595int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
596{
597 if (qemu_ram_foreach_migratable_block(nhp_range, mis)) {
598 return -1;
599 }
600
601 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
602
603 return 0;
604}
605
606
607
608
609
610
611
612
613
614
615static int ram_block_enable_notify(const char *block_name, void *host_addr,
616 ram_addr_t offset, ram_addr_t length,
617 void *opaque)
618{
619 MigrationIncomingState *mis = opaque;
620 struct uffdio_register reg_struct;
621
622 reg_struct.range.start = (uintptr_t)host_addr;
623 reg_struct.range.len = length;
624 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
625
626
627 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
628 error_report("%s userfault register: %s", __func__, strerror(errno));
629 return -1;
630 }
631 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
632 error_report("%s userfault: Region doesn't support COPY", __func__);
633 return -1;
634 }
635 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
636 RAMBlock *rb = qemu_ram_block_by_name(block_name);
637 qemu_ram_set_uf_zeroable(rb);
638 }
639
640 return 0;
641}
642
643int postcopy_wake_shared(struct PostCopyFD *pcfd,
644 uint64_t client_addr,
645 RAMBlock *rb)
646{
647 size_t pagesize = qemu_ram_pagesize(rb);
648 struct uffdio_range range;
649 int ret;
650 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
651 range.start = client_addr & ~(pagesize - 1);
652 range.len = pagesize;
653 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
654 if (ret) {
655 error_report("%s: Failed to wake: %zx in %s (%s)",
656 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
657 strerror(errno));
658 }
659 return ret;
660}
661
662
663
664
665
666
667int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
668 uint64_t client_addr, uint64_t rb_offset)
669{
670 size_t pagesize = qemu_ram_pagesize(rb);
671 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
672 MigrationIncomingState *mis = migration_incoming_get_current();
673
674 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
675 rb_offset);
676 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
677 trace_postcopy_request_shared_page_present(pcfd->idstr,
678 qemu_ram_get_idstr(rb), rb_offset);
679 return postcopy_wake_shared(pcfd, client_addr, rb);
680 }
681 if (rb != mis->last_rb) {
682 mis->last_rb = rb;
683 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
684 aligned_rbo, pagesize);
685 } else {
686
687 migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
688 }
689 return 0;
690}
691
692static int get_mem_fault_cpu_index(uint32_t pid)
693{
694 CPUState *cpu_iter;
695
696 CPU_FOREACH(cpu_iter) {
697 if (cpu_iter->thread_id == pid) {
698 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
699 return cpu_iter->cpu_index;
700 }
701 }
702 trace_get_mem_fault_cpu_index(-1, pid);
703 return -1;
704}
705
706static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
707{
708 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
709 dc->start_time;
710 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
711}
712
713
714
715
716
717
718
719
720
721static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
722 RAMBlock *rb)
723{
724 int cpu, already_received;
725 MigrationIncomingState *mis = migration_incoming_get_current();
726 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
727 uint32_t low_time_offset;
728
729 if (!dc || ptid == 0) {
730 return;
731 }
732 cpu = get_mem_fault_cpu_index(ptid);
733 if (cpu < 0) {
734 return;
735 }
736
737 low_time_offset = get_low_time_offset(dc);
738 if (dc->vcpu_addr[cpu] == 0) {
739 atomic_inc(&dc->smp_cpus_down);
740 }
741
742 atomic_xchg(&dc->last_begin, low_time_offset);
743 atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
744 atomic_xchg(&dc->vcpu_addr[cpu], addr);
745
746
747
748
749 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
750 if (already_received) {
751 atomic_xchg(&dc->vcpu_addr[cpu], 0);
752 atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
753 atomic_dec(&dc->smp_cpus_down);
754 }
755 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
756 cpu, already_received);
757}
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786static void mark_postcopy_blocktime_end(uintptr_t addr)
787{
788 MigrationIncomingState *mis = migration_incoming_get_current();
789 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
790 int i, affected_cpu = 0;
791 bool vcpu_total_blocktime = false;
792 uint32_t read_vcpu_time, low_time_offset;
793
794 if (!dc) {
795 return;
796 }
797
798 low_time_offset = get_low_time_offset(dc);
799
800
801
802
803 for (i = 0; i < smp_cpus; i++) {
804 uint32_t vcpu_blocktime = 0;
805
806 read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
807 if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
808 read_vcpu_time == 0) {
809 continue;
810 }
811 atomic_xchg(&dc->vcpu_addr[i], 0);
812 vcpu_blocktime = low_time_offset - read_vcpu_time;
813 affected_cpu += 1;
814
815
816
817 if (!vcpu_total_blocktime &&
818 atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
819 vcpu_total_blocktime = true;
820 }
821
822 dc->vcpu_blocktime[i] += vcpu_blocktime;
823 }
824
825 atomic_sub(&dc->smp_cpus_down, affected_cpu);
826 if (vcpu_total_blocktime) {
827 dc->total_blocktime += low_time_offset - atomic_fetch_add(
828 &dc->last_begin, 0);
829 }
830 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
831 affected_cpu);
832}
833
834static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
835{
836 trace_postcopy_pause_fault_thread();
837
838 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
839
840 trace_postcopy_pause_fault_thread_continued();
841
842 return true;
843}
844
845
846
847
848static void *postcopy_ram_fault_thread(void *opaque)
849{
850 MigrationIncomingState *mis = opaque;
851 struct uffd_msg msg;
852 int ret;
853 size_t index;
854 RAMBlock *rb = NULL;
855
856 trace_postcopy_ram_fault_thread_entry();
857 mis->last_rb = NULL;
858 qemu_sem_post(&mis->fault_thread_sem);
859
860 struct pollfd *pfd;
861 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
862
863 pfd = g_new0(struct pollfd, pfd_len);
864
865 pfd[0].fd = mis->userfault_fd;
866 pfd[0].events = POLLIN;
867 pfd[1].fd = mis->userfault_event_fd;
868 pfd[1].events = POLLIN;
869 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
870 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
871 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
872 struct PostCopyFD, index);
873 pfd[2 + index].fd = pcfd->fd;
874 pfd[2 + index].events = POLLIN;
875 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
876 pcfd->fd);
877 }
878
879 while (true) {
880 ram_addr_t rb_offset;
881 int poll_result;
882
883
884
885
886
887
888
889 poll_result = poll(pfd, pfd_len, -1 );
890 if (poll_result == -1) {
891 error_report("%s: userfault poll: %s", __func__, strerror(errno));
892 break;
893 }
894
895 if (!mis->to_src_file) {
896
897
898
899
900
901 if (postcopy_pause_fault_thread(mis)) {
902 mis->last_rb = NULL;
903
904 } else {
905 error_report("%s: paused but don't allow to continue",
906 __func__);
907 break;
908 }
909 }
910
911 if (pfd[1].revents) {
912 uint64_t tmp64 = 0;
913
914
915 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
916
917 error_report("%s: read() failed", __func__);
918 }
919
920 if (atomic_read(&mis->fault_thread_quit)) {
921 trace_postcopy_ram_fault_thread_quit();
922 break;
923 }
924 }
925
926 if (pfd[0].revents) {
927 poll_result--;
928 ret = read(mis->userfault_fd, &msg, sizeof(msg));
929 if (ret != sizeof(msg)) {
930 if (errno == EAGAIN) {
931
932
933
934
935 continue;
936 }
937 if (ret < 0) {
938 error_report("%s: Failed to read full userfault "
939 "message: %s",
940 __func__, strerror(errno));
941 break;
942 } else {
943 error_report("%s: Read %d bytes from userfaultfd "
944 "expected %zd",
945 __func__, ret, sizeof(msg));
946 break;
947 }
948 }
949 if (msg.event != UFFD_EVENT_PAGEFAULT) {
950 error_report("%s: Read unexpected event %ud from userfaultfd",
951 __func__, msg.event);
952 continue;
953 }
954
955 rb = qemu_ram_block_from_host(
956 (void *)(uintptr_t)msg.arg.pagefault.address,
957 true, &rb_offset);
958 if (!rb) {
959 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
960 PRIx64, (uint64_t)msg.arg.pagefault.address);
961 break;
962 }
963
964 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
965 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
966 qemu_ram_get_idstr(rb),
967 rb_offset,
968 msg.arg.pagefault.feat.ptid);
969 mark_postcopy_blocktime_begin(
970 (uintptr_t)(msg.arg.pagefault.address),
971 msg.arg.pagefault.feat.ptid, rb);
972
973retry:
974
975
976
977
978 if (rb != mis->last_rb) {
979 mis->last_rb = rb;
980 ret = migrate_send_rp_req_pages(mis,
981 qemu_ram_get_idstr(rb),
982 rb_offset,
983 qemu_ram_pagesize(rb));
984 } else {
985
986 ret = migrate_send_rp_req_pages(mis,
987 NULL,
988 rb_offset,
989 qemu_ram_pagesize(rb));
990 }
991
992 if (ret) {
993
994 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
995
996 mis->last_rb = NULL;
997 goto retry;
998 } else {
999
1000 error_report("%s: migrate_send_rp_req_pages() get %d",
1001 __func__, ret);
1002 break;
1003 }
1004 }
1005 }
1006
1007
1008
1009 for (index = 2; index < pfd_len && poll_result; index++) {
1010 if (pfd[index].revents) {
1011 struct PostCopyFD *pcfd =
1012 &g_array_index(mis->postcopy_remote_fds,
1013 struct PostCopyFD, index - 2);
1014
1015 poll_result--;
1016 if (pfd[index].revents & POLLERR) {
1017 error_report("%s: POLLERR on poll %zd fd=%d",
1018 __func__, index, pcfd->fd);
1019 pfd[index].events = 0;
1020 continue;
1021 }
1022
1023 ret = read(pcfd->fd, &msg, sizeof(msg));
1024 if (ret != sizeof(msg)) {
1025 if (errno == EAGAIN) {
1026
1027
1028
1029
1030 continue;
1031 }
1032 if (ret < 0) {
1033 error_report("%s: Failed to read full userfault "
1034 "message: %s (shared) revents=%d",
1035 __func__, strerror(errno),
1036 pfd[index].revents);
1037
1038 break;
1039 } else {
1040 error_report("%s: Read %d bytes from userfaultfd "
1041 "expected %zd (shared)",
1042 __func__, ret, sizeof(msg));
1043
1044 break;
1045 }
1046 }
1047 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1048 error_report("%s: Read unexpected event %ud "
1049 "from userfaultfd (shared)",
1050 __func__, msg.event);
1051 continue;
1052 }
1053
1054 ret = pcfd->handler(pcfd, &msg);
1055 if (ret) {
1056 error_report("%s: Failed to resolve shared fault on %zd/%s",
1057 __func__, index, pcfd->idstr);
1058
1059 }
1060 }
1061 }
1062 }
1063 trace_postcopy_ram_fault_thread_exit();
1064 g_free(pfd);
1065 return NULL;
1066}
1067
1068int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1069{
1070
1071 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1072 if (mis->userfault_fd == -1) {
1073 error_report("%s: Failed to open userfault fd: %s", __func__,
1074 strerror(errno));
1075 return -1;
1076 }
1077
1078
1079
1080
1081
1082 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1083 return -1;
1084 }
1085
1086
1087 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1088 if (mis->userfault_event_fd == -1) {
1089 error_report("%s: Opening userfault_event_fd: %s", __func__,
1090 strerror(errno));
1091 close(mis->userfault_fd);
1092 return -1;
1093 }
1094
1095 qemu_sem_init(&mis->fault_thread_sem, 0);
1096 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1097 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1098 qemu_sem_wait(&mis->fault_thread_sem);
1099 qemu_sem_destroy(&mis->fault_thread_sem);
1100 mis->have_fault_thread = true;
1101
1102
1103 if (qemu_ram_foreach_migratable_block(ram_block_enable_notify, mis)) {
1104 return -1;
1105 }
1106
1107
1108
1109
1110
1111 qemu_balloon_inhibit(true);
1112
1113 trace_postcopy_ram_enable_notify();
1114
1115 return 0;
1116}
1117
1118static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
1119 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1120{
1121 int ret;
1122 if (from_addr) {
1123 struct uffdio_copy copy_struct;
1124 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1125 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1126 copy_struct.len = pagesize;
1127 copy_struct.mode = 0;
1128 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1129 } else {
1130 struct uffdio_zeropage zero_struct;
1131 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1132 zero_struct.range.len = pagesize;
1133 zero_struct.mode = 0;
1134 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1135 }
1136 if (!ret) {
1137 ramblock_recv_bitmap_set_range(rb, host_addr,
1138 pagesize / qemu_target_page_size());
1139 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1140
1141 }
1142 return ret;
1143}
1144
1145int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1146{
1147 int i;
1148 MigrationIncomingState *mis = migration_incoming_get_current();
1149 GArray *pcrfds = mis->postcopy_remote_fds;
1150
1151 for (i = 0; i < pcrfds->len; i++) {
1152 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1153 int ret = cur->waker(cur, rb, offset);
1154 if (ret) {
1155 return ret;
1156 }
1157 }
1158 return 0;
1159}
1160
1161
1162
1163
1164
1165int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1166 RAMBlock *rb)
1167{
1168 size_t pagesize = qemu_ram_pagesize(rb);
1169
1170
1171
1172
1173
1174
1175 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
1176 int e = errno;
1177 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1178 __func__, strerror(e), host, from, pagesize);
1179
1180 return -e;
1181 }
1182
1183 trace_postcopy_place_page(host);
1184 return postcopy_notify_shared_wake(rb,
1185 qemu_ram_block_host_offset(rb, host));
1186}
1187
1188
1189
1190
1191
1192int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1193 RAMBlock *rb)
1194{
1195 size_t pagesize = qemu_ram_pagesize(rb);
1196 trace_postcopy_place_page_zero(host);
1197
1198
1199
1200
1201 if (qemu_ram_is_uf_zeroable(rb)) {
1202 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
1203 int e = errno;
1204 error_report("%s: %s zero host: %p",
1205 __func__, strerror(e), host);
1206
1207 return -e;
1208 }
1209 return postcopy_notify_shared_wake(rb,
1210 qemu_ram_block_host_offset(rb,
1211 host));
1212 } else {
1213
1214 if (!mis->postcopy_tmp_zero_page) {
1215 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1216 PROT_READ | PROT_WRITE,
1217 MAP_PRIVATE | MAP_ANONYMOUS,
1218 -1, 0);
1219 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1220 int e = errno;
1221 mis->postcopy_tmp_zero_page = NULL;
1222 error_report("%s: %s mapping large zero page",
1223 __func__, strerror(e));
1224 return -e;
1225 }
1226 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1227 }
1228 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
1229 rb);
1230 }
1231}
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1242{
1243 if (!mis->postcopy_tmp_page) {
1244 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1245 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1246 MAP_ANONYMOUS, -1, 0);
1247 if (mis->postcopy_tmp_page == MAP_FAILED) {
1248 mis->postcopy_tmp_page = NULL;
1249 error_report("%s: %s", __func__, strerror(errno));
1250 return NULL;
1251 }
1252 }
1253
1254 return mis->postcopy_tmp_page;
1255}
1256
1257#else
1258
1259void fill_destination_postcopy_migration_info(MigrationInfo *info)
1260{
1261}
1262
1263bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1264{
1265 error_report("%s: No OS support", __func__);
1266 return false;
1267}
1268
1269int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1270{
1271 error_report("postcopy_ram_incoming_init: No OS support");
1272 return -1;
1273}
1274
1275int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1276{
1277 assert(0);
1278 return -1;
1279}
1280
1281int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1282{
1283 assert(0);
1284 return -1;
1285}
1286
1287int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1288 uint64_t client_addr, uint64_t rb_offset)
1289{
1290 assert(0);
1291 return -1;
1292}
1293
1294int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1295{
1296 assert(0);
1297 return -1;
1298}
1299
1300int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1301 RAMBlock *rb)
1302{
1303 assert(0);
1304 return -1;
1305}
1306
1307int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1308 RAMBlock *rb)
1309{
1310 assert(0);
1311 return -1;
1312}
1313
1314void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1315{
1316 assert(0);
1317 return NULL;
1318}
1319
1320int postcopy_wake_shared(struct PostCopyFD *pcfd,
1321 uint64_t client_addr,
1322 RAMBlock *rb)
1323{
1324 assert(0);
1325 return -1;
1326}
1327#endif
1328
1329
1330
1331void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1332{
1333 uint64_t tmp64 = 1;
1334
1335
1336
1337
1338
1339 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1340
1341 error_report("%s: incrementing failed: %s", __func__,
1342 strerror(errno));
1343 }
1344}
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
1358 const char *name)
1359{
1360 PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
1361
1362 if (res) {
1363 res->ramblock_name = name;
1364 }
1365
1366 return res;
1367}
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
1380 unsigned long start, unsigned long length)
1381{
1382 size_t tp_size = qemu_target_page_size();
1383
1384 pds->start_list[pds->cur_entry] = start * tp_size;
1385 pds->length_list[pds->cur_entry] = length * tp_size;
1386 trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
1387 pds->cur_entry++;
1388 pds->nsentwords++;
1389
1390 if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
1391
1392 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1393 pds->ramblock_name,
1394 pds->cur_entry,
1395 pds->start_list,
1396 pds->length_list);
1397 pds->nsentcmds++;
1398 pds->cur_entry = 0;
1399 }
1400}
1401
1402
1403
1404
1405
1406
1407
1408
1409void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
1410{
1411
1412 if (pds->cur_entry) {
1413 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1414 pds->ramblock_name,
1415 pds->cur_entry,
1416 pds->start_list,
1417 pds->length_list);
1418 pds->nsentcmds++;
1419 }
1420
1421 trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
1422 pds->nsentcmds);
1423
1424 g_free(pds);
1425}
1426
1427
1428
1429
1430
1431
1432static PostcopyState incoming_postcopy_state;
1433
1434PostcopyState postcopy_state_get(void)
1435{
1436 return atomic_mb_read(&incoming_postcopy_state);
1437}
1438
1439
1440PostcopyState postcopy_state_set(PostcopyState new_state)
1441{
1442 return atomic_xchg(&incoming_postcopy_state, new_state);
1443}
1444
1445
1446
1447
1448void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1449{
1450 MigrationIncomingState *mis = migration_incoming_get_current();
1451
1452 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1453 *pcfd);
1454}
1455
1456
1457
1458void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1459{
1460 guint i;
1461 MigrationIncomingState *mis = migration_incoming_get_current();
1462 GArray *pcrfds = mis->postcopy_remote_fds;
1463
1464 for (i = 0; i < pcrfds->len; i++) {
1465 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1466 if (cur->fd == pcfd->fd) {
1467 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1468 return;
1469 }
1470 }
1471}
1472