1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "exec/target_page.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "savevm.h"
24#include "postcopy-ram.h"
25#include "ram.h"
26#include "qapi/error.h"
27#include "qemu/notify.h"
28#include "qemu/rcu.h"
29#include "sysemu/sysemu.h"
30#include "qemu/error-report.h"
31#include "trace.h"
32#include "hw/boards.h"
33
34
35
36
37#define MAX_DISCARDS_PER_COMMAND 12
38
39struct PostcopyDiscardState {
40 const char *ramblock_name;
41 uint16_t cur_entry;
42
43
44
45 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
46 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
47 unsigned int nsentwords;
48 unsigned int nsentcmds;
49};
50
51static NotifierWithReturnList postcopy_notifier_list;
52
53void postcopy_infrastructure_init(void)
54{
55 notifier_with_return_list_init(&postcopy_notifier_list);
56}
57
58void postcopy_add_notifier(NotifierWithReturn *nn)
59{
60 notifier_with_return_list_add(&postcopy_notifier_list, nn);
61}
62
63void postcopy_remove_notifier(NotifierWithReturn *n)
64{
65 notifier_with_return_remove(n);
66}
67
68int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
69{
70 struct PostcopyNotifyData pnd;
71 pnd.reason = reason;
72 pnd.errp = errp;
73
74 return notifier_with_return_list_notify(&postcopy_notifier_list,
75 &pnd);
76}
77
78
79
80
81
82#if defined(__linux__)
83
84#include <poll.h>
85#include <sys/ioctl.h>
86#include <sys/syscall.h>
87#include <asm/types.h>
88#endif
89
90#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
91#include <sys/eventfd.h>
92#include <linux/userfaultfd.h>
93
94typedef struct PostcopyBlocktimeContext {
95
96 uint32_t *page_fault_vcpu_time;
97
98 uintptr_t *vcpu_addr;
99 uint32_t total_blocktime;
100
101 uint32_t *vcpu_blocktime;
102
103 uint32_t last_begin;
104
105 int smp_cpus_down;
106 uint64_t start_time;
107
108
109
110
111
112 Notifier exit_notifier;
113} PostcopyBlocktimeContext;
114
115static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
116{
117 g_free(ctx->page_fault_vcpu_time);
118 g_free(ctx->vcpu_addr);
119 g_free(ctx->vcpu_blocktime);
120 g_free(ctx);
121}
122
123static void migration_exit_cb(Notifier *n, void *data)
124{
125 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
126 exit_notifier);
127 destroy_blocktime_context(ctx);
128}
129
130static struct PostcopyBlocktimeContext *blocktime_context_new(void)
131{
132 MachineState *ms = MACHINE(qdev_get_machine());
133 unsigned int smp_cpus = ms->smp.cpus;
134 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
135 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
136 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
137 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
138
139 ctx->exit_notifier.notify = migration_exit_cb;
140 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
141 qemu_add_exit_notifier(&ctx->exit_notifier);
142 return ctx;
143}
144
145static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
146{
147 MachineState *ms = MACHINE(qdev_get_machine());
148 uint32List *list = NULL;
149 int i;
150
151 for (i = ms->smp.cpus - 1; i >= 0; i--) {
152 QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
153 }
154
155 return list;
156}
157
158
159
160
161
162
163
164
165void fill_destination_postcopy_migration_info(MigrationInfo *info)
166{
167 MigrationIncomingState *mis = migration_incoming_get_current();
168 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
169
170 if (!bc) {
171 return;
172 }
173
174 info->has_postcopy_blocktime = true;
175 info->postcopy_blocktime = bc->total_blocktime;
176 info->has_postcopy_vcpu_blocktime = true;
177 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
178}
179
180static uint32_t get_postcopy_total_blocktime(void)
181{
182 MigrationIncomingState *mis = migration_incoming_get_current();
183 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
184
185 if (!bc) {
186 return 0;
187 }
188
189 return bc->total_blocktime;
190}
191
192
193
194
195
196
197
198
199
200
201
202static bool receive_ufd_features(uint64_t *features)
203{
204 struct uffdio_api api_struct = {0};
205 int ufd;
206 bool ret = true;
207
208
209 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
210 if (ufd == -1) {
211 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
212 strerror(errno));
213 return false;
214 }
215
216
217 api_struct.api = UFFD_API;
218 api_struct.features = 0;
219 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
220 error_report("%s: UFFDIO_API failed: %s", __func__,
221 strerror(errno));
222 ret = false;
223 goto release_ufd;
224 }
225
226 *features = api_struct.features;
227
228release_ufd:
229 close(ufd);
230 return ret;
231}
232
233
234
235
236
237
238
239
240
241
242static bool request_ufd_features(int ufd, uint64_t features)
243{
244 struct uffdio_api api_struct = {0};
245 uint64_t ioctl_mask;
246
247 api_struct.api = UFFD_API;
248 api_struct.features = features;
249 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
250 error_report("%s failed: UFFDIO_API failed: %s", __func__,
251 strerror(errno));
252 return false;
253 }
254
255 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
256 (__u64)1 << _UFFDIO_UNREGISTER;
257 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
258 error_report("Missing userfault features: %" PRIx64,
259 (uint64_t)(~api_struct.ioctls & ioctl_mask));
260 return false;
261 }
262
263 return true;
264}
265
266static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
267{
268 uint64_t asked_features = 0;
269 static uint64_t supported_features;
270
271
272
273
274
275
276 if (!supported_features) {
277 if (!receive_ufd_features(&supported_features)) {
278 error_report("%s failed", __func__);
279 return false;
280 }
281 }
282
283#ifdef UFFD_FEATURE_THREAD_ID
284 if (migrate_postcopy_blocktime() && mis &&
285 UFFD_FEATURE_THREAD_ID & supported_features) {
286
287
288 if (!mis->blocktime_ctx) {
289 mis->blocktime_ctx = blocktime_context_new();
290 }
291
292 asked_features |= UFFD_FEATURE_THREAD_ID;
293 }
294#endif
295
296
297
298
299
300
301 if (!request_ufd_features(ufd, asked_features)) {
302 error_report("%s failed: features %" PRIu64, __func__,
303 asked_features);
304 return false;
305 }
306
307 if (qemu_real_host_page_size != ram_pagesize_summary()) {
308 bool have_hp = false;
309
310#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
311 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
312#endif
313 if (!have_hp) {
314 error_report("Userfault on this host does not support huge pages");
315 return false;
316 }
317 }
318 return true;
319}
320
321
322
323static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
324{
325 const char *block_name = qemu_ram_get_idstr(rb);
326 ram_addr_t length = qemu_ram_get_used_length(rb);
327 size_t pagesize = qemu_ram_pagesize(rb);
328
329 if (length % pagesize) {
330 error_report("Postcopy requires RAM blocks to be a page size multiple,"
331 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
332 "page size of 0x%zx", block_name, length, pagesize);
333 return 1;
334 }
335 return 0;
336}
337
338
339
340
341
342
343bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
344{
345 long pagesize = qemu_real_host_page_size;
346 int ufd = -1;
347 bool ret = false;
348 void *testarea = NULL;
349 struct uffdio_register reg_struct;
350 struct uffdio_range range_struct;
351 uint64_t feature_mask;
352 Error *local_err = NULL;
353
354 if (qemu_target_page_size() > pagesize) {
355 error_report("Target page size bigger than host page size");
356 goto out;
357 }
358
359 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
360 if (ufd == -1) {
361 error_report("%s: userfaultfd not available: %s", __func__,
362 strerror(errno));
363 goto out;
364 }
365
366
367 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
368 error_report_err(local_err);
369 goto out;
370 }
371
372
373 if (!ufd_check_and_apply(ufd, mis)) {
374 goto out;
375 }
376
377
378 if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
379 goto out;
380 }
381
382
383
384
385
386 if (munlockall()) {
387 error_report("%s: munlockall: %s", __func__, strerror(errno));
388 goto out;
389 }
390
391
392
393
394
395
396 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
397 MAP_ANONYMOUS, -1, 0);
398 if (testarea == MAP_FAILED) {
399 error_report("%s: Failed to map test area: %s", __func__,
400 strerror(errno));
401 goto out;
402 }
403 g_assert(((size_t)testarea & (pagesize - 1)) == 0);
404
405 reg_struct.range.start = (uintptr_t)testarea;
406 reg_struct.range.len = pagesize;
407 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
408
409 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
410 error_report("%s userfault register: %s", __func__, strerror(errno));
411 goto out;
412 }
413
414 range_struct.start = (uintptr_t)testarea;
415 range_struct.len = pagesize;
416 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
417 error_report("%s userfault unregister: %s", __func__, strerror(errno));
418 goto out;
419 }
420
421 feature_mask = (__u64)1 << _UFFDIO_WAKE |
422 (__u64)1 << _UFFDIO_COPY |
423 (__u64)1 << _UFFDIO_ZEROPAGE;
424 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
425 error_report("Missing userfault map features: %" PRIx64,
426 (uint64_t)(~reg_struct.ioctls & feature_mask));
427 goto out;
428 }
429
430
431 ret = true;
432out:
433 if (testarea) {
434 munmap(testarea, pagesize);
435 }
436 if (ufd != -1) {
437 close(ufd);
438 }
439 return ret;
440}
441
442
443
444
445
446
447static int init_range(RAMBlock *rb, void *opaque)
448{
449 const char *block_name = qemu_ram_get_idstr(rb);
450 void *host_addr = qemu_ram_get_host_addr(rb);
451 ram_addr_t offset = qemu_ram_get_offset(rb);
452 ram_addr_t length = qemu_ram_get_used_length(rb);
453 trace_postcopy_init_range(block_name, host_addr, offset, length);
454
455
456
457
458
459
460
461 if (ram_discard_range(block_name, 0, length)) {
462 return -1;
463 }
464
465 return 0;
466}
467
468
469
470
471
472static int cleanup_range(RAMBlock *rb, void *opaque)
473{
474 const char *block_name = qemu_ram_get_idstr(rb);
475 void *host_addr = qemu_ram_get_host_addr(rb);
476 ram_addr_t offset = qemu_ram_get_offset(rb);
477 ram_addr_t length = qemu_ram_get_used_length(rb);
478 MigrationIncomingState *mis = opaque;
479 struct uffdio_range range_struct;
480 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
481
482
483
484
485
486 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
487
488
489
490
491
492
493 range_struct.start = (uintptr_t)host_addr;
494 range_struct.len = length;
495
496 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
497 error_report("%s: userfault unregister %s", __func__, strerror(errno));
498
499 return -1;
500 }
501
502 return 0;
503}
504
505
506
507
508
509
510int postcopy_ram_incoming_init(MigrationIncomingState *mis)
511{
512 if (foreach_not_ignored_block(init_range, NULL)) {
513 return -1;
514 }
515
516 return 0;
517}
518
519
520
521
522int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
523{
524 trace_postcopy_ram_incoming_cleanup_entry();
525
526 if (mis->have_fault_thread) {
527 Error *local_err = NULL;
528
529
530 qatomic_set(&mis->fault_thread_quit, 1);
531 postcopy_fault_thread_notify(mis);
532 trace_postcopy_ram_incoming_cleanup_join();
533 qemu_thread_join(&mis->fault_thread);
534
535 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
536 error_report_err(local_err);
537 return -1;
538 }
539
540 if (foreach_not_ignored_block(cleanup_range, mis)) {
541 return -1;
542 }
543
544 trace_postcopy_ram_incoming_cleanup_closeuf();
545 close(mis->userfault_fd);
546 close(mis->userfault_event_fd);
547 mis->have_fault_thread = false;
548 }
549
550 if (enable_mlock) {
551 if (os_mlock() < 0) {
552 error_report("mlock: %s", strerror(errno));
553
554
555
556
557 }
558 }
559
560 if (mis->postcopy_tmp_page) {
561 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
562 mis->postcopy_tmp_page = NULL;
563 }
564 if (mis->postcopy_tmp_zero_page) {
565 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
566 mis->postcopy_tmp_zero_page = NULL;
567 }
568 trace_postcopy_ram_incoming_cleanup_blocktime(
569 get_postcopy_total_blocktime());
570
571 trace_postcopy_ram_incoming_cleanup_exit();
572 return 0;
573}
574
575
576
577
578static int nhp_range(RAMBlock *rb, void *opaque)
579{
580 const char *block_name = qemu_ram_get_idstr(rb);
581 void *host_addr = qemu_ram_get_host_addr(rb);
582 ram_addr_t offset = qemu_ram_get_offset(rb);
583 ram_addr_t length = qemu_ram_get_used_length(rb);
584 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
585
586
587
588
589
590
591 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
592
593 return 0;
594}
595
596
597
598
599
600
601int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
602{
603 if (foreach_not_ignored_block(nhp_range, mis)) {
604 return -1;
605 }
606
607 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
608
609 return 0;
610}
611
612
613
614
615
616
617
618
619
620
621static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
622{
623 MigrationIncomingState *mis = opaque;
624 struct uffdio_register reg_struct;
625
626 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
627 reg_struct.range.len = qemu_ram_get_used_length(rb);
628 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
629
630
631 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
632 error_report("%s userfault register: %s", __func__, strerror(errno));
633 return -1;
634 }
635 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
636 error_report("%s userfault: Region doesn't support COPY", __func__);
637 return -1;
638 }
639 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
640 qemu_ram_set_uf_zeroable(rb);
641 }
642
643 return 0;
644}
645
646int postcopy_wake_shared(struct PostCopyFD *pcfd,
647 uint64_t client_addr,
648 RAMBlock *rb)
649{
650 size_t pagesize = qemu_ram_pagesize(rb);
651 struct uffdio_range range;
652 int ret;
653 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
654 range.start = client_addr & ~(pagesize - 1);
655 range.len = pagesize;
656 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
657 if (ret) {
658 error_report("%s: Failed to wake: %zx in %s (%s)",
659 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
660 strerror(errno));
661 }
662 return ret;
663}
664
665
666
667
668
669
670int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
671 uint64_t client_addr, uint64_t rb_offset)
672{
673 size_t pagesize = qemu_ram_pagesize(rb);
674 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
675 MigrationIncomingState *mis = migration_incoming_get_current();
676
677 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
678 rb_offset);
679 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
680 trace_postcopy_request_shared_page_present(pcfd->idstr,
681 qemu_ram_get_idstr(rb), rb_offset);
682 return postcopy_wake_shared(pcfd, client_addr, rb);
683 }
684 migrate_send_rp_req_pages(mis, rb, aligned_rbo, client_addr);
685 return 0;
686}
687
688static int get_mem_fault_cpu_index(uint32_t pid)
689{
690 CPUState *cpu_iter;
691
692 CPU_FOREACH(cpu_iter) {
693 if (cpu_iter->thread_id == pid) {
694 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
695 return cpu_iter->cpu_index;
696 }
697 }
698 trace_get_mem_fault_cpu_index(-1, pid);
699 return -1;
700}
701
702static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
703{
704 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
705 dc->start_time;
706 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
707}
708
709
710
711
712
713
714
715
716
717static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
718 RAMBlock *rb)
719{
720 int cpu, already_received;
721 MigrationIncomingState *mis = migration_incoming_get_current();
722 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
723 uint32_t low_time_offset;
724
725 if (!dc || ptid == 0) {
726 return;
727 }
728 cpu = get_mem_fault_cpu_index(ptid);
729 if (cpu < 0) {
730 return;
731 }
732
733 low_time_offset = get_low_time_offset(dc);
734 if (dc->vcpu_addr[cpu] == 0) {
735 qatomic_inc(&dc->smp_cpus_down);
736 }
737
738 qatomic_xchg(&dc->last_begin, low_time_offset);
739 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
740 qatomic_xchg(&dc->vcpu_addr[cpu], addr);
741
742
743
744
745
746
747 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
748 if (already_received) {
749 qatomic_xchg(&dc->vcpu_addr[cpu], 0);
750 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
751 qatomic_dec(&dc->smp_cpus_down);
752 }
753 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
754 cpu, already_received);
755}
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784static void mark_postcopy_blocktime_end(uintptr_t addr)
785{
786 MigrationIncomingState *mis = migration_incoming_get_current();
787 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
788 MachineState *ms = MACHINE(qdev_get_machine());
789 unsigned int smp_cpus = ms->smp.cpus;
790 int i, affected_cpu = 0;
791 bool vcpu_total_blocktime = false;
792 uint32_t read_vcpu_time, low_time_offset;
793
794 if (!dc) {
795 return;
796 }
797
798 low_time_offset = get_low_time_offset(dc);
799
800
801
802
803 for (i = 0; i < smp_cpus; i++) {
804 uint32_t vcpu_blocktime = 0;
805
806 read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
807 if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
808 read_vcpu_time == 0) {
809 continue;
810 }
811 qatomic_xchg(&dc->vcpu_addr[i], 0);
812 vcpu_blocktime = low_time_offset - read_vcpu_time;
813 affected_cpu += 1;
814
815
816
817 if (!vcpu_total_blocktime &&
818 qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
819 vcpu_total_blocktime = true;
820 }
821
822 dc->vcpu_blocktime[i] += vcpu_blocktime;
823 }
824
825 qatomic_sub(&dc->smp_cpus_down, affected_cpu);
826 if (vcpu_total_blocktime) {
827 dc->total_blocktime += low_time_offset - qatomic_fetch_add(
828 &dc->last_begin, 0);
829 }
830 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
831 affected_cpu);
832}
833
834static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
835{
836 trace_postcopy_pause_fault_thread();
837
838 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
839
840 trace_postcopy_pause_fault_thread_continued();
841
842 return true;
843}
844
845
846
847
848static void *postcopy_ram_fault_thread(void *opaque)
849{
850 MigrationIncomingState *mis = opaque;
851 struct uffd_msg msg;
852 int ret;
853 size_t index;
854 RAMBlock *rb = NULL;
855
856 trace_postcopy_ram_fault_thread_entry();
857 rcu_register_thread();
858 mis->last_rb = NULL;
859 qemu_sem_post(&mis->fault_thread_sem);
860
861 struct pollfd *pfd;
862 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
863
864 pfd = g_new0(struct pollfd, pfd_len);
865
866 pfd[0].fd = mis->userfault_fd;
867 pfd[0].events = POLLIN;
868 pfd[1].fd = mis->userfault_event_fd;
869 pfd[1].events = POLLIN;
870 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
871 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
872 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
873 struct PostCopyFD, index);
874 pfd[2 + index].fd = pcfd->fd;
875 pfd[2 + index].events = POLLIN;
876 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
877 pcfd->fd);
878 }
879
880 while (true) {
881 ram_addr_t rb_offset;
882 int poll_result;
883
884
885
886
887
888
889
890 poll_result = poll(pfd, pfd_len, -1 );
891 if (poll_result == -1) {
892 error_report("%s: userfault poll: %s", __func__, strerror(errno));
893 break;
894 }
895
896 if (!mis->to_src_file) {
897
898
899
900
901
902 if (postcopy_pause_fault_thread(mis)) {
903
904 } else {
905 error_report("%s: paused but don't allow to continue",
906 __func__);
907 break;
908 }
909 }
910
911 if (pfd[1].revents) {
912 uint64_t tmp64 = 0;
913
914
915 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
916
917 error_report("%s: read() failed", __func__);
918 }
919
920 if (qatomic_read(&mis->fault_thread_quit)) {
921 trace_postcopy_ram_fault_thread_quit();
922 break;
923 }
924 }
925
926 if (pfd[0].revents) {
927 poll_result--;
928 ret = read(mis->userfault_fd, &msg, sizeof(msg));
929 if (ret != sizeof(msg)) {
930 if (errno == EAGAIN) {
931
932
933
934
935 continue;
936 }
937 if (ret < 0) {
938 error_report("%s: Failed to read full userfault "
939 "message: %s",
940 __func__, strerror(errno));
941 break;
942 } else {
943 error_report("%s: Read %d bytes from userfaultfd "
944 "expected %zd",
945 __func__, ret, sizeof(msg));
946 break;
947 }
948 }
949 if (msg.event != UFFD_EVENT_PAGEFAULT) {
950 error_report("%s: Read unexpected event %ud from userfaultfd",
951 __func__, msg.event);
952 continue;
953 }
954
955 rb = qemu_ram_block_from_host(
956 (void *)(uintptr_t)msg.arg.pagefault.address,
957 true, &rb_offset);
958 if (!rb) {
959 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
960 PRIx64, (uint64_t)msg.arg.pagefault.address);
961 break;
962 }
963
964 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
965 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
966 qemu_ram_get_idstr(rb),
967 rb_offset,
968 msg.arg.pagefault.feat.ptid);
969 mark_postcopy_blocktime_begin(
970 (uintptr_t)(msg.arg.pagefault.address),
971 msg.arg.pagefault.feat.ptid, rb);
972
973retry:
974
975
976
977
978 ret = migrate_send_rp_req_pages(mis, rb, rb_offset,
979 msg.arg.pagefault.address);
980 if (ret) {
981
982 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
983
984 goto retry;
985 } else {
986
987 error_report("%s: migrate_send_rp_req_pages() get %d",
988 __func__, ret);
989 break;
990 }
991 }
992 }
993
994
995
996 for (index = 2; index < pfd_len && poll_result; index++) {
997 if (pfd[index].revents) {
998 struct PostCopyFD *pcfd =
999 &g_array_index(mis->postcopy_remote_fds,
1000 struct PostCopyFD, index - 2);
1001
1002 poll_result--;
1003 if (pfd[index].revents & POLLERR) {
1004 error_report("%s: POLLERR on poll %zd fd=%d",
1005 __func__, index, pcfd->fd);
1006 pfd[index].events = 0;
1007 continue;
1008 }
1009
1010 ret = read(pcfd->fd, &msg, sizeof(msg));
1011 if (ret != sizeof(msg)) {
1012 if (errno == EAGAIN) {
1013
1014
1015
1016
1017 continue;
1018 }
1019 if (ret < 0) {
1020 error_report("%s: Failed to read full userfault "
1021 "message: %s (shared) revents=%d",
1022 __func__, strerror(errno),
1023 pfd[index].revents);
1024
1025 break;
1026 } else {
1027 error_report("%s: Read %d bytes from userfaultfd "
1028 "expected %zd (shared)",
1029 __func__, ret, sizeof(msg));
1030
1031 break;
1032 }
1033 }
1034 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1035 error_report("%s: Read unexpected event %ud "
1036 "from userfaultfd (shared)",
1037 __func__, msg.event);
1038 continue;
1039 }
1040
1041 ret = pcfd->handler(pcfd, &msg);
1042 if (ret) {
1043 error_report("%s: Failed to resolve shared fault on %zd/%s",
1044 __func__, index, pcfd->idstr);
1045
1046 }
1047 }
1048 }
1049 }
1050 rcu_unregister_thread();
1051 trace_postcopy_ram_fault_thread_exit();
1052 g_free(pfd);
1053 return NULL;
1054}
1055
1056int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1057{
1058
1059 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1060 if (mis->userfault_fd == -1) {
1061 error_report("%s: Failed to open userfault fd: %s", __func__,
1062 strerror(errno));
1063 return -1;
1064 }
1065
1066
1067
1068
1069
1070 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1071 return -1;
1072 }
1073
1074
1075 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1076 if (mis->userfault_event_fd == -1) {
1077 error_report("%s: Opening userfault_event_fd: %s", __func__,
1078 strerror(errno));
1079 close(mis->userfault_fd);
1080 return -1;
1081 }
1082
1083 qemu_sem_init(&mis->fault_thread_sem, 0);
1084 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1085 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1086 qemu_sem_wait(&mis->fault_thread_sem);
1087 qemu_sem_destroy(&mis->fault_thread_sem);
1088 mis->have_fault_thread = true;
1089
1090
1091 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1092 error_report("ram_block_enable_notify failed");
1093 return -1;
1094 }
1095
1096 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1097 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1098 MAP_ANONYMOUS, -1, 0);
1099 if (mis->postcopy_tmp_page == MAP_FAILED) {
1100 mis->postcopy_tmp_page = NULL;
1101 error_report("%s: Failed to map postcopy_tmp_page %s",
1102 __func__, strerror(errno));
1103 return -1;
1104 }
1105
1106
1107
1108
1109 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1110 PROT_READ | PROT_WRITE,
1111 MAP_PRIVATE | MAP_ANONYMOUS,
1112 -1, 0);
1113 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1114 int e = errno;
1115 mis->postcopy_tmp_zero_page = NULL;
1116 error_report("%s: Failed to map large zero page %s",
1117 __func__, strerror(e));
1118 return -e;
1119 }
1120 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1121
1122 trace_postcopy_ram_enable_notify();
1123
1124 return 0;
1125}
1126
1127static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
1128 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1129{
1130 int userfault_fd = mis->userfault_fd;
1131 int ret;
1132
1133 if (from_addr) {
1134 struct uffdio_copy copy_struct;
1135 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1136 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1137 copy_struct.len = pagesize;
1138 copy_struct.mode = 0;
1139 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1140 } else {
1141 struct uffdio_zeropage zero_struct;
1142 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1143 zero_struct.range.len = pagesize;
1144 zero_struct.mode = 0;
1145 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1146 }
1147 if (!ret) {
1148 qemu_mutex_lock(&mis->page_request_mutex);
1149 ramblock_recv_bitmap_set_range(rb, host_addr,
1150 pagesize / qemu_target_page_size());
1151
1152
1153
1154
1155 if (g_tree_lookup(mis->page_requested, host_addr)) {
1156 g_tree_remove(mis->page_requested, host_addr);
1157 mis->page_requested_count--;
1158 trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
1159 }
1160 qemu_mutex_unlock(&mis->page_request_mutex);
1161 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1162 }
1163 return ret;
1164}
1165
1166int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1167{
1168 int i;
1169 MigrationIncomingState *mis = migration_incoming_get_current();
1170 GArray *pcrfds = mis->postcopy_remote_fds;
1171
1172 for (i = 0; i < pcrfds->len; i++) {
1173 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1174 int ret = cur->waker(cur, rb, offset);
1175 if (ret) {
1176 return ret;
1177 }
1178 }
1179 return 0;
1180}
1181
1182
1183
1184
1185
1186int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1187 RAMBlock *rb)
1188{
1189 size_t pagesize = qemu_ram_pagesize(rb);
1190
1191
1192
1193
1194
1195
1196 if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
1197 int e = errno;
1198 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1199 __func__, strerror(e), host, from, pagesize);
1200
1201 return -e;
1202 }
1203
1204 trace_postcopy_place_page(host);
1205 return postcopy_notify_shared_wake(rb,
1206 qemu_ram_block_host_offset(rb, host));
1207}
1208
1209
1210
1211
1212
1213int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1214 RAMBlock *rb)
1215{
1216 size_t pagesize = qemu_ram_pagesize(rb);
1217 trace_postcopy_place_page_zero(host);
1218
1219
1220
1221
1222 if (qemu_ram_is_uf_zeroable(rb)) {
1223 if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
1224 int e = errno;
1225 error_report("%s: %s zero host: %p",
1226 __func__, strerror(e), host);
1227
1228 return -e;
1229 }
1230 return postcopy_notify_shared_wake(rb,
1231 qemu_ram_block_host_offset(rb,
1232 host));
1233 } else {
1234 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1235 }
1236}
1237
1238#else
1239
1240void fill_destination_postcopy_migration_info(MigrationInfo *info)
1241{
1242}
1243
1244bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1245{
1246 error_report("%s: No OS support", __func__);
1247 return false;
1248}
1249
1250int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1251{
1252 error_report("postcopy_ram_incoming_init: No OS support");
1253 return -1;
1254}
1255
1256int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1257{
1258 assert(0);
1259 return -1;
1260}
1261
1262int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1263{
1264 assert(0);
1265 return -1;
1266}
1267
1268int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1269 uint64_t client_addr, uint64_t rb_offset)
1270{
1271 assert(0);
1272 return -1;
1273}
1274
1275int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1276{
1277 assert(0);
1278 return -1;
1279}
1280
1281int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1282 RAMBlock *rb)
1283{
1284 assert(0);
1285 return -1;
1286}
1287
1288int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1289 RAMBlock *rb)
1290{
1291 assert(0);
1292 return -1;
1293}
1294
1295int postcopy_wake_shared(struct PostCopyFD *pcfd,
1296 uint64_t client_addr,
1297 RAMBlock *rb)
1298{
1299 assert(0);
1300 return -1;
1301}
1302#endif
1303
1304
1305
1306void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1307{
1308 uint64_t tmp64 = 1;
1309
1310
1311
1312
1313
1314 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1315
1316 error_report("%s: incrementing failed: %s", __func__,
1317 strerror(errno));
1318 }
1319}
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329static PostcopyDiscardState pds = {0};
1330void postcopy_discard_send_init(MigrationState *ms, const char *name)
1331{
1332 pds.ramblock_name = name;
1333 pds.cur_entry = 0;
1334 pds.nsentwords = 0;
1335 pds.nsentcmds = 0;
1336}
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1348 unsigned long length)
1349{
1350 size_t tp_size = qemu_target_page_size();
1351
1352 pds.start_list[pds.cur_entry] = start * tp_size;
1353 pds.length_list[pds.cur_entry] = length * tp_size;
1354 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1355 pds.cur_entry++;
1356 pds.nsentwords++;
1357
1358 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1359
1360 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1361 pds.ramblock_name,
1362 pds.cur_entry,
1363 pds.start_list,
1364 pds.length_list);
1365 pds.nsentcmds++;
1366 pds.cur_entry = 0;
1367 }
1368}
1369
1370
1371
1372
1373
1374
1375
1376void postcopy_discard_send_finish(MigrationState *ms)
1377{
1378
1379 if (pds.cur_entry) {
1380 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1381 pds.ramblock_name,
1382 pds.cur_entry,
1383 pds.start_list,
1384 pds.length_list);
1385 pds.nsentcmds++;
1386 }
1387
1388 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1389 pds.nsentcmds);
1390}
1391
1392
1393
1394
1395
1396
1397static PostcopyState incoming_postcopy_state;
1398
1399PostcopyState postcopy_state_get(void)
1400{
1401 return qatomic_mb_read(&incoming_postcopy_state);
1402}
1403
1404
1405PostcopyState postcopy_state_set(PostcopyState new_state)
1406{
1407 return qatomic_xchg(&incoming_postcopy_state, new_state);
1408}
1409
1410
1411
1412
1413void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1414{
1415 MigrationIncomingState *mis = migration_incoming_get_current();
1416
1417 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1418 *pcfd);
1419}
1420
1421
1422
1423void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1424{
1425 guint i;
1426 MigrationIncomingState *mis = migration_incoming_get_current();
1427 GArray *pcrfds = mis->postcopy_remote_fds;
1428
1429 for (i = 0; i < pcrfds->len; i++) {
1430 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1431 if (cur->fd == pcfd->fd) {
1432 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1433 return;
1434 }
1435 }
1436}
1437