1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "qemu/rcu.h"
21#include "exec/target_page.h"
22#include "migration.h"
23#include "qemu-file.h"
24#include "savevm.h"
25#include "postcopy-ram.h"
26#include "ram.h"
27#include "qapi/error.h"
28#include "qemu/notify.h"
29#include "qemu/rcu.h"
30#include "sysemu/sysemu.h"
31#include "qemu/error-report.h"
32#include "trace.h"
33#include "hw/boards.h"
34#include "exec/ramblock.h"
35
36
37
38
39#define MAX_DISCARDS_PER_COMMAND 12
40
41struct PostcopyDiscardState {
42 const char *ramblock_name;
43 uint16_t cur_entry;
44
45
46
47 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
48 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
49 unsigned int nsentwords;
50 unsigned int nsentcmds;
51};
52
53static NotifierWithReturnList postcopy_notifier_list;
54
55void postcopy_infrastructure_init(void)
56{
57 notifier_with_return_list_init(&postcopy_notifier_list);
58}
59
60void postcopy_add_notifier(NotifierWithReturn *nn)
61{
62 notifier_with_return_list_add(&postcopy_notifier_list, nn);
63}
64
65void postcopy_remove_notifier(NotifierWithReturn *n)
66{
67 notifier_with_return_remove(n);
68}
69
70int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
71{
72 struct PostcopyNotifyData pnd;
73 pnd.reason = reason;
74 pnd.errp = errp;
75
76 return notifier_with_return_list_notify(&postcopy_notifier_list,
77 &pnd);
78}
79
80
81
82
83
84#if defined(__linux__)
85
86#include <poll.h>
87#include <sys/ioctl.h>
88#include <sys/syscall.h>
89#include <asm/types.h>
90#endif
91
92#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
93#include <sys/eventfd.h>
94#include <linux/userfaultfd.h>
95
96typedef struct PostcopyBlocktimeContext {
97
98 uint32_t *page_fault_vcpu_time;
99
100 uintptr_t *vcpu_addr;
101 uint32_t total_blocktime;
102
103 uint32_t *vcpu_blocktime;
104
105 uint32_t last_begin;
106
107 int smp_cpus_down;
108 uint64_t start_time;
109
110
111
112
113
114 Notifier exit_notifier;
115} PostcopyBlocktimeContext;
116
117static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
118{
119 g_free(ctx->page_fault_vcpu_time);
120 g_free(ctx->vcpu_addr);
121 g_free(ctx->vcpu_blocktime);
122 g_free(ctx);
123}
124
125static void migration_exit_cb(Notifier *n, void *data)
126{
127 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
128 exit_notifier);
129 destroy_blocktime_context(ctx);
130}
131
132static struct PostcopyBlocktimeContext *blocktime_context_new(void)
133{
134 MachineState *ms = MACHINE(qdev_get_machine());
135 unsigned int smp_cpus = ms->smp.cpus;
136 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
137 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
138 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
139 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
140
141 ctx->exit_notifier.notify = migration_exit_cb;
142 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
143 qemu_add_exit_notifier(&ctx->exit_notifier);
144 return ctx;
145}
146
147static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
148{
149 MachineState *ms = MACHINE(qdev_get_machine());
150 uint32List *list = NULL;
151 int i;
152
153 for (i = ms->smp.cpus - 1; i >= 0; i--) {
154 QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
155 }
156
157 return list;
158}
159
160
161
162
163
164
165
166
167void fill_destination_postcopy_migration_info(MigrationInfo *info)
168{
169 MigrationIncomingState *mis = migration_incoming_get_current();
170 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
171
172 if (!bc) {
173 return;
174 }
175
176 info->has_postcopy_blocktime = true;
177 info->postcopy_blocktime = bc->total_blocktime;
178 info->has_postcopy_vcpu_blocktime = true;
179 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
180}
181
182static uint32_t get_postcopy_total_blocktime(void)
183{
184 MigrationIncomingState *mis = migration_incoming_get_current();
185 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
186
187 if (!bc) {
188 return 0;
189 }
190
191 return bc->total_blocktime;
192}
193
194
195
196
197
198
199
200
201
202
203
204static bool receive_ufd_features(uint64_t *features)
205{
206 struct uffdio_api api_struct = {0};
207 int ufd;
208 bool ret = true;
209
210
211 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
212 if (ufd == -1) {
213 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
214 strerror(errno));
215 return false;
216 }
217
218
219 api_struct.api = UFFD_API;
220 api_struct.features = 0;
221 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
222 error_report("%s: UFFDIO_API failed: %s", __func__,
223 strerror(errno));
224 ret = false;
225 goto release_ufd;
226 }
227
228 *features = api_struct.features;
229
230release_ufd:
231 close(ufd);
232 return ret;
233}
234
235
236
237
238
239
240
241
242
243
244static bool request_ufd_features(int ufd, uint64_t features)
245{
246 struct uffdio_api api_struct = {0};
247 uint64_t ioctl_mask;
248
249 api_struct.api = UFFD_API;
250 api_struct.features = features;
251 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
252 error_report("%s failed: UFFDIO_API failed: %s", __func__,
253 strerror(errno));
254 return false;
255 }
256
257 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
258 (__u64)1 << _UFFDIO_UNREGISTER;
259 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
260 error_report("Missing userfault features: %" PRIx64,
261 (uint64_t)(~api_struct.ioctls & ioctl_mask));
262 return false;
263 }
264
265 return true;
266}
267
268static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
269{
270 uint64_t asked_features = 0;
271 static uint64_t supported_features;
272
273
274
275
276
277
278 if (!supported_features) {
279 if (!receive_ufd_features(&supported_features)) {
280 error_report("%s failed", __func__);
281 return false;
282 }
283 }
284
285#ifdef UFFD_FEATURE_THREAD_ID
286 if (migrate_postcopy_blocktime() && mis &&
287 UFFD_FEATURE_THREAD_ID & supported_features) {
288
289
290 if (!mis->blocktime_ctx) {
291 mis->blocktime_ctx = blocktime_context_new();
292 }
293
294 asked_features |= UFFD_FEATURE_THREAD_ID;
295 }
296#endif
297
298
299
300
301
302
303 if (!request_ufd_features(ufd, asked_features)) {
304 error_report("%s failed: features %" PRIu64, __func__,
305 asked_features);
306 return false;
307 }
308
309 if (qemu_real_host_page_size != ram_pagesize_summary()) {
310 bool have_hp = false;
311
312#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
313 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
314#endif
315 if (!have_hp) {
316 error_report("Userfault on this host does not support huge pages");
317 return false;
318 }
319 }
320 return true;
321}
322
323
324
325static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
326{
327 const char *block_name = qemu_ram_get_idstr(rb);
328 ram_addr_t length = qemu_ram_get_used_length(rb);
329 size_t pagesize = qemu_ram_pagesize(rb);
330
331 if (length % pagesize) {
332 error_report("Postcopy requires RAM blocks to be a page size multiple,"
333 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
334 "page size of 0x%zx", block_name, length, pagesize);
335 return 1;
336 }
337 return 0;
338}
339
340
341
342
343
344
345bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
346{
347 long pagesize = qemu_real_host_page_size;
348 int ufd = -1;
349 bool ret = false;
350 void *testarea = NULL;
351 struct uffdio_register reg_struct;
352 struct uffdio_range range_struct;
353 uint64_t feature_mask;
354 Error *local_err = NULL;
355
356 if (qemu_target_page_size() > pagesize) {
357 error_report("Target page size bigger than host page size");
358 goto out;
359 }
360
361 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
362 if (ufd == -1) {
363 error_report("%s: userfaultfd not available: %s", __func__,
364 strerror(errno));
365 goto out;
366 }
367
368
369 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
370 error_report_err(local_err);
371 goto out;
372 }
373
374
375 if (!ufd_check_and_apply(ufd, mis)) {
376 goto out;
377 }
378
379
380 if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
381 goto out;
382 }
383
384
385
386
387
388 if (munlockall()) {
389 error_report("%s: munlockall: %s", __func__, strerror(errno));
390 goto out;
391 }
392
393
394
395
396
397
398 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
399 MAP_ANONYMOUS, -1, 0);
400 if (testarea == MAP_FAILED) {
401 error_report("%s: Failed to map test area: %s", __func__,
402 strerror(errno));
403 goto out;
404 }
405 g_assert(((size_t)testarea & (pagesize - 1)) == 0);
406
407 reg_struct.range.start = (uintptr_t)testarea;
408 reg_struct.range.len = pagesize;
409 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
410
411 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
412 error_report("%s userfault register: %s", __func__, strerror(errno));
413 goto out;
414 }
415
416 range_struct.start = (uintptr_t)testarea;
417 range_struct.len = pagesize;
418 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
419 error_report("%s userfault unregister: %s", __func__, strerror(errno));
420 goto out;
421 }
422
423 feature_mask = (__u64)1 << _UFFDIO_WAKE |
424 (__u64)1 << _UFFDIO_COPY |
425 (__u64)1 << _UFFDIO_ZEROPAGE;
426 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
427 error_report("Missing userfault map features: %" PRIx64,
428 (uint64_t)(~reg_struct.ioctls & feature_mask));
429 goto out;
430 }
431
432
433 ret = true;
434out:
435 if (testarea) {
436 munmap(testarea, pagesize);
437 }
438 if (ufd != -1) {
439 close(ufd);
440 }
441 return ret;
442}
443
444
445
446
447
448
449static int init_range(RAMBlock *rb, void *opaque)
450{
451 const char *block_name = qemu_ram_get_idstr(rb);
452 void *host_addr = qemu_ram_get_host_addr(rb);
453 ram_addr_t offset = qemu_ram_get_offset(rb);
454 ram_addr_t length = qemu_ram_get_used_length(rb);
455 trace_postcopy_init_range(block_name, host_addr, offset, length);
456
457
458
459
460
461
462 rb->postcopy_length = length;
463
464
465
466
467
468
469
470 if (ram_discard_range(block_name, 0, length)) {
471 return -1;
472 }
473
474 return 0;
475}
476
477
478
479
480
481static int cleanup_range(RAMBlock *rb, void *opaque)
482{
483 const char *block_name = qemu_ram_get_idstr(rb);
484 void *host_addr = qemu_ram_get_host_addr(rb);
485 ram_addr_t offset = qemu_ram_get_offset(rb);
486 ram_addr_t length = rb->postcopy_length;
487 MigrationIncomingState *mis = opaque;
488 struct uffdio_range range_struct;
489 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
490
491
492
493
494
495 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
496
497
498
499
500
501
502 range_struct.start = (uintptr_t)host_addr;
503 range_struct.len = length;
504
505 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
506 error_report("%s: userfault unregister %s", __func__, strerror(errno));
507
508 return -1;
509 }
510
511 return 0;
512}
513
514
515
516
517
518
519int postcopy_ram_incoming_init(MigrationIncomingState *mis)
520{
521 if (foreach_not_ignored_block(init_range, NULL)) {
522 return -1;
523 }
524
525 return 0;
526}
527
528
529
530
531int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
532{
533 trace_postcopy_ram_incoming_cleanup_entry();
534
535 if (mis->have_fault_thread) {
536 Error *local_err = NULL;
537
538
539 qatomic_set(&mis->fault_thread_quit, 1);
540 postcopy_fault_thread_notify(mis);
541 trace_postcopy_ram_incoming_cleanup_join();
542 qemu_thread_join(&mis->fault_thread);
543
544 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
545 error_report_err(local_err);
546 return -1;
547 }
548
549 if (foreach_not_ignored_block(cleanup_range, mis)) {
550 return -1;
551 }
552
553 trace_postcopy_ram_incoming_cleanup_closeuf();
554 close(mis->userfault_fd);
555 close(mis->userfault_event_fd);
556 mis->have_fault_thread = false;
557 }
558
559 if (enable_mlock) {
560 if (os_mlock() < 0) {
561 error_report("mlock: %s", strerror(errno));
562
563
564
565
566 }
567 }
568
569 if (mis->postcopy_tmp_page) {
570 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
571 mis->postcopy_tmp_page = NULL;
572 }
573 if (mis->postcopy_tmp_zero_page) {
574 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
575 mis->postcopy_tmp_zero_page = NULL;
576 }
577 trace_postcopy_ram_incoming_cleanup_blocktime(
578 get_postcopy_total_blocktime());
579
580 trace_postcopy_ram_incoming_cleanup_exit();
581 return 0;
582}
583
584
585
586
587static int nhp_range(RAMBlock *rb, void *opaque)
588{
589 const char *block_name = qemu_ram_get_idstr(rb);
590 void *host_addr = qemu_ram_get_host_addr(rb);
591 ram_addr_t offset = qemu_ram_get_offset(rb);
592 ram_addr_t length = rb->postcopy_length;
593 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
594
595
596
597
598
599
600 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
601
602 return 0;
603}
604
605
606
607
608
609
610int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
611{
612 if (foreach_not_ignored_block(nhp_range, mis)) {
613 return -1;
614 }
615
616 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
617
618 return 0;
619}
620
621
622
623
624
625
626
627
628
629
630static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
631{
632 MigrationIncomingState *mis = opaque;
633 struct uffdio_register reg_struct;
634
635 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
636 reg_struct.range.len = rb->postcopy_length;
637 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
638
639
640 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
641 error_report("%s userfault register: %s", __func__, strerror(errno));
642 return -1;
643 }
644 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
645 error_report("%s userfault: Region doesn't support COPY", __func__);
646 return -1;
647 }
648 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
649 qemu_ram_set_uf_zeroable(rb);
650 }
651
652 return 0;
653}
654
655int postcopy_wake_shared(struct PostCopyFD *pcfd,
656 uint64_t client_addr,
657 RAMBlock *rb)
658{
659 size_t pagesize = qemu_ram_pagesize(rb);
660 struct uffdio_range range;
661 int ret;
662 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
663 range.start = client_addr & ~(pagesize - 1);
664 range.len = pagesize;
665 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
666 if (ret) {
667 error_report("%s: Failed to wake: %zx in %s (%s)",
668 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
669 strerror(errno));
670 }
671 return ret;
672}
673
674
675
676
677
678
679int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
680 uint64_t client_addr, uint64_t rb_offset)
681{
682 size_t pagesize = qemu_ram_pagesize(rb);
683 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
684 MigrationIncomingState *mis = migration_incoming_get_current();
685
686 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
687 rb_offset);
688 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
689 trace_postcopy_request_shared_page_present(pcfd->idstr,
690 qemu_ram_get_idstr(rb), rb_offset);
691 return postcopy_wake_shared(pcfd, client_addr, rb);
692 }
693 migrate_send_rp_req_pages(mis, rb, aligned_rbo, client_addr);
694 return 0;
695}
696
697static int get_mem_fault_cpu_index(uint32_t pid)
698{
699 CPUState *cpu_iter;
700
701 CPU_FOREACH(cpu_iter) {
702 if (cpu_iter->thread_id == pid) {
703 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
704 return cpu_iter->cpu_index;
705 }
706 }
707 trace_get_mem_fault_cpu_index(-1, pid);
708 return -1;
709}
710
711static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
712{
713 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
714 dc->start_time;
715 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
716}
717
718
719
720
721
722
723
724
725
726static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
727 RAMBlock *rb)
728{
729 int cpu, already_received;
730 MigrationIncomingState *mis = migration_incoming_get_current();
731 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
732 uint32_t low_time_offset;
733
734 if (!dc || ptid == 0) {
735 return;
736 }
737 cpu = get_mem_fault_cpu_index(ptid);
738 if (cpu < 0) {
739 return;
740 }
741
742 low_time_offset = get_low_time_offset(dc);
743 if (dc->vcpu_addr[cpu] == 0) {
744 qatomic_inc(&dc->smp_cpus_down);
745 }
746
747 qatomic_xchg(&dc->last_begin, low_time_offset);
748 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
749 qatomic_xchg(&dc->vcpu_addr[cpu], addr);
750
751
752
753
754
755
756 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
757 if (already_received) {
758 qatomic_xchg(&dc->vcpu_addr[cpu], 0);
759 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
760 qatomic_dec(&dc->smp_cpus_down);
761 }
762 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
763 cpu, already_received);
764}
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793static void mark_postcopy_blocktime_end(uintptr_t addr)
794{
795 MigrationIncomingState *mis = migration_incoming_get_current();
796 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
797 MachineState *ms = MACHINE(qdev_get_machine());
798 unsigned int smp_cpus = ms->smp.cpus;
799 int i, affected_cpu = 0;
800 bool vcpu_total_blocktime = false;
801 uint32_t read_vcpu_time, low_time_offset;
802
803 if (!dc) {
804 return;
805 }
806
807 low_time_offset = get_low_time_offset(dc);
808
809
810
811
812 for (i = 0; i < smp_cpus; i++) {
813 uint32_t vcpu_blocktime = 0;
814
815 read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
816 if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
817 read_vcpu_time == 0) {
818 continue;
819 }
820 qatomic_xchg(&dc->vcpu_addr[i], 0);
821 vcpu_blocktime = low_time_offset - read_vcpu_time;
822 affected_cpu += 1;
823
824
825
826 if (!vcpu_total_blocktime &&
827 qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
828 vcpu_total_blocktime = true;
829 }
830
831 dc->vcpu_blocktime[i] += vcpu_blocktime;
832 }
833
834 qatomic_sub(&dc->smp_cpus_down, affected_cpu);
835 if (vcpu_total_blocktime) {
836 dc->total_blocktime += low_time_offset - qatomic_fetch_add(
837 &dc->last_begin, 0);
838 }
839 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
840 affected_cpu);
841}
842
843static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
844{
845 trace_postcopy_pause_fault_thread();
846
847 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
848
849 trace_postcopy_pause_fault_thread_continued();
850
851 return true;
852}
853
854
855
856
857static void *postcopy_ram_fault_thread(void *opaque)
858{
859 MigrationIncomingState *mis = opaque;
860 struct uffd_msg msg;
861 int ret;
862 size_t index;
863 RAMBlock *rb = NULL;
864
865 trace_postcopy_ram_fault_thread_entry();
866 rcu_register_thread();
867 mis->last_rb = NULL;
868 qemu_sem_post(&mis->fault_thread_sem);
869
870 struct pollfd *pfd;
871 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
872
873 pfd = g_new0(struct pollfd, pfd_len);
874
875 pfd[0].fd = mis->userfault_fd;
876 pfd[0].events = POLLIN;
877 pfd[1].fd = mis->userfault_event_fd;
878 pfd[1].events = POLLIN;
879 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
880 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
881 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
882 struct PostCopyFD, index);
883 pfd[2 + index].fd = pcfd->fd;
884 pfd[2 + index].events = POLLIN;
885 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
886 pcfd->fd);
887 }
888
889 while (true) {
890 ram_addr_t rb_offset;
891 int poll_result;
892
893
894
895
896
897
898
899 poll_result = poll(pfd, pfd_len, -1 );
900 if (poll_result == -1) {
901 error_report("%s: userfault poll: %s", __func__, strerror(errno));
902 break;
903 }
904
905 if (!mis->to_src_file) {
906
907
908
909
910
911 if (postcopy_pause_fault_thread(mis)) {
912
913 } else {
914 error_report("%s: paused but don't allow to continue",
915 __func__);
916 break;
917 }
918 }
919
920 if (pfd[1].revents) {
921 uint64_t tmp64 = 0;
922
923
924 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
925
926 error_report("%s: read() failed", __func__);
927 }
928
929 if (qatomic_read(&mis->fault_thread_quit)) {
930 trace_postcopy_ram_fault_thread_quit();
931 break;
932 }
933 }
934
935 if (pfd[0].revents) {
936 poll_result--;
937 ret = read(mis->userfault_fd, &msg, sizeof(msg));
938 if (ret != sizeof(msg)) {
939 if (errno == EAGAIN) {
940
941
942
943
944 continue;
945 }
946 if (ret < 0) {
947 error_report("%s: Failed to read full userfault "
948 "message: %s",
949 __func__, strerror(errno));
950 break;
951 } else {
952 error_report("%s: Read %d bytes from userfaultfd "
953 "expected %zd",
954 __func__, ret, sizeof(msg));
955 break;
956 }
957 }
958 if (msg.event != UFFD_EVENT_PAGEFAULT) {
959 error_report("%s: Read unexpected event %ud from userfaultfd",
960 __func__, msg.event);
961 continue;
962 }
963
964 rb = qemu_ram_block_from_host(
965 (void *)(uintptr_t)msg.arg.pagefault.address,
966 true, &rb_offset);
967 if (!rb) {
968 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
969 PRIx64, (uint64_t)msg.arg.pagefault.address);
970 break;
971 }
972
973 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
974 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
975 qemu_ram_get_idstr(rb),
976 rb_offset,
977 msg.arg.pagefault.feat.ptid);
978 mark_postcopy_blocktime_begin(
979 (uintptr_t)(msg.arg.pagefault.address),
980 msg.arg.pagefault.feat.ptid, rb);
981
982retry:
983
984
985
986
987 ret = migrate_send_rp_req_pages(mis, rb, rb_offset,
988 msg.arg.pagefault.address);
989 if (ret) {
990
991 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
992
993 goto retry;
994 } else {
995
996 error_report("%s: migrate_send_rp_req_pages() get %d",
997 __func__, ret);
998 break;
999 }
1000 }
1001 }
1002
1003
1004
1005 for (index = 2; index < pfd_len && poll_result; index++) {
1006 if (pfd[index].revents) {
1007 struct PostCopyFD *pcfd =
1008 &g_array_index(mis->postcopy_remote_fds,
1009 struct PostCopyFD, index - 2);
1010
1011 poll_result--;
1012 if (pfd[index].revents & POLLERR) {
1013 error_report("%s: POLLERR on poll %zd fd=%d",
1014 __func__, index, pcfd->fd);
1015 pfd[index].events = 0;
1016 continue;
1017 }
1018
1019 ret = read(pcfd->fd, &msg, sizeof(msg));
1020 if (ret != sizeof(msg)) {
1021 if (errno == EAGAIN) {
1022
1023
1024
1025
1026 continue;
1027 }
1028 if (ret < 0) {
1029 error_report("%s: Failed to read full userfault "
1030 "message: %s (shared) revents=%d",
1031 __func__, strerror(errno),
1032 pfd[index].revents);
1033
1034 break;
1035 } else {
1036 error_report("%s: Read %d bytes from userfaultfd "
1037 "expected %zd (shared)",
1038 __func__, ret, sizeof(msg));
1039
1040 break;
1041 }
1042 }
1043 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1044 error_report("%s: Read unexpected event %ud "
1045 "from userfaultfd (shared)",
1046 __func__, msg.event);
1047 continue;
1048 }
1049
1050 ret = pcfd->handler(pcfd, &msg);
1051 if (ret) {
1052 error_report("%s: Failed to resolve shared fault on %zd/%s",
1053 __func__, index, pcfd->idstr);
1054
1055 }
1056 }
1057 }
1058 }
1059 rcu_unregister_thread();
1060 trace_postcopy_ram_fault_thread_exit();
1061 g_free(pfd);
1062 return NULL;
1063}
1064
1065int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1066{
1067
1068 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1069 if (mis->userfault_fd == -1) {
1070 error_report("%s: Failed to open userfault fd: %s", __func__,
1071 strerror(errno));
1072 return -1;
1073 }
1074
1075
1076
1077
1078
1079 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1080 return -1;
1081 }
1082
1083
1084 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1085 if (mis->userfault_event_fd == -1) {
1086 error_report("%s: Opening userfault_event_fd: %s", __func__,
1087 strerror(errno));
1088 close(mis->userfault_fd);
1089 return -1;
1090 }
1091
1092 qemu_sem_init(&mis->fault_thread_sem, 0);
1093 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1094 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1095 qemu_sem_wait(&mis->fault_thread_sem);
1096 qemu_sem_destroy(&mis->fault_thread_sem);
1097 mis->have_fault_thread = true;
1098
1099
1100 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1101 error_report("ram_block_enable_notify failed");
1102 return -1;
1103 }
1104
1105 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1106 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1107 MAP_ANONYMOUS, -1, 0);
1108 if (mis->postcopy_tmp_page == MAP_FAILED) {
1109 mis->postcopy_tmp_page = NULL;
1110 error_report("%s: Failed to map postcopy_tmp_page %s",
1111 __func__, strerror(errno));
1112 return -1;
1113 }
1114
1115
1116
1117
1118 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1119 PROT_READ | PROT_WRITE,
1120 MAP_PRIVATE | MAP_ANONYMOUS,
1121 -1, 0);
1122 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1123 int e = errno;
1124 mis->postcopy_tmp_zero_page = NULL;
1125 error_report("%s: Failed to map large zero page %s",
1126 __func__, strerror(e));
1127 return -e;
1128 }
1129 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1130
1131 trace_postcopy_ram_enable_notify();
1132
1133 return 0;
1134}
1135
1136static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
1137 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1138{
1139 int userfault_fd = mis->userfault_fd;
1140 int ret;
1141
1142 if (from_addr) {
1143 struct uffdio_copy copy_struct;
1144 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1145 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1146 copy_struct.len = pagesize;
1147 copy_struct.mode = 0;
1148 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1149 } else {
1150 struct uffdio_zeropage zero_struct;
1151 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1152 zero_struct.range.len = pagesize;
1153 zero_struct.mode = 0;
1154 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1155 }
1156 if (!ret) {
1157 qemu_mutex_lock(&mis->page_request_mutex);
1158 ramblock_recv_bitmap_set_range(rb, host_addr,
1159 pagesize / qemu_target_page_size());
1160
1161
1162
1163
1164 if (g_tree_lookup(mis->page_requested, host_addr)) {
1165 g_tree_remove(mis->page_requested, host_addr);
1166 mis->page_requested_count--;
1167 trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
1168 }
1169 qemu_mutex_unlock(&mis->page_request_mutex);
1170 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1171 }
1172 return ret;
1173}
1174
1175int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1176{
1177 int i;
1178 MigrationIncomingState *mis = migration_incoming_get_current();
1179 GArray *pcrfds = mis->postcopy_remote_fds;
1180
1181 for (i = 0; i < pcrfds->len; i++) {
1182 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1183 int ret = cur->waker(cur, rb, offset);
1184 if (ret) {
1185 return ret;
1186 }
1187 }
1188 return 0;
1189}
1190
1191
1192
1193
1194
1195int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1196 RAMBlock *rb)
1197{
1198 size_t pagesize = qemu_ram_pagesize(rb);
1199
1200
1201
1202
1203
1204
1205 if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
1206 int e = errno;
1207 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1208 __func__, strerror(e), host, from, pagesize);
1209
1210 return -e;
1211 }
1212
1213 trace_postcopy_place_page(host);
1214 return postcopy_notify_shared_wake(rb,
1215 qemu_ram_block_host_offset(rb, host));
1216}
1217
1218
1219
1220
1221
1222int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1223 RAMBlock *rb)
1224{
1225 size_t pagesize = qemu_ram_pagesize(rb);
1226 trace_postcopy_place_page_zero(host);
1227
1228
1229
1230
1231 if (qemu_ram_is_uf_zeroable(rb)) {
1232 if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
1233 int e = errno;
1234 error_report("%s: %s zero host: %p",
1235 __func__, strerror(e), host);
1236
1237 return -e;
1238 }
1239 return postcopy_notify_shared_wake(rb,
1240 qemu_ram_block_host_offset(rb,
1241 host));
1242 } else {
1243 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1244 }
1245}
1246
1247#else
1248
1249void fill_destination_postcopy_migration_info(MigrationInfo *info)
1250{
1251}
1252
1253bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1254{
1255 error_report("%s: No OS support", __func__);
1256 return false;
1257}
1258
1259int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1260{
1261 error_report("postcopy_ram_incoming_init: No OS support");
1262 return -1;
1263}
1264
1265int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1266{
1267 assert(0);
1268 return -1;
1269}
1270
1271int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1272{
1273 assert(0);
1274 return -1;
1275}
1276
1277int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1278 uint64_t client_addr, uint64_t rb_offset)
1279{
1280 assert(0);
1281 return -1;
1282}
1283
1284int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1285{
1286 assert(0);
1287 return -1;
1288}
1289
1290int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1291 RAMBlock *rb)
1292{
1293 assert(0);
1294 return -1;
1295}
1296
1297int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1298 RAMBlock *rb)
1299{
1300 assert(0);
1301 return -1;
1302}
1303
1304int postcopy_wake_shared(struct PostCopyFD *pcfd,
1305 uint64_t client_addr,
1306 RAMBlock *rb)
1307{
1308 assert(0);
1309 return -1;
1310}
1311#endif
1312
1313
1314
1315void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1316{
1317 uint64_t tmp64 = 1;
1318
1319
1320
1321
1322
1323 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1324
1325 error_report("%s: incrementing failed: %s", __func__,
1326 strerror(errno));
1327 }
1328}
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338static PostcopyDiscardState pds = {0};
1339void postcopy_discard_send_init(MigrationState *ms, const char *name)
1340{
1341 pds.ramblock_name = name;
1342 pds.cur_entry = 0;
1343 pds.nsentwords = 0;
1344 pds.nsentcmds = 0;
1345}
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1357 unsigned long length)
1358{
1359 size_t tp_size = qemu_target_page_size();
1360
1361 pds.start_list[pds.cur_entry] = start * tp_size;
1362 pds.length_list[pds.cur_entry] = length * tp_size;
1363 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1364 pds.cur_entry++;
1365 pds.nsentwords++;
1366
1367 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1368
1369 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1370 pds.ramblock_name,
1371 pds.cur_entry,
1372 pds.start_list,
1373 pds.length_list);
1374 pds.nsentcmds++;
1375 pds.cur_entry = 0;
1376 }
1377}
1378
1379
1380
1381
1382
1383
1384
1385void postcopy_discard_send_finish(MigrationState *ms)
1386{
1387
1388 if (pds.cur_entry) {
1389 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1390 pds.ramblock_name,
1391 pds.cur_entry,
1392 pds.start_list,
1393 pds.length_list);
1394 pds.nsentcmds++;
1395 }
1396
1397 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1398 pds.nsentcmds);
1399}
1400
1401
1402
1403
1404
1405
1406static PostcopyState incoming_postcopy_state;
1407
1408PostcopyState postcopy_state_get(void)
1409{
1410 return qatomic_mb_read(&incoming_postcopy_state);
1411}
1412
1413
1414PostcopyState postcopy_state_set(PostcopyState new_state)
1415{
1416 return qatomic_xchg(&incoming_postcopy_state, new_state);
1417}
1418
1419
1420
1421
1422void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1423{
1424 MigrationIncomingState *mis = migration_incoming_get_current();
1425
1426 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1427 *pcfd);
1428}
1429
1430
1431
1432void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1433{
1434 guint i;
1435 MigrationIncomingState *mis = migration_incoming_get_current();
1436 GArray *pcrfds = mis->postcopy_remote_fds;
1437
1438 for (i = 0; i < pcrfds->len; i++) {
1439 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1440 if (cur->fd == pcfd->fd) {
1441 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1442 return;
1443 }
1444 }
1445}
1446