1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "qemu/rcu.h"
21#include "exec/target_page.h"
22#include "migration.h"
23#include "qemu-file.h"
24#include "savevm.h"
25#include "postcopy-ram.h"
26#include "ram.h"
27#include "qapi/error.h"
28#include "qemu/notify.h"
29#include "qemu/rcu.h"
30#include "sysemu/sysemu.h"
31#include "qemu/error-report.h"
32#include "trace.h"
33#include "hw/boards.h"
34#include "exec/ramblock.h"
35
36
37
38
39#define MAX_DISCARDS_PER_COMMAND 12
40
41struct PostcopyDiscardState {
42 const char *ramblock_name;
43 uint16_t cur_entry;
44
45
46
47 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
48 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
49 unsigned int nsentwords;
50 unsigned int nsentcmds;
51};
52
53static NotifierWithReturnList postcopy_notifier_list;
54
55void postcopy_infrastructure_init(void)
56{
57 notifier_with_return_list_init(&postcopy_notifier_list);
58}
59
60void postcopy_add_notifier(NotifierWithReturn *nn)
61{
62 notifier_with_return_list_add(&postcopy_notifier_list, nn);
63}
64
65void postcopy_remove_notifier(NotifierWithReturn *n)
66{
67 notifier_with_return_remove(n);
68}
69
70int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
71{
72 struct PostcopyNotifyData pnd;
73 pnd.reason = reason;
74 pnd.errp = errp;
75
76 return notifier_with_return_list_notify(&postcopy_notifier_list,
77 &pnd);
78}
79
80
81
82
83
84#if defined(__linux__)
85
86#include <poll.h>
87#include <sys/ioctl.h>
88#include <sys/syscall.h>
89#include <asm/types.h>
90#endif
91
92#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
93#include <sys/eventfd.h>
94#include <linux/userfaultfd.h>
95
96typedef struct PostcopyBlocktimeContext {
97
98 uint32_t *page_fault_vcpu_time;
99
100 uintptr_t *vcpu_addr;
101 uint32_t total_blocktime;
102
103 uint32_t *vcpu_blocktime;
104
105 uint32_t last_begin;
106
107 int smp_cpus_down;
108 uint64_t start_time;
109
110
111
112
113
114 Notifier exit_notifier;
115} PostcopyBlocktimeContext;
116
117static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
118{
119 g_free(ctx->page_fault_vcpu_time);
120 g_free(ctx->vcpu_addr);
121 g_free(ctx->vcpu_blocktime);
122 g_free(ctx);
123}
124
125static void migration_exit_cb(Notifier *n, void *data)
126{
127 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
128 exit_notifier);
129 destroy_blocktime_context(ctx);
130}
131
132static struct PostcopyBlocktimeContext *blocktime_context_new(void)
133{
134 MachineState *ms = MACHINE(qdev_get_machine());
135 unsigned int smp_cpus = ms->smp.cpus;
136 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
137 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
138 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
139 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
140
141 ctx->exit_notifier.notify = migration_exit_cb;
142 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
143 qemu_add_exit_notifier(&ctx->exit_notifier);
144 return ctx;
145}
146
147static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
148{
149 MachineState *ms = MACHINE(qdev_get_machine());
150 uint32List *list = NULL;
151 int i;
152
153 for (i = ms->smp.cpus - 1; i >= 0; i--) {
154 QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
155 }
156
157 return list;
158}
159
160
161
162
163
164
165
166
167void fill_destination_postcopy_migration_info(MigrationInfo *info)
168{
169 MigrationIncomingState *mis = migration_incoming_get_current();
170 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
171
172 if (!bc) {
173 return;
174 }
175
176 info->has_postcopy_blocktime = true;
177 info->postcopy_blocktime = bc->total_blocktime;
178 info->has_postcopy_vcpu_blocktime = true;
179 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
180}
181
182static uint32_t get_postcopy_total_blocktime(void)
183{
184 MigrationIncomingState *mis = migration_incoming_get_current();
185 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
186
187 if (!bc) {
188 return 0;
189 }
190
191 return bc->total_blocktime;
192}
193
194
195
196
197
198
199
200
201
202
203
204static bool receive_ufd_features(uint64_t *features)
205{
206 struct uffdio_api api_struct = {0};
207 int ufd;
208 bool ret = true;
209
210
211 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
212 if (ufd == -1) {
213 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
214 strerror(errno));
215 return false;
216 }
217
218
219 api_struct.api = UFFD_API;
220 api_struct.features = 0;
221 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
222 error_report("%s: UFFDIO_API failed: %s", __func__,
223 strerror(errno));
224 ret = false;
225 goto release_ufd;
226 }
227
228 *features = api_struct.features;
229
230release_ufd:
231 close(ufd);
232 return ret;
233}
234
235
236
237
238
239
240
241
242
243
244static bool request_ufd_features(int ufd, uint64_t features)
245{
246 struct uffdio_api api_struct = {0};
247 uint64_t ioctl_mask;
248
249 api_struct.api = UFFD_API;
250 api_struct.features = features;
251 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
252 error_report("%s failed: UFFDIO_API failed: %s", __func__,
253 strerror(errno));
254 return false;
255 }
256
257 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
258 (__u64)1 << _UFFDIO_UNREGISTER;
259 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
260 error_report("Missing userfault features: %" PRIx64,
261 (uint64_t)(~api_struct.ioctls & ioctl_mask));
262 return false;
263 }
264
265 return true;
266}
267
268static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
269{
270 uint64_t asked_features = 0;
271 static uint64_t supported_features;
272
273
274
275
276
277
278 if (!supported_features) {
279 if (!receive_ufd_features(&supported_features)) {
280 error_report("%s failed", __func__);
281 return false;
282 }
283 }
284
285#ifdef UFFD_FEATURE_THREAD_ID
286 if (migrate_postcopy_blocktime() && mis &&
287 UFFD_FEATURE_THREAD_ID & supported_features) {
288
289
290 if (!mis->blocktime_ctx) {
291 mis->blocktime_ctx = blocktime_context_new();
292 }
293
294 asked_features |= UFFD_FEATURE_THREAD_ID;
295 }
296#endif
297
298
299
300
301
302
303 if (!request_ufd_features(ufd, asked_features)) {
304 error_report("%s failed: features %" PRIu64, __func__,
305 asked_features);
306 return false;
307 }
308
309 if (qemu_real_host_page_size != ram_pagesize_summary()) {
310 bool have_hp = false;
311
312#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
313 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
314#endif
315 if (!have_hp) {
316 error_report("Userfault on this host does not support huge pages");
317 return false;
318 }
319 }
320 return true;
321}
322
323
324
325static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
326{
327 const char *block_name = qemu_ram_get_idstr(rb);
328 ram_addr_t length = qemu_ram_get_used_length(rb);
329 size_t pagesize = qemu_ram_pagesize(rb);
330
331 if (length % pagesize) {
332 error_report("Postcopy requires RAM blocks to be a page size multiple,"
333 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
334 "page size of 0x%zx", block_name, length, pagesize);
335 return 1;
336 }
337 return 0;
338}
339
340
341
342
343
344
345bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
346{
347 long pagesize = qemu_real_host_page_size;
348 int ufd = -1;
349 bool ret = false;
350 void *testarea = NULL;
351 struct uffdio_register reg_struct;
352 struct uffdio_range range_struct;
353 uint64_t feature_mask;
354 Error *local_err = NULL;
355
356 if (qemu_target_page_size() > pagesize) {
357 error_report("Target page size bigger than host page size");
358 goto out;
359 }
360
361 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
362 if (ufd == -1) {
363 error_report("%s: userfaultfd not available: %s", __func__,
364 strerror(errno));
365 goto out;
366 }
367
368
369 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
370 error_report_err(local_err);
371 goto out;
372 }
373
374
375 if (!ufd_check_and_apply(ufd, mis)) {
376 goto out;
377 }
378
379
380 if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
381 goto out;
382 }
383
384
385
386
387
388 if (munlockall()) {
389 error_report("%s: munlockall: %s", __func__, strerror(errno));
390 goto out;
391 }
392
393
394
395
396
397
398 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
399 MAP_ANONYMOUS, -1, 0);
400 if (testarea == MAP_FAILED) {
401 error_report("%s: Failed to map test area: %s", __func__,
402 strerror(errno));
403 goto out;
404 }
405 g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
406
407 reg_struct.range.start = (uintptr_t)testarea;
408 reg_struct.range.len = pagesize;
409 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
410
411 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
412 error_report("%s userfault register: %s", __func__, strerror(errno));
413 goto out;
414 }
415
416 range_struct.start = (uintptr_t)testarea;
417 range_struct.len = pagesize;
418 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
419 error_report("%s userfault unregister: %s", __func__, strerror(errno));
420 goto out;
421 }
422
423 feature_mask = (__u64)1 << _UFFDIO_WAKE |
424 (__u64)1 << _UFFDIO_COPY |
425 (__u64)1 << _UFFDIO_ZEROPAGE;
426 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
427 error_report("Missing userfault map features: %" PRIx64,
428 (uint64_t)(~reg_struct.ioctls & feature_mask));
429 goto out;
430 }
431
432
433 ret = true;
434out:
435 if (testarea) {
436 munmap(testarea, pagesize);
437 }
438 if (ufd != -1) {
439 close(ufd);
440 }
441 return ret;
442}
443
444
445
446
447
448
449static int init_range(RAMBlock *rb, void *opaque)
450{
451 const char *block_name = qemu_ram_get_idstr(rb);
452 void *host_addr = qemu_ram_get_host_addr(rb);
453 ram_addr_t offset = qemu_ram_get_offset(rb);
454 ram_addr_t length = qemu_ram_get_used_length(rb);
455 trace_postcopy_init_range(block_name, host_addr, offset, length);
456
457
458
459
460
461
462 rb->postcopy_length = length;
463
464
465
466
467
468
469
470 if (ram_discard_range(block_name, 0, length)) {
471 return -1;
472 }
473
474 return 0;
475}
476
477
478
479
480
481static int cleanup_range(RAMBlock *rb, void *opaque)
482{
483 const char *block_name = qemu_ram_get_idstr(rb);
484 void *host_addr = qemu_ram_get_host_addr(rb);
485 ram_addr_t offset = qemu_ram_get_offset(rb);
486 ram_addr_t length = rb->postcopy_length;
487 MigrationIncomingState *mis = opaque;
488 struct uffdio_range range_struct;
489 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
490
491
492
493
494
495 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
496
497
498
499
500
501
502 range_struct.start = (uintptr_t)host_addr;
503 range_struct.len = length;
504
505 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
506 error_report("%s: userfault unregister %s", __func__, strerror(errno));
507
508 return -1;
509 }
510
511 return 0;
512}
513
514
515
516
517
518
519int postcopy_ram_incoming_init(MigrationIncomingState *mis)
520{
521 if (foreach_not_ignored_block(init_range, NULL)) {
522 return -1;
523 }
524
525 return 0;
526}
527
528
529
530
531int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
532{
533 trace_postcopy_ram_incoming_cleanup_entry();
534
535 if (mis->have_fault_thread) {
536 Error *local_err = NULL;
537
538
539 qatomic_set(&mis->fault_thread_quit, 1);
540 postcopy_fault_thread_notify(mis);
541 trace_postcopy_ram_incoming_cleanup_join();
542 qemu_thread_join(&mis->fault_thread);
543
544 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
545 error_report_err(local_err);
546 return -1;
547 }
548
549 if (foreach_not_ignored_block(cleanup_range, mis)) {
550 return -1;
551 }
552
553 trace_postcopy_ram_incoming_cleanup_closeuf();
554 close(mis->userfault_fd);
555 close(mis->userfault_event_fd);
556 mis->have_fault_thread = false;
557 }
558
559 if (enable_mlock) {
560 if (os_mlock() < 0) {
561 error_report("mlock: %s", strerror(errno));
562
563
564
565
566 }
567 }
568
569 if (mis->postcopy_tmp_page) {
570 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
571 mis->postcopy_tmp_page = NULL;
572 }
573 if (mis->postcopy_tmp_zero_page) {
574 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
575 mis->postcopy_tmp_zero_page = NULL;
576 }
577 trace_postcopy_ram_incoming_cleanup_blocktime(
578 get_postcopy_total_blocktime());
579
580 trace_postcopy_ram_incoming_cleanup_exit();
581 return 0;
582}
583
584
585
586
587static int nhp_range(RAMBlock *rb, void *opaque)
588{
589 const char *block_name = qemu_ram_get_idstr(rb);
590 void *host_addr = qemu_ram_get_host_addr(rb);
591 ram_addr_t offset = qemu_ram_get_offset(rb);
592 ram_addr_t length = rb->postcopy_length;
593 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
594
595
596
597
598
599
600 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
601
602 return 0;
603}
604
605
606
607
608
609
610int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
611{
612 if (foreach_not_ignored_block(nhp_range, mis)) {
613 return -1;
614 }
615
616 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
617
618 return 0;
619}
620
621
622
623
624
625
626
627
628
629
630static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
631{
632 MigrationIncomingState *mis = opaque;
633 struct uffdio_register reg_struct;
634
635 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
636 reg_struct.range.len = rb->postcopy_length;
637 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
638
639
640 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
641 error_report("%s userfault register: %s", __func__, strerror(errno));
642 return -1;
643 }
644 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
645 error_report("%s userfault: Region doesn't support COPY", __func__);
646 return -1;
647 }
648 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
649 qemu_ram_set_uf_zeroable(rb);
650 }
651
652 return 0;
653}
654
655int postcopy_wake_shared(struct PostCopyFD *pcfd,
656 uint64_t client_addr,
657 RAMBlock *rb)
658{
659 size_t pagesize = qemu_ram_pagesize(rb);
660 struct uffdio_range range;
661 int ret;
662 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
663 range.start = ROUND_DOWN(client_addr, pagesize);
664 range.len = pagesize;
665 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
666 if (ret) {
667 error_report("%s: Failed to wake: %zx in %s (%s)",
668 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
669 strerror(errno));
670 }
671 return ret;
672}
673
674static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
675 ram_addr_t start, uint64_t haddr)
676{
677 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
678
679
680
681
682
683
684
685
686
687 assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
688 if (ramblock_page_is_discarded(rb, start)) {
689 bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
690
691 return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
692 }
693
694 return migrate_send_rp_req_pages(mis, rb, start, haddr);
695}
696
697
698
699
700
701
702int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
703 uint64_t client_addr, uint64_t rb_offset)
704{
705 uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
706 MigrationIncomingState *mis = migration_incoming_get_current();
707
708 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
709 rb_offset);
710 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
711 trace_postcopy_request_shared_page_present(pcfd->idstr,
712 qemu_ram_get_idstr(rb), rb_offset);
713 return postcopy_wake_shared(pcfd, client_addr, rb);
714 }
715 postcopy_request_page(mis, rb, aligned_rbo, client_addr);
716 return 0;
717}
718
719static int get_mem_fault_cpu_index(uint32_t pid)
720{
721 CPUState *cpu_iter;
722
723 CPU_FOREACH(cpu_iter) {
724 if (cpu_iter->thread_id == pid) {
725 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
726 return cpu_iter->cpu_index;
727 }
728 }
729 trace_get_mem_fault_cpu_index(-1, pid);
730 return -1;
731}
732
733static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
734{
735 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
736 dc->start_time;
737 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
738}
739
740
741
742
743
744
745
746
747
748static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
749 RAMBlock *rb)
750{
751 int cpu, already_received;
752 MigrationIncomingState *mis = migration_incoming_get_current();
753 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
754 uint32_t low_time_offset;
755
756 if (!dc || ptid == 0) {
757 return;
758 }
759 cpu = get_mem_fault_cpu_index(ptid);
760 if (cpu < 0) {
761 return;
762 }
763
764 low_time_offset = get_low_time_offset(dc);
765 if (dc->vcpu_addr[cpu] == 0) {
766 qatomic_inc(&dc->smp_cpus_down);
767 }
768
769 qatomic_xchg(&dc->last_begin, low_time_offset);
770 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
771 qatomic_xchg(&dc->vcpu_addr[cpu], addr);
772
773
774
775
776
777
778 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
779 if (already_received) {
780 qatomic_xchg(&dc->vcpu_addr[cpu], 0);
781 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
782 qatomic_dec(&dc->smp_cpus_down);
783 }
784 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
785 cpu, already_received);
786}
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815static void mark_postcopy_blocktime_end(uintptr_t addr)
816{
817 MigrationIncomingState *mis = migration_incoming_get_current();
818 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
819 MachineState *ms = MACHINE(qdev_get_machine());
820 unsigned int smp_cpus = ms->smp.cpus;
821 int i, affected_cpu = 0;
822 bool vcpu_total_blocktime = false;
823 uint32_t read_vcpu_time, low_time_offset;
824
825 if (!dc) {
826 return;
827 }
828
829 low_time_offset = get_low_time_offset(dc);
830
831
832
833
834 for (i = 0; i < smp_cpus; i++) {
835 uint32_t vcpu_blocktime = 0;
836
837 read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
838 if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
839 read_vcpu_time == 0) {
840 continue;
841 }
842 qatomic_xchg(&dc->vcpu_addr[i], 0);
843 vcpu_blocktime = low_time_offset - read_vcpu_time;
844 affected_cpu += 1;
845
846
847
848 if (!vcpu_total_blocktime &&
849 qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
850 vcpu_total_blocktime = true;
851 }
852
853 dc->vcpu_blocktime[i] += vcpu_blocktime;
854 }
855
856 qatomic_sub(&dc->smp_cpus_down, affected_cpu);
857 if (vcpu_total_blocktime) {
858 dc->total_blocktime += low_time_offset - qatomic_fetch_add(
859 &dc->last_begin, 0);
860 }
861 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
862 affected_cpu);
863}
864
865static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
866{
867 trace_postcopy_pause_fault_thread();
868
869 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
870
871 trace_postcopy_pause_fault_thread_continued();
872
873 return true;
874}
875
876
877
878
879static void *postcopy_ram_fault_thread(void *opaque)
880{
881 MigrationIncomingState *mis = opaque;
882 struct uffd_msg msg;
883 int ret;
884 size_t index;
885 RAMBlock *rb = NULL;
886
887 trace_postcopy_ram_fault_thread_entry();
888 rcu_register_thread();
889 mis->last_rb = NULL;
890 qemu_sem_post(&mis->fault_thread_sem);
891
892 struct pollfd *pfd;
893 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
894
895 pfd = g_new0(struct pollfd, pfd_len);
896
897 pfd[0].fd = mis->userfault_fd;
898 pfd[0].events = POLLIN;
899 pfd[1].fd = mis->userfault_event_fd;
900 pfd[1].events = POLLIN;
901 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
902 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
903 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
904 struct PostCopyFD, index);
905 pfd[2 + index].fd = pcfd->fd;
906 pfd[2 + index].events = POLLIN;
907 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
908 pcfd->fd);
909 }
910
911 while (true) {
912 ram_addr_t rb_offset;
913 int poll_result;
914
915
916
917
918
919
920
921 poll_result = poll(pfd, pfd_len, -1 );
922 if (poll_result == -1) {
923 error_report("%s: userfault poll: %s", __func__, strerror(errno));
924 break;
925 }
926
927 if (!mis->to_src_file) {
928
929
930
931
932
933 if (postcopy_pause_fault_thread(mis)) {
934
935 } else {
936 error_report("%s: paused but don't allow to continue",
937 __func__);
938 break;
939 }
940 }
941
942 if (pfd[1].revents) {
943 uint64_t tmp64 = 0;
944
945
946 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
947
948 error_report("%s: read() failed", __func__);
949 }
950
951 if (qatomic_read(&mis->fault_thread_quit)) {
952 trace_postcopy_ram_fault_thread_quit();
953 break;
954 }
955 }
956
957 if (pfd[0].revents) {
958 poll_result--;
959 ret = read(mis->userfault_fd, &msg, sizeof(msg));
960 if (ret != sizeof(msg)) {
961 if (errno == EAGAIN) {
962
963
964
965
966 continue;
967 }
968 if (ret < 0) {
969 error_report("%s: Failed to read full userfault "
970 "message: %s",
971 __func__, strerror(errno));
972 break;
973 } else {
974 error_report("%s: Read %d bytes from userfaultfd "
975 "expected %zd",
976 __func__, ret, sizeof(msg));
977 break;
978 }
979 }
980 if (msg.event != UFFD_EVENT_PAGEFAULT) {
981 error_report("%s: Read unexpected event %ud from userfaultfd",
982 __func__, msg.event);
983 continue;
984 }
985
986 rb = qemu_ram_block_from_host(
987 (void *)(uintptr_t)msg.arg.pagefault.address,
988 true, &rb_offset);
989 if (!rb) {
990 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
991 PRIx64, (uint64_t)msg.arg.pagefault.address);
992 break;
993 }
994
995 rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
996 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
997 qemu_ram_get_idstr(rb),
998 rb_offset,
999 msg.arg.pagefault.feat.ptid);
1000 mark_postcopy_blocktime_begin(
1001 (uintptr_t)(msg.arg.pagefault.address),
1002 msg.arg.pagefault.feat.ptid, rb);
1003
1004retry:
1005
1006
1007
1008
1009 ret = postcopy_request_page(mis, rb, rb_offset,
1010 msg.arg.pagefault.address);
1011 if (ret) {
1012
1013 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
1014
1015 goto retry;
1016 } else {
1017
1018 error_report("%s: postcopy_request_page() get %d",
1019 __func__, ret);
1020 break;
1021 }
1022 }
1023 }
1024
1025
1026
1027 for (index = 2; index < pfd_len && poll_result; index++) {
1028 if (pfd[index].revents) {
1029 struct PostCopyFD *pcfd =
1030 &g_array_index(mis->postcopy_remote_fds,
1031 struct PostCopyFD, index - 2);
1032
1033 poll_result--;
1034 if (pfd[index].revents & POLLERR) {
1035 error_report("%s: POLLERR on poll %zd fd=%d",
1036 __func__, index, pcfd->fd);
1037 pfd[index].events = 0;
1038 continue;
1039 }
1040
1041 ret = read(pcfd->fd, &msg, sizeof(msg));
1042 if (ret != sizeof(msg)) {
1043 if (errno == EAGAIN) {
1044
1045
1046
1047
1048 continue;
1049 }
1050 if (ret < 0) {
1051 error_report("%s: Failed to read full userfault "
1052 "message: %s (shared) revents=%d",
1053 __func__, strerror(errno),
1054 pfd[index].revents);
1055
1056 break;
1057 } else {
1058 error_report("%s: Read %d bytes from userfaultfd "
1059 "expected %zd (shared)",
1060 __func__, ret, sizeof(msg));
1061
1062 break;
1063 }
1064 }
1065 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1066 error_report("%s: Read unexpected event %ud "
1067 "from userfaultfd (shared)",
1068 __func__, msg.event);
1069 continue;
1070 }
1071
1072 ret = pcfd->handler(pcfd, &msg);
1073 if (ret) {
1074 error_report("%s: Failed to resolve shared fault on %zd/%s",
1075 __func__, index, pcfd->idstr);
1076
1077 }
1078 }
1079 }
1080 }
1081 rcu_unregister_thread();
1082 trace_postcopy_ram_fault_thread_exit();
1083 g_free(pfd);
1084 return NULL;
1085}
1086
1087int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1088{
1089
1090 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1091 if (mis->userfault_fd == -1) {
1092 error_report("%s: Failed to open userfault fd: %s", __func__,
1093 strerror(errno));
1094 return -1;
1095 }
1096
1097
1098
1099
1100
1101 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1102 return -1;
1103 }
1104
1105
1106 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1107 if (mis->userfault_event_fd == -1) {
1108 error_report("%s: Opening userfault_event_fd: %s", __func__,
1109 strerror(errno));
1110 close(mis->userfault_fd);
1111 return -1;
1112 }
1113
1114 qemu_sem_init(&mis->fault_thread_sem, 0);
1115 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1116 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1117 qemu_sem_wait(&mis->fault_thread_sem);
1118 qemu_sem_destroy(&mis->fault_thread_sem);
1119 mis->have_fault_thread = true;
1120
1121
1122 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1123 error_report("ram_block_enable_notify failed");
1124 return -1;
1125 }
1126
1127 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1128 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1129 MAP_ANONYMOUS, -1, 0);
1130 if (mis->postcopy_tmp_page == MAP_FAILED) {
1131 mis->postcopy_tmp_page = NULL;
1132 error_report("%s: Failed to map postcopy_tmp_page %s",
1133 __func__, strerror(errno));
1134 return -1;
1135 }
1136
1137
1138
1139
1140 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1141 PROT_READ | PROT_WRITE,
1142 MAP_PRIVATE | MAP_ANONYMOUS,
1143 -1, 0);
1144 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1145 int e = errno;
1146 mis->postcopy_tmp_zero_page = NULL;
1147 error_report("%s: Failed to map large zero page %s",
1148 __func__, strerror(e));
1149 return -e;
1150 }
1151 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1152
1153 trace_postcopy_ram_enable_notify();
1154
1155 return 0;
1156}
1157
1158static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
1159 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1160{
1161 int userfault_fd = mis->userfault_fd;
1162 int ret;
1163
1164 if (from_addr) {
1165 struct uffdio_copy copy_struct;
1166 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1167 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1168 copy_struct.len = pagesize;
1169 copy_struct.mode = 0;
1170 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1171 } else {
1172 struct uffdio_zeropage zero_struct;
1173 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1174 zero_struct.range.len = pagesize;
1175 zero_struct.mode = 0;
1176 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1177 }
1178 if (!ret) {
1179 qemu_mutex_lock(&mis->page_request_mutex);
1180 ramblock_recv_bitmap_set_range(rb, host_addr,
1181 pagesize / qemu_target_page_size());
1182
1183
1184
1185
1186 if (g_tree_lookup(mis->page_requested, host_addr)) {
1187 g_tree_remove(mis->page_requested, host_addr);
1188 mis->page_requested_count--;
1189 trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
1190 }
1191 qemu_mutex_unlock(&mis->page_request_mutex);
1192 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1193 }
1194 return ret;
1195}
1196
1197int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1198{
1199 int i;
1200 MigrationIncomingState *mis = migration_incoming_get_current();
1201 GArray *pcrfds = mis->postcopy_remote_fds;
1202
1203 for (i = 0; i < pcrfds->len; i++) {
1204 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1205 int ret = cur->waker(cur, rb, offset);
1206 if (ret) {
1207 return ret;
1208 }
1209 }
1210 return 0;
1211}
1212
1213
1214
1215
1216
1217int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1218 RAMBlock *rb)
1219{
1220 size_t pagesize = qemu_ram_pagesize(rb);
1221
1222
1223
1224
1225
1226
1227 if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
1228 int e = errno;
1229 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1230 __func__, strerror(e), host, from, pagesize);
1231
1232 return -e;
1233 }
1234
1235 trace_postcopy_place_page(host);
1236 return postcopy_notify_shared_wake(rb,
1237 qemu_ram_block_host_offset(rb, host));
1238}
1239
1240
1241
1242
1243
1244int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1245 RAMBlock *rb)
1246{
1247 size_t pagesize = qemu_ram_pagesize(rb);
1248 trace_postcopy_place_page_zero(host);
1249
1250
1251
1252
1253 if (qemu_ram_is_uf_zeroable(rb)) {
1254 if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
1255 int e = errno;
1256 error_report("%s: %s zero host: %p",
1257 __func__, strerror(e), host);
1258
1259 return -e;
1260 }
1261 return postcopy_notify_shared_wake(rb,
1262 qemu_ram_block_host_offset(rb,
1263 host));
1264 } else {
1265 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1266 }
1267}
1268
1269#else
1270
1271void fill_destination_postcopy_migration_info(MigrationInfo *info)
1272{
1273}
1274
1275bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1276{
1277 error_report("%s: No OS support", __func__);
1278 return false;
1279}
1280
1281int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1282{
1283 error_report("postcopy_ram_incoming_init: No OS support");
1284 return -1;
1285}
1286
1287int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1288{
1289 assert(0);
1290 return -1;
1291}
1292
1293int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1294{
1295 assert(0);
1296 return -1;
1297}
1298
1299int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1300 uint64_t client_addr, uint64_t rb_offset)
1301{
1302 assert(0);
1303 return -1;
1304}
1305
1306int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1307{
1308 assert(0);
1309 return -1;
1310}
1311
1312int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1313 RAMBlock *rb)
1314{
1315 assert(0);
1316 return -1;
1317}
1318
1319int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1320 RAMBlock *rb)
1321{
1322 assert(0);
1323 return -1;
1324}
1325
1326int postcopy_wake_shared(struct PostCopyFD *pcfd,
1327 uint64_t client_addr,
1328 RAMBlock *rb)
1329{
1330 assert(0);
1331 return -1;
1332}
1333#endif
1334
1335
1336
1337void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1338{
1339 uint64_t tmp64 = 1;
1340
1341
1342
1343
1344
1345 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1346
1347 error_report("%s: incrementing failed: %s", __func__,
1348 strerror(errno));
1349 }
1350}
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360static PostcopyDiscardState pds = {0};
1361void postcopy_discard_send_init(MigrationState *ms, const char *name)
1362{
1363 pds.ramblock_name = name;
1364 pds.cur_entry = 0;
1365 pds.nsentwords = 0;
1366 pds.nsentcmds = 0;
1367}
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1379 unsigned long length)
1380{
1381 size_t tp_size = qemu_target_page_size();
1382
1383 pds.start_list[pds.cur_entry] = start * tp_size;
1384 pds.length_list[pds.cur_entry] = length * tp_size;
1385 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1386 pds.cur_entry++;
1387 pds.nsentwords++;
1388
1389 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1390
1391 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1392 pds.ramblock_name,
1393 pds.cur_entry,
1394 pds.start_list,
1395 pds.length_list);
1396 pds.nsentcmds++;
1397 pds.cur_entry = 0;
1398 }
1399}
1400
1401
1402
1403
1404
1405
1406
1407void postcopy_discard_send_finish(MigrationState *ms)
1408{
1409
1410 if (pds.cur_entry) {
1411 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1412 pds.ramblock_name,
1413 pds.cur_entry,
1414 pds.start_list,
1415 pds.length_list);
1416 pds.nsentcmds++;
1417 }
1418
1419 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1420 pds.nsentcmds);
1421}
1422
1423
1424
1425
1426
1427
1428static PostcopyState incoming_postcopy_state;
1429
1430PostcopyState postcopy_state_get(void)
1431{
1432 return qatomic_mb_read(&incoming_postcopy_state);
1433}
1434
1435
1436PostcopyState postcopy_state_set(PostcopyState new_state)
1437{
1438 return qatomic_xchg(&incoming_postcopy_state, new_state);
1439}
1440
1441
1442
1443
1444void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1445{
1446 MigrationIncomingState *mis = migration_incoming_get_current();
1447
1448 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1449 *pcfd);
1450}
1451
1452
1453
1454void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1455{
1456 guint i;
1457 MigrationIncomingState *mis = migration_incoming_get_current();
1458 GArray *pcrfds = mis->postcopy_remote_fds;
1459
1460 if (!pcrfds) {
1461
1462 return;
1463 }
1464 for (i = 0; i < pcrfds->len; i++) {
1465 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1466 if (cur->fd == pcfd->fd) {
1467 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1468 return;
1469 }
1470 }
1471}
1472