1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "qemu/rcu.h"
21#include "qemu/madvise.h"
22#include "exec/target_page.h"
23#include "migration.h"
24#include "qemu-file.h"
25#include "savevm.h"
26#include "postcopy-ram.h"
27#include "ram.h"
28#include "qapi/error.h"
29#include "qemu/notify.h"
30#include "qemu/rcu.h"
31#include "sysemu/sysemu.h"
32#include "qemu/error-report.h"
33#include "trace.h"
34#include "hw/boards.h"
35#include "exec/ramblock.h"
36
37
38
39
40#define MAX_DISCARDS_PER_COMMAND 12
41
42struct PostcopyDiscardState {
43 const char *ramblock_name;
44 uint16_t cur_entry;
45
46
47
48 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
49 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
50 unsigned int nsentwords;
51 unsigned int nsentcmds;
52};
53
54static NotifierWithReturnList postcopy_notifier_list;
55
56void postcopy_infrastructure_init(void)
57{
58 notifier_with_return_list_init(&postcopy_notifier_list);
59}
60
61void postcopy_add_notifier(NotifierWithReturn *nn)
62{
63 notifier_with_return_list_add(&postcopy_notifier_list, nn);
64}
65
66void postcopy_remove_notifier(NotifierWithReturn *n)
67{
68 notifier_with_return_remove(n);
69}
70
71int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
72{
73 struct PostcopyNotifyData pnd;
74 pnd.reason = reason;
75 pnd.errp = errp;
76
77 return notifier_with_return_list_notify(&postcopy_notifier_list,
78 &pnd);
79}
80
81
82
83
84
85void postcopy_thread_create(MigrationIncomingState *mis,
86 QemuThread *thread, const char *name,
87 void *(*fn)(void *), int joinable)
88{
89 qemu_sem_init(&mis->thread_sync_sem, 0);
90 qemu_thread_create(thread, name, fn, mis, joinable);
91 qemu_sem_wait(&mis->thread_sync_sem);
92 qemu_sem_destroy(&mis->thread_sync_sem);
93}
94
95
96
97
98
99#if defined(__linux__)
100
101#include <poll.h>
102#include <sys/ioctl.h>
103#include <sys/syscall.h>
104#include <asm/types.h>
105#endif
106
107#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
108#include <sys/eventfd.h>
109#include <linux/userfaultfd.h>
110
111typedef struct PostcopyBlocktimeContext {
112
113 uint32_t *page_fault_vcpu_time;
114
115 uintptr_t *vcpu_addr;
116 uint32_t total_blocktime;
117
118 uint32_t *vcpu_blocktime;
119
120 uint32_t last_begin;
121
122 int smp_cpus_down;
123 uint64_t start_time;
124
125
126
127
128
129 Notifier exit_notifier;
130} PostcopyBlocktimeContext;
131
132static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
133{
134 g_free(ctx->page_fault_vcpu_time);
135 g_free(ctx->vcpu_addr);
136 g_free(ctx->vcpu_blocktime);
137 g_free(ctx);
138}
139
140static void migration_exit_cb(Notifier *n, void *data)
141{
142 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
143 exit_notifier);
144 destroy_blocktime_context(ctx);
145}
146
147static struct PostcopyBlocktimeContext *blocktime_context_new(void)
148{
149 MachineState *ms = MACHINE(qdev_get_machine());
150 unsigned int smp_cpus = ms->smp.cpus;
151 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
152 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
153 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
154 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
155
156 ctx->exit_notifier.notify = migration_exit_cb;
157 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
158 qemu_add_exit_notifier(&ctx->exit_notifier);
159 return ctx;
160}
161
162static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
163{
164 MachineState *ms = MACHINE(qdev_get_machine());
165 uint32List *list = NULL;
166 int i;
167
168 for (i = ms->smp.cpus - 1; i >= 0; i--) {
169 QAPI_LIST_PREPEND(list, ctx->vcpu_blocktime[i]);
170 }
171
172 return list;
173}
174
175
176
177
178
179
180
181
182void fill_destination_postcopy_migration_info(MigrationInfo *info)
183{
184 MigrationIncomingState *mis = migration_incoming_get_current();
185 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
186
187 if (!bc) {
188 return;
189 }
190
191 info->has_postcopy_blocktime = true;
192 info->postcopy_blocktime = bc->total_blocktime;
193 info->has_postcopy_vcpu_blocktime = true;
194 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
195}
196
197static uint32_t get_postcopy_total_blocktime(void)
198{
199 MigrationIncomingState *mis = migration_incoming_get_current();
200 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
201
202 if (!bc) {
203 return 0;
204 }
205
206 return bc->total_blocktime;
207}
208
209
210
211
212
213
214
215
216
217
218
219static bool receive_ufd_features(uint64_t *features)
220{
221 struct uffdio_api api_struct = {0};
222 int ufd;
223 bool ret = true;
224
225
226 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
227 if (ufd == -1) {
228 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
229 strerror(errno));
230 return false;
231 }
232
233
234 api_struct.api = UFFD_API;
235 api_struct.features = 0;
236 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
237 error_report("%s: UFFDIO_API failed: %s", __func__,
238 strerror(errno));
239 ret = false;
240 goto release_ufd;
241 }
242
243 *features = api_struct.features;
244
245release_ufd:
246 close(ufd);
247 return ret;
248}
249
250
251
252
253
254
255
256
257
258
259static bool request_ufd_features(int ufd, uint64_t features)
260{
261 struct uffdio_api api_struct = {0};
262 uint64_t ioctl_mask;
263
264 api_struct.api = UFFD_API;
265 api_struct.features = features;
266 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
267 error_report("%s failed: UFFDIO_API failed: %s", __func__,
268 strerror(errno));
269 return false;
270 }
271
272 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
273 (__u64)1 << _UFFDIO_UNREGISTER;
274 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
275 error_report("Missing userfault features: %" PRIx64,
276 (uint64_t)(~api_struct.ioctls & ioctl_mask));
277 return false;
278 }
279
280 return true;
281}
282
283static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
284{
285 uint64_t asked_features = 0;
286 static uint64_t supported_features;
287
288
289
290
291
292
293 if (!supported_features) {
294 if (!receive_ufd_features(&supported_features)) {
295 error_report("%s failed", __func__);
296 return false;
297 }
298 }
299
300#ifdef UFFD_FEATURE_THREAD_ID
301 if (UFFD_FEATURE_THREAD_ID & supported_features) {
302 asked_features |= UFFD_FEATURE_THREAD_ID;
303 if (migrate_postcopy_blocktime()) {
304 if (!mis->blocktime_ctx) {
305 mis->blocktime_ctx = blocktime_context_new();
306 }
307 }
308 }
309#endif
310
311
312
313
314
315
316 if (!request_ufd_features(ufd, asked_features)) {
317 error_report("%s failed: features %" PRIu64, __func__,
318 asked_features);
319 return false;
320 }
321
322 if (qemu_real_host_page_size != ram_pagesize_summary()) {
323 bool have_hp = false;
324
325#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
326 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
327#endif
328 if (!have_hp) {
329 error_report("Userfault on this host does not support huge pages");
330 return false;
331 }
332 }
333 return true;
334}
335
336
337
338static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
339{
340 const char *block_name = qemu_ram_get_idstr(rb);
341 ram_addr_t length = qemu_ram_get_used_length(rb);
342 size_t pagesize = qemu_ram_pagesize(rb);
343
344 if (length % pagesize) {
345 error_report("Postcopy requires RAM blocks to be a page size multiple,"
346 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
347 "page size of 0x%zx", block_name, length, pagesize);
348 return 1;
349 }
350 return 0;
351}
352
353
354
355
356
357
358bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
359{
360 long pagesize = qemu_real_host_page_size;
361 int ufd = -1;
362 bool ret = false;
363 void *testarea = NULL;
364 struct uffdio_register reg_struct;
365 struct uffdio_range range_struct;
366 uint64_t feature_mask;
367 Error *local_err = NULL;
368
369 if (qemu_target_page_size() > pagesize) {
370 error_report("Target page size bigger than host page size");
371 goto out;
372 }
373
374 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
375 if (ufd == -1) {
376 error_report("%s: userfaultfd not available: %s", __func__,
377 strerror(errno));
378 goto out;
379 }
380
381
382 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
383 error_report_err(local_err);
384 goto out;
385 }
386
387
388 if (!ufd_check_and_apply(ufd, mis)) {
389 goto out;
390 }
391
392
393 if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
394 goto out;
395 }
396
397
398
399
400
401 if (munlockall()) {
402 error_report("%s: munlockall: %s", __func__, strerror(errno));
403 goto out;
404 }
405
406
407
408
409
410
411 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
412 MAP_ANONYMOUS, -1, 0);
413 if (testarea == MAP_FAILED) {
414 error_report("%s: Failed to map test area: %s", __func__,
415 strerror(errno));
416 goto out;
417 }
418 g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
419
420 reg_struct.range.start = (uintptr_t)testarea;
421 reg_struct.range.len = pagesize;
422 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
423
424 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
425 error_report("%s userfault register: %s", __func__, strerror(errno));
426 goto out;
427 }
428
429 range_struct.start = (uintptr_t)testarea;
430 range_struct.len = pagesize;
431 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
432 error_report("%s userfault unregister: %s", __func__, strerror(errno));
433 goto out;
434 }
435
436 feature_mask = (__u64)1 << _UFFDIO_WAKE |
437 (__u64)1 << _UFFDIO_COPY |
438 (__u64)1 << _UFFDIO_ZEROPAGE;
439 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
440 error_report("Missing userfault map features: %" PRIx64,
441 (uint64_t)(~reg_struct.ioctls & feature_mask));
442 goto out;
443 }
444
445
446 ret = true;
447out:
448 if (testarea) {
449 munmap(testarea, pagesize);
450 }
451 if (ufd != -1) {
452 close(ufd);
453 }
454 return ret;
455}
456
457
458
459
460
461
462static int init_range(RAMBlock *rb, void *opaque)
463{
464 const char *block_name = qemu_ram_get_idstr(rb);
465 void *host_addr = qemu_ram_get_host_addr(rb);
466 ram_addr_t offset = qemu_ram_get_offset(rb);
467 ram_addr_t length = qemu_ram_get_used_length(rb);
468 trace_postcopy_init_range(block_name, host_addr, offset, length);
469
470
471
472
473
474
475 rb->postcopy_length = length;
476
477
478
479
480
481
482
483 if (ram_discard_range(block_name, 0, length)) {
484 return -1;
485 }
486
487 return 0;
488}
489
490
491
492
493
494static int cleanup_range(RAMBlock *rb, void *opaque)
495{
496 const char *block_name = qemu_ram_get_idstr(rb);
497 void *host_addr = qemu_ram_get_host_addr(rb);
498 ram_addr_t offset = qemu_ram_get_offset(rb);
499 ram_addr_t length = rb->postcopy_length;
500 MigrationIncomingState *mis = opaque;
501 struct uffdio_range range_struct;
502 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
503
504
505
506
507
508 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
509
510
511
512
513
514
515 range_struct.start = (uintptr_t)host_addr;
516 range_struct.len = length;
517
518 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
519 error_report("%s: userfault unregister %s", __func__, strerror(errno));
520
521 return -1;
522 }
523
524 return 0;
525}
526
527
528
529
530
531
532int postcopy_ram_incoming_init(MigrationIncomingState *mis)
533{
534 if (foreach_not_ignored_block(init_range, NULL)) {
535 return -1;
536 }
537
538 return 0;
539}
540
541static void postcopy_temp_pages_cleanup(MigrationIncomingState *mis)
542{
543 int i;
544
545 if (mis->postcopy_tmp_pages) {
546 for (i = 0; i < mis->postcopy_channels; i++) {
547 if (mis->postcopy_tmp_pages[i].tmp_huge_page) {
548 munmap(mis->postcopy_tmp_pages[i].tmp_huge_page,
549 mis->largest_page_size);
550 mis->postcopy_tmp_pages[i].tmp_huge_page = NULL;
551 }
552 }
553 g_free(mis->postcopy_tmp_pages);
554 mis->postcopy_tmp_pages = NULL;
555 }
556
557 if (mis->postcopy_tmp_zero_page) {
558 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
559 mis->postcopy_tmp_zero_page = NULL;
560 }
561}
562
563
564
565
566int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
567{
568 trace_postcopy_ram_incoming_cleanup_entry();
569
570 if (mis->have_fault_thread) {
571 Error *local_err = NULL;
572
573
574 qatomic_set(&mis->fault_thread_quit, 1);
575 postcopy_fault_thread_notify(mis);
576 trace_postcopy_ram_incoming_cleanup_join();
577 qemu_thread_join(&mis->fault_thread);
578
579 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
580 error_report_err(local_err);
581 return -1;
582 }
583
584 if (foreach_not_ignored_block(cleanup_range, mis)) {
585 return -1;
586 }
587
588 trace_postcopy_ram_incoming_cleanup_closeuf();
589 close(mis->userfault_fd);
590 close(mis->userfault_event_fd);
591 mis->have_fault_thread = false;
592 }
593
594 if (enable_mlock) {
595 if (os_mlock() < 0) {
596 error_report("mlock: %s", strerror(errno));
597
598
599
600
601 }
602 }
603
604 postcopy_temp_pages_cleanup(mis);
605
606 trace_postcopy_ram_incoming_cleanup_blocktime(
607 get_postcopy_total_blocktime());
608
609 trace_postcopy_ram_incoming_cleanup_exit();
610 return 0;
611}
612
613
614
615
616static int nhp_range(RAMBlock *rb, void *opaque)
617{
618 const char *block_name = qemu_ram_get_idstr(rb);
619 void *host_addr = qemu_ram_get_host_addr(rb);
620 ram_addr_t offset = qemu_ram_get_offset(rb);
621 ram_addr_t length = rb->postcopy_length;
622 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
623
624
625
626
627
628
629 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
630
631 return 0;
632}
633
634
635
636
637
638
639int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
640{
641 if (foreach_not_ignored_block(nhp_range, mis)) {
642 return -1;
643 }
644
645 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
646
647 return 0;
648}
649
650
651
652
653
654
655
656
657
658
659static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
660{
661 MigrationIncomingState *mis = opaque;
662 struct uffdio_register reg_struct;
663
664 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
665 reg_struct.range.len = rb->postcopy_length;
666 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
667
668
669 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
670 error_report("%s userfault register: %s", __func__, strerror(errno));
671 return -1;
672 }
673 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
674 error_report("%s userfault: Region doesn't support COPY", __func__);
675 return -1;
676 }
677 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
678 qemu_ram_set_uf_zeroable(rb);
679 }
680
681 return 0;
682}
683
684int postcopy_wake_shared(struct PostCopyFD *pcfd,
685 uint64_t client_addr,
686 RAMBlock *rb)
687{
688 size_t pagesize = qemu_ram_pagesize(rb);
689 struct uffdio_range range;
690 int ret;
691 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
692 range.start = ROUND_DOWN(client_addr, pagesize);
693 range.len = pagesize;
694 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
695 if (ret) {
696 error_report("%s: Failed to wake: %zx in %s (%s)",
697 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
698 strerror(errno));
699 }
700 return ret;
701}
702
703static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
704 ram_addr_t start, uint64_t haddr)
705{
706 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
707
708
709
710
711
712
713
714
715
716 assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
717 if (ramblock_page_is_discarded(rb, start)) {
718 bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
719
720 return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
721 }
722
723 return migrate_send_rp_req_pages(mis, rb, start, haddr);
724}
725
726
727
728
729
730
731int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
732 uint64_t client_addr, uint64_t rb_offset)
733{
734 uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
735 MigrationIncomingState *mis = migration_incoming_get_current();
736
737 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
738 rb_offset);
739 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
740 trace_postcopy_request_shared_page_present(pcfd->idstr,
741 qemu_ram_get_idstr(rb), rb_offset);
742 return postcopy_wake_shared(pcfd, client_addr, rb);
743 }
744 postcopy_request_page(mis, rb, aligned_rbo, client_addr);
745 return 0;
746}
747
748static int get_mem_fault_cpu_index(uint32_t pid)
749{
750 CPUState *cpu_iter;
751
752 CPU_FOREACH(cpu_iter) {
753 if (cpu_iter->thread_id == pid) {
754 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
755 return cpu_iter->cpu_index;
756 }
757 }
758 trace_get_mem_fault_cpu_index(-1, pid);
759 return -1;
760}
761
762static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
763{
764 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
765 dc->start_time;
766 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
767}
768
769
770
771
772
773
774
775
776
777static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
778 RAMBlock *rb)
779{
780 int cpu, already_received;
781 MigrationIncomingState *mis = migration_incoming_get_current();
782 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
783 uint32_t low_time_offset;
784
785 if (!dc || ptid == 0) {
786 return;
787 }
788 cpu = get_mem_fault_cpu_index(ptid);
789 if (cpu < 0) {
790 return;
791 }
792
793 low_time_offset = get_low_time_offset(dc);
794 if (dc->vcpu_addr[cpu] == 0) {
795 qatomic_inc(&dc->smp_cpus_down);
796 }
797
798 qatomic_xchg(&dc->last_begin, low_time_offset);
799 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
800 qatomic_xchg(&dc->vcpu_addr[cpu], addr);
801
802
803
804
805
806
807 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
808 if (already_received) {
809 qatomic_xchg(&dc->vcpu_addr[cpu], 0);
810 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
811 qatomic_dec(&dc->smp_cpus_down);
812 }
813 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
814 cpu, already_received);
815}
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844static void mark_postcopy_blocktime_end(uintptr_t addr)
845{
846 MigrationIncomingState *mis = migration_incoming_get_current();
847 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
848 MachineState *ms = MACHINE(qdev_get_machine());
849 unsigned int smp_cpus = ms->smp.cpus;
850 int i, affected_cpu = 0;
851 bool vcpu_total_blocktime = false;
852 uint32_t read_vcpu_time, low_time_offset;
853
854 if (!dc) {
855 return;
856 }
857
858 low_time_offset = get_low_time_offset(dc);
859
860
861
862
863 for (i = 0; i < smp_cpus; i++) {
864 uint32_t vcpu_blocktime = 0;
865
866 read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
867 if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
868 read_vcpu_time == 0) {
869 continue;
870 }
871 qatomic_xchg(&dc->vcpu_addr[i], 0);
872 vcpu_blocktime = low_time_offset - read_vcpu_time;
873 affected_cpu += 1;
874
875
876
877 if (!vcpu_total_blocktime &&
878 qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
879 vcpu_total_blocktime = true;
880 }
881
882 dc->vcpu_blocktime[i] += vcpu_blocktime;
883 }
884
885 qatomic_sub(&dc->smp_cpus_down, affected_cpu);
886 if (vcpu_total_blocktime) {
887 dc->total_blocktime += low_time_offset - qatomic_fetch_add(
888 &dc->last_begin, 0);
889 }
890 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
891 affected_cpu);
892}
893
894static void postcopy_pause_fault_thread(MigrationIncomingState *mis)
895{
896 trace_postcopy_pause_fault_thread();
897 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
898 trace_postcopy_pause_fault_thread_continued();
899}
900
901
902
903
904static void *postcopy_ram_fault_thread(void *opaque)
905{
906 MigrationIncomingState *mis = opaque;
907 struct uffd_msg msg;
908 int ret;
909 size_t index;
910 RAMBlock *rb = NULL;
911
912 trace_postcopy_ram_fault_thread_entry();
913 rcu_register_thread();
914 mis->last_rb = NULL;
915 qemu_sem_post(&mis->thread_sync_sem);
916
917 struct pollfd *pfd;
918 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
919
920 pfd = g_new0(struct pollfd, pfd_len);
921
922 pfd[0].fd = mis->userfault_fd;
923 pfd[0].events = POLLIN;
924 pfd[1].fd = mis->userfault_event_fd;
925 pfd[1].events = POLLIN;
926 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
927 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
928 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
929 struct PostCopyFD, index);
930 pfd[2 + index].fd = pcfd->fd;
931 pfd[2 + index].events = POLLIN;
932 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
933 pcfd->fd);
934 }
935
936 while (true) {
937 ram_addr_t rb_offset;
938 int poll_result;
939
940
941
942
943
944
945
946 poll_result = poll(pfd, pfd_len, -1 );
947 if (poll_result == -1) {
948 error_report("%s: userfault poll: %s", __func__, strerror(errno));
949 break;
950 }
951
952 if (!mis->to_src_file) {
953
954
955
956
957
958 postcopy_pause_fault_thread(mis);
959 }
960
961 if (pfd[1].revents) {
962 uint64_t tmp64 = 0;
963
964
965 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
966
967 error_report("%s: read() failed", __func__);
968 }
969
970 if (qatomic_read(&mis->fault_thread_quit)) {
971 trace_postcopy_ram_fault_thread_quit();
972 break;
973 }
974 }
975
976 if (pfd[0].revents) {
977 poll_result--;
978 ret = read(mis->userfault_fd, &msg, sizeof(msg));
979 if (ret != sizeof(msg)) {
980 if (errno == EAGAIN) {
981
982
983
984
985 continue;
986 }
987 if (ret < 0) {
988 error_report("%s: Failed to read full userfault "
989 "message: %s",
990 __func__, strerror(errno));
991 break;
992 } else {
993 error_report("%s: Read %d bytes from userfaultfd "
994 "expected %zd",
995 __func__, ret, sizeof(msg));
996 break;
997 }
998 }
999 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1000 error_report("%s: Read unexpected event %ud from userfaultfd",
1001 __func__, msg.event);
1002 continue;
1003 }
1004
1005 rb = qemu_ram_block_from_host(
1006 (void *)(uintptr_t)msg.arg.pagefault.address,
1007 true, &rb_offset);
1008 if (!rb) {
1009 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
1010 PRIx64, (uint64_t)msg.arg.pagefault.address);
1011 break;
1012 }
1013
1014 rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
1015 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
1016 qemu_ram_get_idstr(rb),
1017 rb_offset,
1018 msg.arg.pagefault.feat.ptid);
1019 mark_postcopy_blocktime_begin(
1020 (uintptr_t)(msg.arg.pagefault.address),
1021 msg.arg.pagefault.feat.ptid, rb);
1022
1023retry:
1024
1025
1026
1027
1028 ret = postcopy_request_page(mis, rb, rb_offset,
1029 msg.arg.pagefault.address);
1030 if (ret) {
1031
1032 postcopy_pause_fault_thread(mis);
1033 goto retry;
1034 }
1035 }
1036
1037
1038
1039 for (index = 2; index < pfd_len && poll_result; index++) {
1040 if (pfd[index].revents) {
1041 struct PostCopyFD *pcfd =
1042 &g_array_index(mis->postcopy_remote_fds,
1043 struct PostCopyFD, index - 2);
1044
1045 poll_result--;
1046 if (pfd[index].revents & POLLERR) {
1047 error_report("%s: POLLERR on poll %zd fd=%d",
1048 __func__, index, pcfd->fd);
1049 pfd[index].events = 0;
1050 continue;
1051 }
1052
1053 ret = read(pcfd->fd, &msg, sizeof(msg));
1054 if (ret != sizeof(msg)) {
1055 if (errno == EAGAIN) {
1056
1057
1058
1059
1060 continue;
1061 }
1062 if (ret < 0) {
1063 error_report("%s: Failed to read full userfault "
1064 "message: %s (shared) revents=%d",
1065 __func__, strerror(errno),
1066 pfd[index].revents);
1067
1068 break;
1069 } else {
1070 error_report("%s: Read %d bytes from userfaultfd "
1071 "expected %zd (shared)",
1072 __func__, ret, sizeof(msg));
1073
1074 break;
1075 }
1076 }
1077 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1078 error_report("%s: Read unexpected event %ud "
1079 "from userfaultfd (shared)",
1080 __func__, msg.event);
1081 continue;
1082 }
1083
1084 ret = pcfd->handler(pcfd, &msg);
1085 if (ret) {
1086 error_report("%s: Failed to resolve shared fault on %zd/%s",
1087 __func__, index, pcfd->idstr);
1088
1089 }
1090 }
1091 }
1092 }
1093 rcu_unregister_thread();
1094 trace_postcopy_ram_fault_thread_exit();
1095 g_free(pfd);
1096 return NULL;
1097}
1098
1099static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
1100{
1101 PostcopyTmpPage *tmp_page;
1102 int err, i, channels;
1103 void *temp_page;
1104
1105
1106 mis->postcopy_channels = 1;
1107
1108 channels = mis->postcopy_channels;
1109 mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels);
1110
1111 for (i = 0; i < channels; i++) {
1112 tmp_page = &mis->postcopy_tmp_pages[i];
1113 temp_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE,
1114 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1115 if (temp_page == MAP_FAILED) {
1116 err = errno;
1117 error_report("%s: Failed to map postcopy_tmp_pages[%d]: %s",
1118 __func__, i, strerror(err));
1119
1120 return -err;
1121 }
1122 tmp_page->tmp_huge_page = temp_page;
1123
1124 postcopy_temp_page_reset(tmp_page);
1125 }
1126
1127
1128
1129
1130 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1131 PROT_READ | PROT_WRITE,
1132 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1133 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1134 err = errno;
1135 mis->postcopy_tmp_zero_page = NULL;
1136 error_report("%s: Failed to map large zero page %s",
1137 __func__, strerror(err));
1138 return -err;
1139 }
1140
1141 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1142
1143 return 0;
1144}
1145
1146int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1147{
1148
1149 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1150 if (mis->userfault_fd == -1) {
1151 error_report("%s: Failed to open userfault fd: %s", __func__,
1152 strerror(errno));
1153 return -1;
1154 }
1155
1156
1157
1158
1159
1160 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1161 return -1;
1162 }
1163
1164
1165 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1166 if (mis->userfault_event_fd == -1) {
1167 error_report("%s: Opening userfault_event_fd: %s", __func__,
1168 strerror(errno));
1169 close(mis->userfault_fd);
1170 return -1;
1171 }
1172
1173 postcopy_thread_create(mis, &mis->fault_thread, "postcopy/fault",
1174 postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
1175 mis->have_fault_thread = true;
1176
1177
1178 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1179 error_report("ram_block_enable_notify failed");
1180 return -1;
1181 }
1182
1183 if (postcopy_temp_pages_setup(mis)) {
1184
1185 return -1;
1186 }
1187
1188 trace_postcopy_ram_enable_notify();
1189
1190 return 0;
1191}
1192
1193static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
1194 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1195{
1196 int userfault_fd = mis->userfault_fd;
1197 int ret;
1198
1199 if (from_addr) {
1200 struct uffdio_copy copy_struct;
1201 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1202 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1203 copy_struct.len = pagesize;
1204 copy_struct.mode = 0;
1205 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1206 } else {
1207 struct uffdio_zeropage zero_struct;
1208 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1209 zero_struct.range.len = pagesize;
1210 zero_struct.mode = 0;
1211 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1212 }
1213 if (!ret) {
1214 qemu_mutex_lock(&mis->page_request_mutex);
1215 ramblock_recv_bitmap_set_range(rb, host_addr,
1216 pagesize / qemu_target_page_size());
1217
1218
1219
1220
1221 if (g_tree_lookup(mis->page_requested, host_addr)) {
1222 g_tree_remove(mis->page_requested, host_addr);
1223 mis->page_requested_count--;
1224 trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
1225 }
1226 qemu_mutex_unlock(&mis->page_request_mutex);
1227 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1228 }
1229 return ret;
1230}
1231
1232int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1233{
1234 int i;
1235 MigrationIncomingState *mis = migration_incoming_get_current();
1236 GArray *pcrfds = mis->postcopy_remote_fds;
1237
1238 for (i = 0; i < pcrfds->len; i++) {
1239 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1240 int ret = cur->waker(cur, rb, offset);
1241 if (ret) {
1242 return ret;
1243 }
1244 }
1245 return 0;
1246}
1247
1248
1249
1250
1251
1252int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1253 RAMBlock *rb)
1254{
1255 size_t pagesize = qemu_ram_pagesize(rb);
1256
1257
1258
1259
1260
1261
1262 if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
1263 int e = errno;
1264 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1265 __func__, strerror(e), host, from, pagesize);
1266
1267 return -e;
1268 }
1269
1270 trace_postcopy_place_page(host);
1271 return postcopy_notify_shared_wake(rb,
1272 qemu_ram_block_host_offset(rb, host));
1273}
1274
1275
1276
1277
1278
1279int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1280 RAMBlock *rb)
1281{
1282 size_t pagesize = qemu_ram_pagesize(rb);
1283 trace_postcopy_place_page_zero(host);
1284
1285
1286
1287
1288 if (qemu_ram_is_uf_zeroable(rb)) {
1289 if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
1290 int e = errno;
1291 error_report("%s: %s zero host: %p",
1292 __func__, strerror(e), host);
1293
1294 return -e;
1295 }
1296 return postcopy_notify_shared_wake(rb,
1297 qemu_ram_block_host_offset(rb,
1298 host));
1299 } else {
1300 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1301 }
1302}
1303
1304#else
1305
1306void fill_destination_postcopy_migration_info(MigrationInfo *info)
1307{
1308}
1309
1310bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1311{
1312 error_report("%s: No OS support", __func__);
1313 return false;
1314}
1315
1316int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1317{
1318 error_report("postcopy_ram_incoming_init: No OS support");
1319 return -1;
1320}
1321
1322int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1323{
1324 assert(0);
1325 return -1;
1326}
1327
1328int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1329{
1330 assert(0);
1331 return -1;
1332}
1333
1334int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1335 uint64_t client_addr, uint64_t rb_offset)
1336{
1337 assert(0);
1338 return -1;
1339}
1340
1341int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1342{
1343 assert(0);
1344 return -1;
1345}
1346
1347int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1348 RAMBlock *rb)
1349{
1350 assert(0);
1351 return -1;
1352}
1353
1354int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1355 RAMBlock *rb)
1356{
1357 assert(0);
1358 return -1;
1359}
1360
1361int postcopy_wake_shared(struct PostCopyFD *pcfd,
1362 uint64_t client_addr,
1363 RAMBlock *rb)
1364{
1365 assert(0);
1366 return -1;
1367}
1368#endif
1369
1370
1371void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page)
1372{
1373 tmp_page->target_pages = 0;
1374 tmp_page->host_addr = NULL;
1375
1376
1377
1378
1379 tmp_page->all_zero = true;
1380}
1381
1382void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1383{
1384 uint64_t tmp64 = 1;
1385
1386
1387
1388
1389
1390 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1391
1392 error_report("%s: incrementing failed: %s", __func__,
1393 strerror(errno));
1394 }
1395}
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405static PostcopyDiscardState pds = {0};
1406void postcopy_discard_send_init(MigrationState *ms, const char *name)
1407{
1408 pds.ramblock_name = name;
1409 pds.cur_entry = 0;
1410 pds.nsentwords = 0;
1411 pds.nsentcmds = 0;
1412}
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1424 unsigned long length)
1425{
1426 size_t tp_size = qemu_target_page_size();
1427
1428 pds.start_list[pds.cur_entry] = start * tp_size;
1429 pds.length_list[pds.cur_entry] = length * tp_size;
1430 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1431 pds.cur_entry++;
1432 pds.nsentwords++;
1433
1434 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1435
1436 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1437 pds.ramblock_name,
1438 pds.cur_entry,
1439 pds.start_list,
1440 pds.length_list);
1441 pds.nsentcmds++;
1442 pds.cur_entry = 0;
1443 }
1444}
1445
1446
1447
1448
1449
1450
1451
1452void postcopy_discard_send_finish(MigrationState *ms)
1453{
1454
1455 if (pds.cur_entry) {
1456 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1457 pds.ramblock_name,
1458 pds.cur_entry,
1459 pds.start_list,
1460 pds.length_list);
1461 pds.nsentcmds++;
1462 }
1463
1464 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1465 pds.nsentcmds);
1466}
1467
1468
1469
1470
1471
1472
1473static PostcopyState incoming_postcopy_state;
1474
1475PostcopyState postcopy_state_get(void)
1476{
1477 return qatomic_mb_read(&incoming_postcopy_state);
1478}
1479
1480
1481PostcopyState postcopy_state_set(PostcopyState new_state)
1482{
1483 return qatomic_xchg(&incoming_postcopy_state, new_state);
1484}
1485
1486
1487
1488
1489void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1490{
1491 MigrationIncomingState *mis = migration_incoming_get_current();
1492
1493 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1494 *pcfd);
1495}
1496
1497
1498
1499void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1500{
1501 guint i;
1502 MigrationIncomingState *mis = migration_incoming_get_current();
1503 GArray *pcrfds = mis->postcopy_remote_fds;
1504
1505 if (!pcrfds) {
1506
1507 return;
1508 }
1509 for (i = 0; i < pcrfds->len; i++) {
1510 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1511 if (cur->fd == pcfd->fd) {
1512 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1513 return;
1514 }
1515 }
1516}
1517