1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "exec/target_page.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "savevm.h"
24#include "postcopy-ram.h"
25#include "ram.h"
26#include "qapi/error.h"
27#include "qemu/notify.h"
28#include "qemu/rcu.h"
29#include "sysemu/sysemu.h"
30#include "qemu/error-report.h"
31#include "trace.h"
32#include "hw/boards.h"
33
34
35
36
37#define MAX_DISCARDS_PER_COMMAND 12
38
39struct PostcopyDiscardState {
40 const char *ramblock_name;
41 uint16_t cur_entry;
42
43
44
45 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
46 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
47 unsigned int nsentwords;
48 unsigned int nsentcmds;
49};
50
51static NotifierWithReturnList postcopy_notifier_list;
52
53void postcopy_infrastructure_init(void)
54{
55 notifier_with_return_list_init(&postcopy_notifier_list);
56}
57
58void postcopy_add_notifier(NotifierWithReturn *nn)
59{
60 notifier_with_return_list_add(&postcopy_notifier_list, nn);
61}
62
63void postcopy_remove_notifier(NotifierWithReturn *n)
64{
65 notifier_with_return_remove(n);
66}
67
68int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
69{
70 struct PostcopyNotifyData pnd;
71 pnd.reason = reason;
72 pnd.errp = errp;
73
74 return notifier_with_return_list_notify(&postcopy_notifier_list,
75 &pnd);
76}
77
78
79
80
81
82#if defined(__linux__)
83
84#include <poll.h>
85#include <sys/ioctl.h>
86#include <sys/syscall.h>
87#include <asm/types.h>
88#endif
89
90#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
91#include <sys/eventfd.h>
92#include <linux/userfaultfd.h>
93
94typedef struct PostcopyBlocktimeContext {
95
96 uint32_t *page_fault_vcpu_time;
97
98 uintptr_t *vcpu_addr;
99 uint32_t total_blocktime;
100
101 uint32_t *vcpu_blocktime;
102
103 uint32_t last_begin;
104
105 int smp_cpus_down;
106 uint64_t start_time;
107
108
109
110
111
112 Notifier exit_notifier;
113} PostcopyBlocktimeContext;
114
115static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
116{
117 g_free(ctx->page_fault_vcpu_time);
118 g_free(ctx->vcpu_addr);
119 g_free(ctx->vcpu_blocktime);
120 g_free(ctx);
121}
122
123static void migration_exit_cb(Notifier *n, void *data)
124{
125 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
126 exit_notifier);
127 destroy_blocktime_context(ctx);
128}
129
130static struct PostcopyBlocktimeContext *blocktime_context_new(void)
131{
132 MachineState *ms = MACHINE(qdev_get_machine());
133 unsigned int smp_cpus = ms->smp.cpus;
134 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
135 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
136 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
137 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
138
139 ctx->exit_notifier.notify = migration_exit_cb;
140 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
141 qemu_add_exit_notifier(&ctx->exit_notifier);
142 return ctx;
143}
144
145static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
146{
147 MachineState *ms = MACHINE(qdev_get_machine());
148 uint32List *list = NULL, *entry = NULL;
149 int i;
150
151 for (i = ms->smp.cpus - 1; i >= 0; i--) {
152 entry = g_new0(uint32List, 1);
153 entry->value = ctx->vcpu_blocktime[i];
154 entry->next = list;
155 list = entry;
156 }
157
158 return list;
159}
160
161
162
163
164
165
166
167
168void fill_destination_postcopy_migration_info(MigrationInfo *info)
169{
170 MigrationIncomingState *mis = migration_incoming_get_current();
171 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
172
173 if (!bc) {
174 return;
175 }
176
177 info->has_postcopy_blocktime = true;
178 info->postcopy_blocktime = bc->total_blocktime;
179 info->has_postcopy_vcpu_blocktime = true;
180 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
181}
182
183static uint32_t get_postcopy_total_blocktime(void)
184{
185 MigrationIncomingState *mis = migration_incoming_get_current();
186 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
187
188 if (!bc) {
189 return 0;
190 }
191
192 return bc->total_blocktime;
193}
194
195
196
197
198
199
200
201
202
203
204
205static bool receive_ufd_features(uint64_t *features)
206{
207 struct uffdio_api api_struct = {0};
208 int ufd;
209 bool ret = true;
210
211
212 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
213 if (ufd == -1) {
214 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
215 strerror(errno));
216 return false;
217 }
218
219
220 api_struct.api = UFFD_API;
221 api_struct.features = 0;
222 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
223 error_report("%s: UFFDIO_API failed: %s", __func__,
224 strerror(errno));
225 ret = false;
226 goto release_ufd;
227 }
228
229 *features = api_struct.features;
230
231release_ufd:
232 close(ufd);
233 return ret;
234}
235
236
237
238
239
240
241
242
243
244
245static bool request_ufd_features(int ufd, uint64_t features)
246{
247 struct uffdio_api api_struct = {0};
248 uint64_t ioctl_mask;
249
250 api_struct.api = UFFD_API;
251 api_struct.features = features;
252 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
253 error_report("%s failed: UFFDIO_API failed: %s", __func__,
254 strerror(errno));
255 return false;
256 }
257
258 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
259 (__u64)1 << _UFFDIO_UNREGISTER;
260 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
261 error_report("Missing userfault features: %" PRIx64,
262 (uint64_t)(~api_struct.ioctls & ioctl_mask));
263 return false;
264 }
265
266 return true;
267}
268
269static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
270{
271 uint64_t asked_features = 0;
272 static uint64_t supported_features;
273
274
275
276
277
278
279 if (!supported_features) {
280 if (!receive_ufd_features(&supported_features)) {
281 error_report("%s failed", __func__);
282 return false;
283 }
284 }
285
286#ifdef UFFD_FEATURE_THREAD_ID
287 if (migrate_postcopy_blocktime() && mis &&
288 UFFD_FEATURE_THREAD_ID & supported_features) {
289
290
291 if (!mis->blocktime_ctx) {
292 mis->blocktime_ctx = blocktime_context_new();
293 }
294
295 asked_features |= UFFD_FEATURE_THREAD_ID;
296 }
297#endif
298
299
300
301
302
303
304 if (!request_ufd_features(ufd, asked_features)) {
305 error_report("%s failed: features %" PRIu64, __func__,
306 asked_features);
307 return false;
308 }
309
310 if (qemu_real_host_page_size != ram_pagesize_summary()) {
311 bool have_hp = false;
312
313#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
314 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
315#endif
316 if (!have_hp) {
317 error_report("Userfault on this host does not support huge pages");
318 return false;
319 }
320 }
321 return true;
322}
323
324
325
326static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
327{
328 const char *block_name = qemu_ram_get_idstr(rb);
329 ram_addr_t length = qemu_ram_get_used_length(rb);
330 size_t pagesize = qemu_ram_pagesize(rb);
331
332 if (length % pagesize) {
333 error_report("Postcopy requires RAM blocks to be a page size multiple,"
334 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
335 "page size of 0x%zx", block_name, length, pagesize);
336 return 1;
337 }
338 return 0;
339}
340
341
342
343
344
345
346bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
347{
348 long pagesize = qemu_real_host_page_size;
349 int ufd = -1;
350 bool ret = false;
351 void *testarea = NULL;
352 struct uffdio_register reg_struct;
353 struct uffdio_range range_struct;
354 uint64_t feature_mask;
355 Error *local_err = NULL;
356
357 if (qemu_target_page_size() > pagesize) {
358 error_report("Target page size bigger than host page size");
359 goto out;
360 }
361
362 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
363 if (ufd == -1) {
364 error_report("%s: userfaultfd not available: %s", __func__,
365 strerror(errno));
366 goto out;
367 }
368
369
370 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
371 error_report_err(local_err);
372 goto out;
373 }
374
375
376 if (!ufd_check_and_apply(ufd, mis)) {
377 goto out;
378 }
379
380
381 if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
382 goto out;
383 }
384
385
386
387
388
389 if (munlockall()) {
390 error_report("%s: munlockall: %s", __func__, strerror(errno));
391 goto out;
392 }
393
394
395
396
397
398
399 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
400 MAP_ANONYMOUS, -1, 0);
401 if (testarea == MAP_FAILED) {
402 error_report("%s: Failed to map test area: %s", __func__,
403 strerror(errno));
404 goto out;
405 }
406 g_assert(((size_t)testarea & (pagesize - 1)) == 0);
407
408 reg_struct.range.start = (uintptr_t)testarea;
409 reg_struct.range.len = pagesize;
410 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
411
412 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
413 error_report("%s userfault register: %s", __func__, strerror(errno));
414 goto out;
415 }
416
417 range_struct.start = (uintptr_t)testarea;
418 range_struct.len = pagesize;
419 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
420 error_report("%s userfault unregister: %s", __func__, strerror(errno));
421 goto out;
422 }
423
424 feature_mask = (__u64)1 << _UFFDIO_WAKE |
425 (__u64)1 << _UFFDIO_COPY |
426 (__u64)1 << _UFFDIO_ZEROPAGE;
427 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
428 error_report("Missing userfault map features: %" PRIx64,
429 (uint64_t)(~reg_struct.ioctls & feature_mask));
430 goto out;
431 }
432
433
434 ret = true;
435out:
436 if (testarea) {
437 munmap(testarea, pagesize);
438 }
439 if (ufd != -1) {
440 close(ufd);
441 }
442 return ret;
443}
444
445
446
447
448
449
450static int init_range(RAMBlock *rb, void *opaque)
451{
452 const char *block_name = qemu_ram_get_idstr(rb);
453 void *host_addr = qemu_ram_get_host_addr(rb);
454 ram_addr_t offset = qemu_ram_get_offset(rb);
455 ram_addr_t length = qemu_ram_get_used_length(rb);
456 trace_postcopy_init_range(block_name, host_addr, offset, length);
457
458
459
460
461
462
463
464 if (ram_discard_range(block_name, 0, length)) {
465 return -1;
466 }
467
468 return 0;
469}
470
471
472
473
474
475static int cleanup_range(RAMBlock *rb, void *opaque)
476{
477 const char *block_name = qemu_ram_get_idstr(rb);
478 void *host_addr = qemu_ram_get_host_addr(rb);
479 ram_addr_t offset = qemu_ram_get_offset(rb);
480 ram_addr_t length = qemu_ram_get_used_length(rb);
481 MigrationIncomingState *mis = opaque;
482 struct uffdio_range range_struct;
483 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
484
485
486
487
488
489 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
490
491
492
493
494
495
496 range_struct.start = (uintptr_t)host_addr;
497 range_struct.len = length;
498
499 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
500 error_report("%s: userfault unregister %s", __func__, strerror(errno));
501
502 return -1;
503 }
504
505 return 0;
506}
507
508
509
510
511
512
513int postcopy_ram_incoming_init(MigrationIncomingState *mis)
514{
515 if (foreach_not_ignored_block(init_range, NULL)) {
516 return -1;
517 }
518
519 return 0;
520}
521
522
523
524
525int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
526{
527 trace_postcopy_ram_incoming_cleanup_entry();
528
529 if (mis->have_fault_thread) {
530 Error *local_err = NULL;
531
532
533 qatomic_set(&mis->fault_thread_quit, 1);
534 postcopy_fault_thread_notify(mis);
535 trace_postcopy_ram_incoming_cleanup_join();
536 qemu_thread_join(&mis->fault_thread);
537
538 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
539 error_report_err(local_err);
540 return -1;
541 }
542
543 if (foreach_not_ignored_block(cleanup_range, mis)) {
544 return -1;
545 }
546
547 trace_postcopy_ram_incoming_cleanup_closeuf();
548 close(mis->userfault_fd);
549 close(mis->userfault_event_fd);
550 mis->have_fault_thread = false;
551 }
552
553 if (enable_mlock) {
554 if (os_mlock() < 0) {
555 error_report("mlock: %s", strerror(errno));
556
557
558
559
560 }
561 }
562
563 if (mis->postcopy_tmp_page) {
564 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
565 mis->postcopy_tmp_page = NULL;
566 }
567 if (mis->postcopy_tmp_zero_page) {
568 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
569 mis->postcopy_tmp_zero_page = NULL;
570 }
571 trace_postcopy_ram_incoming_cleanup_blocktime(
572 get_postcopy_total_blocktime());
573
574 trace_postcopy_ram_incoming_cleanup_exit();
575 return 0;
576}
577
578
579
580
581static int nhp_range(RAMBlock *rb, void *opaque)
582{
583 const char *block_name = qemu_ram_get_idstr(rb);
584 void *host_addr = qemu_ram_get_host_addr(rb);
585 ram_addr_t offset = qemu_ram_get_offset(rb);
586 ram_addr_t length = qemu_ram_get_used_length(rb);
587 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
588
589
590
591
592
593
594 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
595
596 return 0;
597}
598
599
600
601
602
603
604int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
605{
606 if (foreach_not_ignored_block(nhp_range, mis)) {
607 return -1;
608 }
609
610 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
611
612 return 0;
613}
614
615
616
617
618
619
620
621
622
623
624static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
625{
626 MigrationIncomingState *mis = opaque;
627 struct uffdio_register reg_struct;
628
629 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
630 reg_struct.range.len = qemu_ram_get_used_length(rb);
631 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
632
633
634 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
635 error_report("%s userfault register: %s", __func__, strerror(errno));
636 return -1;
637 }
638 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
639 error_report("%s userfault: Region doesn't support COPY", __func__);
640 return -1;
641 }
642 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
643 qemu_ram_set_uf_zeroable(rb);
644 }
645
646 return 0;
647}
648
649int postcopy_wake_shared(struct PostCopyFD *pcfd,
650 uint64_t client_addr,
651 RAMBlock *rb)
652{
653 size_t pagesize = qemu_ram_pagesize(rb);
654 struct uffdio_range range;
655 int ret;
656 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
657 range.start = client_addr & ~(pagesize - 1);
658 range.len = pagesize;
659 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
660 if (ret) {
661 error_report("%s: Failed to wake: %zx in %s (%s)",
662 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
663 strerror(errno));
664 }
665 return ret;
666}
667
668
669
670
671
672
673int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
674 uint64_t client_addr, uint64_t rb_offset)
675{
676 size_t pagesize = qemu_ram_pagesize(rb);
677 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
678 MigrationIncomingState *mis = migration_incoming_get_current();
679
680 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
681 rb_offset);
682 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
683 trace_postcopy_request_shared_page_present(pcfd->idstr,
684 qemu_ram_get_idstr(rb), rb_offset);
685 return postcopy_wake_shared(pcfd, client_addr, rb);
686 }
687 migrate_send_rp_req_pages(mis, rb, aligned_rbo, client_addr);
688 return 0;
689}
690
691static int get_mem_fault_cpu_index(uint32_t pid)
692{
693 CPUState *cpu_iter;
694
695 CPU_FOREACH(cpu_iter) {
696 if (cpu_iter->thread_id == pid) {
697 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
698 return cpu_iter->cpu_index;
699 }
700 }
701 trace_get_mem_fault_cpu_index(-1, pid);
702 return -1;
703}
704
705static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
706{
707 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
708 dc->start_time;
709 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
710}
711
712
713
714
715
716
717
718
719
720static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
721 RAMBlock *rb)
722{
723 int cpu, already_received;
724 MigrationIncomingState *mis = migration_incoming_get_current();
725 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
726 uint32_t low_time_offset;
727
728 if (!dc || ptid == 0) {
729 return;
730 }
731 cpu = get_mem_fault_cpu_index(ptid);
732 if (cpu < 0) {
733 return;
734 }
735
736 low_time_offset = get_low_time_offset(dc);
737 if (dc->vcpu_addr[cpu] == 0) {
738 qatomic_inc(&dc->smp_cpus_down);
739 }
740
741 qatomic_xchg(&dc->last_begin, low_time_offset);
742 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
743 qatomic_xchg(&dc->vcpu_addr[cpu], addr);
744
745
746
747
748
749
750 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
751 if (already_received) {
752 qatomic_xchg(&dc->vcpu_addr[cpu], 0);
753 qatomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
754 qatomic_dec(&dc->smp_cpus_down);
755 }
756 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
757 cpu, already_received);
758}
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787static void mark_postcopy_blocktime_end(uintptr_t addr)
788{
789 MigrationIncomingState *mis = migration_incoming_get_current();
790 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
791 MachineState *ms = MACHINE(qdev_get_machine());
792 unsigned int smp_cpus = ms->smp.cpus;
793 int i, affected_cpu = 0;
794 bool vcpu_total_blocktime = false;
795 uint32_t read_vcpu_time, low_time_offset;
796
797 if (!dc) {
798 return;
799 }
800
801 low_time_offset = get_low_time_offset(dc);
802
803
804
805
806 for (i = 0; i < smp_cpus; i++) {
807 uint32_t vcpu_blocktime = 0;
808
809 read_vcpu_time = qatomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
810 if (qatomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
811 read_vcpu_time == 0) {
812 continue;
813 }
814 qatomic_xchg(&dc->vcpu_addr[i], 0);
815 vcpu_blocktime = low_time_offset - read_vcpu_time;
816 affected_cpu += 1;
817
818
819
820 if (!vcpu_total_blocktime &&
821 qatomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
822 vcpu_total_blocktime = true;
823 }
824
825 dc->vcpu_blocktime[i] += vcpu_blocktime;
826 }
827
828 qatomic_sub(&dc->smp_cpus_down, affected_cpu);
829 if (vcpu_total_blocktime) {
830 dc->total_blocktime += low_time_offset - qatomic_fetch_add(
831 &dc->last_begin, 0);
832 }
833 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
834 affected_cpu);
835}
836
837static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
838{
839 trace_postcopy_pause_fault_thread();
840
841 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
842
843 trace_postcopy_pause_fault_thread_continued();
844
845 return true;
846}
847
848
849
850
851static void *postcopy_ram_fault_thread(void *opaque)
852{
853 MigrationIncomingState *mis = opaque;
854 struct uffd_msg msg;
855 int ret;
856 size_t index;
857 RAMBlock *rb = NULL;
858
859 trace_postcopy_ram_fault_thread_entry();
860 rcu_register_thread();
861 mis->last_rb = NULL;
862 qemu_sem_post(&mis->fault_thread_sem);
863
864 struct pollfd *pfd;
865 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
866
867 pfd = g_new0(struct pollfd, pfd_len);
868
869 pfd[0].fd = mis->userfault_fd;
870 pfd[0].events = POLLIN;
871 pfd[1].fd = mis->userfault_event_fd;
872 pfd[1].events = POLLIN;
873 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
874 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
875 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
876 struct PostCopyFD, index);
877 pfd[2 + index].fd = pcfd->fd;
878 pfd[2 + index].events = POLLIN;
879 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
880 pcfd->fd);
881 }
882
883 while (true) {
884 ram_addr_t rb_offset;
885 int poll_result;
886
887
888
889
890
891
892
893 poll_result = poll(pfd, pfd_len, -1 );
894 if (poll_result == -1) {
895 error_report("%s: userfault poll: %s", __func__, strerror(errno));
896 break;
897 }
898
899 if (!mis->to_src_file) {
900
901
902
903
904
905 if (postcopy_pause_fault_thread(mis)) {
906
907 } else {
908 error_report("%s: paused but don't allow to continue",
909 __func__);
910 break;
911 }
912 }
913
914 if (pfd[1].revents) {
915 uint64_t tmp64 = 0;
916
917
918 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
919
920 error_report("%s: read() failed", __func__);
921 }
922
923 if (qatomic_read(&mis->fault_thread_quit)) {
924 trace_postcopy_ram_fault_thread_quit();
925 break;
926 }
927 }
928
929 if (pfd[0].revents) {
930 poll_result--;
931 ret = read(mis->userfault_fd, &msg, sizeof(msg));
932 if (ret != sizeof(msg)) {
933 if (errno == EAGAIN) {
934
935
936
937
938 continue;
939 }
940 if (ret < 0) {
941 error_report("%s: Failed to read full userfault "
942 "message: %s",
943 __func__, strerror(errno));
944 break;
945 } else {
946 error_report("%s: Read %d bytes from userfaultfd "
947 "expected %zd",
948 __func__, ret, sizeof(msg));
949 break;
950 }
951 }
952 if (msg.event != UFFD_EVENT_PAGEFAULT) {
953 error_report("%s: Read unexpected event %ud from userfaultfd",
954 __func__, msg.event);
955 continue;
956 }
957
958 rb = qemu_ram_block_from_host(
959 (void *)(uintptr_t)msg.arg.pagefault.address,
960 true, &rb_offset);
961 if (!rb) {
962 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
963 PRIx64, (uint64_t)msg.arg.pagefault.address);
964 break;
965 }
966
967 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
968 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
969 qemu_ram_get_idstr(rb),
970 rb_offset,
971 msg.arg.pagefault.feat.ptid);
972 mark_postcopy_blocktime_begin(
973 (uintptr_t)(msg.arg.pagefault.address),
974 msg.arg.pagefault.feat.ptid, rb);
975
976retry:
977
978
979
980
981 ret = migrate_send_rp_req_pages(mis, rb, rb_offset,
982 msg.arg.pagefault.address);
983 if (ret) {
984
985 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
986
987 goto retry;
988 } else {
989
990 error_report("%s: migrate_send_rp_req_pages() get %d",
991 __func__, ret);
992 break;
993 }
994 }
995 }
996
997
998
999 for (index = 2; index < pfd_len && poll_result; index++) {
1000 if (pfd[index].revents) {
1001 struct PostCopyFD *pcfd =
1002 &g_array_index(mis->postcopy_remote_fds,
1003 struct PostCopyFD, index - 2);
1004
1005 poll_result--;
1006 if (pfd[index].revents & POLLERR) {
1007 error_report("%s: POLLERR on poll %zd fd=%d",
1008 __func__, index, pcfd->fd);
1009 pfd[index].events = 0;
1010 continue;
1011 }
1012
1013 ret = read(pcfd->fd, &msg, sizeof(msg));
1014 if (ret != sizeof(msg)) {
1015 if (errno == EAGAIN) {
1016
1017
1018
1019
1020 continue;
1021 }
1022 if (ret < 0) {
1023 error_report("%s: Failed to read full userfault "
1024 "message: %s (shared) revents=%d",
1025 __func__, strerror(errno),
1026 pfd[index].revents);
1027
1028 break;
1029 } else {
1030 error_report("%s: Read %d bytes from userfaultfd "
1031 "expected %zd (shared)",
1032 __func__, ret, sizeof(msg));
1033
1034 break;
1035 }
1036 }
1037 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1038 error_report("%s: Read unexpected event %ud "
1039 "from userfaultfd (shared)",
1040 __func__, msg.event);
1041 continue;
1042 }
1043
1044 ret = pcfd->handler(pcfd, &msg);
1045 if (ret) {
1046 error_report("%s: Failed to resolve shared fault on %zd/%s",
1047 __func__, index, pcfd->idstr);
1048
1049 }
1050 }
1051 }
1052 }
1053 rcu_unregister_thread();
1054 trace_postcopy_ram_fault_thread_exit();
1055 g_free(pfd);
1056 return NULL;
1057}
1058
1059int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1060{
1061
1062 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1063 if (mis->userfault_fd == -1) {
1064 error_report("%s: Failed to open userfault fd: %s", __func__,
1065 strerror(errno));
1066 return -1;
1067 }
1068
1069
1070
1071
1072
1073 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1074 return -1;
1075 }
1076
1077
1078 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1079 if (mis->userfault_event_fd == -1) {
1080 error_report("%s: Opening userfault_event_fd: %s", __func__,
1081 strerror(errno));
1082 close(mis->userfault_fd);
1083 return -1;
1084 }
1085
1086 qemu_sem_init(&mis->fault_thread_sem, 0);
1087 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1088 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1089 qemu_sem_wait(&mis->fault_thread_sem);
1090 qemu_sem_destroy(&mis->fault_thread_sem);
1091 mis->have_fault_thread = true;
1092
1093
1094 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1095 error_report("ram_block_enable_notify failed");
1096 return -1;
1097 }
1098
1099 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1100 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1101 MAP_ANONYMOUS, -1, 0);
1102 if (mis->postcopy_tmp_page == MAP_FAILED) {
1103 mis->postcopy_tmp_page = NULL;
1104 error_report("%s: Failed to map postcopy_tmp_page %s",
1105 __func__, strerror(errno));
1106 return -1;
1107 }
1108
1109
1110
1111
1112 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1113 PROT_READ | PROT_WRITE,
1114 MAP_PRIVATE | MAP_ANONYMOUS,
1115 -1, 0);
1116 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1117 int e = errno;
1118 mis->postcopy_tmp_zero_page = NULL;
1119 error_report("%s: Failed to map large zero page %s",
1120 __func__, strerror(e));
1121 return -e;
1122 }
1123 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1124
1125 trace_postcopy_ram_enable_notify();
1126
1127 return 0;
1128}
1129
1130static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
1131 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1132{
1133 int userfault_fd = mis->userfault_fd;
1134 int ret;
1135
1136 if (from_addr) {
1137 struct uffdio_copy copy_struct;
1138 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1139 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1140 copy_struct.len = pagesize;
1141 copy_struct.mode = 0;
1142 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1143 } else {
1144 struct uffdio_zeropage zero_struct;
1145 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1146 zero_struct.range.len = pagesize;
1147 zero_struct.mode = 0;
1148 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1149 }
1150 if (!ret) {
1151 qemu_mutex_lock(&mis->page_request_mutex);
1152 ramblock_recv_bitmap_set_range(rb, host_addr,
1153 pagesize / qemu_target_page_size());
1154
1155
1156
1157
1158 if (g_tree_lookup(mis->page_requested, host_addr)) {
1159 g_tree_remove(mis->page_requested, host_addr);
1160 mis->page_requested_count--;
1161 trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
1162 }
1163 qemu_mutex_unlock(&mis->page_request_mutex);
1164 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1165 }
1166 return ret;
1167}
1168
1169int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1170{
1171 int i;
1172 MigrationIncomingState *mis = migration_incoming_get_current();
1173 GArray *pcrfds = mis->postcopy_remote_fds;
1174
1175 for (i = 0; i < pcrfds->len; i++) {
1176 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1177 int ret = cur->waker(cur, rb, offset);
1178 if (ret) {
1179 return ret;
1180 }
1181 }
1182 return 0;
1183}
1184
1185
1186
1187
1188
1189int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1190 RAMBlock *rb)
1191{
1192 size_t pagesize = qemu_ram_pagesize(rb);
1193
1194
1195
1196
1197
1198
1199 if (qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb)) {
1200 int e = errno;
1201 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1202 __func__, strerror(e), host, from, pagesize);
1203
1204 return -e;
1205 }
1206
1207 trace_postcopy_place_page(host);
1208 return postcopy_notify_shared_wake(rb,
1209 qemu_ram_block_host_offset(rb, host));
1210}
1211
1212
1213
1214
1215
1216int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1217 RAMBlock *rb)
1218{
1219 size_t pagesize = qemu_ram_pagesize(rb);
1220 trace_postcopy_place_page_zero(host);
1221
1222
1223
1224
1225 if (qemu_ram_is_uf_zeroable(rb)) {
1226 if (qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb)) {
1227 int e = errno;
1228 error_report("%s: %s zero host: %p",
1229 __func__, strerror(e), host);
1230
1231 return -e;
1232 }
1233 return postcopy_notify_shared_wake(rb,
1234 qemu_ram_block_host_offset(rb,
1235 host));
1236 } else {
1237 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1238 }
1239}
1240
1241#else
1242
1243void fill_destination_postcopy_migration_info(MigrationInfo *info)
1244{
1245}
1246
1247bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1248{
1249 error_report("%s: No OS support", __func__);
1250 return false;
1251}
1252
1253int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1254{
1255 error_report("postcopy_ram_incoming_init: No OS support");
1256 return -1;
1257}
1258
1259int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1260{
1261 assert(0);
1262 return -1;
1263}
1264
1265int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1266{
1267 assert(0);
1268 return -1;
1269}
1270
1271int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1272 uint64_t client_addr, uint64_t rb_offset)
1273{
1274 assert(0);
1275 return -1;
1276}
1277
1278int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1279{
1280 assert(0);
1281 return -1;
1282}
1283
1284int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1285 RAMBlock *rb)
1286{
1287 assert(0);
1288 return -1;
1289}
1290
1291int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1292 RAMBlock *rb)
1293{
1294 assert(0);
1295 return -1;
1296}
1297
1298int postcopy_wake_shared(struct PostCopyFD *pcfd,
1299 uint64_t client_addr,
1300 RAMBlock *rb)
1301{
1302 assert(0);
1303 return -1;
1304}
1305#endif
1306
1307
1308
1309void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1310{
1311 uint64_t tmp64 = 1;
1312
1313
1314
1315
1316
1317 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1318
1319 error_report("%s: incrementing failed: %s", __func__,
1320 strerror(errno));
1321 }
1322}
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332static PostcopyDiscardState pds = {0};
1333void postcopy_discard_send_init(MigrationState *ms, const char *name)
1334{
1335 pds.ramblock_name = name;
1336 pds.cur_entry = 0;
1337 pds.nsentwords = 0;
1338 pds.nsentcmds = 0;
1339}
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1351 unsigned long length)
1352{
1353 size_t tp_size = qemu_target_page_size();
1354
1355 pds.start_list[pds.cur_entry] = start * tp_size;
1356 pds.length_list[pds.cur_entry] = length * tp_size;
1357 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1358 pds.cur_entry++;
1359 pds.nsentwords++;
1360
1361 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1362
1363 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1364 pds.ramblock_name,
1365 pds.cur_entry,
1366 pds.start_list,
1367 pds.length_list);
1368 pds.nsentcmds++;
1369 pds.cur_entry = 0;
1370 }
1371}
1372
1373
1374
1375
1376
1377
1378
1379void postcopy_discard_send_finish(MigrationState *ms)
1380{
1381
1382 if (pds.cur_entry) {
1383 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1384 pds.ramblock_name,
1385 pds.cur_entry,
1386 pds.start_list,
1387 pds.length_list);
1388 pds.nsentcmds++;
1389 }
1390
1391 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1392 pds.nsentcmds);
1393}
1394
1395
1396
1397
1398
1399
1400static PostcopyState incoming_postcopy_state;
1401
1402PostcopyState postcopy_state_get(void)
1403{
1404 return qatomic_mb_read(&incoming_postcopy_state);
1405}
1406
1407
1408PostcopyState postcopy_state_set(PostcopyState new_state)
1409{
1410 return qatomic_xchg(&incoming_postcopy_state, new_state);
1411}
1412
1413
1414
1415
1416void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1417{
1418 MigrationIncomingState *mis = migration_incoming_get_current();
1419
1420 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1421 *pcfd);
1422}
1423
1424
1425
1426void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1427{
1428 guint i;
1429 MigrationIncomingState *mis = migration_incoming_get_current();
1430 GArray *pcrfds = mis->postcopy_remote_fds;
1431
1432 for (i = 0; i < pcrfds->len; i++) {
1433 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1434 if (cur->fd == pcfd->fd) {
1435 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1436 return;
1437 }
1438 }
1439}
1440