1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "exec/target_page.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "savevm.h"
24#include "postcopy-ram.h"
25#include "ram.h"
26#include "qapi/error.h"
27#include "qemu/notify.h"
28#include "sysemu/sysemu.h"
29#include "sysemu/balloon.h"
30#include "qemu/error-report.h"
31#include "trace.h"
32
33
34
35
36#define MAX_DISCARDS_PER_COMMAND 12
37
38struct PostcopyDiscardState {
39 const char *ramblock_name;
40 uint16_t cur_entry;
41
42
43
44 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
45 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
46 unsigned int nsentwords;
47 unsigned int nsentcmds;
48};
49
50static NotifierWithReturnList postcopy_notifier_list;
51
52void postcopy_infrastructure_init(void)
53{
54 notifier_with_return_list_init(&postcopy_notifier_list);
55}
56
57void postcopy_add_notifier(NotifierWithReturn *nn)
58{
59 notifier_with_return_list_add(&postcopy_notifier_list, nn);
60}
61
62void postcopy_remove_notifier(NotifierWithReturn *n)
63{
64 notifier_with_return_remove(n);
65}
66
67int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
68{
69 struct PostcopyNotifyData pnd;
70 pnd.reason = reason;
71 pnd.errp = errp;
72
73 return notifier_with_return_list_notify(&postcopy_notifier_list,
74 &pnd);
75}
76
77
78
79
80
81#if defined(__linux__)
82
83#include <poll.h>
84#include <sys/ioctl.h>
85#include <sys/syscall.h>
86#include <asm/types.h>
87#endif
88
89#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
90#include <sys/eventfd.h>
91#include <linux/userfaultfd.h>
92
93typedef struct PostcopyBlocktimeContext {
94
95 uint32_t *page_fault_vcpu_time;
96
97 uintptr_t *vcpu_addr;
98 uint32_t total_blocktime;
99
100 uint32_t *vcpu_blocktime;
101
102 uint32_t last_begin;
103
104 int smp_cpus_down;
105 uint64_t start_time;
106
107
108
109
110
111 Notifier exit_notifier;
112} PostcopyBlocktimeContext;
113
114static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
115{
116 g_free(ctx->page_fault_vcpu_time);
117 g_free(ctx->vcpu_addr);
118 g_free(ctx->vcpu_blocktime);
119 g_free(ctx);
120}
121
122static void migration_exit_cb(Notifier *n, void *data)
123{
124 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
125 exit_notifier);
126 destroy_blocktime_context(ctx);
127}
128
129static struct PostcopyBlocktimeContext *blocktime_context_new(void)
130{
131 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
132 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
133 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
134 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
135
136 ctx->exit_notifier.notify = migration_exit_cb;
137 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
138 qemu_add_exit_notifier(&ctx->exit_notifier);
139 return ctx;
140}
141
142static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
143{
144 uint32List *list = NULL, *entry = NULL;
145 int i;
146
147 for (i = smp_cpus - 1; i >= 0; i--) {
148 entry = g_new0(uint32List, 1);
149 entry->value = ctx->vcpu_blocktime[i];
150 entry->next = list;
151 list = entry;
152 }
153
154 return list;
155}
156
157
158
159
160
161
162
163
164void fill_destination_postcopy_migration_info(MigrationInfo *info)
165{
166 MigrationIncomingState *mis = migration_incoming_get_current();
167 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
168
169 if (!bc) {
170 return;
171 }
172
173 info->has_postcopy_blocktime = true;
174 info->postcopy_blocktime = bc->total_blocktime;
175 info->has_postcopy_vcpu_blocktime = true;
176 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
177}
178
179static uint32_t get_postcopy_total_blocktime(void)
180{
181 MigrationIncomingState *mis = migration_incoming_get_current();
182 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
183
184 if (!bc) {
185 return 0;
186 }
187
188 return bc->total_blocktime;
189}
190
191
192
193
194
195
196
197
198
199
200
201static bool receive_ufd_features(uint64_t *features)
202{
203 struct uffdio_api api_struct = {0};
204 int ufd;
205 bool ret = true;
206
207
208 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
209 if (ufd == -1) {
210 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
211 strerror(errno));
212 return false;
213 }
214
215
216 api_struct.api = UFFD_API;
217 api_struct.features = 0;
218 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
219 error_report("%s: UFFDIO_API failed: %s", __func__,
220 strerror(errno));
221 ret = false;
222 goto release_ufd;
223 }
224
225 *features = api_struct.features;
226
227release_ufd:
228 close(ufd);
229 return ret;
230}
231
232
233
234
235
236
237
238
239
240
241static bool request_ufd_features(int ufd, uint64_t features)
242{
243 struct uffdio_api api_struct = {0};
244 uint64_t ioctl_mask;
245
246 api_struct.api = UFFD_API;
247 api_struct.features = features;
248 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
249 error_report("%s failed: UFFDIO_API failed: %s", __func__,
250 strerror(errno));
251 return false;
252 }
253
254 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
255 (__u64)1 << _UFFDIO_UNREGISTER;
256 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
257 error_report("Missing userfault features: %" PRIx64,
258 (uint64_t)(~api_struct.ioctls & ioctl_mask));
259 return false;
260 }
261
262 return true;
263}
264
265static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
266{
267 uint64_t asked_features = 0;
268 static uint64_t supported_features;
269
270
271
272
273
274
275 if (!supported_features) {
276 if (!receive_ufd_features(&supported_features)) {
277 error_report("%s failed", __func__);
278 return false;
279 }
280 }
281
282#ifdef UFFD_FEATURE_THREAD_ID
283 if (migrate_postcopy_blocktime() && mis &&
284 UFFD_FEATURE_THREAD_ID & supported_features) {
285
286
287 if (!mis->blocktime_ctx) {
288 mis->blocktime_ctx = blocktime_context_new();
289 }
290
291 asked_features |= UFFD_FEATURE_THREAD_ID;
292 }
293#endif
294
295
296
297
298
299
300 if (!request_ufd_features(ufd, asked_features)) {
301 error_report("%s failed: features %" PRIu64, __func__,
302 asked_features);
303 return false;
304 }
305
306 if (getpagesize() != ram_pagesize_summary()) {
307 bool have_hp = false;
308
309#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
310 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
311#endif
312 if (!have_hp) {
313 error_report("Userfault on this host does not support huge pages");
314 return false;
315 }
316 }
317 return true;
318}
319
320
321
322static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
323{
324 const char *block_name = qemu_ram_get_idstr(rb);
325 ram_addr_t length = qemu_ram_get_used_length(rb);
326 size_t pagesize = qemu_ram_pagesize(rb);
327
328 if (length % pagesize) {
329 error_report("Postcopy requires RAM blocks to be a page size multiple,"
330 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
331 "page size of 0x%zx", block_name, length, pagesize);
332 return 1;
333 }
334 return 0;
335}
336
337
338
339
340
341
342bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
343{
344 long pagesize = getpagesize();
345 int ufd = -1;
346 bool ret = false;
347 void *testarea = NULL;
348 struct uffdio_register reg_struct;
349 struct uffdio_range range_struct;
350 uint64_t feature_mask;
351 Error *local_err = NULL;
352
353 if (qemu_target_page_size() > pagesize) {
354 error_report("Target page size bigger than host page size");
355 goto out;
356 }
357
358 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
359 if (ufd == -1) {
360 error_report("%s: userfaultfd not available: %s", __func__,
361 strerror(errno));
362 goto out;
363 }
364
365
366 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
367 error_report_err(local_err);
368 goto out;
369 }
370
371
372 if (!ufd_check_and_apply(ufd, mis)) {
373 goto out;
374 }
375
376
377 if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
378 goto out;
379 }
380
381
382
383
384
385 if (munlockall()) {
386 error_report("%s: munlockall: %s", __func__, strerror(errno));
387 return -1;
388 }
389
390
391
392
393
394
395 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
396 MAP_ANONYMOUS, -1, 0);
397 if (testarea == MAP_FAILED) {
398 error_report("%s: Failed to map test area: %s", __func__,
399 strerror(errno));
400 goto out;
401 }
402 g_assert(((size_t)testarea & (pagesize-1)) == 0);
403
404 reg_struct.range.start = (uintptr_t)testarea;
405 reg_struct.range.len = pagesize;
406 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
407
408 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
409 error_report("%s userfault register: %s", __func__, strerror(errno));
410 goto out;
411 }
412
413 range_struct.start = (uintptr_t)testarea;
414 range_struct.len = pagesize;
415 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
416 error_report("%s userfault unregister: %s", __func__, strerror(errno));
417 goto out;
418 }
419
420 feature_mask = (__u64)1 << _UFFDIO_WAKE |
421 (__u64)1 << _UFFDIO_COPY |
422 (__u64)1 << _UFFDIO_ZEROPAGE;
423 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
424 error_report("Missing userfault map features: %" PRIx64,
425 (uint64_t)(~reg_struct.ioctls & feature_mask));
426 goto out;
427 }
428
429
430 ret = true;
431out:
432 if (testarea) {
433 munmap(testarea, pagesize);
434 }
435 if (ufd != -1) {
436 close(ufd);
437 }
438 return ret;
439}
440
441
442
443
444
445
446static int init_range(RAMBlock *rb, void *opaque)
447{
448 const char *block_name = qemu_ram_get_idstr(rb);
449 void *host_addr = qemu_ram_get_host_addr(rb);
450 ram_addr_t offset = qemu_ram_get_offset(rb);
451 ram_addr_t length = qemu_ram_get_used_length(rb);
452 trace_postcopy_init_range(block_name, host_addr, offset, length);
453
454
455
456
457
458
459
460 if (ram_discard_range(block_name, 0, length)) {
461 return -1;
462 }
463
464 return 0;
465}
466
467
468
469
470
471static int cleanup_range(RAMBlock *rb, void *opaque)
472{
473 const char *block_name = qemu_ram_get_idstr(rb);
474 void *host_addr = qemu_ram_get_host_addr(rb);
475 ram_addr_t offset = qemu_ram_get_offset(rb);
476 ram_addr_t length = qemu_ram_get_used_length(rb);
477 MigrationIncomingState *mis = opaque;
478 struct uffdio_range range_struct;
479 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
480
481
482
483
484
485 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
486
487
488
489
490
491
492 range_struct.start = (uintptr_t)host_addr;
493 range_struct.len = length;
494
495 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
496 error_report("%s: userfault unregister %s", __func__, strerror(errno));
497
498 return -1;
499 }
500
501 return 0;
502}
503
504
505
506
507
508
509int postcopy_ram_incoming_init(MigrationIncomingState *mis)
510{
511 if (foreach_not_ignored_block(init_range, NULL)) {
512 return -1;
513 }
514
515 return 0;
516}
517
518
519
520
521
522static void postcopy_balloon_inhibit(bool state)
523{
524 static bool cur_state = false;
525
526 if (state != cur_state) {
527 qemu_balloon_inhibit(state);
528 cur_state = state;
529 }
530}
531
532
533
534
535int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
536{
537 trace_postcopy_ram_incoming_cleanup_entry();
538
539 if (mis->have_fault_thread) {
540 Error *local_err = NULL;
541
542
543 atomic_set(&mis->fault_thread_quit, 1);
544 postcopy_fault_thread_notify(mis);
545 trace_postcopy_ram_incoming_cleanup_join();
546 qemu_thread_join(&mis->fault_thread);
547
548 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
549 error_report_err(local_err);
550 return -1;
551 }
552
553 if (foreach_not_ignored_block(cleanup_range, mis)) {
554 return -1;
555 }
556
557 trace_postcopy_ram_incoming_cleanup_closeuf();
558 close(mis->userfault_fd);
559 close(mis->userfault_event_fd);
560 mis->have_fault_thread = false;
561 }
562
563 postcopy_balloon_inhibit(false);
564
565 if (enable_mlock) {
566 if (os_mlock() < 0) {
567 error_report("mlock: %s", strerror(errno));
568
569
570
571
572 }
573 }
574
575 postcopy_state_set(POSTCOPY_INCOMING_END);
576
577 if (mis->postcopy_tmp_page) {
578 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
579 mis->postcopy_tmp_page = NULL;
580 }
581 if (mis->postcopy_tmp_zero_page) {
582 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
583 mis->postcopy_tmp_zero_page = NULL;
584 }
585 trace_postcopy_ram_incoming_cleanup_blocktime(
586 get_postcopy_total_blocktime());
587
588 trace_postcopy_ram_incoming_cleanup_exit();
589 return 0;
590}
591
592
593
594
595static int nhp_range(RAMBlock *rb, void *opaque)
596{
597 const char *block_name = qemu_ram_get_idstr(rb);
598 void *host_addr = qemu_ram_get_host_addr(rb);
599 ram_addr_t offset = qemu_ram_get_offset(rb);
600 ram_addr_t length = qemu_ram_get_used_length(rb);
601 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
602
603
604
605
606
607
608 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
609
610 return 0;
611}
612
613
614
615
616
617
618int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
619{
620 if (foreach_not_ignored_block(nhp_range, mis)) {
621 return -1;
622 }
623
624 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
625
626 return 0;
627}
628
629
630
631
632
633
634
635
636
637
638static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
639{
640 MigrationIncomingState *mis = opaque;
641 struct uffdio_register reg_struct;
642
643 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
644 reg_struct.range.len = qemu_ram_get_used_length(rb);
645 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
646
647
648 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
649 error_report("%s userfault register: %s", __func__, strerror(errno));
650 return -1;
651 }
652 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
653 error_report("%s userfault: Region doesn't support COPY", __func__);
654 return -1;
655 }
656 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
657 qemu_ram_set_uf_zeroable(rb);
658 }
659
660 return 0;
661}
662
663int postcopy_wake_shared(struct PostCopyFD *pcfd,
664 uint64_t client_addr,
665 RAMBlock *rb)
666{
667 size_t pagesize = qemu_ram_pagesize(rb);
668 struct uffdio_range range;
669 int ret;
670 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
671 range.start = client_addr & ~(pagesize - 1);
672 range.len = pagesize;
673 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
674 if (ret) {
675 error_report("%s: Failed to wake: %zx in %s (%s)",
676 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
677 strerror(errno));
678 }
679 return ret;
680}
681
682
683
684
685
686
687int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
688 uint64_t client_addr, uint64_t rb_offset)
689{
690 size_t pagesize = qemu_ram_pagesize(rb);
691 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
692 MigrationIncomingState *mis = migration_incoming_get_current();
693
694 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
695 rb_offset);
696 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
697 trace_postcopy_request_shared_page_present(pcfd->idstr,
698 qemu_ram_get_idstr(rb), rb_offset);
699 return postcopy_wake_shared(pcfd, client_addr, rb);
700 }
701 if (rb != mis->last_rb) {
702 mis->last_rb = rb;
703 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
704 aligned_rbo, pagesize);
705 } else {
706
707 migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
708 }
709 return 0;
710}
711
712static int get_mem_fault_cpu_index(uint32_t pid)
713{
714 CPUState *cpu_iter;
715
716 CPU_FOREACH(cpu_iter) {
717 if (cpu_iter->thread_id == pid) {
718 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
719 return cpu_iter->cpu_index;
720 }
721 }
722 trace_get_mem_fault_cpu_index(-1, pid);
723 return -1;
724}
725
726static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
727{
728 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
729 dc->start_time;
730 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
731}
732
733
734
735
736
737
738
739
740
741static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
742 RAMBlock *rb)
743{
744 int cpu, already_received;
745 MigrationIncomingState *mis = migration_incoming_get_current();
746 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
747 uint32_t low_time_offset;
748
749 if (!dc || ptid == 0) {
750 return;
751 }
752 cpu = get_mem_fault_cpu_index(ptid);
753 if (cpu < 0) {
754 return;
755 }
756
757 low_time_offset = get_low_time_offset(dc);
758 if (dc->vcpu_addr[cpu] == 0) {
759 atomic_inc(&dc->smp_cpus_down);
760 }
761
762 atomic_xchg(&dc->last_begin, low_time_offset);
763 atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
764 atomic_xchg(&dc->vcpu_addr[cpu], addr);
765
766
767
768
769 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
770 if (already_received) {
771 atomic_xchg(&dc->vcpu_addr[cpu], 0);
772 atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
773 atomic_dec(&dc->smp_cpus_down);
774 }
775 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
776 cpu, already_received);
777}
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806static void mark_postcopy_blocktime_end(uintptr_t addr)
807{
808 MigrationIncomingState *mis = migration_incoming_get_current();
809 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
810 int i, affected_cpu = 0;
811 bool vcpu_total_blocktime = false;
812 uint32_t read_vcpu_time, low_time_offset;
813
814 if (!dc) {
815 return;
816 }
817
818 low_time_offset = get_low_time_offset(dc);
819
820
821
822
823 for (i = 0; i < smp_cpus; i++) {
824 uint32_t vcpu_blocktime = 0;
825
826 read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
827 if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
828 read_vcpu_time == 0) {
829 continue;
830 }
831 atomic_xchg(&dc->vcpu_addr[i], 0);
832 vcpu_blocktime = low_time_offset - read_vcpu_time;
833 affected_cpu += 1;
834
835
836
837 if (!vcpu_total_blocktime &&
838 atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
839 vcpu_total_blocktime = true;
840 }
841
842 dc->vcpu_blocktime[i] += vcpu_blocktime;
843 }
844
845 atomic_sub(&dc->smp_cpus_down, affected_cpu);
846 if (vcpu_total_blocktime) {
847 dc->total_blocktime += low_time_offset - atomic_fetch_add(
848 &dc->last_begin, 0);
849 }
850 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
851 affected_cpu);
852}
853
854static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
855{
856 trace_postcopy_pause_fault_thread();
857
858 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
859
860 trace_postcopy_pause_fault_thread_continued();
861
862 return true;
863}
864
865
866
867
868static void *postcopy_ram_fault_thread(void *opaque)
869{
870 MigrationIncomingState *mis = opaque;
871 struct uffd_msg msg;
872 int ret;
873 size_t index;
874 RAMBlock *rb = NULL;
875
876 trace_postcopy_ram_fault_thread_entry();
877 rcu_register_thread();
878 mis->last_rb = NULL;
879 qemu_sem_post(&mis->fault_thread_sem);
880
881 struct pollfd *pfd;
882 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
883
884 pfd = g_new0(struct pollfd, pfd_len);
885
886 pfd[0].fd = mis->userfault_fd;
887 pfd[0].events = POLLIN;
888 pfd[1].fd = mis->userfault_event_fd;
889 pfd[1].events = POLLIN;
890 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
891 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
892 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
893 struct PostCopyFD, index);
894 pfd[2 + index].fd = pcfd->fd;
895 pfd[2 + index].events = POLLIN;
896 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
897 pcfd->fd);
898 }
899
900 while (true) {
901 ram_addr_t rb_offset;
902 int poll_result;
903
904
905
906
907
908
909
910 poll_result = poll(pfd, pfd_len, -1 );
911 if (poll_result == -1) {
912 error_report("%s: userfault poll: %s", __func__, strerror(errno));
913 break;
914 }
915
916 if (!mis->to_src_file) {
917
918
919
920
921
922 if (postcopy_pause_fault_thread(mis)) {
923 mis->last_rb = NULL;
924
925 } else {
926 error_report("%s: paused but don't allow to continue",
927 __func__);
928 break;
929 }
930 }
931
932 if (pfd[1].revents) {
933 uint64_t tmp64 = 0;
934
935
936 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
937
938 error_report("%s: read() failed", __func__);
939 }
940
941 if (atomic_read(&mis->fault_thread_quit)) {
942 trace_postcopy_ram_fault_thread_quit();
943 break;
944 }
945 }
946
947 if (pfd[0].revents) {
948 poll_result--;
949 ret = read(mis->userfault_fd, &msg, sizeof(msg));
950 if (ret != sizeof(msg)) {
951 if (errno == EAGAIN) {
952
953
954
955
956 continue;
957 }
958 if (ret < 0) {
959 error_report("%s: Failed to read full userfault "
960 "message: %s",
961 __func__, strerror(errno));
962 break;
963 } else {
964 error_report("%s: Read %d bytes from userfaultfd "
965 "expected %zd",
966 __func__, ret, sizeof(msg));
967 break;
968 }
969 }
970 if (msg.event != UFFD_EVENT_PAGEFAULT) {
971 error_report("%s: Read unexpected event %ud from userfaultfd",
972 __func__, msg.event);
973 continue;
974 }
975
976 rb = qemu_ram_block_from_host(
977 (void *)(uintptr_t)msg.arg.pagefault.address,
978 true, &rb_offset);
979 if (!rb) {
980 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
981 PRIx64, (uint64_t)msg.arg.pagefault.address);
982 break;
983 }
984
985 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
986 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
987 qemu_ram_get_idstr(rb),
988 rb_offset,
989 msg.arg.pagefault.feat.ptid);
990 mark_postcopy_blocktime_begin(
991 (uintptr_t)(msg.arg.pagefault.address),
992 msg.arg.pagefault.feat.ptid, rb);
993
994retry:
995
996
997
998
999 if (rb != mis->last_rb) {
1000 mis->last_rb = rb;
1001 ret = migrate_send_rp_req_pages(mis,
1002 qemu_ram_get_idstr(rb),
1003 rb_offset,
1004 qemu_ram_pagesize(rb));
1005 } else {
1006
1007 ret = migrate_send_rp_req_pages(mis,
1008 NULL,
1009 rb_offset,
1010 qemu_ram_pagesize(rb));
1011 }
1012
1013 if (ret) {
1014
1015 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
1016
1017 mis->last_rb = NULL;
1018 goto retry;
1019 } else {
1020
1021 error_report("%s: migrate_send_rp_req_pages() get %d",
1022 __func__, ret);
1023 break;
1024 }
1025 }
1026 }
1027
1028
1029
1030 for (index = 2; index < pfd_len && poll_result; index++) {
1031 if (pfd[index].revents) {
1032 struct PostCopyFD *pcfd =
1033 &g_array_index(mis->postcopy_remote_fds,
1034 struct PostCopyFD, index - 2);
1035
1036 poll_result--;
1037 if (pfd[index].revents & POLLERR) {
1038 error_report("%s: POLLERR on poll %zd fd=%d",
1039 __func__, index, pcfd->fd);
1040 pfd[index].events = 0;
1041 continue;
1042 }
1043
1044 ret = read(pcfd->fd, &msg, sizeof(msg));
1045 if (ret != sizeof(msg)) {
1046 if (errno == EAGAIN) {
1047
1048
1049
1050
1051 continue;
1052 }
1053 if (ret < 0) {
1054 error_report("%s: Failed to read full userfault "
1055 "message: %s (shared) revents=%d",
1056 __func__, strerror(errno),
1057 pfd[index].revents);
1058
1059 break;
1060 } else {
1061 error_report("%s: Read %d bytes from userfaultfd "
1062 "expected %zd (shared)",
1063 __func__, ret, sizeof(msg));
1064
1065 break;
1066 }
1067 }
1068 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1069 error_report("%s: Read unexpected event %ud "
1070 "from userfaultfd (shared)",
1071 __func__, msg.event);
1072 continue;
1073 }
1074
1075 ret = pcfd->handler(pcfd, &msg);
1076 if (ret) {
1077 error_report("%s: Failed to resolve shared fault on %zd/%s",
1078 __func__, index, pcfd->idstr);
1079
1080 }
1081 }
1082 }
1083 }
1084 rcu_unregister_thread();
1085 trace_postcopy_ram_fault_thread_exit();
1086 g_free(pfd);
1087 return NULL;
1088}
1089
1090int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1091{
1092
1093 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1094 if (mis->userfault_fd == -1) {
1095 error_report("%s: Failed to open userfault fd: %s", __func__,
1096 strerror(errno));
1097 return -1;
1098 }
1099
1100
1101
1102
1103
1104 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1105 return -1;
1106 }
1107
1108
1109 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1110 if (mis->userfault_event_fd == -1) {
1111 error_report("%s: Opening userfault_event_fd: %s", __func__,
1112 strerror(errno));
1113 close(mis->userfault_fd);
1114 return -1;
1115 }
1116
1117 qemu_sem_init(&mis->fault_thread_sem, 0);
1118 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1119 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1120 qemu_sem_wait(&mis->fault_thread_sem);
1121 qemu_sem_destroy(&mis->fault_thread_sem);
1122 mis->have_fault_thread = true;
1123
1124
1125 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1126 error_report("ram_block_enable_notify failed");
1127 return -1;
1128 }
1129
1130
1131
1132
1133
1134 postcopy_balloon_inhibit(true);
1135
1136 trace_postcopy_ram_enable_notify();
1137
1138 return 0;
1139}
1140
1141static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
1142 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1143{
1144 int ret;
1145 if (from_addr) {
1146 struct uffdio_copy copy_struct;
1147 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1148 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1149 copy_struct.len = pagesize;
1150 copy_struct.mode = 0;
1151 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1152 } else {
1153 struct uffdio_zeropage zero_struct;
1154 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1155 zero_struct.range.len = pagesize;
1156 zero_struct.mode = 0;
1157 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1158 }
1159 if (!ret) {
1160 ramblock_recv_bitmap_set_range(rb, host_addr,
1161 pagesize / qemu_target_page_size());
1162 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1163
1164 }
1165 return ret;
1166}
1167
1168int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1169{
1170 int i;
1171 MigrationIncomingState *mis = migration_incoming_get_current();
1172 GArray *pcrfds = mis->postcopy_remote_fds;
1173
1174 for (i = 0; i < pcrfds->len; i++) {
1175 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1176 int ret = cur->waker(cur, rb, offset);
1177 if (ret) {
1178 return ret;
1179 }
1180 }
1181 return 0;
1182}
1183
1184
1185
1186
1187
1188int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1189 RAMBlock *rb)
1190{
1191 size_t pagesize = qemu_ram_pagesize(rb);
1192
1193
1194
1195
1196
1197
1198 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
1199 int e = errno;
1200 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1201 __func__, strerror(e), host, from, pagesize);
1202
1203 return -e;
1204 }
1205
1206 trace_postcopy_place_page(host);
1207 return postcopy_notify_shared_wake(rb,
1208 qemu_ram_block_host_offset(rb, host));
1209}
1210
1211
1212
1213
1214
1215int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1216 RAMBlock *rb)
1217{
1218 size_t pagesize = qemu_ram_pagesize(rb);
1219 trace_postcopy_place_page_zero(host);
1220
1221
1222
1223
1224 if (qemu_ram_is_uf_zeroable(rb)) {
1225 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
1226 int e = errno;
1227 error_report("%s: %s zero host: %p",
1228 __func__, strerror(e), host);
1229
1230 return -e;
1231 }
1232 return postcopy_notify_shared_wake(rb,
1233 qemu_ram_block_host_offset(rb,
1234 host));
1235 } else {
1236
1237 if (!mis->postcopy_tmp_zero_page) {
1238 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1239 PROT_READ | PROT_WRITE,
1240 MAP_PRIVATE | MAP_ANONYMOUS,
1241 -1, 0);
1242 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1243 int e = errno;
1244 mis->postcopy_tmp_zero_page = NULL;
1245 error_report("%s: %s mapping large zero page",
1246 __func__, strerror(e));
1247 return -e;
1248 }
1249 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1250 }
1251 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
1252 rb);
1253 }
1254}
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1265{
1266 if (!mis->postcopy_tmp_page) {
1267 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1268 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1269 MAP_ANONYMOUS, -1, 0);
1270 if (mis->postcopy_tmp_page == MAP_FAILED) {
1271 mis->postcopy_tmp_page = NULL;
1272 error_report("%s: %s", __func__, strerror(errno));
1273 return NULL;
1274 }
1275 }
1276
1277 return mis->postcopy_tmp_page;
1278}
1279
1280#else
1281
1282void fill_destination_postcopy_migration_info(MigrationInfo *info)
1283{
1284}
1285
1286bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1287{
1288 error_report("%s: No OS support", __func__);
1289 return false;
1290}
1291
1292int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1293{
1294 error_report("postcopy_ram_incoming_init: No OS support");
1295 return -1;
1296}
1297
1298int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1299{
1300 assert(0);
1301 return -1;
1302}
1303
1304int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1305{
1306 assert(0);
1307 return -1;
1308}
1309
1310int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1311 uint64_t client_addr, uint64_t rb_offset)
1312{
1313 assert(0);
1314 return -1;
1315}
1316
1317int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1318{
1319 assert(0);
1320 return -1;
1321}
1322
1323int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1324 RAMBlock *rb)
1325{
1326 assert(0);
1327 return -1;
1328}
1329
1330int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1331 RAMBlock *rb)
1332{
1333 assert(0);
1334 return -1;
1335}
1336
1337void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1338{
1339 assert(0);
1340 return NULL;
1341}
1342
1343int postcopy_wake_shared(struct PostCopyFD *pcfd,
1344 uint64_t client_addr,
1345 RAMBlock *rb)
1346{
1347 assert(0);
1348 return -1;
1349}
1350#endif
1351
1352
1353
1354void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1355{
1356 uint64_t tmp64 = 1;
1357
1358
1359
1360
1361
1362 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1363
1364 error_report("%s: incrementing failed: %s", __func__,
1365 strerror(errno));
1366 }
1367}
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
1381 const char *name)
1382{
1383 PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
1384
1385 if (res) {
1386 res->ramblock_name = name;
1387 }
1388
1389 return res;
1390}
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
1403 unsigned long start, unsigned long length)
1404{
1405 size_t tp_size = qemu_target_page_size();
1406
1407 pds->start_list[pds->cur_entry] = start * tp_size;
1408 pds->length_list[pds->cur_entry] = length * tp_size;
1409 trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
1410 pds->cur_entry++;
1411 pds->nsentwords++;
1412
1413 if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
1414
1415 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1416 pds->ramblock_name,
1417 pds->cur_entry,
1418 pds->start_list,
1419 pds->length_list);
1420 pds->nsentcmds++;
1421 pds->cur_entry = 0;
1422 }
1423}
1424
1425
1426
1427
1428
1429
1430
1431
1432void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
1433{
1434
1435 if (pds->cur_entry) {
1436 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1437 pds->ramblock_name,
1438 pds->cur_entry,
1439 pds->start_list,
1440 pds->length_list);
1441 pds->nsentcmds++;
1442 }
1443
1444 trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
1445 pds->nsentcmds);
1446
1447 g_free(pds);
1448}
1449
1450
1451
1452
1453
1454
1455static PostcopyState incoming_postcopy_state;
1456
1457PostcopyState postcopy_state_get(void)
1458{
1459 return atomic_mb_read(&incoming_postcopy_state);
1460}
1461
1462
1463PostcopyState postcopy_state_set(PostcopyState new_state)
1464{
1465 return atomic_xchg(&incoming_postcopy_state, new_state);
1466}
1467
1468
1469
1470
1471void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1472{
1473 MigrationIncomingState *mis = migration_incoming_get_current();
1474
1475 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1476 *pcfd);
1477}
1478
1479
1480
1481void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1482{
1483 guint i;
1484 MigrationIncomingState *mis = migration_incoming_get_current();
1485 GArray *pcrfds = mis->postcopy_remote_fds;
1486
1487 for (i = 0; i < pcrfds->len; i++) {
1488 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1489 if (cur->fd == pcfd->fd) {
1490 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1491 return;
1492 }
1493 }
1494}
1495