1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "exec/target_page.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "savevm.h"
24#include "postcopy-ram.h"
25#include "ram.h"
26#include "qapi/error.h"
27#include "qemu/notify.h"
28#include "sysemu/sysemu.h"
29#include "sysemu/balloon.h"
30#include "qemu/error-report.h"
31#include "trace.h"
32
33
34
35
36#define MAX_DISCARDS_PER_COMMAND 12
37
38struct PostcopyDiscardState {
39 const char *ramblock_name;
40 uint16_t cur_entry;
41
42
43
44 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
45 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
46 unsigned int nsentwords;
47 unsigned int nsentcmds;
48};
49
50static NotifierWithReturnList postcopy_notifier_list;
51
52void postcopy_infrastructure_init(void)
53{
54 notifier_with_return_list_init(&postcopy_notifier_list);
55}
56
57void postcopy_add_notifier(NotifierWithReturn *nn)
58{
59 notifier_with_return_list_add(&postcopy_notifier_list, nn);
60}
61
62void postcopy_remove_notifier(NotifierWithReturn *n)
63{
64 notifier_with_return_remove(n);
65}
66
67int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
68{
69 struct PostcopyNotifyData pnd;
70 pnd.reason = reason;
71 pnd.errp = errp;
72
73 return notifier_with_return_list_notify(&postcopy_notifier_list,
74 &pnd);
75}
76
77
78
79
80
81#if defined(__linux__)
82
83#include <poll.h>
84#include <sys/ioctl.h>
85#include <sys/syscall.h>
86#include <asm/types.h>
87#endif
88
89#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
90#include <sys/eventfd.h>
91#include <linux/userfaultfd.h>
92
93typedef struct PostcopyBlocktimeContext {
94
95 uint32_t *page_fault_vcpu_time;
96
97 uintptr_t *vcpu_addr;
98 uint32_t total_blocktime;
99
100 uint32_t *vcpu_blocktime;
101
102 uint32_t last_begin;
103
104 int smp_cpus_down;
105 uint64_t start_time;
106
107
108
109
110
111 Notifier exit_notifier;
112} PostcopyBlocktimeContext;
113
114static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
115{
116 g_free(ctx->page_fault_vcpu_time);
117 g_free(ctx->vcpu_addr);
118 g_free(ctx->vcpu_blocktime);
119 g_free(ctx);
120}
121
122static void migration_exit_cb(Notifier *n, void *data)
123{
124 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
125 exit_notifier);
126 destroy_blocktime_context(ctx);
127}
128
129static struct PostcopyBlocktimeContext *blocktime_context_new(void)
130{
131 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
132 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
133 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
134 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
135
136 ctx->exit_notifier.notify = migration_exit_cb;
137 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
138 qemu_add_exit_notifier(&ctx->exit_notifier);
139 return ctx;
140}
141
142static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
143{
144 uint32List *list = NULL, *entry = NULL;
145 int i;
146
147 for (i = smp_cpus - 1; i >= 0; i--) {
148 entry = g_new0(uint32List, 1);
149 entry->value = ctx->vcpu_blocktime[i];
150 entry->next = list;
151 list = entry;
152 }
153
154 return list;
155}
156
157
158
159
160
161
162
163
164void fill_destination_postcopy_migration_info(MigrationInfo *info)
165{
166 MigrationIncomingState *mis = migration_incoming_get_current();
167 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
168
169 if (!bc) {
170 return;
171 }
172
173 info->has_postcopy_blocktime = true;
174 info->postcopy_blocktime = bc->total_blocktime;
175 info->has_postcopy_vcpu_blocktime = true;
176 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
177}
178
179static uint32_t get_postcopy_total_blocktime(void)
180{
181 MigrationIncomingState *mis = migration_incoming_get_current();
182 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
183
184 if (!bc) {
185 return 0;
186 }
187
188 return bc->total_blocktime;
189}
190
191
192
193
194
195
196
197
198
199
200
201static bool receive_ufd_features(uint64_t *features)
202{
203 struct uffdio_api api_struct = {0};
204 int ufd;
205 bool ret = true;
206
207
208 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
209 if (ufd == -1) {
210 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
211 strerror(errno));
212 return false;
213 }
214
215
216 api_struct.api = UFFD_API;
217 api_struct.features = 0;
218 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
219 error_report("%s: UFFDIO_API failed: %s", __func__,
220 strerror(errno));
221 ret = false;
222 goto release_ufd;
223 }
224
225 *features = api_struct.features;
226
227release_ufd:
228 close(ufd);
229 return ret;
230}
231
232
233
234
235
236
237
238
239
240
241static bool request_ufd_features(int ufd, uint64_t features)
242{
243 struct uffdio_api api_struct = {0};
244 uint64_t ioctl_mask;
245
246 api_struct.api = UFFD_API;
247 api_struct.features = features;
248 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
249 error_report("%s failed: UFFDIO_API failed: %s", __func__,
250 strerror(errno));
251 return false;
252 }
253
254 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
255 (__u64)1 << _UFFDIO_UNREGISTER;
256 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
257 error_report("Missing userfault features: %" PRIx64,
258 (uint64_t)(~api_struct.ioctls & ioctl_mask));
259 return false;
260 }
261
262 return true;
263}
264
265static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
266{
267 uint64_t asked_features = 0;
268 static uint64_t supported_features;
269
270
271
272
273
274
275 if (!supported_features) {
276 if (!receive_ufd_features(&supported_features)) {
277 error_report("%s failed", __func__);
278 return false;
279 }
280 }
281
282#ifdef UFFD_FEATURE_THREAD_ID
283 if (migrate_postcopy_blocktime() && mis &&
284 UFFD_FEATURE_THREAD_ID & supported_features) {
285
286
287 if (!mis->blocktime_ctx) {
288 mis->blocktime_ctx = blocktime_context_new();
289 }
290
291 asked_features |= UFFD_FEATURE_THREAD_ID;
292 }
293#endif
294
295
296
297
298
299
300 if (!request_ufd_features(ufd, asked_features)) {
301 error_report("%s failed: features %" PRIu64, __func__,
302 asked_features);
303 return false;
304 }
305
306 if (getpagesize() != ram_pagesize_summary()) {
307 bool have_hp = false;
308
309#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
310 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
311#endif
312 if (!have_hp) {
313 error_report("Userfault on this host does not support huge pages");
314 return false;
315 }
316 }
317 return true;
318}
319
320
321
322static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
323 ram_addr_t offset, ram_addr_t length, void *opaque)
324{
325 RAMBlock *rb = qemu_ram_block_by_name(block_name);
326 size_t pagesize = qemu_ram_pagesize(rb);
327
328 if (length % pagesize) {
329 error_report("Postcopy requires RAM blocks to be a page size multiple,"
330 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
331 "page size of 0x%zx", block_name, length, pagesize);
332 return 1;
333 }
334 return 0;
335}
336
337
338
339
340
341
342bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
343{
344 long pagesize = getpagesize();
345 int ufd = -1;
346 bool ret = false;
347 void *testarea = NULL;
348 struct uffdio_register reg_struct;
349 struct uffdio_range range_struct;
350 uint64_t feature_mask;
351 Error *local_err = NULL;
352
353 if (qemu_target_page_size() > pagesize) {
354 error_report("Target page size bigger than host page size");
355 goto out;
356 }
357
358 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
359 if (ufd == -1) {
360 error_report("%s: userfaultfd not available: %s", __func__,
361 strerror(errno));
362 goto out;
363 }
364
365
366 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
367 error_report_err(local_err);
368 goto out;
369 }
370
371
372 if (!ufd_check_and_apply(ufd, mis)) {
373 goto out;
374 }
375
376
377 if (qemu_ram_foreach_migratable_block(test_ramblock_postcopiable, NULL)) {
378 goto out;
379 }
380
381
382
383
384
385 if (munlockall()) {
386 error_report("%s: munlockall: %s", __func__, strerror(errno));
387 return -1;
388 }
389
390
391
392
393
394
395 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
396 MAP_ANONYMOUS, -1, 0);
397 if (testarea == MAP_FAILED) {
398 error_report("%s: Failed to map test area: %s", __func__,
399 strerror(errno));
400 goto out;
401 }
402 g_assert(((size_t)testarea & (pagesize-1)) == 0);
403
404 reg_struct.range.start = (uintptr_t)testarea;
405 reg_struct.range.len = pagesize;
406 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
407
408 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
409 error_report("%s userfault register: %s", __func__, strerror(errno));
410 goto out;
411 }
412
413 range_struct.start = (uintptr_t)testarea;
414 range_struct.len = pagesize;
415 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
416 error_report("%s userfault unregister: %s", __func__, strerror(errno));
417 goto out;
418 }
419
420 feature_mask = (__u64)1 << _UFFDIO_WAKE |
421 (__u64)1 << _UFFDIO_COPY |
422 (__u64)1 << _UFFDIO_ZEROPAGE;
423 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
424 error_report("Missing userfault map features: %" PRIx64,
425 (uint64_t)(~reg_struct.ioctls & feature_mask));
426 goto out;
427 }
428
429
430 ret = true;
431out:
432 if (testarea) {
433 munmap(testarea, pagesize);
434 }
435 if (ufd != -1) {
436 close(ufd);
437 }
438 return ret;
439}
440
441
442
443
444
445
446static int init_range(const char *block_name, void *host_addr,
447 ram_addr_t offset, ram_addr_t length, void *opaque)
448{
449 trace_postcopy_init_range(block_name, host_addr, offset, length);
450
451
452
453
454
455
456
457 if (ram_discard_range(block_name, 0, length)) {
458 return -1;
459 }
460
461 return 0;
462}
463
464
465
466
467
468static int cleanup_range(const char *block_name, void *host_addr,
469 ram_addr_t offset, ram_addr_t length, void *opaque)
470{
471 MigrationIncomingState *mis = opaque;
472 struct uffdio_range range_struct;
473 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
474
475
476
477
478
479 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
480
481
482
483
484
485
486 range_struct.start = (uintptr_t)host_addr;
487 range_struct.len = length;
488
489 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
490 error_report("%s: userfault unregister %s", __func__, strerror(errno));
491
492 return -1;
493 }
494
495 return 0;
496}
497
498
499
500
501
502
503int postcopy_ram_incoming_init(MigrationIncomingState *mis)
504{
505 if (qemu_ram_foreach_migratable_block(init_range, NULL)) {
506 return -1;
507 }
508
509 return 0;
510}
511
512
513
514
515
516static void postcopy_balloon_inhibit(bool state)
517{
518 static bool cur_state = false;
519
520 if (state != cur_state) {
521 qemu_balloon_inhibit(state);
522 cur_state = state;
523 }
524}
525
526
527
528
529int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
530{
531 trace_postcopy_ram_incoming_cleanup_entry();
532
533 if (mis->have_fault_thread) {
534 Error *local_err = NULL;
535
536
537 atomic_set(&mis->fault_thread_quit, 1);
538 postcopy_fault_thread_notify(mis);
539 trace_postcopy_ram_incoming_cleanup_join();
540 qemu_thread_join(&mis->fault_thread);
541
542 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
543 error_report_err(local_err);
544 return -1;
545 }
546
547 if (qemu_ram_foreach_migratable_block(cleanup_range, mis)) {
548 return -1;
549 }
550
551 trace_postcopy_ram_incoming_cleanup_closeuf();
552 close(mis->userfault_fd);
553 close(mis->userfault_event_fd);
554 mis->have_fault_thread = false;
555 }
556
557 postcopy_balloon_inhibit(false);
558
559 if (enable_mlock) {
560 if (os_mlock() < 0) {
561 error_report("mlock: %s", strerror(errno));
562
563
564
565
566 }
567 }
568
569 postcopy_state_set(POSTCOPY_INCOMING_END);
570
571 if (mis->postcopy_tmp_page) {
572 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
573 mis->postcopy_tmp_page = NULL;
574 }
575 if (mis->postcopy_tmp_zero_page) {
576 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
577 mis->postcopy_tmp_zero_page = NULL;
578 }
579 trace_postcopy_ram_incoming_cleanup_blocktime(
580 get_postcopy_total_blocktime());
581
582 trace_postcopy_ram_incoming_cleanup_exit();
583 return 0;
584}
585
586
587
588
589static int nhp_range(const char *block_name, void *host_addr,
590 ram_addr_t offset, ram_addr_t length, void *opaque)
591{
592 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
593
594
595
596
597
598
599 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
600
601 return 0;
602}
603
604
605
606
607
608
609int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
610{
611 if (qemu_ram_foreach_migratable_block(nhp_range, mis)) {
612 return -1;
613 }
614
615 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
616
617 return 0;
618}
619
620
621
622
623
624
625
626
627
628
629static int ram_block_enable_notify(const char *block_name, void *host_addr,
630 ram_addr_t offset, ram_addr_t length,
631 void *opaque)
632{
633 MigrationIncomingState *mis = opaque;
634 struct uffdio_register reg_struct;
635
636 reg_struct.range.start = (uintptr_t)host_addr;
637 reg_struct.range.len = length;
638 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
639
640
641 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
642 error_report("%s userfault register: %s", __func__, strerror(errno));
643 return -1;
644 }
645 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
646 error_report("%s userfault: Region doesn't support COPY", __func__);
647 return -1;
648 }
649 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
650 RAMBlock *rb = qemu_ram_block_by_name(block_name);
651 qemu_ram_set_uf_zeroable(rb);
652 }
653
654 return 0;
655}
656
657int postcopy_wake_shared(struct PostCopyFD *pcfd,
658 uint64_t client_addr,
659 RAMBlock *rb)
660{
661 size_t pagesize = qemu_ram_pagesize(rb);
662 struct uffdio_range range;
663 int ret;
664 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
665 range.start = client_addr & ~(pagesize - 1);
666 range.len = pagesize;
667 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
668 if (ret) {
669 error_report("%s: Failed to wake: %zx in %s (%s)",
670 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
671 strerror(errno));
672 }
673 return ret;
674}
675
676
677
678
679
680
681int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
682 uint64_t client_addr, uint64_t rb_offset)
683{
684 size_t pagesize = qemu_ram_pagesize(rb);
685 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
686 MigrationIncomingState *mis = migration_incoming_get_current();
687
688 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
689 rb_offset);
690 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
691 trace_postcopy_request_shared_page_present(pcfd->idstr,
692 qemu_ram_get_idstr(rb), rb_offset);
693 return postcopy_wake_shared(pcfd, client_addr, rb);
694 }
695 if (rb != mis->last_rb) {
696 mis->last_rb = rb;
697 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
698 aligned_rbo, pagesize);
699 } else {
700
701 migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
702 }
703 return 0;
704}
705
706static int get_mem_fault_cpu_index(uint32_t pid)
707{
708 CPUState *cpu_iter;
709
710 CPU_FOREACH(cpu_iter) {
711 if (cpu_iter->thread_id == pid) {
712 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
713 return cpu_iter->cpu_index;
714 }
715 }
716 trace_get_mem_fault_cpu_index(-1, pid);
717 return -1;
718}
719
720static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
721{
722 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
723 dc->start_time;
724 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
725}
726
727
728
729
730
731
732
733
734
735static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
736 RAMBlock *rb)
737{
738 int cpu, already_received;
739 MigrationIncomingState *mis = migration_incoming_get_current();
740 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
741 uint32_t low_time_offset;
742
743 if (!dc || ptid == 0) {
744 return;
745 }
746 cpu = get_mem_fault_cpu_index(ptid);
747 if (cpu < 0) {
748 return;
749 }
750
751 low_time_offset = get_low_time_offset(dc);
752 if (dc->vcpu_addr[cpu] == 0) {
753 atomic_inc(&dc->smp_cpus_down);
754 }
755
756 atomic_xchg(&dc->last_begin, low_time_offset);
757 atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
758 atomic_xchg(&dc->vcpu_addr[cpu], addr);
759
760
761
762
763 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
764 if (already_received) {
765 atomic_xchg(&dc->vcpu_addr[cpu], 0);
766 atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
767 atomic_dec(&dc->smp_cpus_down);
768 }
769 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
770 cpu, already_received);
771}
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800static void mark_postcopy_blocktime_end(uintptr_t addr)
801{
802 MigrationIncomingState *mis = migration_incoming_get_current();
803 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
804 int i, affected_cpu = 0;
805 bool vcpu_total_blocktime = false;
806 uint32_t read_vcpu_time, low_time_offset;
807
808 if (!dc) {
809 return;
810 }
811
812 low_time_offset = get_low_time_offset(dc);
813
814
815
816
817 for (i = 0; i < smp_cpus; i++) {
818 uint32_t vcpu_blocktime = 0;
819
820 read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
821 if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
822 read_vcpu_time == 0) {
823 continue;
824 }
825 atomic_xchg(&dc->vcpu_addr[i], 0);
826 vcpu_blocktime = low_time_offset - read_vcpu_time;
827 affected_cpu += 1;
828
829
830
831 if (!vcpu_total_blocktime &&
832 atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
833 vcpu_total_blocktime = true;
834 }
835
836 dc->vcpu_blocktime[i] += vcpu_blocktime;
837 }
838
839 atomic_sub(&dc->smp_cpus_down, affected_cpu);
840 if (vcpu_total_blocktime) {
841 dc->total_blocktime += low_time_offset - atomic_fetch_add(
842 &dc->last_begin, 0);
843 }
844 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
845 affected_cpu);
846}
847
848static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
849{
850 trace_postcopy_pause_fault_thread();
851
852 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
853
854 trace_postcopy_pause_fault_thread_continued();
855
856 return true;
857}
858
859
860
861
862static void *postcopy_ram_fault_thread(void *opaque)
863{
864 MigrationIncomingState *mis = opaque;
865 struct uffd_msg msg;
866 int ret;
867 size_t index;
868 RAMBlock *rb = NULL;
869
870 trace_postcopy_ram_fault_thread_entry();
871 rcu_register_thread();
872 mis->last_rb = NULL;
873 qemu_sem_post(&mis->fault_thread_sem);
874
875 struct pollfd *pfd;
876 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
877
878 pfd = g_new0(struct pollfd, pfd_len);
879
880 pfd[0].fd = mis->userfault_fd;
881 pfd[0].events = POLLIN;
882 pfd[1].fd = mis->userfault_event_fd;
883 pfd[1].events = POLLIN;
884 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
885 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
886 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
887 struct PostCopyFD, index);
888 pfd[2 + index].fd = pcfd->fd;
889 pfd[2 + index].events = POLLIN;
890 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
891 pcfd->fd);
892 }
893
894 while (true) {
895 ram_addr_t rb_offset;
896 int poll_result;
897
898
899
900
901
902
903
904 poll_result = poll(pfd, pfd_len, -1 );
905 if (poll_result == -1) {
906 error_report("%s: userfault poll: %s", __func__, strerror(errno));
907 break;
908 }
909
910 if (!mis->to_src_file) {
911
912
913
914
915
916 if (postcopy_pause_fault_thread(mis)) {
917 mis->last_rb = NULL;
918
919 } else {
920 error_report("%s: paused but don't allow to continue",
921 __func__);
922 break;
923 }
924 }
925
926 if (pfd[1].revents) {
927 uint64_t tmp64 = 0;
928
929
930 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
931
932 error_report("%s: read() failed", __func__);
933 }
934
935 if (atomic_read(&mis->fault_thread_quit)) {
936 trace_postcopy_ram_fault_thread_quit();
937 break;
938 }
939 }
940
941 if (pfd[0].revents) {
942 poll_result--;
943 ret = read(mis->userfault_fd, &msg, sizeof(msg));
944 if (ret != sizeof(msg)) {
945 if (errno == EAGAIN) {
946
947
948
949
950 continue;
951 }
952 if (ret < 0) {
953 error_report("%s: Failed to read full userfault "
954 "message: %s",
955 __func__, strerror(errno));
956 break;
957 } else {
958 error_report("%s: Read %d bytes from userfaultfd "
959 "expected %zd",
960 __func__, ret, sizeof(msg));
961 break;
962 }
963 }
964 if (msg.event != UFFD_EVENT_PAGEFAULT) {
965 error_report("%s: Read unexpected event %ud from userfaultfd",
966 __func__, msg.event);
967 continue;
968 }
969
970 rb = qemu_ram_block_from_host(
971 (void *)(uintptr_t)msg.arg.pagefault.address,
972 true, &rb_offset);
973 if (!rb) {
974 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
975 PRIx64, (uint64_t)msg.arg.pagefault.address);
976 break;
977 }
978
979 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
980 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
981 qemu_ram_get_idstr(rb),
982 rb_offset,
983 msg.arg.pagefault.feat.ptid);
984 mark_postcopy_blocktime_begin(
985 (uintptr_t)(msg.arg.pagefault.address),
986 msg.arg.pagefault.feat.ptid, rb);
987
988retry:
989
990
991
992
993 if (rb != mis->last_rb) {
994 mis->last_rb = rb;
995 ret = migrate_send_rp_req_pages(mis,
996 qemu_ram_get_idstr(rb),
997 rb_offset,
998 qemu_ram_pagesize(rb));
999 } else {
1000
1001 ret = migrate_send_rp_req_pages(mis,
1002 NULL,
1003 rb_offset,
1004 qemu_ram_pagesize(rb));
1005 }
1006
1007 if (ret) {
1008
1009 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
1010
1011 mis->last_rb = NULL;
1012 goto retry;
1013 } else {
1014
1015 error_report("%s: migrate_send_rp_req_pages() get %d",
1016 __func__, ret);
1017 break;
1018 }
1019 }
1020 }
1021
1022
1023
1024 for (index = 2; index < pfd_len && poll_result; index++) {
1025 if (pfd[index].revents) {
1026 struct PostCopyFD *pcfd =
1027 &g_array_index(mis->postcopy_remote_fds,
1028 struct PostCopyFD, index - 2);
1029
1030 poll_result--;
1031 if (pfd[index].revents & POLLERR) {
1032 error_report("%s: POLLERR on poll %zd fd=%d",
1033 __func__, index, pcfd->fd);
1034 pfd[index].events = 0;
1035 continue;
1036 }
1037
1038 ret = read(pcfd->fd, &msg, sizeof(msg));
1039 if (ret != sizeof(msg)) {
1040 if (errno == EAGAIN) {
1041
1042
1043
1044
1045 continue;
1046 }
1047 if (ret < 0) {
1048 error_report("%s: Failed to read full userfault "
1049 "message: %s (shared) revents=%d",
1050 __func__, strerror(errno),
1051 pfd[index].revents);
1052
1053 break;
1054 } else {
1055 error_report("%s: Read %d bytes from userfaultfd "
1056 "expected %zd (shared)",
1057 __func__, ret, sizeof(msg));
1058
1059 break;
1060 }
1061 }
1062 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1063 error_report("%s: Read unexpected event %ud "
1064 "from userfaultfd (shared)",
1065 __func__, msg.event);
1066 continue;
1067 }
1068
1069 ret = pcfd->handler(pcfd, &msg);
1070 if (ret) {
1071 error_report("%s: Failed to resolve shared fault on %zd/%s",
1072 __func__, index, pcfd->idstr);
1073
1074 }
1075 }
1076 }
1077 }
1078 rcu_unregister_thread();
1079 trace_postcopy_ram_fault_thread_exit();
1080 g_free(pfd);
1081 return NULL;
1082}
1083
1084int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1085{
1086
1087 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1088 if (mis->userfault_fd == -1) {
1089 error_report("%s: Failed to open userfault fd: %s", __func__,
1090 strerror(errno));
1091 return -1;
1092 }
1093
1094
1095
1096
1097
1098 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1099 return -1;
1100 }
1101
1102
1103 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1104 if (mis->userfault_event_fd == -1) {
1105 error_report("%s: Opening userfault_event_fd: %s", __func__,
1106 strerror(errno));
1107 close(mis->userfault_fd);
1108 return -1;
1109 }
1110
1111 qemu_sem_init(&mis->fault_thread_sem, 0);
1112 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1113 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1114 qemu_sem_wait(&mis->fault_thread_sem);
1115 qemu_sem_destroy(&mis->fault_thread_sem);
1116 mis->have_fault_thread = true;
1117
1118
1119 if (qemu_ram_foreach_migratable_block(ram_block_enable_notify, mis)) {
1120 return -1;
1121 }
1122
1123
1124
1125
1126
1127 postcopy_balloon_inhibit(true);
1128
1129 trace_postcopy_ram_enable_notify();
1130
1131 return 0;
1132}
1133
1134static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
1135 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1136{
1137 int ret;
1138 if (from_addr) {
1139 struct uffdio_copy copy_struct;
1140 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1141 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1142 copy_struct.len = pagesize;
1143 copy_struct.mode = 0;
1144 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1145 } else {
1146 struct uffdio_zeropage zero_struct;
1147 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1148 zero_struct.range.len = pagesize;
1149 zero_struct.mode = 0;
1150 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1151 }
1152 if (!ret) {
1153 ramblock_recv_bitmap_set_range(rb, host_addr,
1154 pagesize / qemu_target_page_size());
1155 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1156
1157 }
1158 return ret;
1159}
1160
1161int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1162{
1163 int i;
1164 MigrationIncomingState *mis = migration_incoming_get_current();
1165 GArray *pcrfds = mis->postcopy_remote_fds;
1166
1167 for (i = 0; i < pcrfds->len; i++) {
1168 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1169 int ret = cur->waker(cur, rb, offset);
1170 if (ret) {
1171 return ret;
1172 }
1173 }
1174 return 0;
1175}
1176
1177
1178
1179
1180
1181int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1182 RAMBlock *rb)
1183{
1184 size_t pagesize = qemu_ram_pagesize(rb);
1185
1186
1187
1188
1189
1190
1191 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
1192 int e = errno;
1193 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1194 __func__, strerror(e), host, from, pagesize);
1195
1196 return -e;
1197 }
1198
1199 trace_postcopy_place_page(host);
1200 return postcopy_notify_shared_wake(rb,
1201 qemu_ram_block_host_offset(rb, host));
1202}
1203
1204
1205
1206
1207
1208int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1209 RAMBlock *rb)
1210{
1211 size_t pagesize = qemu_ram_pagesize(rb);
1212 trace_postcopy_place_page_zero(host);
1213
1214
1215
1216
1217 if (qemu_ram_is_uf_zeroable(rb)) {
1218 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
1219 int e = errno;
1220 error_report("%s: %s zero host: %p",
1221 __func__, strerror(e), host);
1222
1223 return -e;
1224 }
1225 return postcopy_notify_shared_wake(rb,
1226 qemu_ram_block_host_offset(rb,
1227 host));
1228 } else {
1229
1230 if (!mis->postcopy_tmp_zero_page) {
1231 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1232 PROT_READ | PROT_WRITE,
1233 MAP_PRIVATE | MAP_ANONYMOUS,
1234 -1, 0);
1235 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1236 int e = errno;
1237 mis->postcopy_tmp_zero_page = NULL;
1238 error_report("%s: %s mapping large zero page",
1239 __func__, strerror(e));
1240 return -e;
1241 }
1242 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1243 }
1244 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
1245 rb);
1246 }
1247}
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1258{
1259 if (!mis->postcopy_tmp_page) {
1260 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1261 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1262 MAP_ANONYMOUS, -1, 0);
1263 if (mis->postcopy_tmp_page == MAP_FAILED) {
1264 mis->postcopy_tmp_page = NULL;
1265 error_report("%s: %s", __func__, strerror(errno));
1266 return NULL;
1267 }
1268 }
1269
1270 return mis->postcopy_tmp_page;
1271}
1272
1273#else
1274
1275void fill_destination_postcopy_migration_info(MigrationInfo *info)
1276{
1277}
1278
1279bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1280{
1281 error_report("%s: No OS support", __func__);
1282 return false;
1283}
1284
1285int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1286{
1287 error_report("postcopy_ram_incoming_init: No OS support");
1288 return -1;
1289}
1290
1291int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1292{
1293 assert(0);
1294 return -1;
1295}
1296
1297int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1298{
1299 assert(0);
1300 return -1;
1301}
1302
1303int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1304 uint64_t client_addr, uint64_t rb_offset)
1305{
1306 assert(0);
1307 return -1;
1308}
1309
1310int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1311{
1312 assert(0);
1313 return -1;
1314}
1315
1316int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1317 RAMBlock *rb)
1318{
1319 assert(0);
1320 return -1;
1321}
1322
1323int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1324 RAMBlock *rb)
1325{
1326 assert(0);
1327 return -1;
1328}
1329
1330void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1331{
1332 assert(0);
1333 return NULL;
1334}
1335
1336int postcopy_wake_shared(struct PostCopyFD *pcfd,
1337 uint64_t client_addr,
1338 RAMBlock *rb)
1339{
1340 assert(0);
1341 return -1;
1342}
1343#endif
1344
1345
1346
1347void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1348{
1349 uint64_t tmp64 = 1;
1350
1351
1352
1353
1354
1355 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1356
1357 error_report("%s: incrementing failed: %s", __func__,
1358 strerror(errno));
1359 }
1360}
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
1374 const char *name)
1375{
1376 PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
1377
1378 if (res) {
1379 res->ramblock_name = name;
1380 }
1381
1382 return res;
1383}
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
1396 unsigned long start, unsigned long length)
1397{
1398 size_t tp_size = qemu_target_page_size();
1399
1400 pds->start_list[pds->cur_entry] = start * tp_size;
1401 pds->length_list[pds->cur_entry] = length * tp_size;
1402 trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
1403 pds->cur_entry++;
1404 pds->nsentwords++;
1405
1406 if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
1407
1408 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1409 pds->ramblock_name,
1410 pds->cur_entry,
1411 pds->start_list,
1412 pds->length_list);
1413 pds->nsentcmds++;
1414 pds->cur_entry = 0;
1415 }
1416}
1417
1418
1419
1420
1421
1422
1423
1424
1425void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
1426{
1427
1428 if (pds->cur_entry) {
1429 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1430 pds->ramblock_name,
1431 pds->cur_entry,
1432 pds->start_list,
1433 pds->length_list);
1434 pds->nsentcmds++;
1435 }
1436
1437 trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
1438 pds->nsentcmds);
1439
1440 g_free(pds);
1441}
1442
1443
1444
1445
1446
1447
1448static PostcopyState incoming_postcopy_state;
1449
1450PostcopyState postcopy_state_get(void)
1451{
1452 return atomic_mb_read(&incoming_postcopy_state);
1453}
1454
1455
1456PostcopyState postcopy_state_set(PostcopyState new_state)
1457{
1458 return atomic_xchg(&incoming_postcopy_state, new_state);
1459}
1460
1461
1462
1463
1464void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1465{
1466 MigrationIncomingState *mis = migration_incoming_get_current();
1467
1468 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1469 *pcfd);
1470}
1471
1472
1473
1474void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1475{
1476 guint i;
1477 MigrationIncomingState *mis = migration_incoming_get_current();
1478 GArray *pcrfds = mis->postcopy_remote_fds;
1479
1480 for (i = 0; i < pcrfds->len; i++) {
1481 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1482 if (cur->fd == pcfd->fd) {
1483 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1484 return;
1485 }
1486 }
1487}
1488