1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "exec/target_page.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "savevm.h"
24#include "postcopy-ram.h"
25#include "ram.h"
26#include "qapi/error.h"
27#include "qemu/notify.h"
28#include "sysemu/sysemu.h"
29#include "sysemu/balloon.h"
30#include "qemu/error-report.h"
31#include "trace.h"
32
33
34
35
36#define MAX_DISCARDS_PER_COMMAND 12
37
38struct PostcopyDiscardState {
39 const char *ramblock_name;
40 uint16_t cur_entry;
41
42
43
44 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
45 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
46 unsigned int nsentwords;
47 unsigned int nsentcmds;
48};
49
50static NotifierWithReturnList postcopy_notifier_list;
51
52void postcopy_infrastructure_init(void)
53{
54 notifier_with_return_list_init(&postcopy_notifier_list);
55}
56
57void postcopy_add_notifier(NotifierWithReturn *nn)
58{
59 notifier_with_return_list_add(&postcopy_notifier_list, nn);
60}
61
62void postcopy_remove_notifier(NotifierWithReturn *n)
63{
64 notifier_with_return_remove(n);
65}
66
67int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
68{
69 struct PostcopyNotifyData pnd;
70 pnd.reason = reason;
71 pnd.errp = errp;
72
73 return notifier_with_return_list_notify(&postcopy_notifier_list,
74 &pnd);
75}
76
77
78
79
80
81#if defined(__linux__)
82
83#include <poll.h>
84#include <sys/ioctl.h>
85#include <sys/syscall.h>
86#include <asm/types.h>
87#endif
88
89#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
90#include <sys/eventfd.h>
91#include <linux/userfaultfd.h>
92
93typedef struct PostcopyBlocktimeContext {
94
95 uint32_t *page_fault_vcpu_time;
96
97 uintptr_t *vcpu_addr;
98 uint32_t total_blocktime;
99
100 uint32_t *vcpu_blocktime;
101
102 uint32_t last_begin;
103
104 int smp_cpus_down;
105 uint64_t start_time;
106
107
108
109
110
111 Notifier exit_notifier;
112} PostcopyBlocktimeContext;
113
114static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
115{
116 g_free(ctx->page_fault_vcpu_time);
117 g_free(ctx->vcpu_addr);
118 g_free(ctx->vcpu_blocktime);
119 g_free(ctx);
120}
121
122static void migration_exit_cb(Notifier *n, void *data)
123{
124 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
125 exit_notifier);
126 destroy_blocktime_context(ctx);
127}
128
129static struct PostcopyBlocktimeContext *blocktime_context_new(void)
130{
131 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
132 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
133 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
134 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
135
136 ctx->exit_notifier.notify = migration_exit_cb;
137 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
138 qemu_add_exit_notifier(&ctx->exit_notifier);
139 return ctx;
140}
141
142static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
143{
144 uint32List *list = NULL, *entry = NULL;
145 int i;
146
147 for (i = smp_cpus - 1; i >= 0; i--) {
148 entry = g_new0(uint32List, 1);
149 entry->value = ctx->vcpu_blocktime[i];
150 entry->next = list;
151 list = entry;
152 }
153
154 return list;
155}
156
157
158
159
160
161
162
163
164void fill_destination_postcopy_migration_info(MigrationInfo *info)
165{
166 MigrationIncomingState *mis = migration_incoming_get_current();
167 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
168
169 if (!bc) {
170 return;
171 }
172
173 info->has_postcopy_blocktime = true;
174 info->postcopy_blocktime = bc->total_blocktime;
175 info->has_postcopy_vcpu_blocktime = true;
176 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
177}
178
179static uint32_t get_postcopy_total_blocktime(void)
180{
181 MigrationIncomingState *mis = migration_incoming_get_current();
182 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
183
184 if (!bc) {
185 return 0;
186 }
187
188 return bc->total_blocktime;
189}
190
191
192
193
194
195
196
197
198
199
200
201static bool receive_ufd_features(uint64_t *features)
202{
203 struct uffdio_api api_struct = {0};
204 int ufd;
205 bool ret = true;
206
207
208 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
209 if (ufd == -1) {
210 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
211 strerror(errno));
212 return false;
213 }
214
215
216 api_struct.api = UFFD_API;
217 api_struct.features = 0;
218 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
219 error_report("%s: UFFDIO_API failed: %s", __func__,
220 strerror(errno));
221 ret = false;
222 goto release_ufd;
223 }
224
225 *features = api_struct.features;
226
227release_ufd:
228 close(ufd);
229 return ret;
230}
231
232
233
234
235
236
237
238
239
240
241static bool request_ufd_features(int ufd, uint64_t features)
242{
243 struct uffdio_api api_struct = {0};
244 uint64_t ioctl_mask;
245
246 api_struct.api = UFFD_API;
247 api_struct.features = features;
248 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
249 error_report("%s failed: UFFDIO_API failed: %s", __func__,
250 strerror(errno));
251 return false;
252 }
253
254 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
255 (__u64)1 << _UFFDIO_UNREGISTER;
256 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
257 error_report("Missing userfault features: %" PRIx64,
258 (uint64_t)(~api_struct.ioctls & ioctl_mask));
259 return false;
260 }
261
262 return true;
263}
264
265static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
266{
267 uint64_t asked_features = 0;
268 static uint64_t supported_features;
269
270
271
272
273
274
275 if (!supported_features) {
276 if (!receive_ufd_features(&supported_features)) {
277 error_report("%s failed", __func__);
278 return false;
279 }
280 }
281
282#ifdef UFFD_FEATURE_THREAD_ID
283 if (migrate_postcopy_blocktime() && mis &&
284 UFFD_FEATURE_THREAD_ID & supported_features) {
285
286
287 if (!mis->blocktime_ctx) {
288 mis->blocktime_ctx = blocktime_context_new();
289 }
290
291 asked_features |= UFFD_FEATURE_THREAD_ID;
292 }
293#endif
294
295
296
297
298
299
300 if (!request_ufd_features(ufd, asked_features)) {
301 error_report("%s failed: features %" PRIu64, __func__,
302 asked_features);
303 return false;
304 }
305
306 if (getpagesize() != ram_pagesize_summary()) {
307 bool have_hp = false;
308
309#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
310 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
311#endif
312 if (!have_hp) {
313 error_report("Userfault on this host does not support huge pages");
314 return false;
315 }
316 }
317 return true;
318}
319
320
321
322static int test_ramblock_postcopiable(const char *block_name, void *host_addr,
323 ram_addr_t offset, ram_addr_t length, void *opaque)
324{
325 RAMBlock *rb = qemu_ram_block_by_name(block_name);
326 size_t pagesize = qemu_ram_pagesize(rb);
327
328 if (length % pagesize) {
329 error_report("Postcopy requires RAM blocks to be a page size multiple,"
330 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
331 "page size of 0x%zx", block_name, length, pagesize);
332 return 1;
333 }
334 return 0;
335}
336
337
338
339
340
341
342bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
343{
344 long pagesize = getpagesize();
345 int ufd = -1;
346 bool ret = false;
347 void *testarea = NULL;
348 struct uffdio_register reg_struct;
349 struct uffdio_range range_struct;
350 uint64_t feature_mask;
351 Error *local_err = NULL;
352
353 if (qemu_target_page_size() > pagesize) {
354 error_report("Target page size bigger than host page size");
355 goto out;
356 }
357
358 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
359 if (ufd == -1) {
360 error_report("%s: userfaultfd not available: %s", __func__,
361 strerror(errno));
362 goto out;
363 }
364
365
366 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
367 error_report_err(local_err);
368 goto out;
369 }
370
371
372 if (!ufd_check_and_apply(ufd, mis)) {
373 goto out;
374 }
375
376
377 if (qemu_ram_foreach_migratable_block(test_ramblock_postcopiable, NULL)) {
378 goto out;
379 }
380
381
382
383
384
385 if (munlockall()) {
386 error_report("%s: munlockall: %s", __func__, strerror(errno));
387 return -1;
388 }
389
390
391
392
393
394
395 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
396 MAP_ANONYMOUS, -1, 0);
397 if (testarea == MAP_FAILED) {
398 error_report("%s: Failed to map test area: %s", __func__,
399 strerror(errno));
400 goto out;
401 }
402 g_assert(((size_t)testarea & (pagesize-1)) == 0);
403
404 reg_struct.range.start = (uintptr_t)testarea;
405 reg_struct.range.len = pagesize;
406 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
407
408 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
409 error_report("%s userfault register: %s", __func__, strerror(errno));
410 goto out;
411 }
412
413 range_struct.start = (uintptr_t)testarea;
414 range_struct.len = pagesize;
415 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
416 error_report("%s userfault unregister: %s", __func__, strerror(errno));
417 goto out;
418 }
419
420 feature_mask = (__u64)1 << _UFFDIO_WAKE |
421 (__u64)1 << _UFFDIO_COPY |
422 (__u64)1 << _UFFDIO_ZEROPAGE;
423 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
424 error_report("Missing userfault map features: %" PRIx64,
425 (uint64_t)(~reg_struct.ioctls & feature_mask));
426 goto out;
427 }
428
429
430 ret = true;
431out:
432 if (testarea) {
433 munmap(testarea, pagesize);
434 }
435 if (ufd != -1) {
436 close(ufd);
437 }
438 return ret;
439}
440
441
442
443
444
445
446static int init_range(const char *block_name, void *host_addr,
447 ram_addr_t offset, ram_addr_t length, void *opaque)
448{
449 trace_postcopy_init_range(block_name, host_addr, offset, length);
450
451
452
453
454
455
456
457 if (ram_discard_range(block_name, 0, length)) {
458 return -1;
459 }
460
461 return 0;
462}
463
464
465
466
467
468static int cleanup_range(const char *block_name, void *host_addr,
469 ram_addr_t offset, ram_addr_t length, void *opaque)
470{
471 MigrationIncomingState *mis = opaque;
472 struct uffdio_range range_struct;
473 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
474
475
476
477
478
479 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
480
481
482
483
484
485
486 range_struct.start = (uintptr_t)host_addr;
487 range_struct.len = length;
488
489 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
490 error_report("%s: userfault unregister %s", __func__, strerror(errno));
491
492 return -1;
493 }
494
495 return 0;
496}
497
498
499
500
501
502
503int postcopy_ram_incoming_init(MigrationIncomingState *mis)
504{
505 if (qemu_ram_foreach_migratable_block(init_range, NULL)) {
506 return -1;
507 }
508
509 return 0;
510}
511
512
513
514
515int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
516{
517 trace_postcopy_ram_incoming_cleanup_entry();
518
519 if (mis->have_fault_thread) {
520 Error *local_err = NULL;
521
522 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
523 error_report_err(local_err);
524 return -1;
525 }
526
527 if (qemu_ram_foreach_migratable_block(cleanup_range, mis)) {
528 return -1;
529 }
530
531 atomic_set(&mis->fault_thread_quit, 1);
532 postcopy_fault_thread_notify(mis);
533 trace_postcopy_ram_incoming_cleanup_join();
534 qemu_thread_join(&mis->fault_thread);
535
536 trace_postcopy_ram_incoming_cleanup_closeuf();
537 close(mis->userfault_fd);
538 close(mis->userfault_event_fd);
539 mis->have_fault_thread = false;
540 }
541
542 qemu_balloon_inhibit(false);
543
544 if (enable_mlock) {
545 if (os_mlock() < 0) {
546 error_report("mlock: %s", strerror(errno));
547
548
549
550
551 }
552 }
553
554 postcopy_state_set(POSTCOPY_INCOMING_END);
555
556 if (mis->postcopy_tmp_page) {
557 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
558 mis->postcopy_tmp_page = NULL;
559 }
560 if (mis->postcopy_tmp_zero_page) {
561 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
562 mis->postcopy_tmp_zero_page = NULL;
563 }
564 trace_postcopy_ram_incoming_cleanup_blocktime(
565 get_postcopy_total_blocktime());
566
567 trace_postcopy_ram_incoming_cleanup_exit();
568 return 0;
569}
570
571
572
573
574static int nhp_range(const char *block_name, void *host_addr,
575 ram_addr_t offset, ram_addr_t length, void *opaque)
576{
577 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
578
579
580
581
582
583
584 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
585
586 return 0;
587}
588
589
590
591
592
593
594int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
595{
596 if (qemu_ram_foreach_migratable_block(nhp_range, mis)) {
597 return -1;
598 }
599
600 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
601
602 return 0;
603}
604
605
606
607
608
609
610
611
612
613
614static int ram_block_enable_notify(const char *block_name, void *host_addr,
615 ram_addr_t offset, ram_addr_t length,
616 void *opaque)
617{
618 MigrationIncomingState *mis = opaque;
619 struct uffdio_register reg_struct;
620
621 reg_struct.range.start = (uintptr_t)host_addr;
622 reg_struct.range.len = length;
623 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
624
625
626 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
627 error_report("%s userfault register: %s", __func__, strerror(errno));
628 return -1;
629 }
630 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
631 error_report("%s userfault: Region doesn't support COPY", __func__);
632 return -1;
633 }
634 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
635 RAMBlock *rb = qemu_ram_block_by_name(block_name);
636 qemu_ram_set_uf_zeroable(rb);
637 }
638
639 return 0;
640}
641
642int postcopy_wake_shared(struct PostCopyFD *pcfd,
643 uint64_t client_addr,
644 RAMBlock *rb)
645{
646 size_t pagesize = qemu_ram_pagesize(rb);
647 struct uffdio_range range;
648 int ret;
649 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
650 range.start = client_addr & ~(pagesize - 1);
651 range.len = pagesize;
652 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
653 if (ret) {
654 error_report("%s: Failed to wake: %zx in %s (%s)",
655 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
656 strerror(errno));
657 }
658 return ret;
659}
660
661
662
663
664
665
666int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
667 uint64_t client_addr, uint64_t rb_offset)
668{
669 size_t pagesize = qemu_ram_pagesize(rb);
670 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
671 MigrationIncomingState *mis = migration_incoming_get_current();
672
673 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
674 rb_offset);
675 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
676 trace_postcopy_request_shared_page_present(pcfd->idstr,
677 qemu_ram_get_idstr(rb), rb_offset);
678 return postcopy_wake_shared(pcfd, client_addr, rb);
679 }
680 if (rb != mis->last_rb) {
681 mis->last_rb = rb;
682 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
683 aligned_rbo, pagesize);
684 } else {
685
686 migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
687 }
688 return 0;
689}
690
691static int get_mem_fault_cpu_index(uint32_t pid)
692{
693 CPUState *cpu_iter;
694
695 CPU_FOREACH(cpu_iter) {
696 if (cpu_iter->thread_id == pid) {
697 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
698 return cpu_iter->cpu_index;
699 }
700 }
701 trace_get_mem_fault_cpu_index(-1, pid);
702 return -1;
703}
704
705static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
706{
707 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
708 dc->start_time;
709 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
710}
711
712
713
714
715
716
717
718
719
720static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
721 RAMBlock *rb)
722{
723 int cpu, already_received;
724 MigrationIncomingState *mis = migration_incoming_get_current();
725 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
726 uint32_t low_time_offset;
727
728 if (!dc || ptid == 0) {
729 return;
730 }
731 cpu = get_mem_fault_cpu_index(ptid);
732 if (cpu < 0) {
733 return;
734 }
735
736 low_time_offset = get_low_time_offset(dc);
737 if (dc->vcpu_addr[cpu] == 0) {
738 atomic_inc(&dc->smp_cpus_down);
739 }
740
741 atomic_xchg(&dc->last_begin, low_time_offset);
742 atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
743 atomic_xchg(&dc->vcpu_addr[cpu], addr);
744
745
746
747
748 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
749 if (already_received) {
750 atomic_xchg(&dc->vcpu_addr[cpu], 0);
751 atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
752 atomic_dec(&dc->smp_cpus_down);
753 }
754 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
755 cpu, already_received);
756}
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785static void mark_postcopy_blocktime_end(uintptr_t addr)
786{
787 MigrationIncomingState *mis = migration_incoming_get_current();
788 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
789 int i, affected_cpu = 0;
790 bool vcpu_total_blocktime = false;
791 uint32_t read_vcpu_time, low_time_offset;
792
793 if (!dc) {
794 return;
795 }
796
797 low_time_offset = get_low_time_offset(dc);
798
799
800
801
802 for (i = 0; i < smp_cpus; i++) {
803 uint32_t vcpu_blocktime = 0;
804
805 read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
806 if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
807 read_vcpu_time == 0) {
808 continue;
809 }
810 atomic_xchg(&dc->vcpu_addr[i], 0);
811 vcpu_blocktime = low_time_offset - read_vcpu_time;
812 affected_cpu += 1;
813
814
815
816 if (!vcpu_total_blocktime &&
817 atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
818 vcpu_total_blocktime = true;
819 }
820
821 dc->vcpu_blocktime[i] += vcpu_blocktime;
822 }
823
824 atomic_sub(&dc->smp_cpus_down, affected_cpu);
825 if (vcpu_total_blocktime) {
826 dc->total_blocktime += low_time_offset - atomic_fetch_add(
827 &dc->last_begin, 0);
828 }
829 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
830 affected_cpu);
831}
832
833static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
834{
835 trace_postcopy_pause_fault_thread();
836
837 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
838
839 trace_postcopy_pause_fault_thread_continued();
840
841 return true;
842}
843
844
845
846
847static void *postcopy_ram_fault_thread(void *opaque)
848{
849 MigrationIncomingState *mis = opaque;
850 struct uffd_msg msg;
851 int ret;
852 size_t index;
853 RAMBlock *rb = NULL;
854
855 trace_postcopy_ram_fault_thread_entry();
856 mis->last_rb = NULL;
857 qemu_sem_post(&mis->fault_thread_sem);
858
859 struct pollfd *pfd;
860 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
861
862 pfd = g_new0(struct pollfd, pfd_len);
863
864 pfd[0].fd = mis->userfault_fd;
865 pfd[0].events = POLLIN;
866 pfd[1].fd = mis->userfault_event_fd;
867 pfd[1].events = POLLIN;
868 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
869 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
870 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
871 struct PostCopyFD, index);
872 pfd[2 + index].fd = pcfd->fd;
873 pfd[2 + index].events = POLLIN;
874 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
875 pcfd->fd);
876 }
877
878 while (true) {
879 ram_addr_t rb_offset;
880 int poll_result;
881
882
883
884
885
886
887
888 poll_result = poll(pfd, pfd_len, -1 );
889 if (poll_result == -1) {
890 error_report("%s: userfault poll: %s", __func__, strerror(errno));
891 break;
892 }
893
894 if (!mis->to_src_file) {
895
896
897
898
899
900 if (postcopy_pause_fault_thread(mis)) {
901 mis->last_rb = NULL;
902
903 } else {
904 error_report("%s: paused but don't allow to continue",
905 __func__);
906 break;
907 }
908 }
909
910 if (pfd[1].revents) {
911 uint64_t tmp64 = 0;
912
913
914 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
915
916 error_report("%s: read() failed", __func__);
917 }
918
919 if (atomic_read(&mis->fault_thread_quit)) {
920 trace_postcopy_ram_fault_thread_quit();
921 break;
922 }
923 }
924
925 if (pfd[0].revents) {
926 poll_result--;
927 ret = read(mis->userfault_fd, &msg, sizeof(msg));
928 if (ret != sizeof(msg)) {
929 if (errno == EAGAIN) {
930
931
932
933
934 continue;
935 }
936 if (ret < 0) {
937 error_report("%s: Failed to read full userfault "
938 "message: %s",
939 __func__, strerror(errno));
940 break;
941 } else {
942 error_report("%s: Read %d bytes from userfaultfd "
943 "expected %zd",
944 __func__, ret, sizeof(msg));
945 break;
946 }
947 }
948 if (msg.event != UFFD_EVENT_PAGEFAULT) {
949 error_report("%s: Read unexpected event %ud from userfaultfd",
950 __func__, msg.event);
951 continue;
952 }
953
954 rb = qemu_ram_block_from_host(
955 (void *)(uintptr_t)msg.arg.pagefault.address,
956 true, &rb_offset);
957 if (!rb) {
958 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
959 PRIx64, (uint64_t)msg.arg.pagefault.address);
960 break;
961 }
962
963 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
964 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
965 qemu_ram_get_idstr(rb),
966 rb_offset,
967 msg.arg.pagefault.feat.ptid);
968 mark_postcopy_blocktime_begin(
969 (uintptr_t)(msg.arg.pagefault.address),
970 msg.arg.pagefault.feat.ptid, rb);
971
972retry:
973
974
975
976
977 if (rb != mis->last_rb) {
978 mis->last_rb = rb;
979 ret = migrate_send_rp_req_pages(mis,
980 qemu_ram_get_idstr(rb),
981 rb_offset,
982 qemu_ram_pagesize(rb));
983 } else {
984
985 ret = migrate_send_rp_req_pages(mis,
986 NULL,
987 rb_offset,
988 qemu_ram_pagesize(rb));
989 }
990
991 if (ret) {
992
993 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
994
995 mis->last_rb = NULL;
996 goto retry;
997 } else {
998
999 error_report("%s: migrate_send_rp_req_pages() get %d",
1000 __func__, ret);
1001 break;
1002 }
1003 }
1004 }
1005
1006
1007
1008 for (index = 2; index < pfd_len && poll_result; index++) {
1009 if (pfd[index].revents) {
1010 struct PostCopyFD *pcfd =
1011 &g_array_index(mis->postcopy_remote_fds,
1012 struct PostCopyFD, index - 2);
1013
1014 poll_result--;
1015 if (pfd[index].revents & POLLERR) {
1016 error_report("%s: POLLERR on poll %zd fd=%d",
1017 __func__, index, pcfd->fd);
1018 pfd[index].events = 0;
1019 continue;
1020 }
1021
1022 ret = read(pcfd->fd, &msg, sizeof(msg));
1023 if (ret != sizeof(msg)) {
1024 if (errno == EAGAIN) {
1025
1026
1027
1028
1029 continue;
1030 }
1031 if (ret < 0) {
1032 error_report("%s: Failed to read full userfault "
1033 "message: %s (shared) revents=%d",
1034 __func__, strerror(errno),
1035 pfd[index].revents);
1036
1037 break;
1038 } else {
1039 error_report("%s: Read %d bytes from userfaultfd "
1040 "expected %zd (shared)",
1041 __func__, ret, sizeof(msg));
1042
1043 break;
1044 }
1045 }
1046 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1047 error_report("%s: Read unexpected event %ud "
1048 "from userfaultfd (shared)",
1049 __func__, msg.event);
1050 continue;
1051 }
1052
1053 ret = pcfd->handler(pcfd, &msg);
1054 if (ret) {
1055 error_report("%s: Failed to resolve shared fault on %zd/%s",
1056 __func__, index, pcfd->idstr);
1057
1058 }
1059 }
1060 }
1061 }
1062 trace_postcopy_ram_fault_thread_exit();
1063 g_free(pfd);
1064 return NULL;
1065}
1066
1067int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1068{
1069
1070 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1071 if (mis->userfault_fd == -1) {
1072 error_report("%s: Failed to open userfault fd: %s", __func__,
1073 strerror(errno));
1074 return -1;
1075 }
1076
1077
1078
1079
1080
1081 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1082 return -1;
1083 }
1084
1085
1086 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1087 if (mis->userfault_event_fd == -1) {
1088 error_report("%s: Opening userfault_event_fd: %s", __func__,
1089 strerror(errno));
1090 close(mis->userfault_fd);
1091 return -1;
1092 }
1093
1094 qemu_sem_init(&mis->fault_thread_sem, 0);
1095 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1096 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1097 qemu_sem_wait(&mis->fault_thread_sem);
1098 qemu_sem_destroy(&mis->fault_thread_sem);
1099 mis->have_fault_thread = true;
1100
1101
1102 if (qemu_ram_foreach_migratable_block(ram_block_enable_notify, mis)) {
1103 return -1;
1104 }
1105
1106
1107
1108
1109
1110 qemu_balloon_inhibit(true);
1111
1112 trace_postcopy_ram_enable_notify();
1113
1114 return 0;
1115}
1116
1117static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
1118 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1119{
1120 int ret;
1121 if (from_addr) {
1122 struct uffdio_copy copy_struct;
1123 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1124 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1125 copy_struct.len = pagesize;
1126 copy_struct.mode = 0;
1127 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1128 } else {
1129 struct uffdio_zeropage zero_struct;
1130 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1131 zero_struct.range.len = pagesize;
1132 zero_struct.mode = 0;
1133 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1134 }
1135 if (!ret) {
1136 ramblock_recv_bitmap_set_range(rb, host_addr,
1137 pagesize / qemu_target_page_size());
1138 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1139
1140 }
1141 return ret;
1142}
1143
1144int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1145{
1146 int i;
1147 MigrationIncomingState *mis = migration_incoming_get_current();
1148 GArray *pcrfds = mis->postcopy_remote_fds;
1149
1150 for (i = 0; i < pcrfds->len; i++) {
1151 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1152 int ret = cur->waker(cur, rb, offset);
1153 if (ret) {
1154 return ret;
1155 }
1156 }
1157 return 0;
1158}
1159
1160
1161
1162
1163
1164int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1165 RAMBlock *rb)
1166{
1167 size_t pagesize = qemu_ram_pagesize(rb);
1168
1169
1170
1171
1172
1173
1174 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
1175 int e = errno;
1176 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1177 __func__, strerror(e), host, from, pagesize);
1178
1179 return -e;
1180 }
1181
1182 trace_postcopy_place_page(host);
1183 return postcopy_notify_shared_wake(rb,
1184 qemu_ram_block_host_offset(rb, host));
1185}
1186
1187
1188
1189
1190
1191int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1192 RAMBlock *rb)
1193{
1194 size_t pagesize = qemu_ram_pagesize(rb);
1195 trace_postcopy_place_page_zero(host);
1196
1197
1198
1199
1200 if (qemu_ram_is_uf_zeroable(rb)) {
1201 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
1202 int e = errno;
1203 error_report("%s: %s zero host: %p",
1204 __func__, strerror(e), host);
1205
1206 return -e;
1207 }
1208 return postcopy_notify_shared_wake(rb,
1209 qemu_ram_block_host_offset(rb,
1210 host));
1211 } else {
1212
1213 if (!mis->postcopy_tmp_zero_page) {
1214 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1215 PROT_READ | PROT_WRITE,
1216 MAP_PRIVATE | MAP_ANONYMOUS,
1217 -1, 0);
1218 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1219 int e = errno;
1220 mis->postcopy_tmp_zero_page = NULL;
1221 error_report("%s: %s mapping large zero page",
1222 __func__, strerror(e));
1223 return -e;
1224 }
1225 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1226 }
1227 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
1228 rb);
1229 }
1230}
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1241{
1242 if (!mis->postcopy_tmp_page) {
1243 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1244 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1245 MAP_ANONYMOUS, -1, 0);
1246 if (mis->postcopy_tmp_page == MAP_FAILED) {
1247 mis->postcopy_tmp_page = NULL;
1248 error_report("%s: %s", __func__, strerror(errno));
1249 return NULL;
1250 }
1251 }
1252
1253 return mis->postcopy_tmp_page;
1254}
1255
1256#else
1257
1258void fill_destination_postcopy_migration_info(MigrationInfo *info)
1259{
1260}
1261
1262bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1263{
1264 error_report("%s: No OS support", __func__);
1265 return false;
1266}
1267
1268int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1269{
1270 error_report("postcopy_ram_incoming_init: No OS support");
1271 return -1;
1272}
1273
1274int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1275{
1276 assert(0);
1277 return -1;
1278}
1279
1280int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1281{
1282 assert(0);
1283 return -1;
1284}
1285
1286int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1287 uint64_t client_addr, uint64_t rb_offset)
1288{
1289 assert(0);
1290 return -1;
1291}
1292
1293int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1294{
1295 assert(0);
1296 return -1;
1297}
1298
1299int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1300 RAMBlock *rb)
1301{
1302 assert(0);
1303 return -1;
1304}
1305
1306int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1307 RAMBlock *rb)
1308{
1309 assert(0);
1310 return -1;
1311}
1312
1313void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1314{
1315 assert(0);
1316 return NULL;
1317}
1318
1319int postcopy_wake_shared(struct PostCopyFD *pcfd,
1320 uint64_t client_addr,
1321 RAMBlock *rb)
1322{
1323 assert(0);
1324 return -1;
1325}
1326#endif
1327
1328
1329
1330void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1331{
1332 uint64_t tmp64 = 1;
1333
1334
1335
1336
1337
1338 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1339
1340 error_report("%s: incrementing failed: %s", __func__,
1341 strerror(errno));
1342 }
1343}
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
1357 const char *name)
1358{
1359 PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
1360
1361 if (res) {
1362 res->ramblock_name = name;
1363 }
1364
1365 return res;
1366}
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
1379 unsigned long start, unsigned long length)
1380{
1381 size_t tp_size = qemu_target_page_size();
1382
1383 pds->start_list[pds->cur_entry] = start * tp_size;
1384 pds->length_list[pds->cur_entry] = length * tp_size;
1385 trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
1386 pds->cur_entry++;
1387 pds->nsentwords++;
1388
1389 if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
1390
1391 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1392 pds->ramblock_name,
1393 pds->cur_entry,
1394 pds->start_list,
1395 pds->length_list);
1396 pds->nsentcmds++;
1397 pds->cur_entry = 0;
1398 }
1399}
1400
1401
1402
1403
1404
1405
1406
1407
1408void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
1409{
1410
1411 if (pds->cur_entry) {
1412 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1413 pds->ramblock_name,
1414 pds->cur_entry,
1415 pds->start_list,
1416 pds->length_list);
1417 pds->nsentcmds++;
1418 }
1419
1420 trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
1421 pds->nsentcmds);
1422
1423 g_free(pds);
1424}
1425
1426
1427
1428
1429
1430
1431static PostcopyState incoming_postcopy_state;
1432
1433PostcopyState postcopy_state_get(void)
1434{
1435 return atomic_mb_read(&incoming_postcopy_state);
1436}
1437
1438
1439PostcopyState postcopy_state_set(PostcopyState new_state)
1440{
1441 return atomic_xchg(&incoming_postcopy_state, new_state);
1442}
1443
1444
1445
1446
1447void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1448{
1449 MigrationIncomingState *mis = migration_incoming_get_current();
1450
1451 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1452 *pcfd);
1453}
1454
1455
1456
1457void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1458{
1459 guint i;
1460 MigrationIncomingState *mis = migration_incoming_get_current();
1461 GArray *pcrfds = mis->postcopy_remote_fds;
1462
1463 for (i = 0; i < pcrfds->len; i++) {
1464 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1465 if (cur->fd == pcfd->fd) {
1466 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1467 return;
1468 }
1469 }
1470}
1471