1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "exec/target_page.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "savevm.h"
24#include "postcopy-ram.h"
25#include "ram.h"
26#include "qapi/error.h"
27#include "qemu/notify.h"
28#include "sysemu/sysemu.h"
29#include "sysemu/balloon.h"
30#include "qemu/error-report.h"
31#include "trace.h"
32#include "hw/boards.h"
33
34
35
36
37#define MAX_DISCARDS_PER_COMMAND 12
38
39struct PostcopyDiscardState {
40 const char *ramblock_name;
41 uint16_t cur_entry;
42
43
44
45 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
46 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
47 unsigned int nsentwords;
48 unsigned int nsentcmds;
49};
50
51static NotifierWithReturnList postcopy_notifier_list;
52
53void postcopy_infrastructure_init(void)
54{
55 notifier_with_return_list_init(&postcopy_notifier_list);
56}
57
58void postcopy_add_notifier(NotifierWithReturn *nn)
59{
60 notifier_with_return_list_add(&postcopy_notifier_list, nn);
61}
62
63void postcopy_remove_notifier(NotifierWithReturn *n)
64{
65 notifier_with_return_remove(n);
66}
67
68int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
69{
70 struct PostcopyNotifyData pnd;
71 pnd.reason = reason;
72 pnd.errp = errp;
73
74 return notifier_with_return_list_notify(&postcopy_notifier_list,
75 &pnd);
76}
77
78
79
80
81
82#if defined(__linux__)
83
84#include <poll.h>
85#include <sys/ioctl.h>
86#include <sys/syscall.h>
87#include <asm/types.h>
88#endif
89
90#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
91#include <sys/eventfd.h>
92#include <linux/userfaultfd.h>
93
94typedef struct PostcopyBlocktimeContext {
95
96 uint32_t *page_fault_vcpu_time;
97
98 uintptr_t *vcpu_addr;
99 uint32_t total_blocktime;
100
101 uint32_t *vcpu_blocktime;
102
103 uint32_t last_begin;
104
105 int smp_cpus_down;
106 uint64_t start_time;
107
108
109
110
111
112 Notifier exit_notifier;
113} PostcopyBlocktimeContext;
114
115static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
116{
117 g_free(ctx->page_fault_vcpu_time);
118 g_free(ctx->vcpu_addr);
119 g_free(ctx->vcpu_blocktime);
120 g_free(ctx);
121}
122
123static void migration_exit_cb(Notifier *n, void *data)
124{
125 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
126 exit_notifier);
127 destroy_blocktime_context(ctx);
128}
129
130static struct PostcopyBlocktimeContext *blocktime_context_new(void)
131{
132 MachineState *ms = MACHINE(qdev_get_machine());
133 unsigned int smp_cpus = ms->smp.cpus;
134 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
135 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
136 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
137 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
138
139 ctx->exit_notifier.notify = migration_exit_cb;
140 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
141 qemu_add_exit_notifier(&ctx->exit_notifier);
142 return ctx;
143}
144
145static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
146{
147 MachineState *ms = MACHINE(qdev_get_machine());
148 uint32List *list = NULL, *entry = NULL;
149 int i;
150
151 for (i = ms->smp.cpus - 1; i >= 0; i--) {
152 entry = g_new0(uint32List, 1);
153 entry->value = ctx->vcpu_blocktime[i];
154 entry->next = list;
155 list = entry;
156 }
157
158 return list;
159}
160
161
162
163
164
165
166
167
168void fill_destination_postcopy_migration_info(MigrationInfo *info)
169{
170 MigrationIncomingState *mis = migration_incoming_get_current();
171 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
172
173 if (!bc) {
174 return;
175 }
176
177 info->has_postcopy_blocktime = true;
178 info->postcopy_blocktime = bc->total_blocktime;
179 info->has_postcopy_vcpu_blocktime = true;
180 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
181}
182
183static uint32_t get_postcopy_total_blocktime(void)
184{
185 MigrationIncomingState *mis = migration_incoming_get_current();
186 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
187
188 if (!bc) {
189 return 0;
190 }
191
192 return bc->total_blocktime;
193}
194
195
196
197
198
199
200
201
202
203
204
205static bool receive_ufd_features(uint64_t *features)
206{
207 struct uffdio_api api_struct = {0};
208 int ufd;
209 bool ret = true;
210
211
212 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
213 if (ufd == -1) {
214 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
215 strerror(errno));
216 return false;
217 }
218
219
220 api_struct.api = UFFD_API;
221 api_struct.features = 0;
222 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
223 error_report("%s: UFFDIO_API failed: %s", __func__,
224 strerror(errno));
225 ret = false;
226 goto release_ufd;
227 }
228
229 *features = api_struct.features;
230
231release_ufd:
232 close(ufd);
233 return ret;
234}
235
236
237
238
239
240
241
242
243
244
245static bool request_ufd_features(int ufd, uint64_t features)
246{
247 struct uffdio_api api_struct = {0};
248 uint64_t ioctl_mask;
249
250 api_struct.api = UFFD_API;
251 api_struct.features = features;
252 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
253 error_report("%s failed: UFFDIO_API failed: %s", __func__,
254 strerror(errno));
255 return false;
256 }
257
258 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
259 (__u64)1 << _UFFDIO_UNREGISTER;
260 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
261 error_report("Missing userfault features: %" PRIx64,
262 (uint64_t)(~api_struct.ioctls & ioctl_mask));
263 return false;
264 }
265
266 return true;
267}
268
269static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
270{
271 uint64_t asked_features = 0;
272 static uint64_t supported_features;
273
274
275
276
277
278
279 if (!supported_features) {
280 if (!receive_ufd_features(&supported_features)) {
281 error_report("%s failed", __func__);
282 return false;
283 }
284 }
285
286#ifdef UFFD_FEATURE_THREAD_ID
287 if (migrate_postcopy_blocktime() && mis &&
288 UFFD_FEATURE_THREAD_ID & supported_features) {
289
290
291 if (!mis->blocktime_ctx) {
292 mis->blocktime_ctx = blocktime_context_new();
293 }
294
295 asked_features |= UFFD_FEATURE_THREAD_ID;
296 }
297#endif
298
299
300
301
302
303
304 if (!request_ufd_features(ufd, asked_features)) {
305 error_report("%s failed: features %" PRIu64, __func__,
306 asked_features);
307 return false;
308 }
309
310 if (getpagesize() != ram_pagesize_summary()) {
311 bool have_hp = false;
312
313#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
314 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
315#endif
316 if (!have_hp) {
317 error_report("Userfault on this host does not support huge pages");
318 return false;
319 }
320 }
321 return true;
322}
323
324
325
326static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
327{
328 const char *block_name = qemu_ram_get_idstr(rb);
329 ram_addr_t length = qemu_ram_get_used_length(rb);
330 size_t pagesize = qemu_ram_pagesize(rb);
331
332 if (length % pagesize) {
333 error_report("Postcopy requires RAM blocks to be a page size multiple,"
334 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
335 "page size of 0x%zx", block_name, length, pagesize);
336 return 1;
337 }
338 return 0;
339}
340
341
342
343
344
345
346bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
347{
348 long pagesize = getpagesize();
349 int ufd = -1;
350 bool ret = false;
351 void *testarea = NULL;
352 struct uffdio_register reg_struct;
353 struct uffdio_range range_struct;
354 uint64_t feature_mask;
355 Error *local_err = NULL;
356
357 if (qemu_target_page_size() > pagesize) {
358 error_report("Target page size bigger than host page size");
359 goto out;
360 }
361
362 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
363 if (ufd == -1) {
364 error_report("%s: userfaultfd not available: %s", __func__,
365 strerror(errno));
366 goto out;
367 }
368
369
370 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
371 error_report_err(local_err);
372 goto out;
373 }
374
375
376 if (!ufd_check_and_apply(ufd, mis)) {
377 goto out;
378 }
379
380
381 if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
382 goto out;
383 }
384
385
386
387
388
389 if (munlockall()) {
390 error_report("%s: munlockall: %s", __func__, strerror(errno));
391 return -1;
392 }
393
394
395
396
397
398
399 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
400 MAP_ANONYMOUS, -1, 0);
401 if (testarea == MAP_FAILED) {
402 error_report("%s: Failed to map test area: %s", __func__,
403 strerror(errno));
404 goto out;
405 }
406 g_assert(((size_t)testarea & (pagesize-1)) == 0);
407
408 reg_struct.range.start = (uintptr_t)testarea;
409 reg_struct.range.len = pagesize;
410 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
411
412 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
413 error_report("%s userfault register: %s", __func__, strerror(errno));
414 goto out;
415 }
416
417 range_struct.start = (uintptr_t)testarea;
418 range_struct.len = pagesize;
419 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
420 error_report("%s userfault unregister: %s", __func__, strerror(errno));
421 goto out;
422 }
423
424 feature_mask = (__u64)1 << _UFFDIO_WAKE |
425 (__u64)1 << _UFFDIO_COPY |
426 (__u64)1 << _UFFDIO_ZEROPAGE;
427 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
428 error_report("Missing userfault map features: %" PRIx64,
429 (uint64_t)(~reg_struct.ioctls & feature_mask));
430 goto out;
431 }
432
433
434 ret = true;
435out:
436 if (testarea) {
437 munmap(testarea, pagesize);
438 }
439 if (ufd != -1) {
440 close(ufd);
441 }
442 return ret;
443}
444
445
446
447
448
449
450static int init_range(RAMBlock *rb, void *opaque)
451{
452 const char *block_name = qemu_ram_get_idstr(rb);
453 void *host_addr = qemu_ram_get_host_addr(rb);
454 ram_addr_t offset = qemu_ram_get_offset(rb);
455 ram_addr_t length = qemu_ram_get_used_length(rb);
456 trace_postcopy_init_range(block_name, host_addr, offset, length);
457
458
459
460
461
462
463
464 if (ram_discard_range(block_name, 0, length)) {
465 return -1;
466 }
467
468 return 0;
469}
470
471
472
473
474
475static int cleanup_range(RAMBlock *rb, void *opaque)
476{
477 const char *block_name = qemu_ram_get_idstr(rb);
478 void *host_addr = qemu_ram_get_host_addr(rb);
479 ram_addr_t offset = qemu_ram_get_offset(rb);
480 ram_addr_t length = qemu_ram_get_used_length(rb);
481 MigrationIncomingState *mis = opaque;
482 struct uffdio_range range_struct;
483 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
484
485
486
487
488
489 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
490
491
492
493
494
495
496 range_struct.start = (uintptr_t)host_addr;
497 range_struct.len = length;
498
499 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
500 error_report("%s: userfault unregister %s", __func__, strerror(errno));
501
502 return -1;
503 }
504
505 return 0;
506}
507
508
509
510
511
512
513int postcopy_ram_incoming_init(MigrationIncomingState *mis)
514{
515 if (foreach_not_ignored_block(init_range, NULL)) {
516 return -1;
517 }
518
519 return 0;
520}
521
522
523
524
525
526static void postcopy_balloon_inhibit(bool state)
527{
528 static bool cur_state = false;
529
530 if (state != cur_state) {
531 qemu_balloon_inhibit(state);
532 cur_state = state;
533 }
534}
535
536
537
538
539int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
540{
541 trace_postcopy_ram_incoming_cleanup_entry();
542
543 if (mis->have_fault_thread) {
544 Error *local_err = NULL;
545
546
547 atomic_set(&mis->fault_thread_quit, 1);
548 postcopy_fault_thread_notify(mis);
549 trace_postcopy_ram_incoming_cleanup_join();
550 qemu_thread_join(&mis->fault_thread);
551
552 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
553 error_report_err(local_err);
554 return -1;
555 }
556
557 if (foreach_not_ignored_block(cleanup_range, mis)) {
558 return -1;
559 }
560
561 trace_postcopy_ram_incoming_cleanup_closeuf();
562 close(mis->userfault_fd);
563 close(mis->userfault_event_fd);
564 mis->have_fault_thread = false;
565 }
566
567 postcopy_balloon_inhibit(false);
568
569 if (enable_mlock) {
570 if (os_mlock() < 0) {
571 error_report("mlock: %s", strerror(errno));
572
573
574
575
576 }
577 }
578
579 postcopy_state_set(POSTCOPY_INCOMING_END);
580
581 if (mis->postcopy_tmp_page) {
582 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
583 mis->postcopy_tmp_page = NULL;
584 }
585 if (mis->postcopy_tmp_zero_page) {
586 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
587 mis->postcopy_tmp_zero_page = NULL;
588 }
589 trace_postcopy_ram_incoming_cleanup_blocktime(
590 get_postcopy_total_blocktime());
591
592 trace_postcopy_ram_incoming_cleanup_exit();
593 return 0;
594}
595
596
597
598
599static int nhp_range(RAMBlock *rb, void *opaque)
600{
601 const char *block_name = qemu_ram_get_idstr(rb);
602 void *host_addr = qemu_ram_get_host_addr(rb);
603 ram_addr_t offset = qemu_ram_get_offset(rb);
604 ram_addr_t length = qemu_ram_get_used_length(rb);
605 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
606
607
608
609
610
611
612 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
613
614 return 0;
615}
616
617
618
619
620
621
622int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
623{
624 if (foreach_not_ignored_block(nhp_range, mis)) {
625 return -1;
626 }
627
628 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
629
630 return 0;
631}
632
633
634
635
636
637
638
639
640
641
642static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
643{
644 MigrationIncomingState *mis = opaque;
645 struct uffdio_register reg_struct;
646
647 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
648 reg_struct.range.len = qemu_ram_get_used_length(rb);
649 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
650
651
652 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
653 error_report("%s userfault register: %s", __func__, strerror(errno));
654 return -1;
655 }
656 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
657 error_report("%s userfault: Region doesn't support COPY", __func__);
658 return -1;
659 }
660 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
661 qemu_ram_set_uf_zeroable(rb);
662 }
663
664 return 0;
665}
666
667int postcopy_wake_shared(struct PostCopyFD *pcfd,
668 uint64_t client_addr,
669 RAMBlock *rb)
670{
671 size_t pagesize = qemu_ram_pagesize(rb);
672 struct uffdio_range range;
673 int ret;
674 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
675 range.start = client_addr & ~(pagesize - 1);
676 range.len = pagesize;
677 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
678 if (ret) {
679 error_report("%s: Failed to wake: %zx in %s (%s)",
680 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
681 strerror(errno));
682 }
683 return ret;
684}
685
686
687
688
689
690
691int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
692 uint64_t client_addr, uint64_t rb_offset)
693{
694 size_t pagesize = qemu_ram_pagesize(rb);
695 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
696 MigrationIncomingState *mis = migration_incoming_get_current();
697
698 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
699 rb_offset);
700 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
701 trace_postcopy_request_shared_page_present(pcfd->idstr,
702 qemu_ram_get_idstr(rb), rb_offset);
703 return postcopy_wake_shared(pcfd, client_addr, rb);
704 }
705 if (rb != mis->last_rb) {
706 mis->last_rb = rb;
707 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
708 aligned_rbo, pagesize);
709 } else {
710
711 migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
712 }
713 return 0;
714}
715
716static int get_mem_fault_cpu_index(uint32_t pid)
717{
718 CPUState *cpu_iter;
719
720 CPU_FOREACH(cpu_iter) {
721 if (cpu_iter->thread_id == pid) {
722 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
723 return cpu_iter->cpu_index;
724 }
725 }
726 trace_get_mem_fault_cpu_index(-1, pid);
727 return -1;
728}
729
730static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
731{
732 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
733 dc->start_time;
734 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
735}
736
737
738
739
740
741
742
743
744
745static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
746 RAMBlock *rb)
747{
748 int cpu, already_received;
749 MigrationIncomingState *mis = migration_incoming_get_current();
750 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
751 uint32_t low_time_offset;
752
753 if (!dc || ptid == 0) {
754 return;
755 }
756 cpu = get_mem_fault_cpu_index(ptid);
757 if (cpu < 0) {
758 return;
759 }
760
761 low_time_offset = get_low_time_offset(dc);
762 if (dc->vcpu_addr[cpu] == 0) {
763 atomic_inc(&dc->smp_cpus_down);
764 }
765
766 atomic_xchg(&dc->last_begin, low_time_offset);
767 atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
768 atomic_xchg(&dc->vcpu_addr[cpu], addr);
769
770
771
772
773 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
774 if (already_received) {
775 atomic_xchg(&dc->vcpu_addr[cpu], 0);
776 atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
777 atomic_dec(&dc->smp_cpus_down);
778 }
779 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
780 cpu, already_received);
781}
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810static void mark_postcopy_blocktime_end(uintptr_t addr)
811{
812 MigrationIncomingState *mis = migration_incoming_get_current();
813 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
814 MachineState *ms = MACHINE(qdev_get_machine());
815 unsigned int smp_cpus = ms->smp.cpus;
816 int i, affected_cpu = 0;
817 bool vcpu_total_blocktime = false;
818 uint32_t read_vcpu_time, low_time_offset;
819
820 if (!dc) {
821 return;
822 }
823
824 low_time_offset = get_low_time_offset(dc);
825
826
827
828
829 for (i = 0; i < smp_cpus; i++) {
830 uint32_t vcpu_blocktime = 0;
831
832 read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
833 if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
834 read_vcpu_time == 0) {
835 continue;
836 }
837 atomic_xchg(&dc->vcpu_addr[i], 0);
838 vcpu_blocktime = low_time_offset - read_vcpu_time;
839 affected_cpu += 1;
840
841
842
843 if (!vcpu_total_blocktime &&
844 atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
845 vcpu_total_blocktime = true;
846 }
847
848 dc->vcpu_blocktime[i] += vcpu_blocktime;
849 }
850
851 atomic_sub(&dc->smp_cpus_down, affected_cpu);
852 if (vcpu_total_blocktime) {
853 dc->total_blocktime += low_time_offset - atomic_fetch_add(
854 &dc->last_begin, 0);
855 }
856 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
857 affected_cpu);
858}
859
860static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
861{
862 trace_postcopy_pause_fault_thread();
863
864 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
865
866 trace_postcopy_pause_fault_thread_continued();
867
868 return true;
869}
870
871
872
873
874static void *postcopy_ram_fault_thread(void *opaque)
875{
876 MigrationIncomingState *mis = opaque;
877 struct uffd_msg msg;
878 int ret;
879 size_t index;
880 RAMBlock *rb = NULL;
881
882 trace_postcopy_ram_fault_thread_entry();
883 rcu_register_thread();
884 mis->last_rb = NULL;
885 qemu_sem_post(&mis->fault_thread_sem);
886
887 struct pollfd *pfd;
888 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
889
890 pfd = g_new0(struct pollfd, pfd_len);
891
892 pfd[0].fd = mis->userfault_fd;
893 pfd[0].events = POLLIN;
894 pfd[1].fd = mis->userfault_event_fd;
895 pfd[1].events = POLLIN;
896 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
897 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
898 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
899 struct PostCopyFD, index);
900 pfd[2 + index].fd = pcfd->fd;
901 pfd[2 + index].events = POLLIN;
902 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
903 pcfd->fd);
904 }
905
906 while (true) {
907 ram_addr_t rb_offset;
908 int poll_result;
909
910
911
912
913
914
915
916 poll_result = poll(pfd, pfd_len, -1 );
917 if (poll_result == -1) {
918 error_report("%s: userfault poll: %s", __func__, strerror(errno));
919 break;
920 }
921
922 if (!mis->to_src_file) {
923
924
925
926
927
928 if (postcopy_pause_fault_thread(mis)) {
929 mis->last_rb = NULL;
930
931 } else {
932 error_report("%s: paused but don't allow to continue",
933 __func__);
934 break;
935 }
936 }
937
938 if (pfd[1].revents) {
939 uint64_t tmp64 = 0;
940
941
942 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
943
944 error_report("%s: read() failed", __func__);
945 }
946
947 if (atomic_read(&mis->fault_thread_quit)) {
948 trace_postcopy_ram_fault_thread_quit();
949 break;
950 }
951 }
952
953 if (pfd[0].revents) {
954 poll_result--;
955 ret = read(mis->userfault_fd, &msg, sizeof(msg));
956 if (ret != sizeof(msg)) {
957 if (errno == EAGAIN) {
958
959
960
961
962 continue;
963 }
964 if (ret < 0) {
965 error_report("%s: Failed to read full userfault "
966 "message: %s",
967 __func__, strerror(errno));
968 break;
969 } else {
970 error_report("%s: Read %d bytes from userfaultfd "
971 "expected %zd",
972 __func__, ret, sizeof(msg));
973 break;
974 }
975 }
976 if (msg.event != UFFD_EVENT_PAGEFAULT) {
977 error_report("%s: Read unexpected event %ud from userfaultfd",
978 __func__, msg.event);
979 continue;
980 }
981
982 rb = qemu_ram_block_from_host(
983 (void *)(uintptr_t)msg.arg.pagefault.address,
984 true, &rb_offset);
985 if (!rb) {
986 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
987 PRIx64, (uint64_t)msg.arg.pagefault.address);
988 break;
989 }
990
991 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
992 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
993 qemu_ram_get_idstr(rb),
994 rb_offset,
995 msg.arg.pagefault.feat.ptid);
996 mark_postcopy_blocktime_begin(
997 (uintptr_t)(msg.arg.pagefault.address),
998 msg.arg.pagefault.feat.ptid, rb);
999
1000retry:
1001
1002
1003
1004
1005 if (rb != mis->last_rb) {
1006 mis->last_rb = rb;
1007 ret = migrate_send_rp_req_pages(mis,
1008 qemu_ram_get_idstr(rb),
1009 rb_offset,
1010 qemu_ram_pagesize(rb));
1011 } else {
1012
1013 ret = migrate_send_rp_req_pages(mis,
1014 NULL,
1015 rb_offset,
1016 qemu_ram_pagesize(rb));
1017 }
1018
1019 if (ret) {
1020
1021 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
1022
1023 mis->last_rb = NULL;
1024 goto retry;
1025 } else {
1026
1027 error_report("%s: migrate_send_rp_req_pages() get %d",
1028 __func__, ret);
1029 break;
1030 }
1031 }
1032 }
1033
1034
1035
1036 for (index = 2; index < pfd_len && poll_result; index++) {
1037 if (pfd[index].revents) {
1038 struct PostCopyFD *pcfd =
1039 &g_array_index(mis->postcopy_remote_fds,
1040 struct PostCopyFD, index - 2);
1041
1042 poll_result--;
1043 if (pfd[index].revents & POLLERR) {
1044 error_report("%s: POLLERR on poll %zd fd=%d",
1045 __func__, index, pcfd->fd);
1046 pfd[index].events = 0;
1047 continue;
1048 }
1049
1050 ret = read(pcfd->fd, &msg, sizeof(msg));
1051 if (ret != sizeof(msg)) {
1052 if (errno == EAGAIN) {
1053
1054
1055
1056
1057 continue;
1058 }
1059 if (ret < 0) {
1060 error_report("%s: Failed to read full userfault "
1061 "message: %s (shared) revents=%d",
1062 __func__, strerror(errno),
1063 pfd[index].revents);
1064
1065 break;
1066 } else {
1067 error_report("%s: Read %d bytes from userfaultfd "
1068 "expected %zd (shared)",
1069 __func__, ret, sizeof(msg));
1070
1071 break;
1072 }
1073 }
1074 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1075 error_report("%s: Read unexpected event %ud "
1076 "from userfaultfd (shared)",
1077 __func__, msg.event);
1078 continue;
1079 }
1080
1081 ret = pcfd->handler(pcfd, &msg);
1082 if (ret) {
1083 error_report("%s: Failed to resolve shared fault on %zd/%s",
1084 __func__, index, pcfd->idstr);
1085
1086 }
1087 }
1088 }
1089 }
1090 rcu_unregister_thread();
1091 trace_postcopy_ram_fault_thread_exit();
1092 g_free(pfd);
1093 return NULL;
1094}
1095
1096int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1097{
1098
1099 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1100 if (mis->userfault_fd == -1) {
1101 error_report("%s: Failed to open userfault fd: %s", __func__,
1102 strerror(errno));
1103 return -1;
1104 }
1105
1106
1107
1108
1109
1110 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1111 return -1;
1112 }
1113
1114
1115 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1116 if (mis->userfault_event_fd == -1) {
1117 error_report("%s: Opening userfault_event_fd: %s", __func__,
1118 strerror(errno));
1119 close(mis->userfault_fd);
1120 return -1;
1121 }
1122
1123 qemu_sem_init(&mis->fault_thread_sem, 0);
1124 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1125 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1126 qemu_sem_wait(&mis->fault_thread_sem);
1127 qemu_sem_destroy(&mis->fault_thread_sem);
1128 mis->have_fault_thread = true;
1129
1130
1131 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1132 error_report("ram_block_enable_notify failed");
1133 return -1;
1134 }
1135
1136
1137
1138
1139
1140 postcopy_balloon_inhibit(true);
1141
1142 trace_postcopy_ram_enable_notify();
1143
1144 return 0;
1145}
1146
1147static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
1148 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1149{
1150 int ret;
1151 if (from_addr) {
1152 struct uffdio_copy copy_struct;
1153 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1154 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1155 copy_struct.len = pagesize;
1156 copy_struct.mode = 0;
1157 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1158 } else {
1159 struct uffdio_zeropage zero_struct;
1160 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1161 zero_struct.range.len = pagesize;
1162 zero_struct.mode = 0;
1163 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1164 }
1165 if (!ret) {
1166 ramblock_recv_bitmap_set_range(rb, host_addr,
1167 pagesize / qemu_target_page_size());
1168 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1169
1170 }
1171 return ret;
1172}
1173
1174int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1175{
1176 int i;
1177 MigrationIncomingState *mis = migration_incoming_get_current();
1178 GArray *pcrfds = mis->postcopy_remote_fds;
1179
1180 for (i = 0; i < pcrfds->len; i++) {
1181 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1182 int ret = cur->waker(cur, rb, offset);
1183 if (ret) {
1184 return ret;
1185 }
1186 }
1187 return 0;
1188}
1189
1190
1191
1192
1193
1194int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1195 RAMBlock *rb)
1196{
1197 size_t pagesize = qemu_ram_pagesize(rb);
1198
1199
1200
1201
1202
1203
1204 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
1205 int e = errno;
1206 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1207 __func__, strerror(e), host, from, pagesize);
1208
1209 return -e;
1210 }
1211
1212 trace_postcopy_place_page(host);
1213 return postcopy_notify_shared_wake(rb,
1214 qemu_ram_block_host_offset(rb, host));
1215}
1216
1217
1218
1219
1220
1221int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1222 RAMBlock *rb)
1223{
1224 size_t pagesize = qemu_ram_pagesize(rb);
1225 trace_postcopy_place_page_zero(host);
1226
1227
1228
1229
1230 if (qemu_ram_is_uf_zeroable(rb)) {
1231 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
1232 int e = errno;
1233 error_report("%s: %s zero host: %p",
1234 __func__, strerror(e), host);
1235
1236 return -e;
1237 }
1238 return postcopy_notify_shared_wake(rb,
1239 qemu_ram_block_host_offset(rb,
1240 host));
1241 } else {
1242
1243 if (!mis->postcopy_tmp_zero_page) {
1244 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1245 PROT_READ | PROT_WRITE,
1246 MAP_PRIVATE | MAP_ANONYMOUS,
1247 -1, 0);
1248 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1249 int e = errno;
1250 mis->postcopy_tmp_zero_page = NULL;
1251 error_report("%s: %s mapping large zero page",
1252 __func__, strerror(e));
1253 return -e;
1254 }
1255 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1256 }
1257 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
1258 rb);
1259 }
1260}
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1271{
1272 if (!mis->postcopy_tmp_page) {
1273 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1274 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1275 MAP_ANONYMOUS, -1, 0);
1276 if (mis->postcopy_tmp_page == MAP_FAILED) {
1277 mis->postcopy_tmp_page = NULL;
1278 error_report("%s: %s", __func__, strerror(errno));
1279 return NULL;
1280 }
1281 }
1282
1283 return mis->postcopy_tmp_page;
1284}
1285
1286#else
1287
1288void fill_destination_postcopy_migration_info(MigrationInfo *info)
1289{
1290}
1291
1292bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1293{
1294 error_report("%s: No OS support", __func__);
1295 return false;
1296}
1297
1298int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1299{
1300 error_report("postcopy_ram_incoming_init: No OS support");
1301 return -1;
1302}
1303
1304int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1305{
1306 assert(0);
1307 return -1;
1308}
1309
1310int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1311{
1312 assert(0);
1313 return -1;
1314}
1315
1316int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1317 uint64_t client_addr, uint64_t rb_offset)
1318{
1319 assert(0);
1320 return -1;
1321}
1322
1323int postcopy_ram_enable_notify(MigrationIncomingState *mis)
1324{
1325 assert(0);
1326 return -1;
1327}
1328
1329int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1330 RAMBlock *rb)
1331{
1332 assert(0);
1333 return -1;
1334}
1335
1336int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1337 RAMBlock *rb)
1338{
1339 assert(0);
1340 return -1;
1341}
1342
1343void *postcopy_get_tmp_page(MigrationIncomingState *mis)
1344{
1345 assert(0);
1346 return NULL;
1347}
1348
1349int postcopy_wake_shared(struct PostCopyFD *pcfd,
1350 uint64_t client_addr,
1351 RAMBlock *rb)
1352{
1353 assert(0);
1354 return -1;
1355}
1356#endif
1357
1358
1359
1360void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1361{
1362 uint64_t tmp64 = 1;
1363
1364
1365
1366
1367
1368 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1369
1370 error_report("%s: incrementing failed: %s", __func__,
1371 strerror(errno));
1372 }
1373}
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
1387 const char *name)
1388{
1389 PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
1390
1391 if (res) {
1392 res->ramblock_name = name;
1393 }
1394
1395 return res;
1396}
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
1409 unsigned long start, unsigned long length)
1410{
1411 size_t tp_size = qemu_target_page_size();
1412
1413 pds->start_list[pds->cur_entry] = start * tp_size;
1414 pds->length_list[pds->cur_entry] = length * tp_size;
1415 trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
1416 pds->cur_entry++;
1417 pds->nsentwords++;
1418
1419 if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
1420
1421 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1422 pds->ramblock_name,
1423 pds->cur_entry,
1424 pds->start_list,
1425 pds->length_list);
1426 pds->nsentcmds++;
1427 pds->cur_entry = 0;
1428 }
1429}
1430
1431
1432
1433
1434
1435
1436
1437
1438void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
1439{
1440
1441 if (pds->cur_entry) {
1442 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1443 pds->ramblock_name,
1444 pds->cur_entry,
1445 pds->start_list,
1446 pds->length_list);
1447 pds->nsentcmds++;
1448 }
1449
1450 trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
1451 pds->nsentcmds);
1452
1453 g_free(pds);
1454}
1455
1456
1457
1458
1459
1460
1461static PostcopyState incoming_postcopy_state;
1462
1463PostcopyState postcopy_state_get(void)
1464{
1465 return atomic_mb_read(&incoming_postcopy_state);
1466}
1467
1468
1469PostcopyState postcopy_state_set(PostcopyState new_state)
1470{
1471 return atomic_xchg(&incoming_postcopy_state, new_state);
1472}
1473
1474
1475
1476
1477void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1478{
1479 MigrationIncomingState *mis = migration_incoming_get_current();
1480
1481 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1482 *pcfd);
1483}
1484
1485
1486
1487void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1488{
1489 guint i;
1490 MigrationIncomingState *mis = migration_incoming_get_current();
1491 GArray *pcrfds = mis->postcopy_remote_fds;
1492
1493 for (i = 0; i < pcrfds->len; i++) {
1494 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1495 if (cur->fd == pcfd->fd) {
1496 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1497 return;
1498 }
1499 }
1500}
1501