1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "exec/target_page.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "savevm.h"
24#include "postcopy-ram.h"
25#include "ram.h"
26#include "qapi/error.h"
27#include "qemu/notify.h"
28#include "qemu/rcu.h"
29#include "sysemu/sysemu.h"
30#include "sysemu/balloon.h"
31#include "qemu/error-report.h"
32#include "trace.h"
33#include "hw/boards.h"
34
35
36
37
38#define MAX_DISCARDS_PER_COMMAND 12
39
40struct PostcopyDiscardState {
41 const char *ramblock_name;
42 uint16_t cur_entry;
43
44
45
46 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
47 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
48 unsigned int nsentwords;
49 unsigned int nsentcmds;
50};
51
52static NotifierWithReturnList postcopy_notifier_list;
53
54void postcopy_infrastructure_init(void)
55{
56 notifier_with_return_list_init(&postcopy_notifier_list);
57}
58
59void postcopy_add_notifier(NotifierWithReturn *nn)
60{
61 notifier_with_return_list_add(&postcopy_notifier_list, nn);
62}
63
64void postcopy_remove_notifier(NotifierWithReturn *n)
65{
66 notifier_with_return_remove(n);
67}
68
69int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
70{
71 struct PostcopyNotifyData pnd;
72 pnd.reason = reason;
73 pnd.errp = errp;
74
75 return notifier_with_return_list_notify(&postcopy_notifier_list,
76 &pnd);
77}
78
79
80
81
82
83#if defined(__linux__)
84
85#include <poll.h>
86#include <sys/ioctl.h>
87#include <sys/syscall.h>
88#include <asm/types.h>
89#endif
90
91#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
92#include <sys/eventfd.h>
93#include <linux/userfaultfd.h>
94
95typedef struct PostcopyBlocktimeContext {
96
97 uint32_t *page_fault_vcpu_time;
98
99 uintptr_t *vcpu_addr;
100 uint32_t total_blocktime;
101
102 uint32_t *vcpu_blocktime;
103
104 uint32_t last_begin;
105
106 int smp_cpus_down;
107 uint64_t start_time;
108
109
110
111
112
113 Notifier exit_notifier;
114} PostcopyBlocktimeContext;
115
116static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
117{
118 g_free(ctx->page_fault_vcpu_time);
119 g_free(ctx->vcpu_addr);
120 g_free(ctx->vcpu_blocktime);
121 g_free(ctx);
122}
123
124static void migration_exit_cb(Notifier *n, void *data)
125{
126 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
127 exit_notifier);
128 destroy_blocktime_context(ctx);
129}
130
131static struct PostcopyBlocktimeContext *blocktime_context_new(void)
132{
133 MachineState *ms = MACHINE(qdev_get_machine());
134 unsigned int smp_cpus = ms->smp.cpus;
135 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
136 ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
137 ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
138 ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
139
140 ctx->exit_notifier.notify = migration_exit_cb;
141 ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
142 qemu_add_exit_notifier(&ctx->exit_notifier);
143 return ctx;
144}
145
146static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
147{
148 MachineState *ms = MACHINE(qdev_get_machine());
149 uint32List *list = NULL, *entry = NULL;
150 int i;
151
152 for (i = ms->smp.cpus - 1; i >= 0; i--) {
153 entry = g_new0(uint32List, 1);
154 entry->value = ctx->vcpu_blocktime[i];
155 entry->next = list;
156 list = entry;
157 }
158
159 return list;
160}
161
162
163
164
165
166
167
168
169void fill_destination_postcopy_migration_info(MigrationInfo *info)
170{
171 MigrationIncomingState *mis = migration_incoming_get_current();
172 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
173
174 if (!bc) {
175 return;
176 }
177
178 info->has_postcopy_blocktime = true;
179 info->postcopy_blocktime = bc->total_blocktime;
180 info->has_postcopy_vcpu_blocktime = true;
181 info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
182}
183
184static uint32_t get_postcopy_total_blocktime(void)
185{
186 MigrationIncomingState *mis = migration_incoming_get_current();
187 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
188
189 if (!bc) {
190 return 0;
191 }
192
193 return bc->total_blocktime;
194}
195
196
197
198
199
200
201
202
203
204
205
206static bool receive_ufd_features(uint64_t *features)
207{
208 struct uffdio_api api_struct = {0};
209 int ufd;
210 bool ret = true;
211
212
213 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
214 if (ufd == -1) {
215 error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
216 strerror(errno));
217 return false;
218 }
219
220
221 api_struct.api = UFFD_API;
222 api_struct.features = 0;
223 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
224 error_report("%s: UFFDIO_API failed: %s", __func__,
225 strerror(errno));
226 ret = false;
227 goto release_ufd;
228 }
229
230 *features = api_struct.features;
231
232release_ufd:
233 close(ufd);
234 return ret;
235}
236
237
238
239
240
241
242
243
244
245
246static bool request_ufd_features(int ufd, uint64_t features)
247{
248 struct uffdio_api api_struct = {0};
249 uint64_t ioctl_mask;
250
251 api_struct.api = UFFD_API;
252 api_struct.features = features;
253 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
254 error_report("%s failed: UFFDIO_API failed: %s", __func__,
255 strerror(errno));
256 return false;
257 }
258
259 ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
260 (__u64)1 << _UFFDIO_UNREGISTER;
261 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
262 error_report("Missing userfault features: %" PRIx64,
263 (uint64_t)(~api_struct.ioctls & ioctl_mask));
264 return false;
265 }
266
267 return true;
268}
269
270static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
271{
272 uint64_t asked_features = 0;
273 static uint64_t supported_features;
274
275
276
277
278
279
280 if (!supported_features) {
281 if (!receive_ufd_features(&supported_features)) {
282 error_report("%s failed", __func__);
283 return false;
284 }
285 }
286
287#ifdef UFFD_FEATURE_THREAD_ID
288 if (migrate_postcopy_blocktime() && mis &&
289 UFFD_FEATURE_THREAD_ID & supported_features) {
290
291
292 if (!mis->blocktime_ctx) {
293 mis->blocktime_ctx = blocktime_context_new();
294 }
295
296 asked_features |= UFFD_FEATURE_THREAD_ID;
297 }
298#endif
299
300
301
302
303
304
305 if (!request_ufd_features(ufd, asked_features)) {
306 error_report("%s failed: features %" PRIu64, __func__,
307 asked_features);
308 return false;
309 }
310
311 if (qemu_real_host_page_size != ram_pagesize_summary()) {
312 bool have_hp = false;
313
314#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
315 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
316#endif
317 if (!have_hp) {
318 error_report("Userfault on this host does not support huge pages");
319 return false;
320 }
321 }
322 return true;
323}
324
325
326
327static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
328{
329 const char *block_name = qemu_ram_get_idstr(rb);
330 ram_addr_t length = qemu_ram_get_used_length(rb);
331 size_t pagesize = qemu_ram_pagesize(rb);
332
333 if (length % pagesize) {
334 error_report("Postcopy requires RAM blocks to be a page size multiple,"
335 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
336 "page size of 0x%zx", block_name, length, pagesize);
337 return 1;
338 }
339 return 0;
340}
341
342
343
344
345
346
347bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
348{
349 long pagesize = qemu_real_host_page_size;
350 int ufd = -1;
351 bool ret = false;
352 void *testarea = NULL;
353 struct uffdio_register reg_struct;
354 struct uffdio_range range_struct;
355 uint64_t feature_mask;
356 Error *local_err = NULL;
357
358 if (qemu_target_page_size() > pagesize) {
359 error_report("Target page size bigger than host page size");
360 goto out;
361 }
362
363 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
364 if (ufd == -1) {
365 error_report("%s: userfaultfd not available: %s", __func__,
366 strerror(errno));
367 goto out;
368 }
369
370
371 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
372 error_report_err(local_err);
373 goto out;
374 }
375
376
377 if (!ufd_check_and_apply(ufd, mis)) {
378 goto out;
379 }
380
381
382 if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
383 goto out;
384 }
385
386
387
388
389
390 if (munlockall()) {
391 error_report("%s: munlockall: %s", __func__, strerror(errno));
392 return -1;
393 }
394
395
396
397
398
399
400 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
401 MAP_ANONYMOUS, -1, 0);
402 if (testarea == MAP_FAILED) {
403 error_report("%s: Failed to map test area: %s", __func__,
404 strerror(errno));
405 goto out;
406 }
407 g_assert(((size_t)testarea & (pagesize-1)) == 0);
408
409 reg_struct.range.start = (uintptr_t)testarea;
410 reg_struct.range.len = pagesize;
411 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
412
413 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
414 error_report("%s userfault register: %s", __func__, strerror(errno));
415 goto out;
416 }
417
418 range_struct.start = (uintptr_t)testarea;
419 range_struct.len = pagesize;
420 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
421 error_report("%s userfault unregister: %s", __func__, strerror(errno));
422 goto out;
423 }
424
425 feature_mask = (__u64)1 << _UFFDIO_WAKE |
426 (__u64)1 << _UFFDIO_COPY |
427 (__u64)1 << _UFFDIO_ZEROPAGE;
428 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
429 error_report("Missing userfault map features: %" PRIx64,
430 (uint64_t)(~reg_struct.ioctls & feature_mask));
431 goto out;
432 }
433
434
435 ret = true;
436out:
437 if (testarea) {
438 munmap(testarea, pagesize);
439 }
440 if (ufd != -1) {
441 close(ufd);
442 }
443 return ret;
444}
445
446
447
448
449
450
451static int init_range(RAMBlock *rb, void *opaque)
452{
453 const char *block_name = qemu_ram_get_idstr(rb);
454 void *host_addr = qemu_ram_get_host_addr(rb);
455 ram_addr_t offset = qemu_ram_get_offset(rb);
456 ram_addr_t length = qemu_ram_get_used_length(rb);
457 trace_postcopy_init_range(block_name, host_addr, offset, length);
458
459
460
461
462
463
464
465 if (ram_discard_range(block_name, 0, length)) {
466 return -1;
467 }
468
469 return 0;
470}
471
472
473
474
475
476static int cleanup_range(RAMBlock *rb, void *opaque)
477{
478 const char *block_name = qemu_ram_get_idstr(rb);
479 void *host_addr = qemu_ram_get_host_addr(rb);
480 ram_addr_t offset = qemu_ram_get_offset(rb);
481 ram_addr_t length = qemu_ram_get_used_length(rb);
482 MigrationIncomingState *mis = opaque;
483 struct uffdio_range range_struct;
484 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
485
486
487
488
489
490 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
491
492
493
494
495
496
497 range_struct.start = (uintptr_t)host_addr;
498 range_struct.len = length;
499
500 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
501 error_report("%s: userfault unregister %s", __func__, strerror(errno));
502
503 return -1;
504 }
505
506 return 0;
507}
508
509
510
511
512
513
514int postcopy_ram_incoming_init(MigrationIncomingState *mis)
515{
516 if (foreach_not_ignored_block(init_range, NULL)) {
517 return -1;
518 }
519
520 return 0;
521}
522
523
524
525
526
527static void postcopy_balloon_inhibit(bool state)
528{
529 static bool cur_state = false;
530
531 if (state != cur_state) {
532 qemu_balloon_inhibit(state);
533 cur_state = state;
534 }
535}
536
537
538
539
540int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
541{
542 trace_postcopy_ram_incoming_cleanup_entry();
543
544 if (mis->have_fault_thread) {
545 Error *local_err = NULL;
546
547
548 atomic_set(&mis->fault_thread_quit, 1);
549 postcopy_fault_thread_notify(mis);
550 trace_postcopy_ram_incoming_cleanup_join();
551 qemu_thread_join(&mis->fault_thread);
552
553 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
554 error_report_err(local_err);
555 return -1;
556 }
557
558 if (foreach_not_ignored_block(cleanup_range, mis)) {
559 return -1;
560 }
561
562 trace_postcopy_ram_incoming_cleanup_closeuf();
563 close(mis->userfault_fd);
564 close(mis->userfault_event_fd);
565 mis->have_fault_thread = false;
566 }
567
568 postcopy_balloon_inhibit(false);
569
570 if (enable_mlock) {
571 if (os_mlock() < 0) {
572 error_report("mlock: %s", strerror(errno));
573
574
575
576
577 }
578 }
579
580 if (mis->postcopy_tmp_page) {
581 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
582 mis->postcopy_tmp_page = NULL;
583 }
584 if (mis->postcopy_tmp_zero_page) {
585 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
586 mis->postcopy_tmp_zero_page = NULL;
587 }
588 trace_postcopy_ram_incoming_cleanup_blocktime(
589 get_postcopy_total_blocktime());
590
591 trace_postcopy_ram_incoming_cleanup_exit();
592 return 0;
593}
594
595
596
597
598static int nhp_range(RAMBlock *rb, void *opaque)
599{
600 const char *block_name = qemu_ram_get_idstr(rb);
601 void *host_addr = qemu_ram_get_host_addr(rb);
602 ram_addr_t offset = qemu_ram_get_offset(rb);
603 ram_addr_t length = qemu_ram_get_used_length(rb);
604 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
605
606
607
608
609
610
611 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
612
613 return 0;
614}
615
616
617
618
619
620
621int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
622{
623 if (foreach_not_ignored_block(nhp_range, mis)) {
624 return -1;
625 }
626
627 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
628
629 return 0;
630}
631
632
633
634
635
636
637
638
639
640
641static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
642{
643 MigrationIncomingState *mis = opaque;
644 struct uffdio_register reg_struct;
645
646 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
647 reg_struct.range.len = qemu_ram_get_used_length(rb);
648 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
649
650
651 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
652 error_report("%s userfault register: %s", __func__, strerror(errno));
653 return -1;
654 }
655 if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
656 error_report("%s userfault: Region doesn't support COPY", __func__);
657 return -1;
658 }
659 if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
660 qemu_ram_set_uf_zeroable(rb);
661 }
662
663 return 0;
664}
665
666int postcopy_wake_shared(struct PostCopyFD *pcfd,
667 uint64_t client_addr,
668 RAMBlock *rb)
669{
670 size_t pagesize = qemu_ram_pagesize(rb);
671 struct uffdio_range range;
672 int ret;
673 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
674 range.start = client_addr & ~(pagesize - 1);
675 range.len = pagesize;
676 ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
677 if (ret) {
678 error_report("%s: Failed to wake: %zx in %s (%s)",
679 __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
680 strerror(errno));
681 }
682 return ret;
683}
684
685
686
687
688
689
690int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
691 uint64_t client_addr, uint64_t rb_offset)
692{
693 size_t pagesize = qemu_ram_pagesize(rb);
694 uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
695 MigrationIncomingState *mis = migration_incoming_get_current();
696
697 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
698 rb_offset);
699 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
700 trace_postcopy_request_shared_page_present(pcfd->idstr,
701 qemu_ram_get_idstr(rb), rb_offset);
702 return postcopy_wake_shared(pcfd, client_addr, rb);
703 }
704 if (rb != mis->last_rb) {
705 mis->last_rb = rb;
706 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
707 aligned_rbo, pagesize);
708 } else {
709
710 migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
711 }
712 return 0;
713}
714
715static int get_mem_fault_cpu_index(uint32_t pid)
716{
717 CPUState *cpu_iter;
718
719 CPU_FOREACH(cpu_iter) {
720 if (cpu_iter->thread_id == pid) {
721 trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
722 return cpu_iter->cpu_index;
723 }
724 }
725 trace_get_mem_fault_cpu_index(-1, pid);
726 return -1;
727}
728
729static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
730{
731 int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
732 dc->start_time;
733 return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
734}
735
736
737
738
739
740
741
742
743
744static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
745 RAMBlock *rb)
746{
747 int cpu, already_received;
748 MigrationIncomingState *mis = migration_incoming_get_current();
749 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
750 uint32_t low_time_offset;
751
752 if (!dc || ptid == 0) {
753 return;
754 }
755 cpu = get_mem_fault_cpu_index(ptid);
756 if (cpu < 0) {
757 return;
758 }
759
760 low_time_offset = get_low_time_offset(dc);
761 if (dc->vcpu_addr[cpu] == 0) {
762 atomic_inc(&dc->smp_cpus_down);
763 }
764
765 atomic_xchg(&dc->last_begin, low_time_offset);
766 atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
767 atomic_xchg(&dc->vcpu_addr[cpu], addr);
768
769
770
771
772
773
774 already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
775 if (already_received) {
776 atomic_xchg(&dc->vcpu_addr[cpu], 0);
777 atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
778 atomic_dec(&dc->smp_cpus_down);
779 }
780 trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
781 cpu, already_received);
782}
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811static void mark_postcopy_blocktime_end(uintptr_t addr)
812{
813 MigrationIncomingState *mis = migration_incoming_get_current();
814 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
815 MachineState *ms = MACHINE(qdev_get_machine());
816 unsigned int smp_cpus = ms->smp.cpus;
817 int i, affected_cpu = 0;
818 bool vcpu_total_blocktime = false;
819 uint32_t read_vcpu_time, low_time_offset;
820
821 if (!dc) {
822 return;
823 }
824
825 low_time_offset = get_low_time_offset(dc);
826
827
828
829
830 for (i = 0; i < smp_cpus; i++) {
831 uint32_t vcpu_blocktime = 0;
832
833 read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
834 if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
835 read_vcpu_time == 0) {
836 continue;
837 }
838 atomic_xchg(&dc->vcpu_addr[i], 0);
839 vcpu_blocktime = low_time_offset - read_vcpu_time;
840 affected_cpu += 1;
841
842
843
844 if (!vcpu_total_blocktime &&
845 atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
846 vcpu_total_blocktime = true;
847 }
848
849 dc->vcpu_blocktime[i] += vcpu_blocktime;
850 }
851
852 atomic_sub(&dc->smp_cpus_down, affected_cpu);
853 if (vcpu_total_blocktime) {
854 dc->total_blocktime += low_time_offset - atomic_fetch_add(
855 &dc->last_begin, 0);
856 }
857 trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
858 affected_cpu);
859}
860
861static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
862{
863 trace_postcopy_pause_fault_thread();
864
865 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
866
867 trace_postcopy_pause_fault_thread_continued();
868
869 return true;
870}
871
872
873
874
875static void *postcopy_ram_fault_thread(void *opaque)
876{
877 MigrationIncomingState *mis = opaque;
878 struct uffd_msg msg;
879 int ret;
880 size_t index;
881 RAMBlock *rb = NULL;
882
883 trace_postcopy_ram_fault_thread_entry();
884 rcu_register_thread();
885 mis->last_rb = NULL;
886 qemu_sem_post(&mis->fault_thread_sem);
887
888 struct pollfd *pfd;
889 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
890
891 pfd = g_new0(struct pollfd, pfd_len);
892
893 pfd[0].fd = mis->userfault_fd;
894 pfd[0].events = POLLIN;
895 pfd[1].fd = mis->userfault_event_fd;
896 pfd[1].events = POLLIN;
897 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
898 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
899 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
900 struct PostCopyFD, index);
901 pfd[2 + index].fd = pcfd->fd;
902 pfd[2 + index].events = POLLIN;
903 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
904 pcfd->fd);
905 }
906
907 while (true) {
908 ram_addr_t rb_offset;
909 int poll_result;
910
911
912
913
914
915
916
917 poll_result = poll(pfd, pfd_len, -1 );
918 if (poll_result == -1) {
919 error_report("%s: userfault poll: %s", __func__, strerror(errno));
920 break;
921 }
922
923 if (!mis->to_src_file) {
924
925
926
927
928
929 if (postcopy_pause_fault_thread(mis)) {
930 mis->last_rb = NULL;
931
932 } else {
933 error_report("%s: paused but don't allow to continue",
934 __func__);
935 break;
936 }
937 }
938
939 if (pfd[1].revents) {
940 uint64_t tmp64 = 0;
941
942
943 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
944
945 error_report("%s: read() failed", __func__);
946 }
947
948 if (atomic_read(&mis->fault_thread_quit)) {
949 trace_postcopy_ram_fault_thread_quit();
950 break;
951 }
952 }
953
954 if (pfd[0].revents) {
955 poll_result--;
956 ret = read(mis->userfault_fd, &msg, sizeof(msg));
957 if (ret != sizeof(msg)) {
958 if (errno == EAGAIN) {
959
960
961
962
963 continue;
964 }
965 if (ret < 0) {
966 error_report("%s: Failed to read full userfault "
967 "message: %s",
968 __func__, strerror(errno));
969 break;
970 } else {
971 error_report("%s: Read %d bytes from userfaultfd "
972 "expected %zd",
973 __func__, ret, sizeof(msg));
974 break;
975 }
976 }
977 if (msg.event != UFFD_EVENT_PAGEFAULT) {
978 error_report("%s: Read unexpected event %ud from userfaultfd",
979 __func__, msg.event);
980 continue;
981 }
982
983 rb = qemu_ram_block_from_host(
984 (void *)(uintptr_t)msg.arg.pagefault.address,
985 true, &rb_offset);
986 if (!rb) {
987 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
988 PRIx64, (uint64_t)msg.arg.pagefault.address);
989 break;
990 }
991
992 rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
993 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
994 qemu_ram_get_idstr(rb),
995 rb_offset,
996 msg.arg.pagefault.feat.ptid);
997 mark_postcopy_blocktime_begin(
998 (uintptr_t)(msg.arg.pagefault.address),
999 msg.arg.pagefault.feat.ptid, rb);
1000
1001retry:
1002
1003
1004
1005
1006 if (rb != mis->last_rb) {
1007 mis->last_rb = rb;
1008 ret = migrate_send_rp_req_pages(mis,
1009 qemu_ram_get_idstr(rb),
1010 rb_offset,
1011 qemu_ram_pagesize(rb));
1012 } else {
1013
1014 ret = migrate_send_rp_req_pages(mis,
1015 NULL,
1016 rb_offset,
1017 qemu_ram_pagesize(rb));
1018 }
1019
1020 if (ret) {
1021
1022 if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
1023
1024 mis->last_rb = NULL;
1025 goto retry;
1026 } else {
1027
1028 error_report("%s: migrate_send_rp_req_pages() get %d",
1029 __func__, ret);
1030 break;
1031 }
1032 }
1033 }
1034
1035
1036
1037 for (index = 2; index < pfd_len && poll_result; index++) {
1038 if (pfd[index].revents) {
1039 struct PostCopyFD *pcfd =
1040 &g_array_index(mis->postcopy_remote_fds,
1041 struct PostCopyFD, index - 2);
1042
1043 poll_result--;
1044 if (pfd[index].revents & POLLERR) {
1045 error_report("%s: POLLERR on poll %zd fd=%d",
1046 __func__, index, pcfd->fd);
1047 pfd[index].events = 0;
1048 continue;
1049 }
1050
1051 ret = read(pcfd->fd, &msg, sizeof(msg));
1052 if (ret != sizeof(msg)) {
1053 if (errno == EAGAIN) {
1054
1055
1056
1057
1058 continue;
1059 }
1060 if (ret < 0) {
1061 error_report("%s: Failed to read full userfault "
1062 "message: %s (shared) revents=%d",
1063 __func__, strerror(errno),
1064 pfd[index].revents);
1065
1066 break;
1067 } else {
1068 error_report("%s: Read %d bytes from userfaultfd "
1069 "expected %zd (shared)",
1070 __func__, ret, sizeof(msg));
1071
1072 break;
1073 }
1074 }
1075 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1076 error_report("%s: Read unexpected event %ud "
1077 "from userfaultfd (shared)",
1078 __func__, msg.event);
1079 continue;
1080 }
1081
1082 ret = pcfd->handler(pcfd, &msg);
1083 if (ret) {
1084 error_report("%s: Failed to resolve shared fault on %zd/%s",
1085 __func__, index, pcfd->idstr);
1086
1087 }
1088 }
1089 }
1090 }
1091 rcu_unregister_thread();
1092 trace_postcopy_ram_fault_thread_exit();
1093 g_free(pfd);
1094 return NULL;
1095}
1096
1097int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1098{
1099
1100 mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1101 if (mis->userfault_fd == -1) {
1102 error_report("%s: Failed to open userfault fd: %s", __func__,
1103 strerror(errno));
1104 return -1;
1105 }
1106
1107
1108
1109
1110
1111 if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1112 return -1;
1113 }
1114
1115
1116 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1117 if (mis->userfault_event_fd == -1) {
1118 error_report("%s: Opening userfault_event_fd: %s", __func__,
1119 strerror(errno));
1120 close(mis->userfault_fd);
1121 return -1;
1122 }
1123
1124 qemu_sem_init(&mis->fault_thread_sem, 0);
1125 qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1126 postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1127 qemu_sem_wait(&mis->fault_thread_sem);
1128 qemu_sem_destroy(&mis->fault_thread_sem);
1129 mis->have_fault_thread = true;
1130
1131
1132 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1133 error_report("ram_block_enable_notify failed");
1134 return -1;
1135 }
1136
1137 mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1138 PROT_READ | PROT_WRITE, MAP_PRIVATE |
1139 MAP_ANONYMOUS, -1, 0);
1140 if (mis->postcopy_tmp_page == MAP_FAILED) {
1141 mis->postcopy_tmp_page = NULL;
1142 error_report("%s: Failed to map postcopy_tmp_page %s",
1143 __func__, strerror(errno));
1144 return -1;
1145 }
1146
1147
1148
1149
1150 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1151 PROT_READ | PROT_WRITE,
1152 MAP_PRIVATE | MAP_ANONYMOUS,
1153 -1, 0);
1154 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1155 int e = errno;
1156 mis->postcopy_tmp_zero_page = NULL;
1157 error_report("%s: Failed to map large zero page %s",
1158 __func__, strerror(e));
1159 return -e;
1160 }
1161 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1162
1163
1164
1165
1166
1167 postcopy_balloon_inhibit(true);
1168
1169 trace_postcopy_ram_enable_notify();
1170
1171 return 0;
1172}
1173
1174static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
1175 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1176{
1177 int ret;
1178 if (from_addr) {
1179 struct uffdio_copy copy_struct;
1180 copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1181 copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1182 copy_struct.len = pagesize;
1183 copy_struct.mode = 0;
1184 ret = ioctl(userfault_fd, UFFDIO_COPY, ©_struct);
1185 } else {
1186 struct uffdio_zeropage zero_struct;
1187 zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1188 zero_struct.range.len = pagesize;
1189 zero_struct.mode = 0;
1190 ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1191 }
1192 if (!ret) {
1193 ramblock_recv_bitmap_set_range(rb, host_addr,
1194 pagesize / qemu_target_page_size());
1195 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1196
1197 }
1198 return ret;
1199}
1200
1201int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1202{
1203 int i;
1204 MigrationIncomingState *mis = migration_incoming_get_current();
1205 GArray *pcrfds = mis->postcopy_remote_fds;
1206
1207 for (i = 0; i < pcrfds->len; i++) {
1208 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1209 int ret = cur->waker(cur, rb, offset);
1210 if (ret) {
1211 return ret;
1212 }
1213 }
1214 return 0;
1215}
1216
1217
1218
1219
1220
1221int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1222 RAMBlock *rb)
1223{
1224 size_t pagesize = qemu_ram_pagesize(rb);
1225
1226
1227
1228
1229
1230
1231 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
1232 int e = errno;
1233 error_report("%s: %s copy host: %p from: %p (size: %zd)",
1234 __func__, strerror(e), host, from, pagesize);
1235
1236 return -e;
1237 }
1238
1239 trace_postcopy_place_page(host);
1240 return postcopy_notify_shared_wake(rb,
1241 qemu_ram_block_host_offset(rb, host));
1242}
1243
1244
1245
1246
1247
1248int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1249 RAMBlock *rb)
1250{
1251 size_t pagesize = qemu_ram_pagesize(rb);
1252 trace_postcopy_place_page_zero(host);
1253
1254
1255
1256
1257 if (qemu_ram_is_uf_zeroable(rb)) {
1258 if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
1259 int e = errno;
1260 error_report("%s: %s zero host: %p",
1261 __func__, strerror(e), host);
1262
1263 return -e;
1264 }
1265 return postcopy_notify_shared_wake(rb,
1266 qemu_ram_block_host_offset(rb,
1267 host));
1268 } else {
1269 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1270 }
1271}
1272
1273#else
1274
1275void fill_destination_postcopy_migration_info(MigrationInfo *info)
1276{
1277}
1278
1279bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1280{
1281 error_report("%s: No OS support", __func__);
1282 return false;
1283}
1284
1285int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1286{
1287 error_report("postcopy_ram_incoming_init: No OS support");
1288 return -1;
1289}
1290
1291int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1292{
1293 assert(0);
1294 return -1;
1295}
1296
1297int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1298{
1299 assert(0);
1300 return -1;
1301}
1302
1303int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1304 uint64_t client_addr, uint64_t rb_offset)
1305{
1306 assert(0);
1307 return -1;
1308}
1309
1310int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1311{
1312 assert(0);
1313 return -1;
1314}
1315
1316int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1317 RAMBlock *rb)
1318{
1319 assert(0);
1320 return -1;
1321}
1322
1323int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1324 RAMBlock *rb)
1325{
1326 assert(0);
1327 return -1;
1328}
1329
1330int postcopy_wake_shared(struct PostCopyFD *pcfd,
1331 uint64_t client_addr,
1332 RAMBlock *rb)
1333{
1334 assert(0);
1335 return -1;
1336}
1337#endif
1338
1339
1340
1341void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1342{
1343 uint64_t tmp64 = 1;
1344
1345
1346
1347
1348
1349 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1350
1351 error_report("%s: incrementing failed: %s", __func__,
1352 strerror(errno));
1353 }
1354}
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364static PostcopyDiscardState pds = {0};
1365void postcopy_discard_send_init(MigrationState *ms, const char *name)
1366{
1367 pds.ramblock_name = name;
1368 pds.cur_entry = 0;
1369 pds.nsentwords = 0;
1370 pds.nsentcmds = 0;
1371}
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1383 unsigned long length)
1384{
1385 size_t tp_size = qemu_target_page_size();
1386
1387 pds.start_list[pds.cur_entry] = start * tp_size;
1388 pds.length_list[pds.cur_entry] = length * tp_size;
1389 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1390 pds.cur_entry++;
1391 pds.nsentwords++;
1392
1393 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1394
1395 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1396 pds.ramblock_name,
1397 pds.cur_entry,
1398 pds.start_list,
1399 pds.length_list);
1400 pds.nsentcmds++;
1401 pds.cur_entry = 0;
1402 }
1403}
1404
1405
1406
1407
1408
1409
1410
1411void postcopy_discard_send_finish(MigrationState *ms)
1412{
1413
1414 if (pds.cur_entry) {
1415 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1416 pds.ramblock_name,
1417 pds.cur_entry,
1418 pds.start_list,
1419 pds.length_list);
1420 pds.nsentcmds++;
1421 }
1422
1423 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1424 pds.nsentcmds);
1425}
1426
1427
1428
1429
1430
1431
1432static PostcopyState incoming_postcopy_state;
1433
1434PostcopyState postcopy_state_get(void)
1435{
1436 return atomic_mb_read(&incoming_postcopy_state);
1437}
1438
1439
1440PostcopyState postcopy_state_set(PostcopyState new_state)
1441{
1442 return atomic_xchg(&incoming_postcopy_state, new_state);
1443}
1444
1445
1446
1447
1448void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1449{
1450 MigrationIncomingState *mis = migration_incoming_get_current();
1451
1452 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1453 *pcfd);
1454}
1455
1456
1457
1458void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1459{
1460 guint i;
1461 MigrationIncomingState *mis = migration_incoming_get_current();
1462 GArray *pcrfds = mis->postcopy_remote_fds;
1463
1464 for (i = 0; i < pcrfds->len; i++) {
1465 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1466 if (cur->fd == pcfd->fd) {
1467 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1468 return;
1469 }
1470 }
1471}
1472