1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "qemu/osdep.h"
20#include "qemu/madvise.h"
21#include "exec/target_page.h"
22#include "migration.h"
23#include "qemu-file.h"
24#include "savevm.h"
25#include "postcopy-ram.h"
26#include "ram.h"
27#include "qapi/error.h"
28#include "qemu/notify.h"
29#include "qemu/rcu.h"
30#include "system/system.h"
31#include "qemu/error-report.h"
32#include "trace.h"
33#include "hw/boards.h"
34#include "system/ramblock.h"
35#include "socket.h"
36#include "yank_functions.h"
37#include "tls.h"
38#include "qemu/userfaultfd.h"
39#include "qemu/mmap-alloc.h"
40#include "options.h"
41
42
43
44
45#define MAX_DISCARDS_PER_COMMAND 12
46
47typedef struct PostcopyDiscardState {
48 const char *ramblock_name;
49 uint16_t cur_entry;
50
51
52
53 uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
54 uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
55 unsigned int nsentwords;
56 unsigned int nsentcmds;
57} PostcopyDiscardState;
58
59static NotifierWithReturnList postcopy_notifier_list;
60
61void postcopy_infrastructure_init(void)
62{
63 notifier_with_return_list_init(&postcopy_notifier_list);
64}
65
66void postcopy_add_notifier(NotifierWithReturn *nn)
67{
68 notifier_with_return_list_add(&postcopy_notifier_list, nn);
69}
70
71void postcopy_remove_notifier(NotifierWithReturn *n)
72{
73 notifier_with_return_remove(n);
74}
75
76int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
77{
78 struct PostcopyNotifyData pnd;
79 pnd.reason = reason;
80
81 return notifier_with_return_list_notify(&postcopy_notifier_list,
82 &pnd, errp);
83}
84
85
86
87
88
89void postcopy_thread_create(MigrationIncomingState *mis,
90 QemuThread *thread, const char *name,
91 void *(*fn)(void *), int joinable)
92{
93 qemu_event_init(&mis->thread_sync_event, false);
94 qemu_thread_create(thread, name, fn, mis, joinable);
95 qemu_event_wait(&mis->thread_sync_event);
96 qemu_event_destroy(&mis->thread_sync_event);
97}
98
99
100
101
102
103#if defined(__linux__)
104#include <poll.h>
105#include <sys/ioctl.h>
106#include <sys/syscall.h>
107#endif
108
109#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
110#include <sys/eventfd.h>
111#include <linux/userfaultfd.h>
112
113
114
115
116
117
118
119
120#define BLOCKTIME_LATENCY_BUCKET_N (24)
121
122
123typedef struct PostcopyBlocktimeContext {
124
125 uint64_t *vcpu_blocktime_total;
126
127 uint64_t *vcpu_faults_count;
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180 uint8_t *vcpu_faults_current;
181
182
183
184
185
186 GHashTable *vcpu_addr_hash;
187
188
189
190
191 uint64_t latency_buckets[BLOCKTIME_LATENCY_BUCKET_N];
192
193 uint64_t total_blocktime;
194
195 uint64_t last_begin;
196
197 int smp_cpus_down;
198
199
200
201
202
203
204
205
206 GHashTable *tid_to_vcpu_hash;
207
208 uint64_t non_vcpu_faults;
209
210 uint64_t non_vcpu_blocktime_total;
211
212
213
214
215
216 Notifier exit_notifier;
217} PostcopyBlocktimeContext;
218
219typedef struct {
220
221 uint64_t fault_time;
222
223
224
225
226 int cpu;
227} BlocktimeVCPUEntry;
228
229
230static BlocktimeVCPUEntry *
231blocktime_vcpu_entry_alloc(int cpu, uint64_t fault_time)
232{
233 BlocktimeVCPUEntry *entry = g_new(BlocktimeVCPUEntry, 1);
234
235 entry->fault_time = fault_time;
236 entry->cpu = cpu;
237
238 return entry;
239}
240
241
242static void blocktime_vcpu_list_free(gpointer data)
243{
244 g_list_free_full(data, g_free);
245}
246
247static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
248{
249 g_hash_table_destroy(ctx->tid_to_vcpu_hash);
250 g_hash_table_destroy(ctx->vcpu_addr_hash);
251 g_free(ctx->vcpu_blocktime_total);
252 g_free(ctx->vcpu_faults_count);
253 g_free(ctx->vcpu_faults_current);
254 g_free(ctx);
255}
256
257static void migration_exit_cb(Notifier *n, void *data)
258{
259 PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
260 exit_notifier);
261 destroy_blocktime_context(ctx);
262}
263
264static GHashTable *blocktime_init_tid_to_vcpu_hash(void)
265{
266
267
268
269
270
271
272 GHashTable *table = g_hash_table_new_full(g_direct_hash, g_direct_equal,
273 NULL, g_free);
274 CPUState *cpu;
275
276
277
278
279
280
281 CPU_FOREACH(cpu) {
282 int *value = g_new(int, 1);
283
284 *value = cpu->cpu_index;
285 g_hash_table_insert(table,
286 GUINT_TO_POINTER((uint32_t)cpu->thread_id),
287 value);
288 trace_postcopy_blocktime_tid_cpu_map(cpu->cpu_index, cpu->thread_id);
289 }
290
291 return table;
292}
293
294static struct PostcopyBlocktimeContext *blocktime_context_new(void)
295{
296 MachineState *ms = MACHINE(qdev_get_machine());
297 unsigned int smp_cpus = ms->smp.cpus;
298 PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
299
300
301 memset(ctx->latency_buckets, 0, sizeof(ctx->latency_buckets));
302
303 ctx->vcpu_blocktime_total = g_new0(uint64_t, smp_cpus);
304 ctx->vcpu_faults_count = g_new0(uint64_t, smp_cpus);
305 ctx->vcpu_faults_current = g_new0(uint8_t, smp_cpus);
306 ctx->tid_to_vcpu_hash = blocktime_init_tid_to_vcpu_hash();
307
308
309
310
311
312
313
314 ctx->vcpu_addr_hash = g_hash_table_new_full(g_direct_hash,
315 g_direct_equal,
316 NULL,
317 blocktime_vcpu_list_free);
318
319 ctx->exit_notifier.notify = migration_exit_cb;
320 qemu_add_exit_notifier(&ctx->exit_notifier);
321
322 return ctx;
323}
324
325
326
327
328
329
330
331
332void fill_destination_postcopy_migration_info(MigrationInfo *info)
333{
334 MigrationIncomingState *mis = migration_incoming_get_current();
335 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
336 MachineState *ms = MACHINE(qdev_get_machine());
337 uint64_t latency_total = 0, faults = 0;
338 uint32List *list_blocktime = NULL;
339 uint64List *list_latency = NULL;
340 uint64List *latency_buckets = NULL;
341 int i;
342
343 if (!bc) {
344 return;
345 }
346
347 for (i = ms->smp.cpus - 1; i >= 0; i--) {
348 uint64_t latency, total, count;
349
350
351 QAPI_LIST_PREPEND(list_blocktime,
352 (uint32_t)(bc->vcpu_blocktime_total[i] / SCALE_MS));
353
354
355 total = bc->vcpu_blocktime_total[i];
356 latency_total += total;
357 count = bc->vcpu_faults_count[i];
358 faults += count;
359
360 if (count) {
361 latency = total / count;
362 } else {
363
364 latency = 0;
365 }
366
367 QAPI_LIST_PREPEND(list_latency, latency);
368 }
369
370 for (i = BLOCKTIME_LATENCY_BUCKET_N - 1; i >= 0; i--) {
371 QAPI_LIST_PREPEND(latency_buckets, bc->latency_buckets[i]);
372 }
373
374 latency_total += bc->non_vcpu_blocktime_total;
375 faults += bc->non_vcpu_faults;
376
377 info->has_postcopy_non_vcpu_latency = true;
378 info->postcopy_non_vcpu_latency = bc->non_vcpu_faults ?
379 (bc->non_vcpu_blocktime_total / bc->non_vcpu_faults) : 0;
380 info->has_postcopy_blocktime = true;
381
382 info->postcopy_blocktime = (uint32_t)(bc->total_blocktime / SCALE_MS);
383 info->has_postcopy_vcpu_blocktime = true;
384 info->postcopy_vcpu_blocktime = list_blocktime;
385 info->has_postcopy_latency = true;
386 info->postcopy_latency = faults ? (latency_total / faults) : 0;
387 info->has_postcopy_vcpu_latency = true;
388 info->postcopy_vcpu_latency = list_latency;
389 info->has_postcopy_latency_dist = true;
390 info->postcopy_latency_dist = latency_buckets;
391}
392
393static uint64_t get_postcopy_total_blocktime(void)
394{
395 MigrationIncomingState *mis = migration_incoming_get_current();
396 PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
397
398 if (!bc) {
399 return 0;
400 }
401
402 return bc->total_blocktime;
403}
404
405
406
407
408
409
410
411
412
413
414
415static bool receive_ufd_features(uint64_t *features)
416{
417 struct uffdio_api api_struct = {0};
418 int ufd;
419 bool ret = true;
420
421 ufd = uffd_open(O_CLOEXEC);
422 if (ufd == -1) {
423 error_report("%s: uffd_open() failed: %s", __func__, strerror(errno));
424 return false;
425 }
426
427
428 api_struct.api = UFFD_API;
429 api_struct.features = 0;
430 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
431 error_report("%s: UFFDIO_API failed: %s", __func__,
432 strerror(errno));
433 ret = false;
434 goto release_ufd;
435 }
436
437 *features = api_struct.features;
438
439release_ufd:
440 close(ufd);
441 return ret;
442}
443
444
445
446
447
448
449
450
451
452
453static bool request_ufd_features(int ufd, uint64_t features)
454{
455 struct uffdio_api api_struct = {0};
456 uint64_t ioctl_mask;
457
458 api_struct.api = UFFD_API;
459 api_struct.features = features;
460 if (ioctl(ufd, UFFDIO_API, &api_struct)) {
461 error_report("%s failed: UFFDIO_API failed: %s", __func__,
462 strerror(errno));
463 return false;
464 }
465
466 ioctl_mask = 1ULL << _UFFDIO_REGISTER |
467 1ULL << _UFFDIO_UNREGISTER;
468 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
469 error_report("Missing userfault features: %" PRIx64,
470 (uint64_t)(~api_struct.ioctls & ioctl_mask));
471 return false;
472 }
473
474 return true;
475}
476
477static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis,
478 Error **errp)
479{
480 ERRP_GUARD();
481 uint64_t asked_features = 0;
482 static uint64_t supported_features;
483
484
485
486
487
488
489 if (!supported_features) {
490 if (!receive_ufd_features(&supported_features)) {
491 error_setg(errp, "Userfault feature detection failed");
492 return false;
493 }
494 }
495
496#ifdef UFFD_FEATURE_THREAD_ID
497
498
499
500
501
502 if (UFFD_FEATURE_THREAD_ID & supported_features) {
503 asked_features |= UFFD_FEATURE_THREAD_ID;
504 }
505#endif
506
507
508
509
510
511
512 if (!request_ufd_features(ufd, asked_features)) {
513 error_setg(errp, "Failed features %" PRIu64, asked_features);
514 return false;
515 }
516
517 if (qemu_real_host_page_size() != ram_pagesize_summary()) {
518 bool have_hp = false;
519
520#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
521 have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
522#endif
523 if (!have_hp) {
524 error_setg(errp,
525 "Userfault on this host does not support huge pages");
526 return false;
527 }
528 }
529 return true;
530}
531
532
533
534static int test_ramblock_postcopiable(RAMBlock *rb, Error **errp)
535{
536 const char *block_name = qemu_ram_get_idstr(rb);
537 ram_addr_t length = qemu_ram_get_used_length(rb);
538 size_t pagesize = qemu_ram_pagesize(rb);
539 QemuFsType fs;
540
541 if (length % pagesize) {
542 error_setg(errp,
543 "Postcopy requires RAM blocks to be a page size multiple,"
544 " block %s is 0x" RAM_ADDR_FMT " bytes with a "
545 "page size of 0x%zx", block_name, length, pagesize);
546 return 1;
547 }
548
549 if (rb->fd >= 0) {
550 fs = qemu_fd_getfs(rb->fd);
551 if (fs != QEMU_FS_TYPE_TMPFS && fs != QEMU_FS_TYPE_HUGETLBFS) {
552 error_setg(errp,
553 "Host backend files need to be TMPFS or HUGETLBFS only");
554 return 1;
555 }
556 }
557
558 return 0;
559}
560
561
562
563
564
565
566bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
567{
568 ERRP_GUARD();
569 long pagesize = qemu_real_host_page_size();
570 int ufd = -1;
571 bool ret = false;
572 void *testarea = NULL;
573 struct uffdio_register reg_struct;
574 struct uffdio_range range_struct;
575 uint64_t feature_mask;
576 RAMBlock *block;
577
578 if (qemu_target_page_size() > pagesize) {
579 error_setg(errp, "Target page size bigger than host page size");
580 goto out;
581 }
582
583 ufd = uffd_open(O_CLOEXEC);
584 if (ufd == -1) {
585 error_setg(errp, "Userfaultfd not available: %s", strerror(errno));
586 goto out;
587 }
588
589
590 if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, errp)) {
591 goto out;
592 }
593
594
595 if (!ufd_check_and_apply(ufd, mis, errp)) {
596 goto out;
597 }
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612 RAMBLOCK_FOREACH(block) {
613 if (test_ramblock_postcopiable(block, errp)) {
614 goto out;
615 }
616 }
617
618
619
620
621
622 if (munlockall()) {
623 error_setg(errp, "munlockall() failed: %s", strerror(errno));
624 goto out;
625 }
626
627
628
629
630
631
632 testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
633 MAP_ANONYMOUS, -1, 0);
634 if (testarea == MAP_FAILED) {
635 error_setg(errp, "Failed to map test area: %s", strerror(errno));
636 goto out;
637 }
638 g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
639
640 reg_struct.range.start = (uintptr_t)testarea;
641 reg_struct.range.len = pagesize;
642 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
643
644 if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) {
645 error_setg(errp, "UFFDIO_REGISTER failed: %s", strerror(errno));
646 goto out;
647 }
648
649 range_struct.start = (uintptr_t)testarea;
650 range_struct.len = pagesize;
651 if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
652 error_setg(errp, "UFFDIO_UNREGISTER failed: %s", strerror(errno));
653 goto out;
654 }
655
656 feature_mask = 1ULL << _UFFDIO_WAKE |
657 1ULL << _UFFDIO_COPY |
658 1ULL << _UFFDIO_ZEROPAGE;
659 if ((reg_struct.ioctls & feature_mask) != feature_mask) {
660 error_setg(errp, "Missing userfault map features: %" PRIx64,
661 (uint64_t)(~reg_struct.ioctls & feature_mask));
662 goto out;
663 }
664
665
666 ret = true;
667out:
668 if (testarea) {
669 munmap(testarea, pagesize);
670 }
671 if (ufd != -1) {
672 close(ufd);
673 }
674 return ret;
675}
676
677
678
679
680
681
682static int init_range(RAMBlock *rb, void *opaque)
683{
684 const char *block_name = qemu_ram_get_idstr(rb);
685 void *host_addr = qemu_ram_get_host_addr(rb);
686 ram_addr_t offset = qemu_ram_get_offset(rb);
687 ram_addr_t length = qemu_ram_get_used_length(rb);
688 trace_postcopy_init_range(block_name, host_addr, offset, length);
689
690
691
692
693
694
695 rb->postcopy_length = length;
696
697
698
699
700
701
702
703 if (ram_discard_range(block_name, 0, length)) {
704 return -1;
705 }
706
707 return 0;
708}
709
710
711
712
713
714static int cleanup_range(RAMBlock *rb, void *opaque)
715{
716 const char *block_name = qemu_ram_get_idstr(rb);
717 void *host_addr = qemu_ram_get_host_addr(rb);
718 ram_addr_t offset = qemu_ram_get_offset(rb);
719 ram_addr_t length = rb->postcopy_length;
720 MigrationIncomingState *mis = opaque;
721 struct uffdio_range range_struct;
722 trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
723
724
725
726
727
728 qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
729
730
731
732
733
734
735 range_struct.start = (uintptr_t)host_addr;
736 range_struct.len = length;
737
738 if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
739 error_report("%s: userfault unregister %s", __func__, strerror(errno));
740
741 return -1;
742 }
743
744 return 0;
745}
746
747
748
749
750
751
752int postcopy_ram_incoming_init(MigrationIncomingState *mis)
753{
754 if (foreach_not_ignored_block(init_range, NULL)) {
755 return -1;
756 }
757
758 return 0;
759}
760
761static void postcopy_temp_pages_cleanup(MigrationIncomingState *mis)
762{
763 int i;
764
765 if (mis->postcopy_tmp_pages) {
766 for (i = 0; i < mis->postcopy_channels; i++) {
767 if (mis->postcopy_tmp_pages[i].tmp_huge_page) {
768 munmap(mis->postcopy_tmp_pages[i].tmp_huge_page,
769 mis->largest_page_size);
770 mis->postcopy_tmp_pages[i].tmp_huge_page = NULL;
771 }
772 }
773 g_free(mis->postcopy_tmp_pages);
774 mis->postcopy_tmp_pages = NULL;
775 }
776
777 if (mis->postcopy_tmp_zero_page) {
778 munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
779 mis->postcopy_tmp_zero_page = NULL;
780 }
781}
782
783
784
785
786int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
787{
788 trace_postcopy_ram_incoming_cleanup_entry();
789
790 if (mis->preempt_thread_status == PREEMPT_THREAD_CREATED) {
791
792 mis->preempt_thread_status = PREEMPT_THREAD_QUIT;
793
794
795
796
797
798 smp_mb();
799
800
801
802
803
804
805 WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
806 if (qatomic_read(&mis->page_requested_count)) {
807
808
809
810
811
812 qemu_cond_wait(&mis->page_request_cond,
813 &mis->page_request_mutex);
814 }
815 }
816
817 if (mis->postcopy_qemufile_dst) {
818 qemu_file_shutdown(mis->postcopy_qemufile_dst);
819 }
820 qemu_thread_join(&mis->postcopy_prio_thread);
821 mis->preempt_thread_status = PREEMPT_THREAD_NONE;
822 }
823
824 if (mis->have_fault_thread) {
825 Error *local_err = NULL;
826
827
828 qatomic_set(&mis->fault_thread_quit, 1);
829 postcopy_fault_thread_notify(mis);
830 trace_postcopy_ram_incoming_cleanup_join();
831 qemu_thread_join(&mis->fault_thread);
832
833 if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
834 error_report_err(local_err);
835 return -1;
836 }
837
838 if (foreach_not_ignored_block(cleanup_range, mis)) {
839 return -1;
840 }
841
842 trace_postcopy_ram_incoming_cleanup_closeuf();
843 close(mis->userfault_fd);
844 close(mis->userfault_event_fd);
845 mis->have_fault_thread = false;
846 }
847
848 if (should_mlock(mlock_state)) {
849 if (os_mlock(is_mlock_on_fault(mlock_state)) < 0) {
850 error_report("mlock: %s", strerror(errno));
851
852
853
854
855 }
856 }
857
858 postcopy_temp_pages_cleanup(mis);
859
860 trace_postcopy_ram_incoming_cleanup_blocktime(
861 get_postcopy_total_blocktime());
862
863 trace_postcopy_ram_incoming_cleanup_exit();
864 return 0;
865}
866
867
868
869
870static int nhp_range(RAMBlock *rb, void *opaque)
871{
872 const char *block_name = qemu_ram_get_idstr(rb);
873 void *host_addr = qemu_ram_get_host_addr(rb);
874 ram_addr_t offset = qemu_ram_get_offset(rb);
875 ram_addr_t length = rb->postcopy_length;
876 trace_postcopy_nhp_range(block_name, host_addr, offset, length);
877
878
879
880
881
882
883 qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
884
885 return 0;
886}
887
888
889
890
891
892
893int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
894{
895 if (foreach_not_ignored_block(nhp_range, mis)) {
896 return -1;
897 }
898
899 postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
900
901 return 0;
902}
903
904
905
906
907
908
909
910
911
912
913static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
914{
915 MigrationIncomingState *mis = opaque;
916 struct uffdio_register reg_struct;
917
918 reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
919 reg_struct.range.len = rb->postcopy_length;
920 reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
921
922
923 if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) {
924 error_report("%s userfault register: %s", __func__, strerror(errno));
925 return -1;
926 }
927 if (!(reg_struct.ioctls & (1ULL << _UFFDIO_COPY))) {
928 error_report("%s userfault: Region doesn't support COPY", __func__);
929 return -1;
930 }
931 if (reg_struct.ioctls & (1ULL << _UFFDIO_ZEROPAGE)) {
932 qemu_ram_set_uf_zeroable(rb);
933 }
934
935 return 0;
936}
937
938int postcopy_wake_shared(struct PostCopyFD *pcfd,
939 uint64_t client_addr,
940 RAMBlock *rb)
941{
942 size_t pagesize = qemu_ram_pagesize(rb);
943 trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
944 return uffd_wakeup(pcfd->fd,
945 (void *)(uintptr_t)ROUND_DOWN(client_addr, pagesize),
946 pagesize);
947}
948
949
950
951
952
953static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
954 ram_addr_t start, uint64_t haddr, uint32_t tid)
955{
956 void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
957
958
959
960
961
962
963
964
965
966 assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
967 if (ramblock_page_is_discarded(rb, start)) {
968 bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
969
970 return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
971 }
972
973 return migrate_send_rp_req_pages(mis, rb, start, haddr, tid);
974}
975
976
977
978
979
980
981int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
982 uint64_t client_addr, uint64_t rb_offset)
983{
984 uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
985 MigrationIncomingState *mis = migration_incoming_get_current();
986
987 trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
988 rb_offset);
989 if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
990 trace_postcopy_request_shared_page_present(pcfd->idstr,
991 qemu_ram_get_idstr(rb), rb_offset);
992 return postcopy_wake_shared(pcfd, client_addr, rb);
993 }
994
995 postcopy_request_page(mis, rb, aligned_rbo, client_addr, 0);
996 return 0;
997}
998
999static int blocktime_get_vcpu(PostcopyBlocktimeContext *ctx, uint32_t tid)
1000{
1001 int *found;
1002
1003 found = g_hash_table_lookup(ctx->tid_to_vcpu_hash, GUINT_TO_POINTER(tid));
1004 if (!found) {
1005
1006
1007
1008
1009
1010 return -1;
1011 }
1012
1013 return *found;
1014}
1015
1016static uint64_t get_current_ns(void)
1017{
1018 return (uint64_t)qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1019}
1020
1021
1022
1023
1024
1025static void blocktime_fault_inject(PostcopyBlocktimeContext *ctx,
1026 uintptr_t addr, int cpu, uint64_t time)
1027{
1028 BlocktimeVCPUEntry *entry = blocktime_vcpu_entry_alloc(cpu, time);
1029 GHashTable *table = ctx->vcpu_addr_hash;
1030 gpointer key = (gpointer)addr;
1031 GList *head, *list;
1032 gboolean result;
1033
1034 head = g_hash_table_lookup(table, key);
1035 if (head) {
1036
1037
1038
1039
1040 result = g_hash_table_steal(table, key);
1041 assert(result == TRUE);
1042 }
1043
1044
1045
1046
1047
1048
1049
1050
1051 list = g_list_prepend(head, entry);
1052 g_hash_table_insert(table, key, list);
1053
1054 trace_postcopy_blocktime_begin(addr, time, cpu, !!head);
1055}
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
1066 RAMBlock *rb)
1067{
1068 int cpu;
1069 MigrationIncomingState *mis = migration_incoming_get_current();
1070 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
1071 uint64_t current;
1072
1073 if (!dc || ptid == 0) {
1074 return;
1075 }
1076
1077
1078
1079
1080
1081 assert(!ramblock_recv_bitmap_test(rb, (void *)addr));
1082
1083 current = get_current_ns();
1084 cpu = blocktime_get_vcpu(dc, ptid);
1085
1086 if (cpu >= 0) {
1087
1088 dc->vcpu_faults_count[cpu]++;
1089
1090
1091
1092
1093
1094 if (dc->vcpu_faults_current[cpu]++ == 0) {
1095 dc->smp_cpus_down++;
1096
1097
1098
1099
1100
1101 dc->last_begin = current;
1102 }
1103
1104
1105 assert(dc->vcpu_faults_current[cpu] <= 255);
1106 } else {
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117 dc->non_vcpu_faults++;
1118 }
1119
1120 blocktime_fault_inject(dc, addr, cpu, current);
1121}
1122
1123static void blocktime_latency_account(PostcopyBlocktimeContext *ctx,
1124 uint64_t time_us)
1125{
1126
1127
1128
1129
1130 int index = time_us ? (63 - clz64(time_us)) : 0;
1131
1132 assert(index >= 0);
1133
1134
1135 if (index >= BLOCKTIME_LATENCY_BUCKET_N) {
1136 index = BLOCKTIME_LATENCY_BUCKET_N - 1;
1137 }
1138
1139 ctx->latency_buckets[index]++;
1140}
1141
1142typedef struct {
1143 PostcopyBlocktimeContext *ctx;
1144 uint64_t current;
1145 int affected_cpus;
1146 int affected_non_cpus;
1147} BlockTimeVCPUIter;
1148
1149static void blocktime_cpu_list_iter_fn(gpointer data, gpointer user_data)
1150{
1151 BlockTimeVCPUIter *iter = user_data;
1152 PostcopyBlocktimeContext *ctx = iter->ctx;
1153 BlocktimeVCPUEntry *entry = data;
1154 uint64_t time_passed;
1155 int cpu = entry->cpu;
1156
1157
1158
1159
1160
1161 assert(iter->current >= entry->fault_time);
1162 time_passed = iter->current - entry->fault_time;
1163
1164
1165 blocktime_latency_account(ctx, time_passed / SCALE_US);
1166
1167 if (cpu >= 0) {
1168
1169
1170
1171
1172 if (--ctx->vcpu_faults_current[cpu] == 0) {
1173 ctx->vcpu_blocktime_total[cpu] += time_passed;
1174 iter->affected_cpus += 1;
1175 }
1176 trace_postcopy_blocktime_end_one(cpu, ctx->vcpu_faults_current[cpu]);
1177 } else {
1178 iter->affected_non_cpus++;
1179 ctx->non_vcpu_blocktime_total += time_passed;
1180
1181
1182
1183
1184 trace_postcopy_blocktime_end_one(-1, 0);
1185 }
1186}
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215static void mark_postcopy_blocktime_end(uintptr_t addr)
1216{
1217 MigrationIncomingState *mis = migration_incoming_get_current();
1218 PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
1219 MachineState *ms = MACHINE(qdev_get_machine());
1220 unsigned int smp_cpus = ms->smp.cpus;
1221 BlockTimeVCPUIter iter = {
1222 .current = get_current_ns(),
1223 .affected_cpus = 0,
1224 .affected_non_cpus = 0,
1225 .ctx = dc,
1226 };
1227 gpointer key = (gpointer)addr;
1228 GHashTable *table;
1229 GList *list;
1230
1231 if (!dc) {
1232 return;
1233 }
1234
1235 table = dc->vcpu_addr_hash;
1236
1237 list = g_hash_table_lookup(table, key);
1238 if (!list) {
1239 return;
1240 }
1241
1242
1243
1244
1245
1246 g_list_foreach(list, blocktime_cpu_list_iter_fn, &iter);
1247 g_hash_table_remove(table, key);
1248
1249
1250
1251
1252
1253 if (dc->smp_cpus_down == smp_cpus && iter.affected_cpus) {
1254 dc->total_blocktime += iter.current - dc->last_begin;
1255 }
1256 dc->smp_cpus_down -= iter.affected_cpus;
1257
1258 trace_postcopy_blocktime_end(addr, iter.current, iter.affected_cpus,
1259 iter.affected_non_cpus);
1260}
1261
1262static void postcopy_pause_fault_thread(MigrationIncomingState *mis)
1263{
1264 trace_postcopy_pause_fault_thread();
1265 qemu_sem_wait(&mis->postcopy_pause_sem_fault);
1266 trace_postcopy_pause_fault_thread_continued();
1267}
1268
1269
1270
1271
1272static void *postcopy_ram_fault_thread(void *opaque)
1273{
1274 MigrationIncomingState *mis = opaque;
1275 struct uffd_msg msg;
1276 int ret;
1277 size_t index;
1278 RAMBlock *rb = NULL;
1279
1280 trace_postcopy_ram_fault_thread_entry();
1281 rcu_register_thread();
1282 mis->last_rb = NULL;
1283 qemu_event_set(&mis->thread_sync_event);
1284
1285 struct pollfd *pfd;
1286 size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
1287
1288 pfd = g_new0(struct pollfd, pfd_len);
1289
1290 pfd[0].fd = mis->userfault_fd;
1291 pfd[0].events = POLLIN;
1292 pfd[1].fd = mis->userfault_event_fd;
1293 pfd[1].events = POLLIN;
1294 trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
1295 for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
1296 struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
1297 struct PostCopyFD, index);
1298 pfd[2 + index].fd = pcfd->fd;
1299 pfd[2 + index].events = POLLIN;
1300 trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
1301 pcfd->fd);
1302 }
1303
1304 while (true) {
1305 ram_addr_t rb_offset;
1306 int poll_result;
1307
1308
1309
1310
1311
1312
1313
1314 poll_result = poll(pfd, pfd_len, -1 );
1315 if (poll_result == -1) {
1316 error_report("%s: userfault poll: %s", __func__, strerror(errno));
1317 break;
1318 }
1319
1320 if (!mis->to_src_file) {
1321
1322
1323
1324
1325
1326 postcopy_pause_fault_thread(mis);
1327 }
1328
1329 if (pfd[1].revents) {
1330 uint64_t tmp64 = 0;
1331
1332
1333 if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
1334
1335 error_report("%s: read() failed", __func__);
1336 }
1337
1338 if (qatomic_read(&mis->fault_thread_quit)) {
1339 trace_postcopy_ram_fault_thread_quit();
1340 break;
1341 }
1342 }
1343
1344 if (pfd[0].revents) {
1345 poll_result--;
1346 ret = read(mis->userfault_fd, &msg, sizeof(msg));
1347 if (ret != sizeof(msg)) {
1348 if (errno == EAGAIN) {
1349
1350
1351
1352
1353 continue;
1354 }
1355 if (ret < 0) {
1356 error_report("%s: Failed to read full userfault "
1357 "message: %s",
1358 __func__, strerror(errno));
1359 break;
1360 } else {
1361 error_report("%s: Read %d bytes from userfaultfd "
1362 "expected %zd",
1363 __func__, ret, sizeof(msg));
1364 break;
1365 }
1366 }
1367 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1368 error_report("%s: Read unexpected event %ud from userfaultfd",
1369 __func__, msg.event);
1370 continue;
1371 }
1372
1373 rb = qemu_ram_block_from_host(
1374 (void *)(uintptr_t)msg.arg.pagefault.address,
1375 true, &rb_offset);
1376 if (!rb) {
1377 error_report("postcopy_ram_fault_thread: Fault outside guest: %"
1378 PRIx64, (uint64_t)msg.arg.pagefault.address);
1379 break;
1380 }
1381
1382 rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
1383 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
1384 qemu_ram_get_idstr(rb),
1385 rb_offset,
1386 msg.arg.pagefault.feat.ptid);
1387retry:
1388
1389
1390
1391
1392 ret = postcopy_request_page(mis, rb, rb_offset,
1393 msg.arg.pagefault.address,
1394 msg.arg.pagefault.feat.ptid);
1395 if (ret) {
1396
1397 postcopy_pause_fault_thread(mis);
1398 goto retry;
1399 }
1400 }
1401
1402
1403
1404 for (index = 2; index < pfd_len && poll_result; index++) {
1405 if (pfd[index].revents) {
1406 struct PostCopyFD *pcfd =
1407 &g_array_index(mis->postcopy_remote_fds,
1408 struct PostCopyFD, index - 2);
1409
1410 poll_result--;
1411 if (pfd[index].revents & POLLERR) {
1412 error_report("%s: POLLERR on poll %zd fd=%d",
1413 __func__, index, pcfd->fd);
1414 pfd[index].events = 0;
1415 continue;
1416 }
1417
1418 ret = read(pcfd->fd, &msg, sizeof(msg));
1419 if (ret != sizeof(msg)) {
1420 if (errno == EAGAIN) {
1421
1422
1423
1424
1425 continue;
1426 }
1427 if (ret < 0) {
1428 error_report("%s: Failed to read full userfault "
1429 "message: %s (shared) revents=%d",
1430 __func__, strerror(errno),
1431 pfd[index].revents);
1432
1433 break;
1434 } else {
1435 error_report("%s: Read %d bytes from userfaultfd "
1436 "expected %zd (shared)",
1437 __func__, ret, sizeof(msg));
1438
1439 break;
1440 }
1441 }
1442 if (msg.event != UFFD_EVENT_PAGEFAULT) {
1443 error_report("%s: Read unexpected event %ud "
1444 "from userfaultfd (shared)",
1445 __func__, msg.event);
1446 continue;
1447 }
1448
1449 ret = pcfd->handler(pcfd, &msg);
1450 if (ret) {
1451 error_report("%s: Failed to resolve shared fault on %zd/%s",
1452 __func__, index, pcfd->idstr);
1453
1454 }
1455 }
1456 }
1457 }
1458 rcu_unregister_thread();
1459 trace_postcopy_ram_fault_thread_exit();
1460 g_free(pfd);
1461 return NULL;
1462}
1463
1464static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
1465{
1466 PostcopyTmpPage *tmp_page;
1467 int err, i, channels;
1468 void *temp_page;
1469
1470 if (migrate_postcopy_preempt()) {
1471
1472 mis->postcopy_channels = RAM_CHANNEL_MAX;
1473 } else {
1474
1475 mis->postcopy_channels = 1;
1476 }
1477
1478 channels = mis->postcopy_channels;
1479 mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels);
1480
1481 for (i = 0; i < channels; i++) {
1482 tmp_page = &mis->postcopy_tmp_pages[i];
1483 temp_page = mmap(NULL, mis->largest_page_size, PROT_READ | PROT_WRITE,
1484 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1485 if (temp_page == MAP_FAILED) {
1486 err = errno;
1487 error_report("%s: Failed to map postcopy_tmp_pages[%d]: %s",
1488 __func__, i, strerror(err));
1489
1490 return -err;
1491 }
1492 tmp_page->tmp_huge_page = temp_page;
1493
1494 postcopy_temp_page_reset(tmp_page);
1495 }
1496
1497
1498
1499
1500 mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1501 PROT_READ | PROT_WRITE,
1502 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
1503 if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1504 err = errno;
1505 mis->postcopy_tmp_zero_page = NULL;
1506 error_report("%s: Failed to map large zero page %s",
1507 __func__, strerror(err));
1508 return -err;
1509 }
1510
1511 memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1512
1513 return 0;
1514}
1515
1516int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1517{
1518 Error *local_err = NULL;
1519
1520
1521 mis->userfault_fd = uffd_open(O_CLOEXEC | O_NONBLOCK);
1522 if (mis->userfault_fd == -1) {
1523 error_report("%s: Failed to open userfault fd: %s", __func__,
1524 strerror(errno));
1525 return -1;
1526 }
1527
1528
1529
1530
1531
1532 if (!ufd_check_and_apply(mis->userfault_fd, mis, &local_err)) {
1533 error_report_err(local_err);
1534 return -1;
1535 }
1536
1537 if (migrate_postcopy_blocktime()) {
1538 assert(mis->blocktime_ctx == NULL);
1539 mis->blocktime_ctx = blocktime_context_new();
1540 }
1541
1542
1543 mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1544 if (mis->userfault_event_fd == -1) {
1545 error_report("%s: Opening userfault_event_fd: %s", __func__,
1546 strerror(errno));
1547 close(mis->userfault_fd);
1548 return -1;
1549 }
1550
1551 postcopy_thread_create(mis, &mis->fault_thread,
1552 MIGRATION_THREAD_DST_FAULT,
1553 postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
1554 mis->have_fault_thread = true;
1555
1556
1557 if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1558 error_report("ram_block_enable_notify failed");
1559 return -1;
1560 }
1561
1562 if (postcopy_temp_pages_setup(mis)) {
1563
1564 return -1;
1565 }
1566
1567 if (migrate_postcopy_preempt()) {
1568
1569
1570
1571
1572 postcopy_thread_create(mis, &mis->postcopy_prio_thread,
1573 MIGRATION_THREAD_DST_PREEMPT,
1574 postcopy_preempt_thread, QEMU_THREAD_JOINABLE);
1575 mis->preempt_thread_status = PREEMPT_THREAD_CREATED;
1576 }
1577
1578 trace_postcopy_ram_enable_notify();
1579
1580 return 0;
1581}
1582
1583static int qemu_ufd_copy_ioctl(MigrationIncomingState *mis, void *host_addr,
1584 void *from_addr, uint64_t pagesize, RAMBlock *rb)
1585{
1586 int userfault_fd = mis->userfault_fd;
1587 int ret;
1588
1589 if (from_addr) {
1590 ret = uffd_copy_page(userfault_fd, host_addr, from_addr, pagesize,
1591 false);
1592 } else {
1593 ret = uffd_zero_page(userfault_fd, host_addr, pagesize, false);
1594 }
1595 if (!ret) {
1596 qemu_mutex_lock(&mis->page_request_mutex);
1597 ramblock_recv_bitmap_set_range(rb, host_addr,
1598 pagesize / qemu_target_page_size());
1599
1600
1601
1602
1603 if (g_tree_lookup(mis->page_requested, host_addr)) {
1604 g_tree_remove(mis->page_requested, host_addr);
1605 int left_pages = qatomic_dec_fetch(&mis->page_requested_count);
1606
1607 trace_postcopy_page_req_del(host_addr, mis->page_requested_count);
1608
1609 smp_mb();
1610 if (mis->preempt_thread_status == PREEMPT_THREAD_QUIT &&
1611 left_pages == 0) {
1612
1613
1614
1615
1616
1617 qemu_cond_signal(&mis->page_request_cond);
1618 }
1619 }
1620 mark_postcopy_blocktime_end((uintptr_t)host_addr);
1621 qemu_mutex_unlock(&mis->page_request_mutex);
1622 }
1623 return ret;
1624}
1625
1626int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1627{
1628 int i;
1629 MigrationIncomingState *mis = migration_incoming_get_current();
1630 GArray *pcrfds = mis->postcopy_remote_fds;
1631
1632 for (i = 0; i < pcrfds->len; i++) {
1633 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1634 int ret = cur->waker(cur, rb, offset);
1635 if (ret) {
1636 return ret;
1637 }
1638 }
1639 return 0;
1640}
1641
1642
1643
1644
1645
1646int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1647 RAMBlock *rb)
1648{
1649 size_t pagesize = qemu_ram_pagesize(rb);
1650 int e;
1651
1652
1653
1654
1655
1656
1657 e = qemu_ufd_copy_ioctl(mis, host, from, pagesize, rb);
1658 if (e) {
1659 return e;
1660 }
1661
1662 trace_postcopy_place_page(host);
1663 return postcopy_notify_shared_wake(rb,
1664 qemu_ram_block_host_offset(rb, host));
1665}
1666
1667
1668
1669
1670
1671int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1672 RAMBlock *rb)
1673{
1674 size_t pagesize = qemu_ram_pagesize(rb);
1675 trace_postcopy_place_page_zero(host);
1676
1677
1678
1679
1680 if (qemu_ram_is_uf_zeroable(rb)) {
1681 int e;
1682 e = qemu_ufd_copy_ioctl(mis, host, NULL, pagesize, rb);
1683 if (e) {
1684 return e;
1685 }
1686 return postcopy_notify_shared_wake(rb,
1687 qemu_ram_block_host_offset(rb,
1688 host));
1689 } else {
1690 return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1691 }
1692}
1693
1694#else
1695
1696void fill_destination_postcopy_migration_info(MigrationInfo *info)
1697{
1698}
1699
1700bool postcopy_ram_supported_by_host(MigrationIncomingState *mis, Error **errp)
1701{
1702 error_report("%s: No OS support", __func__);
1703 return false;
1704}
1705
1706int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1707{
1708 error_report("postcopy_ram_incoming_init: No OS support");
1709 return -1;
1710}
1711
1712int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1713{
1714 g_assert_not_reached();
1715}
1716
1717int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1718{
1719 g_assert_not_reached();
1720}
1721
1722int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1723 uint64_t client_addr, uint64_t rb_offset)
1724{
1725 g_assert_not_reached();
1726}
1727
1728int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1729{
1730 g_assert_not_reached();
1731}
1732
1733int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1734 RAMBlock *rb)
1735{
1736 g_assert_not_reached();
1737}
1738
1739int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1740 RAMBlock *rb)
1741{
1742 g_assert_not_reached();
1743}
1744
1745int postcopy_wake_shared(struct PostCopyFD *pcfd,
1746 uint64_t client_addr,
1747 RAMBlock *rb)
1748{
1749 g_assert_not_reached();
1750}
1751
1752void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
1753 RAMBlock *rb)
1754{
1755}
1756#endif
1757
1758
1759void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page)
1760{
1761 tmp_page->target_pages = 0;
1762 tmp_page->host_addr = NULL;
1763
1764
1765
1766
1767 tmp_page->all_zero = true;
1768}
1769
1770void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1771{
1772 uint64_t tmp64 = 1;
1773
1774
1775
1776
1777
1778 if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1779
1780 error_report("%s: incrementing failed: %s", __func__,
1781 strerror(errno));
1782 }
1783}
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793static PostcopyDiscardState pds = {0};
1794void postcopy_discard_send_init(MigrationState *ms, const char *name)
1795{
1796 pds.ramblock_name = name;
1797 pds.cur_entry = 0;
1798 pds.nsentwords = 0;
1799 pds.nsentcmds = 0;
1800}
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1812 unsigned long length)
1813{
1814 size_t tp_size = qemu_target_page_size();
1815
1816 pds.start_list[pds.cur_entry] = start * tp_size;
1817 pds.length_list[pds.cur_entry] = length * tp_size;
1818 trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1819 pds.cur_entry++;
1820 pds.nsentwords++;
1821
1822 if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1823
1824 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1825 pds.ramblock_name,
1826 pds.cur_entry,
1827 pds.start_list,
1828 pds.length_list);
1829 pds.nsentcmds++;
1830 pds.cur_entry = 0;
1831 }
1832}
1833
1834
1835
1836
1837
1838
1839
1840void postcopy_discard_send_finish(MigrationState *ms)
1841{
1842
1843 if (pds.cur_entry) {
1844 qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1845 pds.ramblock_name,
1846 pds.cur_entry,
1847 pds.start_list,
1848 pds.length_list);
1849 pds.nsentcmds++;
1850 }
1851
1852 trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1853 pds.nsentcmds);
1854}
1855
1856
1857
1858
1859
1860
1861static PostcopyState incoming_postcopy_state;
1862
1863PostcopyState postcopy_state_get(void)
1864{
1865 return qatomic_load_acquire(&incoming_postcopy_state);
1866}
1867
1868
1869PostcopyState postcopy_state_set(PostcopyState new_state)
1870{
1871 return qatomic_xchg(&incoming_postcopy_state, new_state);
1872}
1873
1874
1875
1876
1877void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1878{
1879 MigrationIncomingState *mis = migration_incoming_get_current();
1880
1881 mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1882 *pcfd);
1883}
1884
1885
1886
1887void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1888{
1889 guint i;
1890 MigrationIncomingState *mis = migration_incoming_get_current();
1891 GArray *pcrfds = mis->postcopy_remote_fds;
1892
1893 if (!pcrfds) {
1894
1895 return;
1896 }
1897 for (i = 0; i < pcrfds->len; i++) {
1898 struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1899 if (cur->fd == pcfd->fd) {
1900 mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1901 return;
1902 }
1903 }
1904}
1905
1906void postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
1907{
1908
1909
1910
1911
1912 qemu_file_set_blocking(file, true);
1913 mis->postcopy_qemufile_dst = file;
1914 qemu_sem_post(&mis->postcopy_qemufile_dst_done);
1915 trace_postcopy_preempt_new_channel();
1916}
1917
1918
1919
1920
1921
1922static void
1923postcopy_preempt_send_channel_done(MigrationState *s,
1924 QIOChannel *ioc, Error *local_err)
1925{
1926 if (local_err) {
1927 migrate_set_error(s, local_err);
1928 error_free(local_err);
1929 } else {
1930 migration_ioc_register_yank(ioc);
1931 s->postcopy_qemufile_src = qemu_file_new_output(ioc);
1932 trace_postcopy_preempt_new_channel();
1933 }
1934
1935
1936
1937
1938
1939 qemu_sem_post(&s->postcopy_qemufile_src_sem);
1940}
1941
1942static void
1943postcopy_preempt_tls_handshake(QIOTask *task, gpointer opaque)
1944{
1945 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1946 MigrationState *s = opaque;
1947 Error *local_err = NULL;
1948
1949 qio_task_propagate_error(task, &local_err);
1950 postcopy_preempt_send_channel_done(s, ioc, local_err);
1951}
1952
1953static void
1954postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque)
1955{
1956 g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
1957 MigrationState *s = opaque;
1958 QIOChannelTLS *tioc;
1959 Error *local_err = NULL;
1960
1961 if (qio_task_propagate_error(task, &local_err)) {
1962 goto out;
1963 }
1964
1965 if (migrate_channel_requires_tls_upgrade(ioc)) {
1966 tioc = migration_tls_client_create(ioc, s->hostname, &local_err);
1967 if (!tioc) {
1968 goto out;
1969 }
1970 trace_postcopy_preempt_tls_handshake();
1971 qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-preempt");
1972 qio_channel_tls_handshake(tioc, postcopy_preempt_tls_handshake,
1973 s, NULL, NULL);
1974
1975 return;
1976 }
1977
1978out:
1979
1980 postcopy_preempt_send_channel_done(s, ioc, local_err);
1981}
1982
1983
1984
1985
1986
1987
1988int postcopy_preempt_establish_channel(MigrationState *s)
1989{
1990
1991 if (!migrate_postcopy_preempt()) {
1992 return 0;
1993 }
1994
1995
1996
1997
1998
1999
2000 if (!s->preempt_pre_7_2) {
2001 postcopy_preempt_setup(s);
2002 }
2003
2004
2005
2006
2007
2008 qemu_sem_wait(&s->postcopy_qemufile_src_sem);
2009
2010 return s->postcopy_qemufile_src ? 0 : -1;
2011}
2012
2013void postcopy_preempt_setup(MigrationState *s)
2014{
2015
2016 socket_send_channel_create(postcopy_preempt_send_channel_new, s);
2017}
2018
2019static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis)
2020{
2021 trace_postcopy_pause_fast_load();
2022 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
2023 qemu_sem_wait(&mis->postcopy_pause_sem_fast_load);
2024 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
2025 trace_postcopy_pause_fast_load_continued();
2026}
2027
2028static bool preempt_thread_should_run(MigrationIncomingState *mis)
2029{
2030 return mis->preempt_thread_status != PREEMPT_THREAD_QUIT;
2031}
2032
2033void *postcopy_preempt_thread(void *opaque)
2034{
2035 MigrationIncomingState *mis = opaque;
2036 int ret;
2037
2038 trace_postcopy_preempt_thread_entry();
2039
2040 rcu_register_thread();
2041
2042 qemu_event_set(&mis->thread_sync_event);
2043
2044
2045
2046
2047
2048 qemu_sem_wait(&mis->postcopy_qemufile_dst_done);
2049
2050
2051 qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
2052 while (preempt_thread_should_run(mis)) {
2053 ret = ram_load_postcopy(mis->postcopy_qemufile_dst,
2054 RAM_CHANNEL_POSTCOPY);
2055
2056 if (ret && preempt_thread_should_run(mis)) {
2057 postcopy_pause_ram_fast_load(mis);
2058 } else {
2059
2060 break;
2061 }
2062 }
2063 qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
2064
2065 rcu_unregister_thread();
2066
2067 trace_postcopy_preempt_thread_exit();
2068
2069 return NULL;
2070}
2071
2072bool postcopy_is_paused(MigrationStatus status)
2073{
2074 return status == MIGRATION_STATUS_POSTCOPY_PAUSED ||
2075 status == MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP;
2076}
2077