1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include "qemu/osdep.h"
18#include "qapi/error.h"
19#include "qemu/cutils.h"
20#include "rdma.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "ram.h"
24#include "qemu-file-channel.h"
25#include "qemu/error-report.h"
26#include "qemu/main-loop.h"
27#include "qemu/module.h"
28#include "qemu/rcu.h"
29#include "qemu/sockets.h"
30#include "qemu/bitmap.h"
31#include "qemu/coroutine.h"
32#include "exec/memory.h"
33#include <sys/socket.h>
34#include <netdb.h>
35#include <arpa/inet.h>
36#include <rdma/rdma_cma.h>
37#include "trace.h"
38#include "qom/object.h"
39
40
41
42
43#define ERROR(errp, fmt, ...) \
44 do { \
45 fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
46 if (errp && (*(errp) == NULL)) { \
47 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
48 } \
49 } while (0)
50
51#define RDMA_RESOLVE_TIMEOUT_MS 10000
52
53
54#define RDMA_MERGE_MAX (2 * 1024 * 1024)
55#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
56
57#define RDMA_REG_CHUNK_SHIFT 20
58
59
60
61
62
63
64
65#define RDMA_SEND_INCREMENT 32768
66
67
68
69
70#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
71#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
72
73#define RDMA_CONTROL_VERSION_CURRENT 1
74
75
76
77#define RDMA_CAPABILITY_PIN_ALL 0x01
78
79
80
81
82
83static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
84
85#define CHECK_ERROR_STATE() \
86 do { \
87 if (rdma->error_state) { \
88 if (!rdma->error_reported) { \
89 error_report("RDMA is in an error state waiting migration" \
90 " to abort!"); \
91 rdma->error_reported = 1; \
92 } \
93 return rdma->error_state; \
94 } \
95 } while (0)
96
97
98
99
100
101
102
103
104
105
106
107
108
109#define RDMA_WRID_TYPE_SHIFT 0UL
110#define RDMA_WRID_BLOCK_SHIFT 16UL
111#define RDMA_WRID_CHUNK_SHIFT 30UL
112
113#define RDMA_WRID_TYPE_MASK \
114 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
115
116#define RDMA_WRID_BLOCK_MASK \
117 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
118
119#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
120
121
122
123
124
125
126enum {
127 RDMA_WRID_NONE = 0,
128 RDMA_WRID_RDMA_WRITE = 1,
129 RDMA_WRID_SEND_CONTROL = 2000,
130 RDMA_WRID_RECV_CONTROL = 4000,
131};
132
133static const char *wrid_desc[] = {
134 [RDMA_WRID_NONE] = "NONE",
135 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
136 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
137 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
138};
139
140
141
142
143
144
145
146
147enum {
148 RDMA_WRID_READY = 0,
149 RDMA_WRID_DATA,
150 RDMA_WRID_CONTROL,
151 RDMA_WRID_MAX,
152};
153
154
155
156
157enum {
158 RDMA_CONTROL_NONE = 0,
159 RDMA_CONTROL_ERROR,
160 RDMA_CONTROL_READY,
161 RDMA_CONTROL_QEMU_FILE,
162 RDMA_CONTROL_RAM_BLOCKS_REQUEST,
163 RDMA_CONTROL_RAM_BLOCKS_RESULT,
164 RDMA_CONTROL_COMPRESS,
165 RDMA_CONTROL_REGISTER_REQUEST,
166 RDMA_CONTROL_REGISTER_RESULT,
167 RDMA_CONTROL_REGISTER_FINISHED,
168 RDMA_CONTROL_UNREGISTER_REQUEST,
169 RDMA_CONTROL_UNREGISTER_FINISHED,
170};
171
172
173
174
175
176
177typedef struct {
178 uint8_t control[RDMA_CONTROL_MAX_BUFFER];
179 struct ibv_mr *control_mr;
180 size_t control_len;
181 uint8_t *control_curr;
182} RDMAWorkRequestData;
183
184
185
186
187typedef struct {
188 uint32_t version;
189 uint32_t flags;
190} RDMACapabilities;
191
192static void caps_to_network(RDMACapabilities *cap)
193{
194 cap->version = htonl(cap->version);
195 cap->flags = htonl(cap->flags);
196}
197
198static void network_to_caps(RDMACapabilities *cap)
199{
200 cap->version = ntohl(cap->version);
201 cap->flags = ntohl(cap->flags);
202}
203
204
205
206
207
208
209
210
211typedef struct RDMALocalBlock {
212 char *block_name;
213 uint8_t *local_host_addr;
214 uint64_t remote_host_addr;
215 uint64_t offset;
216 uint64_t length;
217 struct ibv_mr **pmr;
218 struct ibv_mr *mr;
219 uint32_t *remote_keys;
220 uint32_t remote_rkey;
221 int index;
222 unsigned int src_index;
223 bool is_ram_block;
224 int nb_chunks;
225 unsigned long *transit_bitmap;
226 unsigned long *unregister_bitmap;
227} RDMALocalBlock;
228
229
230
231
232
233
234
235
236typedef struct QEMU_PACKED RDMADestBlock {
237 uint64_t remote_host_addr;
238 uint64_t offset;
239 uint64_t length;
240 uint32_t remote_rkey;
241 uint32_t padding;
242} RDMADestBlock;
243
244static const char *control_desc(unsigned int rdma_control)
245{
246 static const char *strs[] = {
247 [RDMA_CONTROL_NONE] = "NONE",
248 [RDMA_CONTROL_ERROR] = "ERROR",
249 [RDMA_CONTROL_READY] = "READY",
250 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
251 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
252 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
253 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
254 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
255 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
256 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
257 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
258 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
259 };
260
261 if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
262 return "??BAD CONTROL VALUE??";
263 }
264
265 return strs[rdma_control];
266}
267
268static uint64_t htonll(uint64_t v)
269{
270 union { uint32_t lv[2]; uint64_t llv; } u;
271 u.lv[0] = htonl(v >> 32);
272 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
273 return u.llv;
274}
275
276static uint64_t ntohll(uint64_t v)
277{
278 union { uint32_t lv[2]; uint64_t llv; } u;
279 u.llv = v;
280 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
281}
282
283static void dest_block_to_network(RDMADestBlock *db)
284{
285 db->remote_host_addr = htonll(db->remote_host_addr);
286 db->offset = htonll(db->offset);
287 db->length = htonll(db->length);
288 db->remote_rkey = htonl(db->remote_rkey);
289}
290
291static void network_to_dest_block(RDMADestBlock *db)
292{
293 db->remote_host_addr = ntohll(db->remote_host_addr);
294 db->offset = ntohll(db->offset);
295 db->length = ntohll(db->length);
296 db->remote_rkey = ntohl(db->remote_rkey);
297}
298
299
300
301
302
303
304typedef struct RDMALocalBlocks {
305 int nb_blocks;
306 bool init;
307 RDMALocalBlock *block;
308} RDMALocalBlocks;
309
310
311
312
313
314
315
316typedef struct RDMAContext {
317 char *host;
318 int port;
319
320 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
321
322
323
324
325
326
327
328
329 int control_ready_expected;
330
331
332 int nb_sent;
333
334
335
336 uint64_t current_addr;
337 uint64_t current_length;
338
339 int current_index;
340
341 int current_chunk;
342
343 bool pin_all;
344
345
346
347
348
349
350
351
352 struct rdma_cm_id *cm_id;
353 struct rdma_cm_id *listen_id;
354 bool connected;
355
356 struct ibv_context *verbs;
357 struct rdma_event_channel *channel;
358 struct ibv_qp *qp;
359 struct ibv_comp_channel *comp_channel;
360 struct ibv_pd *pd;
361 struct ibv_cq *cq;
362
363
364
365
366
367
368 int error_state;
369 int error_reported;
370 int received_error;
371
372
373
374
375 RDMALocalBlocks local_ram_blocks;
376 RDMADestBlock *dest_blocks;
377
378
379 unsigned int next_src_index;
380
381
382
383
384
385
386 int migration_started_on_destination;
387
388 int total_registrations;
389 int total_writes;
390
391 int unregister_current, unregister_next;
392 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
393
394 GHashTable *blockmap;
395
396
397 struct RDMAContext *return_path;
398 bool is_return_path;
399} RDMAContext;
400
401#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
402OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
403
404
405
406struct QIOChannelRDMA {
407 QIOChannel parent;
408 RDMAContext *rdmain;
409 RDMAContext *rdmaout;
410 QEMUFile *file;
411 bool blocking;
412};
413
414
415
416
417
418typedef struct QEMU_PACKED {
419 uint32_t len;
420 uint32_t type;
421 uint32_t repeat;
422 uint32_t padding;
423} RDMAControlHeader;
424
425static void control_to_network(RDMAControlHeader *control)
426{
427 control->type = htonl(control->type);
428 control->len = htonl(control->len);
429 control->repeat = htonl(control->repeat);
430}
431
432static void network_to_control(RDMAControlHeader *control)
433{
434 control->type = ntohl(control->type);
435 control->len = ntohl(control->len);
436 control->repeat = ntohl(control->repeat);
437}
438
439
440
441
442
443
444
445typedef struct QEMU_PACKED {
446 union QEMU_PACKED {
447 uint64_t current_addr;
448 uint64_t chunk;
449 } key;
450 uint32_t current_index;
451 uint32_t padding;
452 uint64_t chunks;
453} RDMARegister;
454
455static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
456{
457 RDMALocalBlock *local_block;
458 local_block = &rdma->local_ram_blocks.block[reg->current_index];
459
460 if (local_block->is_ram_block) {
461
462
463
464
465 reg->key.current_addr -= local_block->offset;
466 reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
467 }
468 reg->key.current_addr = htonll(reg->key.current_addr);
469 reg->current_index = htonl(reg->current_index);
470 reg->chunks = htonll(reg->chunks);
471}
472
473static void network_to_register(RDMARegister *reg)
474{
475 reg->key.current_addr = ntohll(reg->key.current_addr);
476 reg->current_index = ntohl(reg->current_index);
477 reg->chunks = ntohll(reg->chunks);
478}
479
480typedef struct QEMU_PACKED {
481 uint32_t value;
482 uint32_t block_idx;
483 uint64_t offset;
484 uint64_t length;
485} RDMACompress;
486
487static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
488{
489 comp->value = htonl(comp->value);
490
491
492
493
494 comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
495 comp->offset += rdma->dest_blocks[comp->block_idx].offset;
496 comp->block_idx = htonl(comp->block_idx);
497 comp->offset = htonll(comp->offset);
498 comp->length = htonll(comp->length);
499}
500
501static void network_to_compress(RDMACompress *comp)
502{
503 comp->value = ntohl(comp->value);
504 comp->block_idx = ntohl(comp->block_idx);
505 comp->offset = ntohll(comp->offset);
506 comp->length = ntohll(comp->length);
507}
508
509
510
511
512
513
514typedef struct QEMU_PACKED {
515 uint32_t rkey;
516 uint32_t padding;
517 uint64_t host_addr;
518} RDMARegisterResult;
519
520static void result_to_network(RDMARegisterResult *result)
521{
522 result->rkey = htonl(result->rkey);
523 result->host_addr = htonll(result->host_addr);
524};
525
526static void network_to_result(RDMARegisterResult *result)
527{
528 result->rkey = ntohl(result->rkey);
529 result->host_addr = ntohll(result->host_addr);
530};
531
532const char *print_wrid(int wrid);
533static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
534 uint8_t *data, RDMAControlHeader *resp,
535 int *resp_idx,
536 int (*callback)(RDMAContext *rdma));
537
538static inline uint64_t ram_chunk_index(const uint8_t *start,
539 const uint8_t *host)
540{
541 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
542}
543
544static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
545 uint64_t i)
546{
547 return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
548 (i << RDMA_REG_CHUNK_SHIFT));
549}
550
551static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
552 uint64_t i)
553{
554 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
555 (1UL << RDMA_REG_CHUNK_SHIFT);
556
557 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
558 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
559 }
560
561 return result;
562}
563
564static int rdma_add_block(RDMAContext *rdma, const char *block_name,
565 void *host_addr,
566 ram_addr_t block_offset, uint64_t length)
567{
568 RDMALocalBlocks *local = &rdma->local_ram_blocks;
569 RDMALocalBlock *block;
570 RDMALocalBlock *old = local->block;
571
572 local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
573
574 if (local->nb_blocks) {
575 int x;
576
577 if (rdma->blockmap) {
578 for (x = 0; x < local->nb_blocks; x++) {
579 g_hash_table_remove(rdma->blockmap,
580 (void *)(uintptr_t)old[x].offset);
581 g_hash_table_insert(rdma->blockmap,
582 (void *)(uintptr_t)old[x].offset,
583 &local->block[x]);
584 }
585 }
586 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
587 g_free(old);
588 }
589
590 block = &local->block[local->nb_blocks];
591
592 block->block_name = g_strdup(block_name);
593 block->local_host_addr = host_addr;
594 block->offset = block_offset;
595 block->length = length;
596 block->index = local->nb_blocks;
597 block->src_index = ~0U;
598 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
599 block->transit_bitmap = bitmap_new(block->nb_chunks);
600 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
601 block->unregister_bitmap = bitmap_new(block->nb_chunks);
602 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
603 block->remote_keys = g_new0(uint32_t, block->nb_chunks);
604
605 block->is_ram_block = local->init ? false : true;
606
607 if (rdma->blockmap) {
608 g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
609 }
610
611 trace_rdma_add_block(block_name, local->nb_blocks,
612 (uintptr_t) block->local_host_addr,
613 block->offset, block->length,
614 (uintptr_t) (block->local_host_addr + block->length),
615 BITS_TO_LONGS(block->nb_chunks) *
616 sizeof(unsigned long) * 8,
617 block->nb_chunks);
618
619 local->nb_blocks++;
620
621 return 0;
622}
623
624
625
626
627
628
629static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
630{
631 const char *block_name = qemu_ram_get_idstr(rb);
632 void *host_addr = qemu_ram_get_host_addr(rb);
633 ram_addr_t block_offset = qemu_ram_get_offset(rb);
634 ram_addr_t length = qemu_ram_get_used_length(rb);
635 return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
636}
637
638
639
640
641
642
643static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
644{
645 RDMALocalBlocks *local = &rdma->local_ram_blocks;
646 int ret;
647
648 assert(rdma->blockmap == NULL);
649 memset(local, 0, sizeof *local);
650 ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
651 if (ret) {
652 return ret;
653 }
654 trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
655 rdma->dest_blocks = g_new0(RDMADestBlock,
656 rdma->local_ram_blocks.nb_blocks);
657 local->init = true;
658 return 0;
659}
660
661
662
663
664
665static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
666{
667 RDMALocalBlocks *local = &rdma->local_ram_blocks;
668 RDMALocalBlock *old = local->block;
669 int x;
670
671 if (rdma->blockmap) {
672 g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
673 }
674 if (block->pmr) {
675 int j;
676
677 for (j = 0; j < block->nb_chunks; j++) {
678 if (!block->pmr[j]) {
679 continue;
680 }
681 ibv_dereg_mr(block->pmr[j]);
682 rdma->total_registrations--;
683 }
684 g_free(block->pmr);
685 block->pmr = NULL;
686 }
687
688 if (block->mr) {
689 ibv_dereg_mr(block->mr);
690 rdma->total_registrations--;
691 block->mr = NULL;
692 }
693
694 g_free(block->transit_bitmap);
695 block->transit_bitmap = NULL;
696
697 g_free(block->unregister_bitmap);
698 block->unregister_bitmap = NULL;
699
700 g_free(block->remote_keys);
701 block->remote_keys = NULL;
702
703 g_free(block->block_name);
704 block->block_name = NULL;
705
706 if (rdma->blockmap) {
707 for (x = 0; x < local->nb_blocks; x++) {
708 g_hash_table_remove(rdma->blockmap,
709 (void *)(uintptr_t)old[x].offset);
710 }
711 }
712
713 if (local->nb_blocks > 1) {
714
715 local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
716
717 if (block->index) {
718 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
719 }
720
721 if (block->index < (local->nb_blocks - 1)) {
722 memcpy(local->block + block->index, old + (block->index + 1),
723 sizeof(RDMALocalBlock) *
724 (local->nb_blocks - (block->index + 1)));
725 for (x = block->index; x < local->nb_blocks - 1; x++) {
726 local->block[x].index--;
727 }
728 }
729 } else {
730 assert(block == local->block);
731 local->block = NULL;
732 }
733
734 trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
735 block->offset, block->length,
736 (uintptr_t)(block->local_host_addr + block->length),
737 BITS_TO_LONGS(block->nb_chunks) *
738 sizeof(unsigned long) * 8, block->nb_chunks);
739
740 g_free(old);
741
742 local->nb_blocks--;
743
744 if (local->nb_blocks && rdma->blockmap) {
745 for (x = 0; x < local->nb_blocks; x++) {
746 g_hash_table_insert(rdma->blockmap,
747 (void *)(uintptr_t)local->block[x].offset,
748 &local->block[x]);
749 }
750 }
751
752 return 0;
753}
754
755
756
757
758
759static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
760{
761 struct ibv_port_attr port;
762
763 if (ibv_query_port(verbs, 1, &port)) {
764 error_report("Failed to query port information");
765 return;
766 }
767
768 printf("%s RDMA Device opened: kernel name %s "
769 "uverbs device name %s, "
770 "infiniband_verbs class device path %s, "
771 "infiniband class device path %s, "
772 "transport: (%d) %s\n",
773 who,
774 verbs->device->name,
775 verbs->device->dev_name,
776 verbs->device->dev_path,
777 verbs->device->ibdev_path,
778 port.link_layer,
779 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
780 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
781 ? "Ethernet" : "Unknown"));
782}
783
784
785
786
787
788
789static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
790{
791 char sgid[33];
792 char dgid[33];
793 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
794 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
795 trace_qemu_rdma_dump_gid(who, sgid, dgid);
796}
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
842{
843
844#ifdef CONFIG_LINUX
845 struct ibv_port_attr port_attr;
846
847
848
849
850
851
852
853
854
855
856 if (!verbs) {
857 int num_devices, x;
858 struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
859 bool roce_found = false;
860 bool ib_found = false;
861
862 for (x = 0; x < num_devices; x++) {
863 verbs = ibv_open_device(dev_list[x]);
864 if (!verbs) {
865 if (errno == EPERM) {
866 continue;
867 } else {
868 return -EINVAL;
869 }
870 }
871
872 if (ibv_query_port(verbs, 1, &port_attr)) {
873 ibv_close_device(verbs);
874 ERROR(errp, "Could not query initial IB port");
875 return -EINVAL;
876 }
877
878 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
879 ib_found = true;
880 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
881 roce_found = true;
882 }
883
884 ibv_close_device(verbs);
885
886 }
887
888 if (roce_found) {
889 if (ib_found) {
890 fprintf(stderr, "WARN: migrations may fail:"
891 " IPv6 over RoCE / iWARP in linux"
892 " is broken. But since you appear to have a"
893 " mixed RoCE / IB environment, be sure to only"
894 " migrate over the IB fabric until the kernel "
895 " fixes the bug.\n");
896 } else {
897 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
898 " and your management software has specified '[::]'"
899 ", but IPv6 over RoCE / iWARP is not supported in Linux.");
900 return -ENONET;
901 }
902 }
903
904 return 0;
905 }
906
907
908
909
910
911
912
913
914 if (ibv_query_port(verbs, 1, &port_attr)) {
915 ERROR(errp, "Could not query initial IB port");
916 return -EINVAL;
917 }
918
919 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
920 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
921 "(but patches on linux-rdma in progress)");
922 return -ENONET;
923 }
924
925#endif
926
927 return 0;
928}
929
930
931
932
933
934
935static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
936{
937 int ret;
938 struct rdma_addrinfo *res;
939 char port_str[16];
940 struct rdma_cm_event *cm_event;
941 char ip[40] = "unknown";
942 struct rdma_addrinfo *e;
943
944 if (rdma->host == NULL || !strcmp(rdma->host, "")) {
945 ERROR(errp, "RDMA hostname has not been set");
946 return -EINVAL;
947 }
948
949
950 rdma->channel = rdma_create_event_channel();
951 if (!rdma->channel) {
952 ERROR(errp, "could not create CM channel");
953 return -EINVAL;
954 }
955
956
957 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
958 if (ret) {
959 ERROR(errp, "could not create channel id");
960 goto err_resolve_create_id;
961 }
962
963 snprintf(port_str, 16, "%d", rdma->port);
964 port_str[15] = '\0';
965
966 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
967 if (ret < 0) {
968 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
969 goto err_resolve_get_addr;
970 }
971
972 for (e = res; e != NULL; e = e->ai_next) {
973 inet_ntop(e->ai_family,
974 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
975 trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
976
977 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
978 RDMA_RESOLVE_TIMEOUT_MS);
979 if (!ret) {
980 if (e->ai_family == AF_INET6) {
981 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
982 if (ret) {
983 continue;
984 }
985 }
986 goto route;
987 }
988 }
989
990 ERROR(errp, "could not resolve address %s", rdma->host);
991 goto err_resolve_get_addr;
992
993route:
994 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
995
996 ret = rdma_get_cm_event(rdma->channel, &cm_event);
997 if (ret) {
998 ERROR(errp, "could not perform event_addr_resolved");
999 goto err_resolve_get_addr;
1000 }
1001
1002 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
1003 ERROR(errp, "result not equal to event_addr_resolved %s",
1004 rdma_event_str(cm_event->event));
1005 perror("rdma_resolve_addr");
1006 rdma_ack_cm_event(cm_event);
1007 ret = -EINVAL;
1008 goto err_resolve_get_addr;
1009 }
1010 rdma_ack_cm_event(cm_event);
1011
1012
1013 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
1014 if (ret) {
1015 ERROR(errp, "could not resolve rdma route");
1016 goto err_resolve_get_addr;
1017 }
1018
1019 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1020 if (ret) {
1021 ERROR(errp, "could not perform event_route_resolved");
1022 goto err_resolve_get_addr;
1023 }
1024 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1025 ERROR(errp, "result not equal to event_route_resolved: %s",
1026 rdma_event_str(cm_event->event));
1027 rdma_ack_cm_event(cm_event);
1028 ret = -EINVAL;
1029 goto err_resolve_get_addr;
1030 }
1031 rdma_ack_cm_event(cm_event);
1032 rdma->verbs = rdma->cm_id->verbs;
1033 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1034 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1035 return 0;
1036
1037err_resolve_get_addr:
1038 rdma_destroy_id(rdma->cm_id);
1039 rdma->cm_id = NULL;
1040err_resolve_create_id:
1041 rdma_destroy_event_channel(rdma->channel);
1042 rdma->channel = NULL;
1043 return ret;
1044}
1045
1046
1047
1048
1049static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1050{
1051
1052 rdma->pd = ibv_alloc_pd(rdma->verbs);
1053 if (!rdma->pd) {
1054 error_report("failed to allocate protection domain");
1055 return -1;
1056 }
1057
1058
1059 rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
1060 if (!rdma->comp_channel) {
1061 error_report("failed to allocate completion channel");
1062 goto err_alloc_pd_cq;
1063 }
1064
1065
1066
1067
1068
1069 rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1070 NULL, rdma->comp_channel, 0);
1071 if (!rdma->cq) {
1072 error_report("failed to allocate completion queue");
1073 goto err_alloc_pd_cq;
1074 }
1075
1076 return 0;
1077
1078err_alloc_pd_cq:
1079 if (rdma->pd) {
1080 ibv_dealloc_pd(rdma->pd);
1081 }
1082 if (rdma->comp_channel) {
1083 ibv_destroy_comp_channel(rdma->comp_channel);
1084 }
1085 rdma->pd = NULL;
1086 rdma->comp_channel = NULL;
1087 return -1;
1088
1089}
1090
1091
1092
1093
1094static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1095{
1096 struct ibv_qp_init_attr attr = { 0 };
1097 int ret;
1098
1099 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1100 attr.cap.max_recv_wr = 3;
1101 attr.cap.max_send_sge = 1;
1102 attr.cap.max_recv_sge = 1;
1103 attr.send_cq = rdma->cq;
1104 attr.recv_cq = rdma->cq;
1105 attr.qp_type = IBV_QPT_RC;
1106
1107 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1108 if (ret) {
1109 return -1;
1110 }
1111
1112 rdma->qp = rdma->cm_id->qp;
1113 return 0;
1114}
1115
1116static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1117{
1118 int i;
1119 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1120
1121 for (i = 0; i < local->nb_blocks; i++) {
1122 local->block[i].mr =
1123 ibv_reg_mr(rdma->pd,
1124 local->block[i].local_host_addr,
1125 local->block[i].length,
1126 IBV_ACCESS_LOCAL_WRITE |
1127 IBV_ACCESS_REMOTE_WRITE
1128 );
1129 if (!local->block[i].mr) {
1130 perror("Failed to register local dest ram block!\n");
1131 break;
1132 }
1133 rdma->total_registrations++;
1134 }
1135
1136 if (i >= local->nb_blocks) {
1137 return 0;
1138 }
1139
1140 for (i--; i >= 0; i--) {
1141 ibv_dereg_mr(local->block[i].mr);
1142 rdma->total_registrations--;
1143 }
1144
1145 return -1;
1146
1147}
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1159 uintptr_t block_offset,
1160 uint64_t offset,
1161 uint64_t length,
1162 uint64_t *block_index,
1163 uint64_t *chunk_index)
1164{
1165 uint64_t current_addr = block_offset + offset;
1166 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1167 (void *) block_offset);
1168 assert(block);
1169 assert(current_addr >= block->offset);
1170 assert((current_addr + length) <= (block->offset + block->length));
1171
1172 *block_index = block->index;
1173 *chunk_index = ram_chunk_index(block->local_host_addr,
1174 block->local_host_addr + (current_addr - block->offset));
1175
1176 return 0;
1177}
1178
1179
1180
1181
1182
1183
1184
1185
1186static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1187 RDMALocalBlock *block, uintptr_t host_addr,
1188 uint32_t *lkey, uint32_t *rkey, int chunk,
1189 uint8_t *chunk_start, uint8_t *chunk_end)
1190{
1191 if (block->mr) {
1192 if (lkey) {
1193 *lkey = block->mr->lkey;
1194 }
1195 if (rkey) {
1196 *rkey = block->mr->rkey;
1197 }
1198 return 0;
1199 }
1200
1201
1202 if (!block->pmr) {
1203 block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1204 }
1205
1206
1207
1208
1209
1210
1211 if (!block->pmr[chunk]) {
1212 uint64_t len = chunk_end - chunk_start;
1213
1214 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1215
1216 block->pmr[chunk] = ibv_reg_mr(rdma->pd,
1217 chunk_start, len,
1218 (rkey ? (IBV_ACCESS_LOCAL_WRITE |
1219 IBV_ACCESS_REMOTE_WRITE) : 0));
1220
1221 if (!block->pmr[chunk]) {
1222 perror("Failed to register chunk!");
1223 fprintf(stderr, "Chunk details: block: %d chunk index %d"
1224 " start %" PRIuPTR " end %" PRIuPTR
1225 " host %" PRIuPTR
1226 " local %" PRIuPTR " registrations: %d\n",
1227 block->index, chunk, (uintptr_t)chunk_start,
1228 (uintptr_t)chunk_end, host_addr,
1229 (uintptr_t)block->local_host_addr,
1230 rdma->total_registrations);
1231 return -1;
1232 }
1233 rdma->total_registrations++;
1234 }
1235
1236 if (lkey) {
1237 *lkey = block->pmr[chunk]->lkey;
1238 }
1239 if (rkey) {
1240 *rkey = block->pmr[chunk]->rkey;
1241 }
1242 return 0;
1243}
1244
1245
1246
1247
1248
1249static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1250{
1251 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1252 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1253 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1254 if (rdma->wr_data[idx].control_mr) {
1255 rdma->total_registrations++;
1256 return 0;
1257 }
1258 error_report("qemu_rdma_reg_control failed");
1259 return -1;
1260}
1261
1262const char *print_wrid(int wrid)
1263{
1264 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1265 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1266 }
1267 return wrid_desc[wrid];
1268}
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1306{
1307 while (rdma->unregistrations[rdma->unregister_current]) {
1308 int ret;
1309 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1310 uint64_t chunk =
1311 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1312 uint64_t index =
1313 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1314 RDMALocalBlock *block =
1315 &(rdma->local_ram_blocks.block[index]);
1316 RDMARegister reg = { .current_index = index };
1317 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1318 };
1319 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1320 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1321 .repeat = 1,
1322 };
1323
1324 trace_qemu_rdma_unregister_waiting_proc(chunk,
1325 rdma->unregister_current);
1326
1327 rdma->unregistrations[rdma->unregister_current] = 0;
1328 rdma->unregister_current++;
1329
1330 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1331 rdma->unregister_current = 0;
1332 }
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342 clear_bit(chunk, block->unregister_bitmap);
1343
1344 if (test_bit(chunk, block->transit_bitmap)) {
1345 trace_qemu_rdma_unregister_waiting_inflight(chunk);
1346 continue;
1347 }
1348
1349 trace_qemu_rdma_unregister_waiting_send(chunk);
1350
1351 ret = ibv_dereg_mr(block->pmr[chunk]);
1352 block->pmr[chunk] = NULL;
1353 block->remote_keys[chunk] = 0;
1354
1355 if (ret != 0) {
1356 perror("unregistration chunk failed");
1357 return -ret;
1358 }
1359 rdma->total_registrations--;
1360
1361 reg.key.chunk = chunk;
1362 register_to_network(rdma, ®);
1363 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1364 &resp, NULL, NULL);
1365 if (ret < 0) {
1366 return ret;
1367 }
1368
1369 trace_qemu_rdma_unregister_waiting_complete(chunk);
1370 }
1371
1372 return 0;
1373}
1374
1375static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1376 uint64_t chunk)
1377{
1378 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1379
1380 result |= (index << RDMA_WRID_BLOCK_SHIFT);
1381 result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1382
1383 return result;
1384}
1385
1386
1387
1388
1389
1390static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1391 uint64_t chunk, uint64_t wr_id)
1392{
1393 if (rdma->unregistrations[rdma->unregister_next] != 0) {
1394 error_report("rdma migration: queue is full");
1395 } else {
1396 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1397
1398 if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1399 trace_qemu_rdma_signal_unregister_append(chunk,
1400 rdma->unregister_next);
1401
1402 rdma->unregistrations[rdma->unregister_next++] =
1403 qemu_rdma_make_wrid(wr_id, index, chunk);
1404
1405 if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1406 rdma->unregister_next = 0;
1407 }
1408 } else {
1409 trace_qemu_rdma_signal_unregister_already(chunk);
1410 }
1411 }
1412}
1413
1414
1415
1416
1417
1418
1419static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
1420 uint32_t *byte_len)
1421{
1422 int ret;
1423 struct ibv_wc wc;
1424 uint64_t wr_id;
1425
1426 ret = ibv_poll_cq(rdma->cq, 1, &wc);
1427
1428 if (!ret) {
1429 *wr_id_out = RDMA_WRID_NONE;
1430 return 0;
1431 }
1432
1433 if (ret < 0) {
1434 error_report("ibv_poll_cq return %d", ret);
1435 return ret;
1436 }
1437
1438 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1439
1440 if (wc.status != IBV_WC_SUCCESS) {
1441 fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1442 wc.status, ibv_wc_status_str(wc.status));
1443 fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1444
1445 return -1;
1446 }
1447
1448 if (rdma->control_ready_expected &&
1449 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1450 trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
1451 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1452 rdma->control_ready_expected = 0;
1453 }
1454
1455 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1456 uint64_t chunk =
1457 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1458 uint64_t index =
1459 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1460 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1461
1462 trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
1463 index, chunk, block->local_host_addr,
1464 (void *)(uintptr_t)block->remote_host_addr);
1465
1466 clear_bit(chunk, block->transit_bitmap);
1467
1468 if (rdma->nb_sent > 0) {
1469 rdma->nb_sent--;
1470 }
1471
1472 if (!rdma->pin_all) {
1473
1474
1475
1476
1477
1478
1479#ifdef RDMA_UNREGISTRATION_EXAMPLE
1480 qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1481#endif
1482 }
1483 } else {
1484 trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
1485 }
1486
1487 *wr_id_out = wc.wr_id;
1488 if (byte_len) {
1489 *byte_len = wc.byte_len;
1490 }
1491
1492 return 0;
1493}
1494
1495
1496
1497
1498static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
1499{
1500 struct rdma_cm_event *cm_event;
1501 int ret = -1;
1502
1503
1504
1505
1506
1507 if (rdma->migration_started_on_destination &&
1508 migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1509 yield_until_fd_readable(rdma->comp_channel->fd);
1510 } else {
1511
1512
1513
1514
1515
1516
1517
1518 while (!rdma->error_state && !rdma->received_error) {
1519 GPollFD pfds[2];
1520 pfds[0].fd = rdma->comp_channel->fd;
1521 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1522 pfds[0].revents = 0;
1523
1524 pfds[1].fd = rdma->channel->fd;
1525 pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1526 pfds[1].revents = 0;
1527
1528
1529 switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1530 case 2:
1531 case 1:
1532 if (pfds[0].revents) {
1533 return 0;
1534 }
1535
1536 if (pfds[1].revents) {
1537 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1538 if (!ret) {
1539 rdma_ack_cm_event(cm_event);
1540 }
1541
1542 error_report("receive cm event while wait comp channel,"
1543 "cm event is %d", cm_event->event);
1544 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1545 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
1546 return -EPIPE;
1547 }
1548 }
1549 break;
1550
1551 case 0:
1552 break;
1553
1554 default:
1555
1556
1557 error_report("%s: poll failed", __func__);
1558 return -EPIPE;
1559 }
1560
1561 if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
1562
1563 return -EPIPE;
1564 }
1565 }
1566 }
1567
1568 if (rdma->received_error) {
1569 return -EPIPE;
1570 }
1571 return rdma->error_state;
1572}
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1588 uint32_t *byte_len)
1589{
1590 int num_cq_events = 0, ret = 0;
1591 struct ibv_cq *cq;
1592 void *cq_ctx;
1593 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1594
1595 if (ibv_req_notify_cq(rdma->cq, 0)) {
1596 return -1;
1597 }
1598
1599 while (wr_id != wrid_requested) {
1600 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1601 if (ret < 0) {
1602 return ret;
1603 }
1604
1605 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1606
1607 if (wr_id == RDMA_WRID_NONE) {
1608 break;
1609 }
1610 if (wr_id != wrid_requested) {
1611 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1612 wrid_requested, print_wrid(wr_id), wr_id);
1613 }
1614 }
1615
1616 if (wr_id == wrid_requested) {
1617 return 0;
1618 }
1619
1620 while (1) {
1621 ret = qemu_rdma_wait_comp_channel(rdma);
1622 if (ret) {
1623 goto err_block_for_wrid;
1624 }
1625
1626 ret = ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx);
1627 if (ret) {
1628 perror("ibv_get_cq_event");
1629 goto err_block_for_wrid;
1630 }
1631
1632 num_cq_events++;
1633
1634 ret = -ibv_req_notify_cq(cq, 0);
1635 if (ret) {
1636 goto err_block_for_wrid;
1637 }
1638
1639 while (wr_id != wrid_requested) {
1640 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1641 if (ret < 0) {
1642 goto err_block_for_wrid;
1643 }
1644
1645 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1646
1647 if (wr_id == RDMA_WRID_NONE) {
1648 break;
1649 }
1650 if (wr_id != wrid_requested) {
1651 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1652 wrid_requested, print_wrid(wr_id), wr_id);
1653 }
1654 }
1655
1656 if (wr_id == wrid_requested) {
1657 goto success_block_for_wrid;
1658 }
1659 }
1660
1661success_block_for_wrid:
1662 if (num_cq_events) {
1663 ibv_ack_cq_events(cq, num_cq_events);
1664 }
1665 return 0;
1666
1667err_block_for_wrid:
1668 if (num_cq_events) {
1669 ibv_ack_cq_events(cq, num_cq_events);
1670 }
1671
1672 rdma->error_state = ret;
1673 return ret;
1674}
1675
1676
1677
1678
1679
1680static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1681 RDMAControlHeader *head)
1682{
1683 int ret = 0;
1684 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1685 struct ibv_send_wr *bad_wr;
1686 struct ibv_sge sge = {
1687 .addr = (uintptr_t)(wr->control),
1688 .length = head->len + sizeof(RDMAControlHeader),
1689 .lkey = wr->control_mr->lkey,
1690 };
1691 struct ibv_send_wr send_wr = {
1692 .wr_id = RDMA_WRID_SEND_CONTROL,
1693 .opcode = IBV_WR_SEND,
1694 .send_flags = IBV_SEND_SIGNALED,
1695 .sg_list = &sge,
1696 .num_sge = 1,
1697 };
1698
1699 trace_qemu_rdma_post_send_control(control_desc(head->type));
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1710 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1711 control_to_network((void *) wr->control);
1712
1713 if (buf) {
1714 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1715 }
1716
1717
1718 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1719
1720 if (ret > 0) {
1721 error_report("Failed to use post IB SEND for control");
1722 return -ret;
1723 }
1724
1725 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1726 if (ret < 0) {
1727 error_report("rdma migration: send polling control error");
1728 }
1729
1730 return ret;
1731}
1732
1733
1734
1735
1736
1737static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1738{
1739 struct ibv_recv_wr *bad_wr;
1740 struct ibv_sge sge = {
1741 .addr = (uintptr_t)(rdma->wr_data[idx].control),
1742 .length = RDMA_CONTROL_MAX_BUFFER,
1743 .lkey = rdma->wr_data[idx].control_mr->lkey,
1744 };
1745
1746 struct ibv_recv_wr recv_wr = {
1747 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1748 .sg_list = &sge,
1749 .num_sge = 1,
1750 };
1751
1752
1753 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1754 return -1;
1755 }
1756
1757 return 0;
1758}
1759
1760
1761
1762
1763static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1764 RDMAControlHeader *head, int expecting, int idx)
1765{
1766 uint32_t byte_len;
1767 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1768 &byte_len);
1769
1770 if (ret < 0) {
1771 error_report("rdma migration: recv polling control error!");
1772 return ret;
1773 }
1774
1775 network_to_control((void *) rdma->wr_data[idx].control);
1776 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1777
1778 trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1779
1780 if (expecting == RDMA_CONTROL_NONE) {
1781 trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1782 head->type);
1783 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1784 error_report("Was expecting a %s (%d) control message"
1785 ", but got: %s (%d), length: %d",
1786 control_desc(expecting), expecting,
1787 control_desc(head->type), head->type, head->len);
1788 if (head->type == RDMA_CONTROL_ERROR) {
1789 rdma->received_error = true;
1790 }
1791 return -EIO;
1792 }
1793 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1794 error_report("too long length: %d", head->len);
1795 return -EINVAL;
1796 }
1797 if (sizeof(*head) + head->len != byte_len) {
1798 error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1799 return -EINVAL;
1800 }
1801
1802 return 0;
1803}
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1814 RDMAControlHeader *head)
1815{
1816 rdma->wr_data[idx].control_len = head->len;
1817 rdma->wr_data[idx].control_curr =
1818 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1819}
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1835 uint8_t *data, RDMAControlHeader *resp,
1836 int *resp_idx,
1837 int (*callback)(RDMAContext *rdma))
1838{
1839 int ret = 0;
1840
1841
1842
1843
1844
1845 if (rdma->control_ready_expected) {
1846 RDMAControlHeader resp;
1847 ret = qemu_rdma_exchange_get_response(rdma,
1848 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1849 if (ret < 0) {
1850 return ret;
1851 }
1852 }
1853
1854
1855
1856
1857 if (resp) {
1858 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1859 if (ret) {
1860 error_report("rdma migration: error posting"
1861 " extra control recv for anticipated result!");
1862 return ret;
1863 }
1864 }
1865
1866
1867
1868
1869 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1870 if (ret) {
1871 error_report("rdma migration: error posting first control recv!");
1872 return ret;
1873 }
1874
1875
1876
1877
1878 ret = qemu_rdma_post_send_control(rdma, data, head);
1879
1880 if (ret < 0) {
1881 error_report("Failed to send control buffer!");
1882 return ret;
1883 }
1884
1885
1886
1887
1888 if (resp) {
1889 if (callback) {
1890 trace_qemu_rdma_exchange_send_issue_callback();
1891 ret = callback(rdma);
1892 if (ret < 0) {
1893 return ret;
1894 }
1895 }
1896
1897 trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1898 ret = qemu_rdma_exchange_get_response(rdma, resp,
1899 resp->type, RDMA_WRID_DATA);
1900
1901 if (ret < 0) {
1902 return ret;
1903 }
1904
1905 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1906 if (resp_idx) {
1907 *resp_idx = RDMA_WRID_DATA;
1908 }
1909 trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1910 }
1911
1912 rdma->control_ready_expected = 1;
1913
1914 return 0;
1915}
1916
1917
1918
1919
1920
1921static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1922 int expecting)
1923{
1924 RDMAControlHeader ready = {
1925 .len = 0,
1926 .type = RDMA_CONTROL_READY,
1927 .repeat = 1,
1928 };
1929 int ret;
1930
1931
1932
1933
1934 ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1935
1936 if (ret < 0) {
1937 error_report("Failed to send control buffer!");
1938 return ret;
1939 }
1940
1941
1942
1943
1944 ret = qemu_rdma_exchange_get_response(rdma, head,
1945 expecting, RDMA_WRID_READY);
1946
1947 if (ret < 0) {
1948 return ret;
1949 }
1950
1951 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1952
1953
1954
1955
1956 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1957 if (ret) {
1958 error_report("rdma migration: error posting second control recv!");
1959 return ret;
1960 }
1961
1962 return 0;
1963}
1964
1965
1966
1967
1968
1969
1970
1971static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
1972 int current_index, uint64_t current_addr,
1973 uint64_t length)
1974{
1975 struct ibv_sge sge;
1976 struct ibv_send_wr send_wr = { 0 };
1977 struct ibv_send_wr *bad_wr;
1978 int reg_result_idx, ret, count = 0;
1979 uint64_t chunk, chunks;
1980 uint8_t *chunk_start, *chunk_end;
1981 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
1982 RDMARegister reg;
1983 RDMARegisterResult *reg_result;
1984 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
1985 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1986 .type = RDMA_CONTROL_REGISTER_REQUEST,
1987 .repeat = 1,
1988 };
1989
1990retry:
1991 sge.addr = (uintptr_t)(block->local_host_addr +
1992 (current_addr - block->offset));
1993 sge.length = length;
1994
1995 chunk = ram_chunk_index(block->local_host_addr,
1996 (uint8_t *)(uintptr_t)sge.addr);
1997 chunk_start = ram_chunk_start(block, chunk);
1998
1999 if (block->is_ram_block) {
2000 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2001
2002 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2003 chunks--;
2004 }
2005 } else {
2006 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2007
2008 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2009 chunks--;
2010 }
2011 }
2012
2013 trace_qemu_rdma_write_one_top(chunks + 1,
2014 (chunks + 1) *
2015 (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2016
2017 chunk_end = ram_chunk_end(block, chunk + chunks);
2018
2019 if (!rdma->pin_all) {
2020#ifdef RDMA_UNREGISTRATION_EXAMPLE
2021 qemu_rdma_unregister_waiting(rdma);
2022#endif
2023 }
2024
2025 while (test_bit(chunk, block->transit_bitmap)) {
2026 (void)count;
2027 trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2028 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2029
2030 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2031
2032 if (ret < 0) {
2033 error_report("Failed to Wait for previous write to complete "
2034 "block %d chunk %" PRIu64
2035 " current %" PRIu64 " len %" PRIu64 " %d",
2036 current_index, chunk, sge.addr, length, rdma->nb_sent);
2037 return ret;
2038 }
2039 }
2040
2041 if (!rdma->pin_all || !block->is_ram_block) {
2042 if (!block->remote_keys[chunk]) {
2043
2044
2045
2046
2047
2048
2049 if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2050 RDMACompress comp = {
2051 .offset = current_addr,
2052 .value = 0,
2053 .block_idx = current_index,
2054 .length = length,
2055 };
2056
2057 head.len = sizeof(comp);
2058 head.type = RDMA_CONTROL_COMPRESS;
2059
2060 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2061 current_index, current_addr);
2062
2063 compress_to_network(rdma, &comp);
2064 ret = qemu_rdma_exchange_send(rdma, &head,
2065 (uint8_t *) &comp, NULL, NULL, NULL);
2066
2067 if (ret < 0) {
2068 return -EIO;
2069 }
2070
2071 acct_update_position(f, sge.length, true);
2072
2073 return 1;
2074 }
2075
2076
2077
2078
2079 reg.current_index = current_index;
2080 if (block->is_ram_block) {
2081 reg.key.current_addr = current_addr;
2082 } else {
2083 reg.key.chunk = chunk;
2084 }
2085 reg.chunks = chunks;
2086
2087 trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2088 current_addr);
2089
2090 register_to_network(rdma, ®);
2091 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
2092 &resp, ®_result_idx, NULL);
2093 if (ret < 0) {
2094 return ret;
2095 }
2096
2097
2098 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2099 &sge.lkey, NULL, chunk,
2100 chunk_start, chunk_end)) {
2101 error_report("cannot get lkey");
2102 return -EINVAL;
2103 }
2104
2105 reg_result = (RDMARegisterResult *)
2106 rdma->wr_data[reg_result_idx].control_curr;
2107
2108 network_to_result(reg_result);
2109
2110 trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2111 reg_result->rkey, chunk);
2112
2113 block->remote_keys[chunk] = reg_result->rkey;
2114 block->remote_host_addr = reg_result->host_addr;
2115 } else {
2116
2117 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2118 &sge.lkey, NULL, chunk,
2119 chunk_start, chunk_end)) {
2120 error_report("cannot get lkey!");
2121 return -EINVAL;
2122 }
2123 }
2124
2125 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2126 } else {
2127 send_wr.wr.rdma.rkey = block->remote_rkey;
2128
2129 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2130 &sge.lkey, NULL, chunk,
2131 chunk_start, chunk_end)) {
2132 error_report("cannot get lkey!");
2133 return -EINVAL;
2134 }
2135 }
2136
2137
2138
2139
2140
2141
2142
2143 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2144 current_index, chunk);
2145
2146 send_wr.opcode = IBV_WR_RDMA_WRITE;
2147 send_wr.send_flags = IBV_SEND_SIGNALED;
2148 send_wr.sg_list = &sge;
2149 send_wr.num_sge = 1;
2150 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2151 (current_addr - block->offset);
2152
2153 trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2154 sge.length);
2155
2156
2157
2158
2159
2160 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2161
2162 if (ret == ENOMEM) {
2163 trace_qemu_rdma_write_one_queue_full();
2164 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2165 if (ret < 0) {
2166 error_report("rdma migration: failed to make "
2167 "room in full send queue! %d", ret);
2168 return ret;
2169 }
2170
2171 goto retry;
2172
2173 } else if (ret > 0) {
2174 perror("rdma migration: post rdma write failed");
2175 return -ret;
2176 }
2177
2178 set_bit(chunk, block->transit_bitmap);
2179 acct_update_position(f, sge.length, false);
2180 rdma->total_writes++;
2181
2182 return 0;
2183}
2184
2185
2186
2187
2188
2189
2190
2191static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2192{
2193 int ret;
2194
2195 if (!rdma->current_length) {
2196 return 0;
2197 }
2198
2199 ret = qemu_rdma_write_one(f, rdma,
2200 rdma->current_index, rdma->current_addr, rdma->current_length);
2201
2202 if (ret < 0) {
2203 return ret;
2204 }
2205
2206 if (ret == 0) {
2207 rdma->nb_sent++;
2208 trace_qemu_rdma_write_flush(rdma->nb_sent);
2209 }
2210
2211 rdma->current_length = 0;
2212 rdma->current_addr = 0;
2213
2214 return 0;
2215}
2216
2217static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2218 uint64_t offset, uint64_t len)
2219{
2220 RDMALocalBlock *block;
2221 uint8_t *host_addr;
2222 uint8_t *chunk_end;
2223
2224 if (rdma->current_index < 0) {
2225 return 0;
2226 }
2227
2228 if (rdma->current_chunk < 0) {
2229 return 0;
2230 }
2231
2232 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2233 host_addr = block->local_host_addr + (offset - block->offset);
2234 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2235
2236 if (rdma->current_length == 0) {
2237 return 0;
2238 }
2239
2240
2241
2242
2243 if (offset != (rdma->current_addr + rdma->current_length)) {
2244 return 0;
2245 }
2246
2247 if (offset < block->offset) {
2248 return 0;
2249 }
2250
2251 if ((offset + len) > (block->offset + block->length)) {
2252 return 0;
2253 }
2254
2255 if ((host_addr + len) > chunk_end) {
2256 return 0;
2257 }
2258
2259 return 1;
2260}
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2273 uint64_t block_offset, uint64_t offset,
2274 uint64_t len)
2275{
2276 uint64_t current_addr = block_offset + offset;
2277 uint64_t index = rdma->current_index;
2278 uint64_t chunk = rdma->current_chunk;
2279 int ret;
2280
2281
2282 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2283 ret = qemu_rdma_write_flush(f, rdma);
2284 if (ret) {
2285 return ret;
2286 }
2287 rdma->current_length = 0;
2288 rdma->current_addr = current_addr;
2289
2290 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2291 offset, len, &index, &chunk);
2292 if (ret) {
2293 error_report("ram block search failed");
2294 return ret;
2295 }
2296 rdma->current_index = index;
2297 rdma->current_chunk = chunk;
2298 }
2299
2300
2301 rdma->current_length += len;
2302
2303
2304 if (rdma->current_length >= RDMA_MERGE_MAX) {
2305 return qemu_rdma_write_flush(f, rdma);
2306 }
2307
2308 return 0;
2309}
2310
2311static void qemu_rdma_cleanup(RDMAContext *rdma)
2312{
2313 int idx;
2314
2315 if (rdma->cm_id && rdma->connected) {
2316 if ((rdma->error_state ||
2317 migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
2318 !rdma->received_error) {
2319 RDMAControlHeader head = { .len = 0,
2320 .type = RDMA_CONTROL_ERROR,
2321 .repeat = 1,
2322 };
2323 error_report("Early error. Sending error.");
2324 qemu_rdma_post_send_control(rdma, NULL, &head);
2325 }
2326
2327 rdma_disconnect(rdma->cm_id);
2328 trace_qemu_rdma_cleanup_disconnect();
2329 rdma->connected = false;
2330 }
2331
2332 if (rdma->channel) {
2333 qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2334 }
2335 g_free(rdma->dest_blocks);
2336 rdma->dest_blocks = NULL;
2337
2338 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2339 if (rdma->wr_data[idx].control_mr) {
2340 rdma->total_registrations--;
2341 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2342 }
2343 rdma->wr_data[idx].control_mr = NULL;
2344 }
2345
2346 if (rdma->local_ram_blocks.block) {
2347 while (rdma->local_ram_blocks.nb_blocks) {
2348 rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2349 }
2350 }
2351
2352 if (rdma->qp) {
2353 rdma_destroy_qp(rdma->cm_id);
2354 rdma->qp = NULL;
2355 }
2356 if (rdma->cq) {
2357 ibv_destroy_cq(rdma->cq);
2358 rdma->cq = NULL;
2359 }
2360 if (rdma->comp_channel) {
2361 ibv_destroy_comp_channel(rdma->comp_channel);
2362 rdma->comp_channel = NULL;
2363 }
2364 if (rdma->pd) {
2365 ibv_dealloc_pd(rdma->pd);
2366 rdma->pd = NULL;
2367 }
2368 if (rdma->cm_id) {
2369 rdma_destroy_id(rdma->cm_id);
2370 rdma->cm_id = NULL;
2371 }
2372
2373
2374 if (rdma->listen_id) {
2375 if (!rdma->is_return_path) {
2376 rdma_destroy_id(rdma->listen_id);
2377 }
2378 rdma->listen_id = NULL;
2379
2380 if (rdma->channel) {
2381 if (!rdma->is_return_path) {
2382 rdma_destroy_event_channel(rdma->channel);
2383 }
2384 rdma->channel = NULL;
2385 }
2386 }
2387
2388 if (rdma->channel) {
2389 rdma_destroy_event_channel(rdma->channel);
2390 rdma->channel = NULL;
2391 }
2392 g_free(rdma->host);
2393 rdma->host = NULL;
2394}
2395
2396
2397static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2398{
2399 int ret, idx;
2400 Error *local_err = NULL, **temp = &local_err;
2401
2402
2403
2404
2405
2406 rdma->pin_all = pin_all;
2407
2408 ret = qemu_rdma_resolve_host(rdma, temp);
2409 if (ret) {
2410 goto err_rdma_source_init;
2411 }
2412
2413 ret = qemu_rdma_alloc_pd_cq(rdma);
2414 if (ret) {
2415 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2416 " limits may be too low. Please check $ ulimit -a # and "
2417 "search for 'ulimit -l' in the output");
2418 goto err_rdma_source_init;
2419 }
2420
2421 ret = qemu_rdma_alloc_qp(rdma);
2422 if (ret) {
2423 ERROR(temp, "rdma migration: error allocating qp!");
2424 goto err_rdma_source_init;
2425 }
2426
2427 ret = qemu_rdma_init_ram_blocks(rdma);
2428 if (ret) {
2429 ERROR(temp, "rdma migration: error initializing ram blocks!");
2430 goto err_rdma_source_init;
2431 }
2432
2433
2434 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2435 for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2436 g_hash_table_insert(rdma->blockmap,
2437 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2438 &rdma->local_ram_blocks.block[idx]);
2439 }
2440
2441 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2442 ret = qemu_rdma_reg_control(rdma, idx);
2443 if (ret) {
2444 ERROR(temp, "rdma migration: error registering %d control!",
2445 idx);
2446 goto err_rdma_source_init;
2447 }
2448 }
2449
2450 return 0;
2451
2452err_rdma_source_init:
2453 error_propagate(errp, local_err);
2454 qemu_rdma_cleanup(rdma);
2455 return -1;
2456}
2457
2458static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
2459{
2460 RDMACapabilities cap = {
2461 .version = RDMA_CONTROL_VERSION_CURRENT,
2462 .flags = 0,
2463 };
2464 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2465 .retry_count = 5,
2466 .private_data = &cap,
2467 .private_data_len = sizeof(cap),
2468 };
2469 struct rdma_cm_event *cm_event;
2470 int ret;
2471
2472
2473
2474
2475
2476 if (rdma->pin_all) {
2477 trace_qemu_rdma_connect_pin_all_requested();
2478 cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2479 }
2480
2481 caps_to_network(&cap);
2482
2483 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2484 if (ret) {
2485 ERROR(errp, "posting second control recv");
2486 goto err_rdma_source_connect;
2487 }
2488
2489 ret = rdma_connect(rdma->cm_id, &conn_param);
2490 if (ret) {
2491 perror("rdma_connect");
2492 ERROR(errp, "connecting to destination!");
2493 goto err_rdma_source_connect;
2494 }
2495
2496 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2497 if (ret) {
2498 perror("rdma_get_cm_event after rdma_connect");
2499 ERROR(errp, "connecting to destination!");
2500 goto err_rdma_source_connect;
2501 }
2502
2503 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2504 perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2505 ERROR(errp, "connecting to destination!");
2506 rdma_ack_cm_event(cm_event);
2507 goto err_rdma_source_connect;
2508 }
2509 rdma->connected = true;
2510
2511 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2512 network_to_caps(&cap);
2513
2514
2515
2516
2517
2518 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2519 ERROR(errp, "Server cannot support pinning all memory. "
2520 "Will register memory dynamically.");
2521 rdma->pin_all = false;
2522 }
2523
2524 trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2525
2526 rdma_ack_cm_event(cm_event);
2527
2528 rdma->control_ready_expected = 1;
2529 rdma->nb_sent = 0;
2530 return 0;
2531
2532err_rdma_source_connect:
2533 qemu_rdma_cleanup(rdma);
2534 return -1;
2535}
2536
2537static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2538{
2539 int ret, idx;
2540 struct rdma_cm_id *listen_id;
2541 char ip[40] = "unknown";
2542 struct rdma_addrinfo *res, *e;
2543 char port_str[16];
2544
2545 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2546 rdma->wr_data[idx].control_len = 0;
2547 rdma->wr_data[idx].control_curr = NULL;
2548 }
2549
2550 if (!rdma->host || !rdma->host[0]) {
2551 ERROR(errp, "RDMA host is not set!");
2552 rdma->error_state = -EINVAL;
2553 return -1;
2554 }
2555
2556 rdma->channel = rdma_create_event_channel();
2557 if (!rdma->channel) {
2558 ERROR(errp, "could not create rdma event channel");
2559 rdma->error_state = -EINVAL;
2560 return -1;
2561 }
2562
2563
2564 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2565 if (ret) {
2566 ERROR(errp, "could not create cm_id!");
2567 goto err_dest_init_create_listen_id;
2568 }
2569
2570 snprintf(port_str, 16, "%d", rdma->port);
2571 port_str[15] = '\0';
2572
2573 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2574 if (ret < 0) {
2575 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2576 goto err_dest_init_bind_addr;
2577 }
2578
2579 for (e = res; e != NULL; e = e->ai_next) {
2580 inet_ntop(e->ai_family,
2581 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2582 trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2583 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2584 if (ret) {
2585 continue;
2586 }
2587 if (e->ai_family == AF_INET6) {
2588 ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
2589 if (ret) {
2590 continue;
2591 }
2592 }
2593 break;
2594 }
2595
2596 if (!e) {
2597 ERROR(errp, "Error: could not rdma_bind_addr!");
2598 goto err_dest_init_bind_addr;
2599 }
2600
2601 rdma->listen_id = listen_id;
2602 qemu_rdma_dump_gid("dest_init", listen_id);
2603 return 0;
2604
2605err_dest_init_bind_addr:
2606 rdma_destroy_id(listen_id);
2607err_dest_init_create_listen_id:
2608 rdma_destroy_event_channel(rdma->channel);
2609 rdma->channel = NULL;
2610 rdma->error_state = ret;
2611 return ret;
2612
2613}
2614
2615static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
2616 RDMAContext *rdma)
2617{
2618 int idx;
2619
2620 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2621 rdma_return_path->wr_data[idx].control_len = 0;
2622 rdma_return_path->wr_data[idx].control_curr = NULL;
2623 }
2624
2625
2626 rdma_return_path->channel = rdma->channel;
2627 rdma_return_path->listen_id = rdma->listen_id;
2628
2629 rdma->return_path = rdma_return_path;
2630 rdma_return_path->return_path = rdma;
2631 rdma_return_path->is_return_path = true;
2632}
2633
2634static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2635{
2636 RDMAContext *rdma = NULL;
2637 InetSocketAddress *addr;
2638
2639 if (host_port) {
2640 rdma = g_new0(RDMAContext, 1);
2641 rdma->current_index = -1;
2642 rdma->current_chunk = -1;
2643
2644 addr = g_new(InetSocketAddress, 1);
2645 if (!inet_parse(addr, host_port, NULL)) {
2646 rdma->port = atoi(addr->port);
2647 rdma->host = g_strdup(addr->host);
2648 } else {
2649 ERROR(errp, "bad RDMA migration address '%s'", host_port);
2650 g_free(rdma);
2651 rdma = NULL;
2652 }
2653
2654 qapi_free_InetSocketAddress(addr);
2655 }
2656
2657 return rdma;
2658}
2659
2660
2661
2662
2663
2664
2665static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
2666 const struct iovec *iov,
2667 size_t niov,
2668 int *fds,
2669 size_t nfds,
2670 Error **errp)
2671{
2672 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2673 QEMUFile *f = rioc->file;
2674 RDMAContext *rdma;
2675 int ret;
2676 ssize_t done = 0;
2677 size_t i;
2678 size_t len = 0;
2679
2680 RCU_READ_LOCK_GUARD();
2681 rdma = qatomic_rcu_read(&rioc->rdmaout);
2682
2683 if (!rdma) {
2684 return -EIO;
2685 }
2686
2687 CHECK_ERROR_STATE();
2688
2689
2690
2691
2692
2693 ret = qemu_rdma_write_flush(f, rdma);
2694 if (ret < 0) {
2695 rdma->error_state = ret;
2696 return ret;
2697 }
2698
2699 for (i = 0; i < niov; i++) {
2700 size_t remaining = iov[i].iov_len;
2701 uint8_t * data = (void *)iov[i].iov_base;
2702 while (remaining) {
2703 RDMAControlHeader head;
2704
2705 len = MIN(remaining, RDMA_SEND_INCREMENT);
2706 remaining -= len;
2707
2708 head.len = len;
2709 head.type = RDMA_CONTROL_QEMU_FILE;
2710
2711 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2712
2713 if (ret < 0) {
2714 rdma->error_state = ret;
2715 return ret;
2716 }
2717
2718 data += len;
2719 done += len;
2720 }
2721 }
2722
2723 return done;
2724}
2725
2726static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2727 size_t size, int idx)
2728{
2729 size_t len = 0;
2730
2731 if (rdma->wr_data[idx].control_len) {
2732 trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2733
2734 len = MIN(size, rdma->wr_data[idx].control_len);
2735 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2736 rdma->wr_data[idx].control_curr += len;
2737 rdma->wr_data[idx].control_len -= len;
2738 }
2739
2740 return len;
2741}
2742
2743
2744
2745
2746
2747
2748static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
2749 const struct iovec *iov,
2750 size_t niov,
2751 int **fds,
2752 size_t *nfds,
2753 Error **errp)
2754{
2755 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2756 RDMAContext *rdma;
2757 RDMAControlHeader head;
2758 int ret = 0;
2759 ssize_t i;
2760 size_t done = 0;
2761
2762 RCU_READ_LOCK_GUARD();
2763 rdma = qatomic_rcu_read(&rioc->rdmain);
2764
2765 if (!rdma) {
2766 return -EIO;
2767 }
2768
2769 CHECK_ERROR_STATE();
2770
2771 for (i = 0; i < niov; i++) {
2772 size_t want = iov[i].iov_len;
2773 uint8_t *data = (void *)iov[i].iov_base;
2774
2775
2776
2777
2778
2779
2780 ret = qemu_rdma_fill(rdma, data, want, 0);
2781 done += ret;
2782 want -= ret;
2783
2784 if (want == 0) {
2785 continue;
2786 }
2787
2788
2789
2790 if (done > 0) {
2791 break;
2792 }
2793
2794
2795
2796
2797
2798 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2799
2800 if (ret < 0) {
2801 rdma->error_state = ret;
2802 return ret;
2803 }
2804
2805
2806
2807
2808 ret = qemu_rdma_fill(rdma, data, want, 0);
2809 done += ret;
2810 want -= ret;
2811
2812
2813 if (want) {
2814 if (done == 0) {
2815 return QIO_CHANNEL_ERR_BLOCK;
2816 } else {
2817 break;
2818 }
2819 }
2820 }
2821 return done;
2822}
2823
2824
2825
2826
2827static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2828{
2829 int ret;
2830
2831 if (qemu_rdma_write_flush(f, rdma) < 0) {
2832 return -EIO;
2833 }
2834
2835 while (rdma->nb_sent) {
2836 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2837 if (ret < 0) {
2838 error_report("rdma migration: complete polling error!");
2839 return -EIO;
2840 }
2841 }
2842
2843 qemu_rdma_unregister_waiting(rdma);
2844
2845 return 0;
2846}
2847
2848
2849static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
2850 bool blocking,
2851 Error **errp)
2852{
2853 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2854
2855 rioc->blocking = blocking;
2856 return 0;
2857}
2858
2859
2860typedef struct QIOChannelRDMASource QIOChannelRDMASource;
2861struct QIOChannelRDMASource {
2862 GSource parent;
2863 QIOChannelRDMA *rioc;
2864 GIOCondition condition;
2865};
2866
2867static gboolean
2868qio_channel_rdma_source_prepare(GSource *source,
2869 gint *timeout)
2870{
2871 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2872 RDMAContext *rdma;
2873 GIOCondition cond = 0;
2874 *timeout = -1;
2875
2876 RCU_READ_LOCK_GUARD();
2877 if (rsource->condition == G_IO_IN) {
2878 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
2879 } else {
2880 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
2881 }
2882
2883 if (!rdma) {
2884 error_report("RDMAContext is NULL when prepare Gsource");
2885 return FALSE;
2886 }
2887
2888 if (rdma->wr_data[0].control_len) {
2889 cond |= G_IO_IN;
2890 }
2891 cond |= G_IO_OUT;
2892
2893 return cond & rsource->condition;
2894}
2895
2896static gboolean
2897qio_channel_rdma_source_check(GSource *source)
2898{
2899 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2900 RDMAContext *rdma;
2901 GIOCondition cond = 0;
2902
2903 RCU_READ_LOCK_GUARD();
2904 if (rsource->condition == G_IO_IN) {
2905 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
2906 } else {
2907 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
2908 }
2909
2910 if (!rdma) {
2911 error_report("RDMAContext is NULL when check Gsource");
2912 return FALSE;
2913 }
2914
2915 if (rdma->wr_data[0].control_len) {
2916 cond |= G_IO_IN;
2917 }
2918 cond |= G_IO_OUT;
2919
2920 return cond & rsource->condition;
2921}
2922
2923static gboolean
2924qio_channel_rdma_source_dispatch(GSource *source,
2925 GSourceFunc callback,
2926 gpointer user_data)
2927{
2928 QIOChannelFunc func = (QIOChannelFunc)callback;
2929 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2930 RDMAContext *rdma;
2931 GIOCondition cond = 0;
2932
2933 RCU_READ_LOCK_GUARD();
2934 if (rsource->condition == G_IO_IN) {
2935 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
2936 } else {
2937 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
2938 }
2939
2940 if (!rdma) {
2941 error_report("RDMAContext is NULL when dispatch Gsource");
2942 return FALSE;
2943 }
2944
2945 if (rdma->wr_data[0].control_len) {
2946 cond |= G_IO_IN;
2947 }
2948 cond |= G_IO_OUT;
2949
2950 return (*func)(QIO_CHANNEL(rsource->rioc),
2951 (cond & rsource->condition),
2952 user_data);
2953}
2954
2955static void
2956qio_channel_rdma_source_finalize(GSource *source)
2957{
2958 QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
2959
2960 object_unref(OBJECT(ssource->rioc));
2961}
2962
2963GSourceFuncs qio_channel_rdma_source_funcs = {
2964 qio_channel_rdma_source_prepare,
2965 qio_channel_rdma_source_check,
2966 qio_channel_rdma_source_dispatch,
2967 qio_channel_rdma_source_finalize
2968};
2969
2970static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
2971 GIOCondition condition)
2972{
2973 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2974 QIOChannelRDMASource *ssource;
2975 GSource *source;
2976
2977 source = g_source_new(&qio_channel_rdma_source_funcs,
2978 sizeof(QIOChannelRDMASource));
2979 ssource = (QIOChannelRDMASource *)source;
2980
2981 ssource->rioc = rioc;
2982 object_ref(OBJECT(rioc));
2983
2984 ssource->condition = condition;
2985
2986 return source;
2987}
2988
2989static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
2990 AioContext *ctx,
2991 IOHandler *io_read,
2992 IOHandler *io_write,
2993 void *opaque)
2994{
2995 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2996 if (io_read) {
2997 aio_set_fd_handler(ctx, rioc->rdmain->comp_channel->fd,
2998 false, io_read, io_write, NULL, opaque);
2999 } else {
3000 aio_set_fd_handler(ctx, rioc->rdmaout->comp_channel->fd,
3001 false, io_read, io_write, NULL, opaque);
3002 }
3003}
3004
3005struct rdma_close_rcu {
3006 struct rcu_head rcu;
3007 RDMAContext *rdmain;
3008 RDMAContext *rdmaout;
3009};
3010
3011
3012static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3013{
3014 if (rcu->rdmain) {
3015 qemu_rdma_cleanup(rcu->rdmain);
3016 }
3017
3018 if (rcu->rdmaout) {
3019 qemu_rdma_cleanup(rcu->rdmaout);
3020 }
3021
3022 g_free(rcu->rdmain);
3023 g_free(rcu->rdmaout);
3024 g_free(rcu);
3025}
3026
3027static int qio_channel_rdma_close(QIOChannel *ioc,
3028 Error **errp)
3029{
3030 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3031 RDMAContext *rdmain, *rdmaout;
3032 struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3033
3034 trace_qemu_rdma_close();
3035
3036 rdmain = rioc->rdmain;
3037 if (rdmain) {
3038 qatomic_rcu_set(&rioc->rdmain, NULL);
3039 }
3040
3041 rdmaout = rioc->rdmaout;
3042 if (rdmaout) {
3043 qatomic_rcu_set(&rioc->rdmaout, NULL);
3044 }
3045
3046 rcu->rdmain = rdmain;
3047 rcu->rdmaout = rdmaout;
3048 call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
3049
3050 return 0;
3051}
3052
3053static int
3054qio_channel_rdma_shutdown(QIOChannel *ioc,
3055 QIOChannelShutdown how,
3056 Error **errp)
3057{
3058 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3059 RDMAContext *rdmain, *rdmaout;
3060
3061 RCU_READ_LOCK_GUARD();
3062
3063 rdmain = qatomic_rcu_read(&rioc->rdmain);
3064 rdmaout = qatomic_rcu_read(&rioc->rdmain);
3065
3066 switch (how) {
3067 case QIO_CHANNEL_SHUTDOWN_READ:
3068 if (rdmain) {
3069 rdmain->error_state = -1;
3070 }
3071 break;
3072 case QIO_CHANNEL_SHUTDOWN_WRITE:
3073 if (rdmaout) {
3074 rdmaout->error_state = -1;
3075 }
3076 break;
3077 case QIO_CHANNEL_SHUTDOWN_BOTH:
3078 default:
3079 if (rdmain) {
3080 rdmain->error_state = -1;
3081 }
3082 if (rdmaout) {
3083 rdmaout->error_state = -1;
3084 }
3085 break;
3086 }
3087
3088 return 0;
3089}
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
3126 ram_addr_t block_offset, ram_addr_t offset,
3127 size_t size, uint64_t *bytes_sent)
3128{
3129 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3130 RDMAContext *rdma;
3131 int ret;
3132
3133 RCU_READ_LOCK_GUARD();
3134 rdma = qatomic_rcu_read(&rioc->rdmaout);
3135
3136 if (!rdma) {
3137 return -EIO;
3138 }
3139
3140 CHECK_ERROR_STATE();
3141
3142 if (migration_in_postcopy()) {
3143 return RAM_SAVE_CONTROL_NOT_SUPP;
3144 }
3145
3146 qemu_fflush(f);
3147
3148 if (size > 0) {
3149
3150
3151
3152
3153
3154 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
3155 if (ret < 0) {
3156 error_report("rdma migration: write error! %d", ret);
3157 goto err;
3158 }
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168 if (bytes_sent) {
3169 *bytes_sent = 1;
3170 }
3171 } else {
3172 uint64_t index, chunk;
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185 ret = qemu_rdma_search_ram_block(rdma, block_offset,
3186 offset, size, &index, &chunk);
3187
3188 if (ret) {
3189 error_report("ram block search failed");
3190 goto err;
3191 }
3192
3193 qemu_rdma_signal_unregister(rdma, index, chunk, 0);
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203 }
3204
3205
3206
3207
3208
3209
3210
3211
3212 while (1) {
3213 uint64_t wr_id, wr_id_in;
3214 int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
3215 if (ret < 0) {
3216 error_report("rdma migration: polling error! %d", ret);
3217 goto err;
3218 }
3219
3220 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3221
3222 if (wr_id == RDMA_WRID_NONE) {
3223 break;
3224 }
3225 }
3226
3227 return RAM_SAVE_CONTROL_DELAYED;
3228err:
3229 rdma->error_state = ret;
3230 return ret;
3231}
3232
3233static void rdma_accept_incoming_migration(void *opaque);
3234
3235static void rdma_cm_poll_handler(void *opaque)
3236{
3237 RDMAContext *rdma = opaque;
3238 int ret;
3239 struct rdma_cm_event *cm_event;
3240 MigrationIncomingState *mis = migration_incoming_get_current();
3241
3242 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3243 if (ret) {
3244 error_report("get_cm_event failed %d", errno);
3245 return;
3246 }
3247 rdma_ack_cm_event(cm_event);
3248
3249 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
3250 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3251 if (!rdma->error_state &&
3252 migration_incoming_get_current()->state !=
3253 MIGRATION_STATUS_COMPLETED) {
3254 error_report("receive cm event, cm event is %d", cm_event->event);
3255 rdma->error_state = -EPIPE;
3256 if (rdma->return_path) {
3257 rdma->return_path->error_state = -EPIPE;
3258 }
3259 }
3260
3261 if (mis->migration_incoming_co) {
3262 qemu_coroutine_enter(mis->migration_incoming_co);
3263 }
3264 return;
3265 }
3266}
3267
3268static int qemu_rdma_accept(RDMAContext *rdma)
3269{
3270 RDMACapabilities cap;
3271 struct rdma_conn_param conn_param = {
3272 .responder_resources = 2,
3273 .private_data = &cap,
3274 .private_data_len = sizeof(cap),
3275 };
3276 struct rdma_cm_event *cm_event;
3277 struct ibv_context *verbs;
3278 int ret = -EINVAL;
3279 int idx;
3280
3281 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3282 if (ret) {
3283 goto err_rdma_dest_wait;
3284 }
3285
3286 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3287 rdma_ack_cm_event(cm_event);
3288 goto err_rdma_dest_wait;
3289 }
3290
3291 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3292
3293 network_to_caps(&cap);
3294
3295 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3296 error_report("Unknown source RDMA version: %d, bailing...",
3297 cap.version);
3298 rdma_ack_cm_event(cm_event);
3299 goto err_rdma_dest_wait;
3300 }
3301
3302
3303
3304
3305 cap.flags &= known_capabilities;
3306
3307
3308
3309
3310
3311 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3312 rdma->pin_all = true;
3313 }
3314
3315 rdma->cm_id = cm_event->id;
3316 verbs = cm_event->id->verbs;
3317
3318 rdma_ack_cm_event(cm_event);
3319
3320 trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3321
3322 caps_to_network(&cap);
3323
3324 trace_qemu_rdma_accept_pin_verbsc(verbs);
3325
3326 if (!rdma->verbs) {
3327 rdma->verbs = verbs;
3328 } else if (rdma->verbs != verbs) {
3329 error_report("ibv context not matching %p, %p!", rdma->verbs,
3330 verbs);
3331 goto err_rdma_dest_wait;
3332 }
3333
3334 qemu_rdma_dump_id("dest_init", verbs);
3335
3336 ret = qemu_rdma_alloc_pd_cq(rdma);
3337 if (ret) {
3338 error_report("rdma migration: error allocating pd and cq!");
3339 goto err_rdma_dest_wait;
3340 }
3341
3342 ret = qemu_rdma_alloc_qp(rdma);
3343 if (ret) {
3344 error_report("rdma migration: error allocating qp!");
3345 goto err_rdma_dest_wait;
3346 }
3347
3348 ret = qemu_rdma_init_ram_blocks(rdma);
3349 if (ret) {
3350 error_report("rdma migration: error initializing ram blocks!");
3351 goto err_rdma_dest_wait;
3352 }
3353
3354 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
3355 ret = qemu_rdma_reg_control(rdma, idx);
3356 if (ret) {
3357 error_report("rdma: error registering %d control", idx);
3358 goto err_rdma_dest_wait;
3359 }
3360 }
3361
3362
3363 if (migrate_postcopy() && !rdma->is_return_path) {
3364 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3365 NULL,
3366 (void *)(intptr_t)rdma->return_path);
3367 } else {
3368 qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
3369 NULL, rdma);
3370 }
3371
3372 ret = rdma_accept(rdma->cm_id, &conn_param);
3373 if (ret) {
3374 error_report("rdma_accept returns %d", ret);
3375 goto err_rdma_dest_wait;
3376 }
3377
3378 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3379 if (ret) {
3380 error_report("rdma_accept get_cm_event failed %d", ret);
3381 goto err_rdma_dest_wait;
3382 }
3383
3384 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3385 error_report("rdma_accept not event established");
3386 rdma_ack_cm_event(cm_event);
3387 goto err_rdma_dest_wait;
3388 }
3389
3390 rdma_ack_cm_event(cm_event);
3391 rdma->connected = true;
3392
3393 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
3394 if (ret) {
3395 error_report("rdma migration: error posting second control recv");
3396 goto err_rdma_dest_wait;
3397 }
3398
3399 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3400
3401 return 0;
3402
3403err_rdma_dest_wait:
3404 rdma->error_state = ret;
3405 qemu_rdma_cleanup(rdma);
3406 return ret;
3407}
3408
3409static int dest_ram_sort_func(const void *a, const void *b)
3410{
3411 unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3412 unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3413
3414 return (a_index < b_index) ? -1 : (a_index != b_index);
3415}
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
3427{
3428 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3429 .type = RDMA_CONTROL_REGISTER_RESULT,
3430 .repeat = 0,
3431 };
3432 RDMAControlHeader unreg_resp = { .len = 0,
3433 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3434 .repeat = 0,
3435 };
3436 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3437 .repeat = 1 };
3438 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3439 RDMAContext *rdma;
3440 RDMALocalBlocks *local;
3441 RDMAControlHeader head;
3442 RDMARegister *reg, *registers;
3443 RDMACompress *comp;
3444 RDMARegisterResult *reg_result;
3445 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3446 RDMALocalBlock *block;
3447 void *host_addr;
3448 int ret = 0;
3449 int idx = 0;
3450 int count = 0;
3451 int i = 0;
3452
3453 RCU_READ_LOCK_GUARD();
3454 rdma = qatomic_rcu_read(&rioc->rdmain);
3455
3456 if (!rdma) {
3457 return -EIO;
3458 }
3459
3460 CHECK_ERROR_STATE();
3461
3462 local = &rdma->local_ram_blocks;
3463 do {
3464 trace_qemu_rdma_registration_handle_wait();
3465
3466 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
3467
3468 if (ret < 0) {
3469 break;
3470 }
3471
3472 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3473 error_report("rdma: Too many requests in this message (%d)."
3474 "Bailing.", head.repeat);
3475 ret = -EIO;
3476 break;
3477 }
3478
3479 switch (head.type) {
3480 case RDMA_CONTROL_COMPRESS:
3481 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3482 network_to_compress(comp);
3483
3484 trace_qemu_rdma_registration_handle_compress(comp->length,
3485 comp->block_idx,
3486 comp->offset);
3487 if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3488 error_report("rdma: 'compress' bad block index %u (vs %d)",
3489 (unsigned int)comp->block_idx,
3490 rdma->local_ram_blocks.nb_blocks);
3491 ret = -EIO;
3492 goto out;
3493 }
3494 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3495
3496 host_addr = block->local_host_addr +
3497 (comp->offset - block->offset);
3498
3499 ram_handle_compressed(host_addr, comp->value, comp->length);
3500 break;
3501
3502 case RDMA_CONTROL_REGISTER_FINISHED:
3503 trace_qemu_rdma_registration_handle_finished();
3504 goto out;
3505
3506 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3507 trace_qemu_rdma_registration_handle_ram_blocks();
3508
3509
3510
3511
3512
3513 qsort(rdma->local_ram_blocks.block,
3514 rdma->local_ram_blocks.nb_blocks,
3515 sizeof(RDMALocalBlock), dest_ram_sort_func);
3516 for (i = 0; i < local->nb_blocks; i++) {
3517 local->block[i].index = i;
3518 }
3519
3520 if (rdma->pin_all) {
3521 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3522 if (ret) {
3523 error_report("rdma migration: error dest "
3524 "registering ram blocks");
3525 goto out;
3526 }
3527 }
3528
3529
3530
3531
3532
3533
3534
3535 for (i = 0; i < local->nb_blocks; i++) {
3536 rdma->dest_blocks[i].remote_host_addr =
3537 (uintptr_t)(local->block[i].local_host_addr);
3538
3539 if (rdma->pin_all) {
3540 rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3541 }
3542
3543 rdma->dest_blocks[i].offset = local->block[i].offset;
3544 rdma->dest_blocks[i].length = local->block[i].length;
3545
3546 dest_block_to_network(&rdma->dest_blocks[i]);
3547 trace_qemu_rdma_registration_handle_ram_blocks_loop(
3548 local->block[i].block_name,
3549 local->block[i].offset,
3550 local->block[i].length,
3551 local->block[i].local_host_addr,
3552 local->block[i].src_index);
3553 }
3554
3555 blocks.len = rdma->local_ram_blocks.nb_blocks
3556 * sizeof(RDMADestBlock);
3557
3558
3559 ret = qemu_rdma_post_send_control(rdma,
3560 (uint8_t *) rdma->dest_blocks, &blocks);
3561
3562 if (ret < 0) {
3563 error_report("rdma migration: error sending remote info");
3564 goto out;
3565 }
3566
3567 break;
3568 case RDMA_CONTROL_REGISTER_REQUEST:
3569 trace_qemu_rdma_registration_handle_register(head.repeat);
3570
3571 reg_resp.repeat = head.repeat;
3572 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3573
3574 for (count = 0; count < head.repeat; count++) {
3575 uint64_t chunk;
3576 uint8_t *chunk_start, *chunk_end;
3577
3578 reg = ®isters[count];
3579 network_to_register(reg);
3580
3581 reg_result = &results[count];
3582
3583 trace_qemu_rdma_registration_handle_register_loop(count,
3584 reg->current_index, reg->key.current_addr, reg->chunks);
3585
3586 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3587 error_report("rdma: 'register' bad block index %u (vs %d)",
3588 (unsigned int)reg->current_index,
3589 rdma->local_ram_blocks.nb_blocks);
3590 ret = -ENOENT;
3591 goto out;
3592 }
3593 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3594 if (block->is_ram_block) {
3595 if (block->offset > reg->key.current_addr) {
3596 error_report("rdma: bad register address for block %s"
3597 " offset: %" PRIx64 " current_addr: %" PRIx64,
3598 block->block_name, block->offset,
3599 reg->key.current_addr);
3600 ret = -ERANGE;
3601 goto out;
3602 }
3603 host_addr = (block->local_host_addr +
3604 (reg->key.current_addr - block->offset));
3605 chunk = ram_chunk_index(block->local_host_addr,
3606 (uint8_t *) host_addr);
3607 } else {
3608 chunk = reg->key.chunk;
3609 host_addr = block->local_host_addr +
3610 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3611
3612 if (host_addr < (void *)block->local_host_addr) {
3613 error_report("rdma: bad chunk for block %s"
3614 " chunk: %" PRIx64,
3615 block->block_name, reg->key.chunk);
3616 ret = -ERANGE;
3617 goto out;
3618 }
3619 }
3620 chunk_start = ram_chunk_start(block, chunk);
3621 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3622
3623 uint32_t tmp_rkey = 0;
3624 if (qemu_rdma_register_and_get_keys(rdma, block,
3625 (uintptr_t)host_addr, NULL, &tmp_rkey,
3626 chunk, chunk_start, chunk_end)) {
3627 error_report("cannot get rkey");
3628 ret = -EINVAL;
3629 goto out;
3630 }
3631 reg_result->rkey = tmp_rkey;
3632
3633 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3634
3635 trace_qemu_rdma_registration_handle_register_rkey(
3636 reg_result->rkey);
3637
3638 result_to_network(reg_result);
3639 }
3640
3641 ret = qemu_rdma_post_send_control(rdma,
3642 (uint8_t *) results, ®_resp);
3643
3644 if (ret < 0) {
3645 error_report("Failed to send control buffer");
3646 goto out;
3647 }
3648 break;
3649 case RDMA_CONTROL_UNREGISTER_REQUEST:
3650 trace_qemu_rdma_registration_handle_unregister(head.repeat);
3651 unreg_resp.repeat = head.repeat;
3652 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3653
3654 for (count = 0; count < head.repeat; count++) {
3655 reg = ®isters[count];
3656 network_to_register(reg);
3657
3658 trace_qemu_rdma_registration_handle_unregister_loop(count,
3659 reg->current_index, reg->key.chunk);
3660
3661 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3662
3663 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3664 block->pmr[reg->key.chunk] = NULL;
3665
3666 if (ret != 0) {
3667 perror("rdma unregistration chunk failed");
3668 ret = -ret;
3669 goto out;
3670 }
3671
3672 rdma->total_registrations--;
3673
3674 trace_qemu_rdma_registration_handle_unregister_success(
3675 reg->key.chunk);
3676 }
3677
3678 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3679
3680 if (ret < 0) {
3681 error_report("Failed to send control buffer");
3682 goto out;
3683 }
3684 break;
3685 case RDMA_CONTROL_REGISTER_RESULT:
3686 error_report("Invalid RESULT message at dest.");
3687 ret = -EIO;
3688 goto out;
3689 default:
3690 error_report("Unknown control message %s", control_desc(head.type));
3691 ret = -EIO;
3692 goto out;
3693 }
3694 } while (1);
3695out:
3696 if (ret < 0) {
3697 rdma->error_state = ret;
3698 }
3699 return ret;
3700}
3701
3702
3703
3704
3705
3706
3707
3708
3709static int
3710rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
3711{
3712 RDMAContext *rdma;
3713 int curr;
3714 int found = -1;
3715
3716 RCU_READ_LOCK_GUARD();
3717 rdma = qatomic_rcu_read(&rioc->rdmain);
3718
3719 if (!rdma) {
3720 return -EIO;
3721 }
3722
3723
3724 for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3725 if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3726 found = curr;
3727 break;
3728 }
3729 }
3730
3731 if (found == -1) {
3732 error_report("RAMBlock '%s' not found on destination", name);
3733 return -ENOENT;
3734 }
3735
3736 rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3737 trace_rdma_block_notification_handle(name, rdma->next_src_index);
3738 rdma->next_src_index++;
3739
3740 return 0;
3741}
3742
3743static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
3744{
3745 switch (flags) {
3746 case RAM_CONTROL_BLOCK_REG:
3747 return rdma_block_notification_handle(opaque, data);
3748
3749 case RAM_CONTROL_HOOK:
3750 return qemu_rdma_registration_handle(f, opaque);
3751
3752 default:
3753
3754 abort();
3755 }
3756}
3757
3758static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3759 uint64_t flags, void *data)
3760{
3761 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3762 RDMAContext *rdma;
3763
3764 RCU_READ_LOCK_GUARD();
3765 rdma = qatomic_rcu_read(&rioc->rdmaout);
3766 if (!rdma) {
3767 return -EIO;
3768 }
3769
3770 CHECK_ERROR_STATE();
3771
3772 if (migration_in_postcopy()) {
3773 return 0;
3774 }
3775
3776 trace_qemu_rdma_registration_start(flags);
3777 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3778 qemu_fflush(f);
3779
3780 return 0;
3781}
3782
3783
3784
3785
3786
3787static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3788 uint64_t flags, void *data)
3789{
3790 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3791 RDMAContext *rdma;
3792 RDMAControlHeader head = { .len = 0, .repeat = 1 };
3793 int ret = 0;
3794
3795 RCU_READ_LOCK_GUARD();
3796 rdma = qatomic_rcu_read(&rioc->rdmaout);
3797 if (!rdma) {
3798 return -EIO;
3799 }
3800
3801 CHECK_ERROR_STATE();
3802
3803 if (migration_in_postcopy()) {
3804 return 0;
3805 }
3806
3807 qemu_fflush(f);
3808 ret = qemu_rdma_drain_cq(f, rdma);
3809
3810 if (ret < 0) {
3811 goto err;
3812 }
3813
3814 if (flags == RAM_CONTROL_SETUP) {
3815 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3816 RDMALocalBlocks *local = &rdma->local_ram_blocks;
3817 int reg_result_idx, i, nb_dest_blocks;
3818
3819 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3820 trace_qemu_rdma_registration_stop_ram();
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3831 ®_result_idx, rdma->pin_all ?
3832 qemu_rdma_reg_whole_ram_blocks : NULL);
3833 if (ret < 0) {
3834 fprintf(stderr, "receiving remote info!");
3835 return ret;
3836 }
3837
3838 nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852 if (local->nb_blocks != nb_dest_blocks) {
3853 fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
3854 "Your QEMU command line parameters are probably "
3855 "not identical on both the source and destination.",
3856 local->nb_blocks, nb_dest_blocks);
3857 rdma->error_state = -EINVAL;
3858 return -EINVAL;
3859 }
3860
3861 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3862 memcpy(rdma->dest_blocks,
3863 rdma->wr_data[reg_result_idx].control_curr, resp.len);
3864 for (i = 0; i < nb_dest_blocks; i++) {
3865 network_to_dest_block(&rdma->dest_blocks[i]);
3866
3867
3868 if (rdma->dest_blocks[i].length != local->block[i].length) {
3869 fprintf(stderr, "Block %s/%d has a different length %" PRIu64
3870 "vs %" PRIu64, local->block[i].block_name, i,
3871 local->block[i].length,
3872 rdma->dest_blocks[i].length);
3873 rdma->error_state = -EINVAL;
3874 return -EINVAL;
3875 }
3876 local->block[i].remote_host_addr =
3877 rdma->dest_blocks[i].remote_host_addr;
3878 local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3879 }
3880 }
3881
3882 trace_qemu_rdma_registration_stop(flags);
3883
3884 head.type = RDMA_CONTROL_REGISTER_FINISHED;
3885 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3886
3887 if (ret < 0) {
3888 goto err;
3889 }
3890
3891 return 0;
3892err:
3893 rdma->error_state = ret;
3894 return ret;
3895}
3896
3897static const QEMUFileHooks rdma_read_hooks = {
3898 .hook_ram_load = rdma_load_hook,
3899};
3900
3901static const QEMUFileHooks rdma_write_hooks = {
3902 .before_ram_iterate = qemu_rdma_registration_start,
3903 .after_ram_iterate = qemu_rdma_registration_stop,
3904 .save_page = qemu_rdma_save_page,
3905};
3906
3907
3908static void qio_channel_rdma_finalize(Object *obj)
3909{
3910 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
3911 if (rioc->rdmain) {
3912 qemu_rdma_cleanup(rioc->rdmain);
3913 g_free(rioc->rdmain);
3914 rioc->rdmain = NULL;
3915 }
3916 if (rioc->rdmaout) {
3917 qemu_rdma_cleanup(rioc->rdmaout);
3918 g_free(rioc->rdmaout);
3919 rioc->rdmaout = NULL;
3920 }
3921}
3922
3923static void qio_channel_rdma_class_init(ObjectClass *klass,
3924 void *class_data G_GNUC_UNUSED)
3925{
3926 QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
3927
3928 ioc_klass->io_writev = qio_channel_rdma_writev;
3929 ioc_klass->io_readv = qio_channel_rdma_readv;
3930 ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
3931 ioc_klass->io_close = qio_channel_rdma_close;
3932 ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
3933 ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
3934 ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
3935}
3936
3937static const TypeInfo qio_channel_rdma_info = {
3938 .parent = TYPE_QIO_CHANNEL,
3939 .name = TYPE_QIO_CHANNEL_RDMA,
3940 .instance_size = sizeof(QIOChannelRDMA),
3941 .instance_finalize = qio_channel_rdma_finalize,
3942 .class_init = qio_channel_rdma_class_init,
3943};
3944
3945static void qio_channel_rdma_register_types(void)
3946{
3947 type_register_static(&qio_channel_rdma_info);
3948}
3949
3950type_init(qio_channel_rdma_register_types);
3951
3952static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
3953{
3954 QIOChannelRDMA *rioc;
3955
3956 if (qemu_file_mode_is_not_valid(mode)) {
3957 return NULL;
3958 }
3959
3960 rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
3961
3962 if (mode[0] == 'w') {
3963 rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc));
3964 rioc->rdmaout = rdma;
3965 rioc->rdmain = rdma->return_path;
3966 qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
3967 } else {
3968 rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc));
3969 rioc->rdmain = rdma;
3970 rioc->rdmaout = rdma->return_path;
3971 qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
3972 }
3973
3974 return rioc->file;
3975}
3976
3977static void rdma_accept_incoming_migration(void *opaque)
3978{
3979 RDMAContext *rdma = opaque;
3980 int ret;
3981 QEMUFile *f;
3982 Error *local_err = NULL;
3983
3984 trace_qemu_rdma_accept_incoming_migration();
3985 ret = qemu_rdma_accept(rdma);
3986
3987 if (ret) {
3988 fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
3989 return;
3990 }
3991
3992 trace_qemu_rdma_accept_incoming_migration_accepted();
3993
3994 if (rdma->is_return_path) {
3995 return;
3996 }
3997
3998 f = qemu_fopen_rdma(rdma, "rb");
3999 if (f == NULL) {
4000 fprintf(stderr, "RDMA ERROR: could not qemu_fopen_rdma\n");
4001 qemu_rdma_cleanup(rdma);
4002 return;
4003 }
4004
4005 rdma->migration_started_on_destination = 1;
4006 migration_fd_process_incoming(f, &local_err);
4007 if (local_err) {
4008 error_reportf_err(local_err, "RDMA ERROR:");
4009 }
4010}
4011
4012void rdma_start_incoming_migration(const char *host_port, Error **errp)
4013{
4014 int ret;
4015 RDMAContext *rdma, *rdma_return_path = NULL;
4016 Error *local_err = NULL;
4017
4018 trace_rdma_start_incoming_migration();
4019
4020
4021 if (ram_block_discard_is_required()) {
4022 error_setg(errp, "RDMA: cannot disable RAM discard");
4023 return;
4024 }
4025
4026 rdma = qemu_rdma_data_init(host_port, &local_err);
4027 if (rdma == NULL) {
4028 goto err;
4029 }
4030
4031 ret = qemu_rdma_dest_init(rdma, &local_err);
4032
4033 if (ret) {
4034 goto err;
4035 }
4036
4037 trace_rdma_start_incoming_migration_after_dest_init();
4038
4039 ret = rdma_listen(rdma->listen_id, 5);
4040
4041 if (ret) {
4042 ERROR(errp, "listening on socket!");
4043 goto err;
4044 }
4045
4046 trace_rdma_start_incoming_migration_after_rdma_listen();
4047
4048
4049 if (migrate_postcopy()) {
4050 rdma_return_path = qemu_rdma_data_init(host_port, &local_err);
4051
4052 if (rdma_return_path == NULL) {
4053 goto err;
4054 }
4055
4056 qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
4057 }
4058
4059 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
4060 NULL, (void *)(intptr_t)rdma);
4061 return;
4062err:
4063 error_propagate(errp, local_err);
4064 if (rdma) {
4065 g_free(rdma->host);
4066 }
4067 g_free(rdma);
4068 g_free(rdma_return_path);
4069}
4070
4071void rdma_start_outgoing_migration(void *opaque,
4072 const char *host_port, Error **errp)
4073{
4074 MigrationState *s = opaque;
4075 RDMAContext *rdma_return_path = NULL;
4076 RDMAContext *rdma;
4077 int ret = 0;
4078
4079
4080 if (ram_block_discard_is_required()) {
4081 error_setg(errp, "RDMA: cannot disable RAM discard");
4082 return;
4083 }
4084
4085 rdma = qemu_rdma_data_init(host_port, errp);
4086 if (rdma == NULL) {
4087 goto err;
4088 }
4089
4090 ret = qemu_rdma_source_init(rdma,
4091 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4092
4093 if (ret) {
4094 goto err;
4095 }
4096
4097 trace_rdma_start_outgoing_migration_after_rdma_source_init();
4098 ret = qemu_rdma_connect(rdma, errp);
4099
4100 if (ret) {
4101 goto err;
4102 }
4103
4104
4105 if (migrate_postcopy()) {
4106 rdma_return_path = qemu_rdma_data_init(host_port, errp);
4107
4108 if (rdma_return_path == NULL) {
4109 goto return_path_err;
4110 }
4111
4112 ret = qemu_rdma_source_init(rdma_return_path,
4113 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4114
4115 if (ret) {
4116 goto return_path_err;
4117 }
4118
4119 ret = qemu_rdma_connect(rdma_return_path, errp);
4120
4121 if (ret) {
4122 goto return_path_err;
4123 }
4124
4125 rdma->return_path = rdma_return_path;
4126 rdma_return_path->return_path = rdma;
4127 rdma_return_path->is_return_path = true;
4128 }
4129
4130 trace_rdma_start_outgoing_migration_after_rdma_connect();
4131
4132 s->to_dst_file = qemu_fopen_rdma(rdma, "wb");
4133 migrate_fd_connect(s, NULL);
4134 return;
4135return_path_err:
4136 qemu_rdma_cleanup(rdma);
4137err:
4138 g_free(rdma);
4139 g_free(rdma_return_path);
4140}
4141