1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include "qemu/osdep.h"
18#include "qapi/error.h"
19#include "qemu/cutils.h"
20#include "rdma.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "ram.h"
24#include "qemu-file-channel.h"
25#include "qemu/error-report.h"
26#include "qemu/main-loop.h"
27#include "qemu/module.h"
28#include "qemu/sockets.h"
29#include "qemu/bitmap.h"
30#include "qemu/coroutine.h"
31#include <sys/socket.h>
32#include <netdb.h>
33#include <arpa/inet.h>
34#include <rdma/rdma_cma.h>
35#include "trace.h"
36
37
38
39
40#define ERROR(errp, fmt, ...) \
41 do { \
42 fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
43 if (errp && (*(errp) == NULL)) { \
44 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
45 } \
46 } while (0)
47
48#define RDMA_RESOLVE_TIMEOUT_MS 10000
49
50
51#define RDMA_MERGE_MAX (2 * 1024 * 1024)
52#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
53
54#define RDMA_REG_CHUNK_SHIFT 20
55
56
57
58
59
60
61
62#define RDMA_SEND_INCREMENT 32768
63
64
65
66
67#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
68#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
69
70#define RDMA_CONTROL_VERSION_CURRENT 1
71
72
73
74#define RDMA_CAPABILITY_PIN_ALL 0x01
75
76
77
78
79
80static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
81
82#define CHECK_ERROR_STATE() \
83 do { \
84 if (rdma->error_state) { \
85 if (!rdma->error_reported) { \
86 error_report("RDMA is in an error state waiting migration" \
87 " to abort!"); \
88 rdma->error_reported = 1; \
89 } \
90 rcu_read_unlock(); \
91 return rdma->error_state; \
92 } \
93 } while (0)
94
95
96
97
98
99
100
101
102
103
104
105
106
107#define RDMA_WRID_TYPE_SHIFT 0UL
108#define RDMA_WRID_BLOCK_SHIFT 16UL
109#define RDMA_WRID_CHUNK_SHIFT 30UL
110
111#define RDMA_WRID_TYPE_MASK \
112 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
113
114#define RDMA_WRID_BLOCK_MASK \
115 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
116
117#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
118
119
120
121
122
123
124enum {
125 RDMA_WRID_NONE = 0,
126 RDMA_WRID_RDMA_WRITE = 1,
127 RDMA_WRID_SEND_CONTROL = 2000,
128 RDMA_WRID_RECV_CONTROL = 4000,
129};
130
131static const char *wrid_desc[] = {
132 [RDMA_WRID_NONE] = "NONE",
133 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
134 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
135 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
136};
137
138
139
140
141
142
143
144
145enum {
146 RDMA_WRID_READY = 0,
147 RDMA_WRID_DATA,
148 RDMA_WRID_CONTROL,
149 RDMA_WRID_MAX,
150};
151
152
153
154
155enum {
156 RDMA_CONTROL_NONE = 0,
157 RDMA_CONTROL_ERROR,
158 RDMA_CONTROL_READY,
159 RDMA_CONTROL_QEMU_FILE,
160 RDMA_CONTROL_RAM_BLOCKS_REQUEST,
161 RDMA_CONTROL_RAM_BLOCKS_RESULT,
162 RDMA_CONTROL_COMPRESS,
163 RDMA_CONTROL_REGISTER_REQUEST,
164 RDMA_CONTROL_REGISTER_RESULT,
165 RDMA_CONTROL_REGISTER_FINISHED,
166 RDMA_CONTROL_UNREGISTER_REQUEST,
167 RDMA_CONTROL_UNREGISTER_FINISHED,
168};
169
170
171
172
173
174
175typedef struct {
176 uint8_t control[RDMA_CONTROL_MAX_BUFFER];
177 struct ibv_mr *control_mr;
178 size_t control_len;
179 uint8_t *control_curr;
180} RDMAWorkRequestData;
181
182
183
184
185typedef struct {
186 uint32_t version;
187 uint32_t flags;
188} RDMACapabilities;
189
190static void caps_to_network(RDMACapabilities *cap)
191{
192 cap->version = htonl(cap->version);
193 cap->flags = htonl(cap->flags);
194}
195
196static void network_to_caps(RDMACapabilities *cap)
197{
198 cap->version = ntohl(cap->version);
199 cap->flags = ntohl(cap->flags);
200}
201
202
203
204
205
206
207
208
209typedef struct RDMALocalBlock {
210 char *block_name;
211 uint8_t *local_host_addr;
212 uint64_t remote_host_addr;
213 uint64_t offset;
214 uint64_t length;
215 struct ibv_mr **pmr;
216 struct ibv_mr *mr;
217 uint32_t *remote_keys;
218 uint32_t remote_rkey;
219 int index;
220 unsigned int src_index;
221 bool is_ram_block;
222 int nb_chunks;
223 unsigned long *transit_bitmap;
224 unsigned long *unregister_bitmap;
225} RDMALocalBlock;
226
227
228
229
230
231
232
233
234typedef struct QEMU_PACKED RDMADestBlock {
235 uint64_t remote_host_addr;
236 uint64_t offset;
237 uint64_t length;
238 uint32_t remote_rkey;
239 uint32_t padding;
240} RDMADestBlock;
241
242static const char *control_desc(unsigned int rdma_control)
243{
244 static const char *strs[] = {
245 [RDMA_CONTROL_NONE] = "NONE",
246 [RDMA_CONTROL_ERROR] = "ERROR",
247 [RDMA_CONTROL_READY] = "READY",
248 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
249 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
250 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
251 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
252 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
253 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
254 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
255 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
256 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
257 };
258
259 if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
260 return "??BAD CONTROL VALUE??";
261 }
262
263 return strs[rdma_control];
264}
265
266static uint64_t htonll(uint64_t v)
267{
268 union { uint32_t lv[2]; uint64_t llv; } u;
269 u.lv[0] = htonl(v >> 32);
270 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
271 return u.llv;
272}
273
274static uint64_t ntohll(uint64_t v) {
275 union { uint32_t lv[2]; uint64_t llv; } u;
276 u.llv = v;
277 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
278}
279
280static void dest_block_to_network(RDMADestBlock *db)
281{
282 db->remote_host_addr = htonll(db->remote_host_addr);
283 db->offset = htonll(db->offset);
284 db->length = htonll(db->length);
285 db->remote_rkey = htonl(db->remote_rkey);
286}
287
288static void network_to_dest_block(RDMADestBlock *db)
289{
290 db->remote_host_addr = ntohll(db->remote_host_addr);
291 db->offset = ntohll(db->offset);
292 db->length = ntohll(db->length);
293 db->remote_rkey = ntohl(db->remote_rkey);
294}
295
296
297
298
299
300
301typedef struct RDMALocalBlocks {
302 int nb_blocks;
303 bool init;
304 RDMALocalBlock *block;
305} RDMALocalBlocks;
306
307
308
309
310
311
312
313typedef struct RDMAContext {
314 char *host;
315 int port;
316
317 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
318
319
320
321
322
323
324
325
326 int control_ready_expected;
327
328
329 int nb_sent;
330
331
332
333 uint64_t current_addr;
334 uint64_t current_length;
335
336 int current_index;
337
338 int current_chunk;
339
340 bool pin_all;
341
342
343
344
345
346
347
348
349 struct rdma_cm_id *cm_id;
350 struct rdma_cm_id *listen_id;
351 bool connected;
352
353 struct ibv_context *verbs;
354 struct rdma_event_channel *channel;
355 struct ibv_qp *qp;
356 struct ibv_comp_channel *comp_channel;
357 struct ibv_pd *pd;
358 struct ibv_cq *cq;
359
360
361
362
363
364
365 int error_state;
366 int error_reported;
367 int received_error;
368
369
370
371
372 RDMALocalBlocks local_ram_blocks;
373 RDMADestBlock *dest_blocks;
374
375
376 unsigned int next_src_index;
377
378
379
380
381
382
383 int migration_started_on_destination;
384
385 int total_registrations;
386 int total_writes;
387
388 int unregister_current, unregister_next;
389 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
390
391 GHashTable *blockmap;
392
393
394 struct RDMAContext *return_path;
395 bool is_return_path;
396} RDMAContext;
397
398#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
399#define QIO_CHANNEL_RDMA(obj) \
400 OBJECT_CHECK(QIOChannelRDMA, (obj), TYPE_QIO_CHANNEL_RDMA)
401
402typedef struct QIOChannelRDMA QIOChannelRDMA;
403
404
405struct QIOChannelRDMA {
406 QIOChannel parent;
407 RDMAContext *rdmain;
408 RDMAContext *rdmaout;
409 QEMUFile *file;
410 bool blocking;
411};
412
413
414
415
416
417typedef struct QEMU_PACKED {
418 uint32_t len;
419 uint32_t type;
420 uint32_t repeat;
421 uint32_t padding;
422} RDMAControlHeader;
423
424static void control_to_network(RDMAControlHeader *control)
425{
426 control->type = htonl(control->type);
427 control->len = htonl(control->len);
428 control->repeat = htonl(control->repeat);
429}
430
431static void network_to_control(RDMAControlHeader *control)
432{
433 control->type = ntohl(control->type);
434 control->len = ntohl(control->len);
435 control->repeat = ntohl(control->repeat);
436}
437
438
439
440
441
442
443
444typedef struct QEMU_PACKED {
445 union QEMU_PACKED {
446 uint64_t current_addr;
447 uint64_t chunk;
448 } key;
449 uint32_t current_index;
450 uint32_t padding;
451 uint64_t chunks;
452} RDMARegister;
453
454static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
455{
456 RDMALocalBlock *local_block;
457 local_block = &rdma->local_ram_blocks.block[reg->current_index];
458
459 if (local_block->is_ram_block) {
460
461
462
463
464 reg->key.current_addr -= local_block->offset;
465 reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
466 }
467 reg->key.current_addr = htonll(reg->key.current_addr);
468 reg->current_index = htonl(reg->current_index);
469 reg->chunks = htonll(reg->chunks);
470}
471
472static void network_to_register(RDMARegister *reg)
473{
474 reg->key.current_addr = ntohll(reg->key.current_addr);
475 reg->current_index = ntohl(reg->current_index);
476 reg->chunks = ntohll(reg->chunks);
477}
478
479typedef struct QEMU_PACKED {
480 uint32_t value;
481 uint32_t block_idx;
482 uint64_t offset;
483 uint64_t length;
484} RDMACompress;
485
486static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
487{
488 comp->value = htonl(comp->value);
489
490
491
492
493 comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
494 comp->offset += rdma->dest_blocks[comp->block_idx].offset;
495 comp->block_idx = htonl(comp->block_idx);
496 comp->offset = htonll(comp->offset);
497 comp->length = htonll(comp->length);
498}
499
500static void network_to_compress(RDMACompress *comp)
501{
502 comp->value = ntohl(comp->value);
503 comp->block_idx = ntohl(comp->block_idx);
504 comp->offset = ntohll(comp->offset);
505 comp->length = ntohll(comp->length);
506}
507
508
509
510
511
512
513typedef struct QEMU_PACKED {
514 uint32_t rkey;
515 uint32_t padding;
516 uint64_t host_addr;
517} RDMARegisterResult;
518
519static void result_to_network(RDMARegisterResult *result)
520{
521 result->rkey = htonl(result->rkey);
522 result->host_addr = htonll(result->host_addr);
523};
524
525static void network_to_result(RDMARegisterResult *result)
526{
527 result->rkey = ntohl(result->rkey);
528 result->host_addr = ntohll(result->host_addr);
529};
530
531const char *print_wrid(int wrid);
532static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
533 uint8_t *data, RDMAControlHeader *resp,
534 int *resp_idx,
535 int (*callback)(RDMAContext *rdma));
536
537static inline uint64_t ram_chunk_index(const uint8_t *start,
538 const uint8_t *host)
539{
540 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
541}
542
543static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
544 uint64_t i)
545{
546 return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
547 (i << RDMA_REG_CHUNK_SHIFT));
548}
549
550static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
551 uint64_t i)
552{
553 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
554 (1UL << RDMA_REG_CHUNK_SHIFT);
555
556 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
557 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
558 }
559
560 return result;
561}
562
563static int rdma_add_block(RDMAContext *rdma, const char *block_name,
564 void *host_addr,
565 ram_addr_t block_offset, uint64_t length)
566{
567 RDMALocalBlocks *local = &rdma->local_ram_blocks;
568 RDMALocalBlock *block;
569 RDMALocalBlock *old = local->block;
570
571 local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
572
573 if (local->nb_blocks) {
574 int x;
575
576 if (rdma->blockmap) {
577 for (x = 0; x < local->nb_blocks; x++) {
578 g_hash_table_remove(rdma->blockmap,
579 (void *)(uintptr_t)old[x].offset);
580 g_hash_table_insert(rdma->blockmap,
581 (void *)(uintptr_t)old[x].offset,
582 &local->block[x]);
583 }
584 }
585 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
586 g_free(old);
587 }
588
589 block = &local->block[local->nb_blocks];
590
591 block->block_name = g_strdup(block_name);
592 block->local_host_addr = host_addr;
593 block->offset = block_offset;
594 block->length = length;
595 block->index = local->nb_blocks;
596 block->src_index = ~0U;
597 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
598 block->transit_bitmap = bitmap_new(block->nb_chunks);
599 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
600 block->unregister_bitmap = bitmap_new(block->nb_chunks);
601 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
602 block->remote_keys = g_new0(uint32_t, block->nb_chunks);
603
604 block->is_ram_block = local->init ? false : true;
605
606 if (rdma->blockmap) {
607 g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
608 }
609
610 trace_rdma_add_block(block_name, local->nb_blocks,
611 (uintptr_t) block->local_host_addr,
612 block->offset, block->length,
613 (uintptr_t) (block->local_host_addr + block->length),
614 BITS_TO_LONGS(block->nb_chunks) *
615 sizeof(unsigned long) * 8,
616 block->nb_chunks);
617
618 local->nb_blocks++;
619
620 return 0;
621}
622
623
624
625
626
627
628static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
629{
630 const char *block_name = qemu_ram_get_idstr(rb);
631 void *host_addr = qemu_ram_get_host_addr(rb);
632 ram_addr_t block_offset = qemu_ram_get_offset(rb);
633 ram_addr_t length = qemu_ram_get_used_length(rb);
634 return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
635}
636
637
638
639
640
641
642static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
643{
644 RDMALocalBlocks *local = &rdma->local_ram_blocks;
645 int ret;
646
647 assert(rdma->blockmap == NULL);
648 memset(local, 0, sizeof *local);
649 ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
650 if (ret) {
651 return ret;
652 }
653 trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
654 rdma->dest_blocks = g_new0(RDMADestBlock,
655 rdma->local_ram_blocks.nb_blocks);
656 local->init = true;
657 return 0;
658}
659
660
661
662
663
664static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
665{
666 RDMALocalBlocks *local = &rdma->local_ram_blocks;
667 RDMALocalBlock *old = local->block;
668 int x;
669
670 if (rdma->blockmap) {
671 g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
672 }
673 if (block->pmr) {
674 int j;
675
676 for (j = 0; j < block->nb_chunks; j++) {
677 if (!block->pmr[j]) {
678 continue;
679 }
680 ibv_dereg_mr(block->pmr[j]);
681 rdma->total_registrations--;
682 }
683 g_free(block->pmr);
684 block->pmr = NULL;
685 }
686
687 if (block->mr) {
688 ibv_dereg_mr(block->mr);
689 rdma->total_registrations--;
690 block->mr = NULL;
691 }
692
693 g_free(block->transit_bitmap);
694 block->transit_bitmap = NULL;
695
696 g_free(block->unregister_bitmap);
697 block->unregister_bitmap = NULL;
698
699 g_free(block->remote_keys);
700 block->remote_keys = NULL;
701
702 g_free(block->block_name);
703 block->block_name = NULL;
704
705 if (rdma->blockmap) {
706 for (x = 0; x < local->nb_blocks; x++) {
707 g_hash_table_remove(rdma->blockmap,
708 (void *)(uintptr_t)old[x].offset);
709 }
710 }
711
712 if (local->nb_blocks > 1) {
713
714 local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
715
716 if (block->index) {
717 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
718 }
719
720 if (block->index < (local->nb_blocks - 1)) {
721 memcpy(local->block + block->index, old + (block->index + 1),
722 sizeof(RDMALocalBlock) *
723 (local->nb_blocks - (block->index + 1)));
724 for (x = block->index; x < local->nb_blocks - 1; x++) {
725 local->block[x].index--;
726 }
727 }
728 } else {
729 assert(block == local->block);
730 local->block = NULL;
731 }
732
733 trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
734 block->offset, block->length,
735 (uintptr_t)(block->local_host_addr + block->length),
736 BITS_TO_LONGS(block->nb_chunks) *
737 sizeof(unsigned long) * 8, block->nb_chunks);
738
739 g_free(old);
740
741 local->nb_blocks--;
742
743 if (local->nb_blocks && rdma->blockmap) {
744 for (x = 0; x < local->nb_blocks; x++) {
745 g_hash_table_insert(rdma->blockmap,
746 (void *)(uintptr_t)local->block[x].offset,
747 &local->block[x]);
748 }
749 }
750
751 return 0;
752}
753
754
755
756
757
758static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
759{
760 struct ibv_port_attr port;
761
762 if (ibv_query_port(verbs, 1, &port)) {
763 error_report("Failed to query port information");
764 return;
765 }
766
767 printf("%s RDMA Device opened: kernel name %s "
768 "uverbs device name %s, "
769 "infiniband_verbs class device path %s, "
770 "infiniband class device path %s, "
771 "transport: (%d) %s\n",
772 who,
773 verbs->device->name,
774 verbs->device->dev_name,
775 verbs->device->dev_path,
776 verbs->device->ibdev_path,
777 port.link_layer,
778 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
779 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
780 ? "Ethernet" : "Unknown"));
781}
782
783
784
785
786
787
788static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
789{
790 char sgid[33];
791 char dgid[33];
792 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
793 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
794 trace_qemu_rdma_dump_gid(who, sgid, dgid);
795}
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
841{
842
843#ifdef CONFIG_LINUX
844 struct ibv_port_attr port_attr;
845
846
847
848
849
850
851
852
853
854
855 if (!verbs) {
856 int num_devices, x;
857 struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
858 bool roce_found = false;
859 bool ib_found = false;
860
861 for (x = 0; x < num_devices; x++) {
862 verbs = ibv_open_device(dev_list[x]);
863 if (!verbs) {
864 if (errno == EPERM) {
865 continue;
866 } else {
867 return -EINVAL;
868 }
869 }
870
871 if (ibv_query_port(verbs, 1, &port_attr)) {
872 ibv_close_device(verbs);
873 ERROR(errp, "Could not query initial IB port");
874 return -EINVAL;
875 }
876
877 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
878 ib_found = true;
879 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
880 roce_found = true;
881 }
882
883 ibv_close_device(verbs);
884
885 }
886
887 if (roce_found) {
888 if (ib_found) {
889 fprintf(stderr, "WARN: migrations may fail:"
890 " IPv6 over RoCE / iWARP in linux"
891 " is broken. But since you appear to have a"
892 " mixed RoCE / IB environment, be sure to only"
893 " migrate over the IB fabric until the kernel "
894 " fixes the bug.\n");
895 } else {
896 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
897 " and your management software has specified '[::]'"
898 ", but IPv6 over RoCE / iWARP is not supported in Linux.");
899 return -ENONET;
900 }
901 }
902
903 return 0;
904 }
905
906
907
908
909
910
911
912
913 if (ibv_query_port(verbs, 1, &port_attr)) {
914 ERROR(errp, "Could not query initial IB port");
915 return -EINVAL;
916 }
917
918 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
919 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
920 "(but patches on linux-rdma in progress)");
921 return -ENONET;
922 }
923
924#endif
925
926 return 0;
927}
928
929
930
931
932
933
934static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
935{
936 int ret;
937 struct rdma_addrinfo *res;
938 char port_str[16];
939 struct rdma_cm_event *cm_event;
940 char ip[40] = "unknown";
941 struct rdma_addrinfo *e;
942
943 if (rdma->host == NULL || !strcmp(rdma->host, "")) {
944 ERROR(errp, "RDMA hostname has not been set");
945 return -EINVAL;
946 }
947
948
949 rdma->channel = rdma_create_event_channel();
950 if (!rdma->channel) {
951 ERROR(errp, "could not create CM channel");
952 return -EINVAL;
953 }
954
955
956 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
957 if (ret) {
958 ERROR(errp, "could not create channel id");
959 goto err_resolve_create_id;
960 }
961
962 snprintf(port_str, 16, "%d", rdma->port);
963 port_str[15] = '\0';
964
965 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
966 if (ret < 0) {
967 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
968 goto err_resolve_get_addr;
969 }
970
971 for (e = res; e != NULL; e = e->ai_next) {
972 inet_ntop(e->ai_family,
973 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
974 trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
975
976 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
977 RDMA_RESOLVE_TIMEOUT_MS);
978 if (!ret) {
979 if (e->ai_family == AF_INET6) {
980 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
981 if (ret) {
982 continue;
983 }
984 }
985 goto route;
986 }
987 }
988
989 ERROR(errp, "could not resolve address %s", rdma->host);
990 goto err_resolve_get_addr;
991
992route:
993 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
994
995 ret = rdma_get_cm_event(rdma->channel, &cm_event);
996 if (ret) {
997 ERROR(errp, "could not perform event_addr_resolved");
998 goto err_resolve_get_addr;
999 }
1000
1001 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
1002 ERROR(errp, "result not equal to event_addr_resolved %s",
1003 rdma_event_str(cm_event->event));
1004 perror("rdma_resolve_addr");
1005 rdma_ack_cm_event(cm_event);
1006 ret = -EINVAL;
1007 goto err_resolve_get_addr;
1008 }
1009 rdma_ack_cm_event(cm_event);
1010
1011
1012 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
1013 if (ret) {
1014 ERROR(errp, "could not resolve rdma route");
1015 goto err_resolve_get_addr;
1016 }
1017
1018 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1019 if (ret) {
1020 ERROR(errp, "could not perform event_route_resolved");
1021 goto err_resolve_get_addr;
1022 }
1023 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1024 ERROR(errp, "result not equal to event_route_resolved: %s",
1025 rdma_event_str(cm_event->event));
1026 rdma_ack_cm_event(cm_event);
1027 ret = -EINVAL;
1028 goto err_resolve_get_addr;
1029 }
1030 rdma_ack_cm_event(cm_event);
1031 rdma->verbs = rdma->cm_id->verbs;
1032 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1033 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1034 return 0;
1035
1036err_resolve_get_addr:
1037 rdma_destroy_id(rdma->cm_id);
1038 rdma->cm_id = NULL;
1039err_resolve_create_id:
1040 rdma_destroy_event_channel(rdma->channel);
1041 rdma->channel = NULL;
1042 return ret;
1043}
1044
1045
1046
1047
1048static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1049{
1050
1051 rdma->pd = ibv_alloc_pd(rdma->verbs);
1052 if (!rdma->pd) {
1053 error_report("failed to allocate protection domain");
1054 return -1;
1055 }
1056
1057
1058 rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
1059 if (!rdma->comp_channel) {
1060 error_report("failed to allocate completion channel");
1061 goto err_alloc_pd_cq;
1062 }
1063
1064
1065
1066
1067
1068 rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1069 NULL, rdma->comp_channel, 0);
1070 if (!rdma->cq) {
1071 error_report("failed to allocate completion queue");
1072 goto err_alloc_pd_cq;
1073 }
1074
1075 return 0;
1076
1077err_alloc_pd_cq:
1078 if (rdma->pd) {
1079 ibv_dealloc_pd(rdma->pd);
1080 }
1081 if (rdma->comp_channel) {
1082 ibv_destroy_comp_channel(rdma->comp_channel);
1083 }
1084 rdma->pd = NULL;
1085 rdma->comp_channel = NULL;
1086 return -1;
1087
1088}
1089
1090
1091
1092
1093static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1094{
1095 struct ibv_qp_init_attr attr = { 0 };
1096 int ret;
1097
1098 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1099 attr.cap.max_recv_wr = 3;
1100 attr.cap.max_send_sge = 1;
1101 attr.cap.max_recv_sge = 1;
1102 attr.send_cq = rdma->cq;
1103 attr.recv_cq = rdma->cq;
1104 attr.qp_type = IBV_QPT_RC;
1105
1106 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1107 if (ret) {
1108 return -1;
1109 }
1110
1111 rdma->qp = rdma->cm_id->qp;
1112 return 0;
1113}
1114
1115static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1116{
1117 int i;
1118 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1119
1120 for (i = 0; i < local->nb_blocks; i++) {
1121 local->block[i].mr =
1122 ibv_reg_mr(rdma->pd,
1123 local->block[i].local_host_addr,
1124 local->block[i].length,
1125 IBV_ACCESS_LOCAL_WRITE |
1126 IBV_ACCESS_REMOTE_WRITE
1127 );
1128 if (!local->block[i].mr) {
1129 perror("Failed to register local dest ram block!\n");
1130 break;
1131 }
1132 rdma->total_registrations++;
1133 }
1134
1135 if (i >= local->nb_blocks) {
1136 return 0;
1137 }
1138
1139 for (i--; i >= 0; i--) {
1140 ibv_dereg_mr(local->block[i].mr);
1141 rdma->total_registrations--;
1142 }
1143
1144 return -1;
1145
1146}
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1158 uintptr_t block_offset,
1159 uint64_t offset,
1160 uint64_t length,
1161 uint64_t *block_index,
1162 uint64_t *chunk_index)
1163{
1164 uint64_t current_addr = block_offset + offset;
1165 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1166 (void *) block_offset);
1167 assert(block);
1168 assert(current_addr >= block->offset);
1169 assert((current_addr + length) <= (block->offset + block->length));
1170
1171 *block_index = block->index;
1172 *chunk_index = ram_chunk_index(block->local_host_addr,
1173 block->local_host_addr + (current_addr - block->offset));
1174
1175 return 0;
1176}
1177
1178
1179
1180
1181
1182
1183
1184
1185static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1186 RDMALocalBlock *block, uintptr_t host_addr,
1187 uint32_t *lkey, uint32_t *rkey, int chunk,
1188 uint8_t *chunk_start, uint8_t *chunk_end)
1189{
1190 if (block->mr) {
1191 if (lkey) {
1192 *lkey = block->mr->lkey;
1193 }
1194 if (rkey) {
1195 *rkey = block->mr->rkey;
1196 }
1197 return 0;
1198 }
1199
1200
1201 if (!block->pmr) {
1202 block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1203 }
1204
1205
1206
1207
1208
1209
1210 if (!block->pmr[chunk]) {
1211 uint64_t len = chunk_end - chunk_start;
1212
1213 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1214
1215 block->pmr[chunk] = ibv_reg_mr(rdma->pd,
1216 chunk_start, len,
1217 (rkey ? (IBV_ACCESS_LOCAL_WRITE |
1218 IBV_ACCESS_REMOTE_WRITE) : 0));
1219
1220 if (!block->pmr[chunk]) {
1221 perror("Failed to register chunk!");
1222 fprintf(stderr, "Chunk details: block: %d chunk index %d"
1223 " start %" PRIuPTR " end %" PRIuPTR
1224 " host %" PRIuPTR
1225 " local %" PRIuPTR " registrations: %d\n",
1226 block->index, chunk, (uintptr_t)chunk_start,
1227 (uintptr_t)chunk_end, host_addr,
1228 (uintptr_t)block->local_host_addr,
1229 rdma->total_registrations);
1230 return -1;
1231 }
1232 rdma->total_registrations++;
1233 }
1234
1235 if (lkey) {
1236 *lkey = block->pmr[chunk]->lkey;
1237 }
1238 if (rkey) {
1239 *rkey = block->pmr[chunk]->rkey;
1240 }
1241 return 0;
1242}
1243
1244
1245
1246
1247
1248static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1249{
1250 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1251 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1252 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1253 if (rdma->wr_data[idx].control_mr) {
1254 rdma->total_registrations++;
1255 return 0;
1256 }
1257 error_report("qemu_rdma_reg_control failed");
1258 return -1;
1259}
1260
1261const char *print_wrid(int wrid)
1262{
1263 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1264 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1265 }
1266 return wrid_desc[wrid];
1267}
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1305{
1306 while (rdma->unregistrations[rdma->unregister_current]) {
1307 int ret;
1308 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1309 uint64_t chunk =
1310 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1311 uint64_t index =
1312 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1313 RDMALocalBlock *block =
1314 &(rdma->local_ram_blocks.block[index]);
1315 RDMARegister reg = { .current_index = index };
1316 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1317 };
1318 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1319 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1320 .repeat = 1,
1321 };
1322
1323 trace_qemu_rdma_unregister_waiting_proc(chunk,
1324 rdma->unregister_current);
1325
1326 rdma->unregistrations[rdma->unregister_current] = 0;
1327 rdma->unregister_current++;
1328
1329 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1330 rdma->unregister_current = 0;
1331 }
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341 clear_bit(chunk, block->unregister_bitmap);
1342
1343 if (test_bit(chunk, block->transit_bitmap)) {
1344 trace_qemu_rdma_unregister_waiting_inflight(chunk);
1345 continue;
1346 }
1347
1348 trace_qemu_rdma_unregister_waiting_send(chunk);
1349
1350 ret = ibv_dereg_mr(block->pmr[chunk]);
1351 block->pmr[chunk] = NULL;
1352 block->remote_keys[chunk] = 0;
1353
1354 if (ret != 0) {
1355 perror("unregistration chunk failed");
1356 return -ret;
1357 }
1358 rdma->total_registrations--;
1359
1360 reg.key.chunk = chunk;
1361 register_to_network(rdma, ®);
1362 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1363 &resp, NULL, NULL);
1364 if (ret < 0) {
1365 return ret;
1366 }
1367
1368 trace_qemu_rdma_unregister_waiting_complete(chunk);
1369 }
1370
1371 return 0;
1372}
1373
1374static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1375 uint64_t chunk)
1376{
1377 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1378
1379 result |= (index << RDMA_WRID_BLOCK_SHIFT);
1380 result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1381
1382 return result;
1383}
1384
1385
1386
1387
1388
1389static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1390 uint64_t chunk, uint64_t wr_id)
1391{
1392 if (rdma->unregistrations[rdma->unregister_next] != 0) {
1393 error_report("rdma migration: queue is full");
1394 } else {
1395 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1396
1397 if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1398 trace_qemu_rdma_signal_unregister_append(chunk,
1399 rdma->unregister_next);
1400
1401 rdma->unregistrations[rdma->unregister_next++] =
1402 qemu_rdma_make_wrid(wr_id, index, chunk);
1403
1404 if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1405 rdma->unregister_next = 0;
1406 }
1407 } else {
1408 trace_qemu_rdma_signal_unregister_already(chunk);
1409 }
1410 }
1411}
1412
1413
1414
1415
1416
1417
1418static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
1419 uint32_t *byte_len)
1420{
1421 int ret;
1422 struct ibv_wc wc;
1423 uint64_t wr_id;
1424
1425 ret = ibv_poll_cq(rdma->cq, 1, &wc);
1426
1427 if (!ret) {
1428 *wr_id_out = RDMA_WRID_NONE;
1429 return 0;
1430 }
1431
1432 if (ret < 0) {
1433 error_report("ibv_poll_cq return %d", ret);
1434 return ret;
1435 }
1436
1437 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1438
1439 if (wc.status != IBV_WC_SUCCESS) {
1440 fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1441 wc.status, ibv_wc_status_str(wc.status));
1442 fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1443
1444 return -1;
1445 }
1446
1447 if (rdma->control_ready_expected &&
1448 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1449 trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
1450 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1451 rdma->control_ready_expected = 0;
1452 }
1453
1454 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1455 uint64_t chunk =
1456 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1457 uint64_t index =
1458 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1459 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1460
1461 trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
1462 index, chunk, block->local_host_addr,
1463 (void *)(uintptr_t)block->remote_host_addr);
1464
1465 clear_bit(chunk, block->transit_bitmap);
1466
1467 if (rdma->nb_sent > 0) {
1468 rdma->nb_sent--;
1469 }
1470
1471 if (!rdma->pin_all) {
1472
1473
1474
1475
1476
1477
1478#ifdef RDMA_UNREGISTRATION_EXAMPLE
1479 qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1480#endif
1481 }
1482 } else {
1483 trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
1484 }
1485
1486 *wr_id_out = wc.wr_id;
1487 if (byte_len) {
1488 *byte_len = wc.byte_len;
1489 }
1490
1491 return 0;
1492}
1493
1494
1495
1496
1497static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
1498{
1499 struct rdma_cm_event *cm_event;
1500 int ret = -1;
1501
1502
1503
1504
1505
1506 if (rdma->migration_started_on_destination &&
1507 migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1508 yield_until_fd_readable(rdma->comp_channel->fd);
1509 } else {
1510
1511
1512
1513
1514
1515
1516
1517 while (!rdma->error_state && !rdma->received_error) {
1518 GPollFD pfds[2];
1519 pfds[0].fd = rdma->comp_channel->fd;
1520 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1521 pfds[0].revents = 0;
1522
1523 pfds[1].fd = rdma->channel->fd;
1524 pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1525 pfds[1].revents = 0;
1526
1527
1528 switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1529 case 2:
1530 case 1:
1531 if (pfds[0].revents) {
1532 return 0;
1533 }
1534
1535 if (pfds[1].revents) {
1536 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1537 if (!ret) {
1538 rdma_ack_cm_event(cm_event);
1539 }
1540
1541 error_report("receive cm event while wait comp channel,"
1542 "cm event is %d", cm_event->event);
1543 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1544 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
1545 return -EPIPE;
1546 }
1547 }
1548 break;
1549
1550 case 0:
1551 break;
1552
1553 default:
1554
1555
1556 error_report("%s: poll failed", __func__);
1557 return -EPIPE;
1558 }
1559
1560 if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
1561
1562 return -EPIPE;
1563 }
1564 }
1565 }
1566
1567 if (rdma->received_error) {
1568 return -EPIPE;
1569 }
1570 return rdma->error_state;
1571}
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1587 uint32_t *byte_len)
1588{
1589 int num_cq_events = 0, ret = 0;
1590 struct ibv_cq *cq;
1591 void *cq_ctx;
1592 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1593
1594 if (ibv_req_notify_cq(rdma->cq, 0)) {
1595 return -1;
1596 }
1597
1598 while (wr_id != wrid_requested) {
1599 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1600 if (ret < 0) {
1601 return ret;
1602 }
1603
1604 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1605
1606 if (wr_id == RDMA_WRID_NONE) {
1607 break;
1608 }
1609 if (wr_id != wrid_requested) {
1610 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1611 wrid_requested, print_wrid(wr_id), wr_id);
1612 }
1613 }
1614
1615 if (wr_id == wrid_requested) {
1616 return 0;
1617 }
1618
1619 while (1) {
1620 ret = qemu_rdma_wait_comp_channel(rdma);
1621 if (ret) {
1622 goto err_block_for_wrid;
1623 }
1624
1625 ret = ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx);
1626 if (ret) {
1627 perror("ibv_get_cq_event");
1628 goto err_block_for_wrid;
1629 }
1630
1631 num_cq_events++;
1632
1633 ret = -ibv_req_notify_cq(cq, 0);
1634 if (ret) {
1635 goto err_block_for_wrid;
1636 }
1637
1638 while (wr_id != wrid_requested) {
1639 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1640 if (ret < 0) {
1641 goto err_block_for_wrid;
1642 }
1643
1644 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1645
1646 if (wr_id == RDMA_WRID_NONE) {
1647 break;
1648 }
1649 if (wr_id != wrid_requested) {
1650 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1651 wrid_requested, print_wrid(wr_id), wr_id);
1652 }
1653 }
1654
1655 if (wr_id == wrid_requested) {
1656 goto success_block_for_wrid;
1657 }
1658 }
1659
1660success_block_for_wrid:
1661 if (num_cq_events) {
1662 ibv_ack_cq_events(cq, num_cq_events);
1663 }
1664 return 0;
1665
1666err_block_for_wrid:
1667 if (num_cq_events) {
1668 ibv_ack_cq_events(cq, num_cq_events);
1669 }
1670
1671 rdma->error_state = ret;
1672 return ret;
1673}
1674
1675
1676
1677
1678
1679static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1680 RDMAControlHeader *head)
1681{
1682 int ret = 0;
1683 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1684 struct ibv_send_wr *bad_wr;
1685 struct ibv_sge sge = {
1686 .addr = (uintptr_t)(wr->control),
1687 .length = head->len + sizeof(RDMAControlHeader),
1688 .lkey = wr->control_mr->lkey,
1689 };
1690 struct ibv_send_wr send_wr = {
1691 .wr_id = RDMA_WRID_SEND_CONTROL,
1692 .opcode = IBV_WR_SEND,
1693 .send_flags = IBV_SEND_SIGNALED,
1694 .sg_list = &sge,
1695 .num_sge = 1,
1696 };
1697
1698 trace_qemu_rdma_post_send_control(control_desc(head->type));
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1709 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1710 control_to_network((void *) wr->control);
1711
1712 if (buf) {
1713 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1714 }
1715
1716
1717 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1718
1719 if (ret > 0) {
1720 error_report("Failed to use post IB SEND for control");
1721 return -ret;
1722 }
1723
1724 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1725 if (ret < 0) {
1726 error_report("rdma migration: send polling control error");
1727 }
1728
1729 return ret;
1730}
1731
1732
1733
1734
1735
1736static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1737{
1738 struct ibv_recv_wr *bad_wr;
1739 struct ibv_sge sge = {
1740 .addr = (uintptr_t)(rdma->wr_data[idx].control),
1741 .length = RDMA_CONTROL_MAX_BUFFER,
1742 .lkey = rdma->wr_data[idx].control_mr->lkey,
1743 };
1744
1745 struct ibv_recv_wr recv_wr = {
1746 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1747 .sg_list = &sge,
1748 .num_sge = 1,
1749 };
1750
1751
1752 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1753 return -1;
1754 }
1755
1756 return 0;
1757}
1758
1759
1760
1761
1762static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1763 RDMAControlHeader *head, int expecting, int idx)
1764{
1765 uint32_t byte_len;
1766 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1767 &byte_len);
1768
1769 if (ret < 0) {
1770 error_report("rdma migration: recv polling control error!");
1771 return ret;
1772 }
1773
1774 network_to_control((void *) rdma->wr_data[idx].control);
1775 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1776
1777 trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1778
1779 if (expecting == RDMA_CONTROL_NONE) {
1780 trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1781 head->type);
1782 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1783 error_report("Was expecting a %s (%d) control message"
1784 ", but got: %s (%d), length: %d",
1785 control_desc(expecting), expecting,
1786 control_desc(head->type), head->type, head->len);
1787 if (head->type == RDMA_CONTROL_ERROR) {
1788 rdma->received_error = true;
1789 }
1790 return -EIO;
1791 }
1792 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1793 error_report("too long length: %d", head->len);
1794 return -EINVAL;
1795 }
1796 if (sizeof(*head) + head->len != byte_len) {
1797 error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1798 return -EINVAL;
1799 }
1800
1801 return 0;
1802}
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1813 RDMAControlHeader *head)
1814{
1815 rdma->wr_data[idx].control_len = head->len;
1816 rdma->wr_data[idx].control_curr =
1817 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1818}
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1834 uint8_t *data, RDMAControlHeader *resp,
1835 int *resp_idx,
1836 int (*callback)(RDMAContext *rdma))
1837{
1838 int ret = 0;
1839
1840
1841
1842
1843
1844 if (rdma->control_ready_expected) {
1845 RDMAControlHeader resp;
1846 ret = qemu_rdma_exchange_get_response(rdma,
1847 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1848 if (ret < 0) {
1849 return ret;
1850 }
1851 }
1852
1853
1854
1855
1856 if (resp) {
1857 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1858 if (ret) {
1859 error_report("rdma migration: error posting"
1860 " extra control recv for anticipated result!");
1861 return ret;
1862 }
1863 }
1864
1865
1866
1867
1868 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1869 if (ret) {
1870 error_report("rdma migration: error posting first control recv!");
1871 return ret;
1872 }
1873
1874
1875
1876
1877 ret = qemu_rdma_post_send_control(rdma, data, head);
1878
1879 if (ret < 0) {
1880 error_report("Failed to send control buffer!");
1881 return ret;
1882 }
1883
1884
1885
1886
1887 if (resp) {
1888 if (callback) {
1889 trace_qemu_rdma_exchange_send_issue_callback();
1890 ret = callback(rdma);
1891 if (ret < 0) {
1892 return ret;
1893 }
1894 }
1895
1896 trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1897 ret = qemu_rdma_exchange_get_response(rdma, resp,
1898 resp->type, RDMA_WRID_DATA);
1899
1900 if (ret < 0) {
1901 return ret;
1902 }
1903
1904 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1905 if (resp_idx) {
1906 *resp_idx = RDMA_WRID_DATA;
1907 }
1908 trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1909 }
1910
1911 rdma->control_ready_expected = 1;
1912
1913 return 0;
1914}
1915
1916
1917
1918
1919
1920static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1921 int expecting)
1922{
1923 RDMAControlHeader ready = {
1924 .len = 0,
1925 .type = RDMA_CONTROL_READY,
1926 .repeat = 1,
1927 };
1928 int ret;
1929
1930
1931
1932
1933 ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1934
1935 if (ret < 0) {
1936 error_report("Failed to send control buffer!");
1937 return ret;
1938 }
1939
1940
1941
1942
1943 ret = qemu_rdma_exchange_get_response(rdma, head,
1944 expecting, RDMA_WRID_READY);
1945
1946 if (ret < 0) {
1947 return ret;
1948 }
1949
1950 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1951
1952
1953
1954
1955 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1956 if (ret) {
1957 error_report("rdma migration: error posting second control recv!");
1958 return ret;
1959 }
1960
1961 return 0;
1962}
1963
1964
1965
1966
1967
1968
1969
1970static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
1971 int current_index, uint64_t current_addr,
1972 uint64_t length)
1973{
1974 struct ibv_sge sge;
1975 struct ibv_send_wr send_wr = { 0 };
1976 struct ibv_send_wr *bad_wr;
1977 int reg_result_idx, ret, count = 0;
1978 uint64_t chunk, chunks;
1979 uint8_t *chunk_start, *chunk_end;
1980 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
1981 RDMARegister reg;
1982 RDMARegisterResult *reg_result;
1983 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
1984 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1985 .type = RDMA_CONTROL_REGISTER_REQUEST,
1986 .repeat = 1,
1987 };
1988
1989retry:
1990 sge.addr = (uintptr_t)(block->local_host_addr +
1991 (current_addr - block->offset));
1992 sge.length = length;
1993
1994 chunk = ram_chunk_index(block->local_host_addr,
1995 (uint8_t *)(uintptr_t)sge.addr);
1996 chunk_start = ram_chunk_start(block, chunk);
1997
1998 if (block->is_ram_block) {
1999 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2000
2001 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2002 chunks--;
2003 }
2004 } else {
2005 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2006
2007 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2008 chunks--;
2009 }
2010 }
2011
2012 trace_qemu_rdma_write_one_top(chunks + 1,
2013 (chunks + 1) *
2014 (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2015
2016 chunk_end = ram_chunk_end(block, chunk + chunks);
2017
2018 if (!rdma->pin_all) {
2019#ifdef RDMA_UNREGISTRATION_EXAMPLE
2020 qemu_rdma_unregister_waiting(rdma);
2021#endif
2022 }
2023
2024 while (test_bit(chunk, block->transit_bitmap)) {
2025 (void)count;
2026 trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2027 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2028
2029 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2030
2031 if (ret < 0) {
2032 error_report("Failed to Wait for previous write to complete "
2033 "block %d chunk %" PRIu64
2034 " current %" PRIu64 " len %" PRIu64 " %d",
2035 current_index, chunk, sge.addr, length, rdma->nb_sent);
2036 return ret;
2037 }
2038 }
2039
2040 if (!rdma->pin_all || !block->is_ram_block) {
2041 if (!block->remote_keys[chunk]) {
2042
2043
2044
2045
2046
2047
2048 if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2049 RDMACompress comp = {
2050 .offset = current_addr,
2051 .value = 0,
2052 .block_idx = current_index,
2053 .length = length,
2054 };
2055
2056 head.len = sizeof(comp);
2057 head.type = RDMA_CONTROL_COMPRESS;
2058
2059 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2060 current_index, current_addr);
2061
2062 compress_to_network(rdma, &comp);
2063 ret = qemu_rdma_exchange_send(rdma, &head,
2064 (uint8_t *) &comp, NULL, NULL, NULL);
2065
2066 if (ret < 0) {
2067 return -EIO;
2068 }
2069
2070 acct_update_position(f, sge.length, true);
2071
2072 return 1;
2073 }
2074
2075
2076
2077
2078 reg.current_index = current_index;
2079 if (block->is_ram_block) {
2080 reg.key.current_addr = current_addr;
2081 } else {
2082 reg.key.chunk = chunk;
2083 }
2084 reg.chunks = chunks;
2085
2086 trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2087 current_addr);
2088
2089 register_to_network(rdma, ®);
2090 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
2091 &resp, ®_result_idx, NULL);
2092 if (ret < 0) {
2093 return ret;
2094 }
2095
2096
2097 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2098 &sge.lkey, NULL, chunk,
2099 chunk_start, chunk_end)) {
2100 error_report("cannot get lkey");
2101 return -EINVAL;
2102 }
2103
2104 reg_result = (RDMARegisterResult *)
2105 rdma->wr_data[reg_result_idx].control_curr;
2106
2107 network_to_result(reg_result);
2108
2109 trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2110 reg_result->rkey, chunk);
2111
2112 block->remote_keys[chunk] = reg_result->rkey;
2113 block->remote_host_addr = reg_result->host_addr;
2114 } else {
2115
2116 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2117 &sge.lkey, NULL, chunk,
2118 chunk_start, chunk_end)) {
2119 error_report("cannot get lkey!");
2120 return -EINVAL;
2121 }
2122 }
2123
2124 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2125 } else {
2126 send_wr.wr.rdma.rkey = block->remote_rkey;
2127
2128 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2129 &sge.lkey, NULL, chunk,
2130 chunk_start, chunk_end)) {
2131 error_report("cannot get lkey!");
2132 return -EINVAL;
2133 }
2134 }
2135
2136
2137
2138
2139
2140
2141
2142 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2143 current_index, chunk);
2144
2145 send_wr.opcode = IBV_WR_RDMA_WRITE;
2146 send_wr.send_flags = IBV_SEND_SIGNALED;
2147 send_wr.sg_list = &sge;
2148 send_wr.num_sge = 1;
2149 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2150 (current_addr - block->offset);
2151
2152 trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2153 sge.length);
2154
2155
2156
2157
2158
2159 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2160
2161 if (ret == ENOMEM) {
2162 trace_qemu_rdma_write_one_queue_full();
2163 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2164 if (ret < 0) {
2165 error_report("rdma migration: failed to make "
2166 "room in full send queue! %d", ret);
2167 return ret;
2168 }
2169
2170 goto retry;
2171
2172 } else if (ret > 0) {
2173 perror("rdma migration: post rdma write failed");
2174 return -ret;
2175 }
2176
2177 set_bit(chunk, block->transit_bitmap);
2178 acct_update_position(f, sge.length, false);
2179 rdma->total_writes++;
2180
2181 return 0;
2182}
2183
2184
2185
2186
2187
2188
2189
2190static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2191{
2192 int ret;
2193
2194 if (!rdma->current_length) {
2195 return 0;
2196 }
2197
2198 ret = qemu_rdma_write_one(f, rdma,
2199 rdma->current_index, rdma->current_addr, rdma->current_length);
2200
2201 if (ret < 0) {
2202 return ret;
2203 }
2204
2205 if (ret == 0) {
2206 rdma->nb_sent++;
2207 trace_qemu_rdma_write_flush(rdma->nb_sent);
2208 }
2209
2210 rdma->current_length = 0;
2211 rdma->current_addr = 0;
2212
2213 return 0;
2214}
2215
2216static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2217 uint64_t offset, uint64_t len)
2218{
2219 RDMALocalBlock *block;
2220 uint8_t *host_addr;
2221 uint8_t *chunk_end;
2222
2223 if (rdma->current_index < 0) {
2224 return 0;
2225 }
2226
2227 if (rdma->current_chunk < 0) {
2228 return 0;
2229 }
2230
2231 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2232 host_addr = block->local_host_addr + (offset - block->offset);
2233 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2234
2235 if (rdma->current_length == 0) {
2236 return 0;
2237 }
2238
2239
2240
2241
2242 if (offset != (rdma->current_addr + rdma->current_length)) {
2243 return 0;
2244 }
2245
2246 if (offset < block->offset) {
2247 return 0;
2248 }
2249
2250 if ((offset + len) > (block->offset + block->length)) {
2251 return 0;
2252 }
2253
2254 if ((host_addr + len) > chunk_end) {
2255 return 0;
2256 }
2257
2258 return 1;
2259}
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2272 uint64_t block_offset, uint64_t offset,
2273 uint64_t len)
2274{
2275 uint64_t current_addr = block_offset + offset;
2276 uint64_t index = rdma->current_index;
2277 uint64_t chunk = rdma->current_chunk;
2278 int ret;
2279
2280
2281 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2282 ret = qemu_rdma_write_flush(f, rdma);
2283 if (ret) {
2284 return ret;
2285 }
2286 rdma->current_length = 0;
2287 rdma->current_addr = current_addr;
2288
2289 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2290 offset, len, &index, &chunk);
2291 if (ret) {
2292 error_report("ram block search failed");
2293 return ret;
2294 }
2295 rdma->current_index = index;
2296 rdma->current_chunk = chunk;
2297 }
2298
2299
2300 rdma->current_length += len;
2301
2302
2303 if (rdma->current_length >= RDMA_MERGE_MAX) {
2304 return qemu_rdma_write_flush(f, rdma);
2305 }
2306
2307 return 0;
2308}
2309
2310static void qemu_rdma_cleanup(RDMAContext *rdma)
2311{
2312 int idx;
2313
2314 if (rdma->cm_id && rdma->connected) {
2315 if ((rdma->error_state ||
2316 migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
2317 !rdma->received_error) {
2318 RDMAControlHeader head = { .len = 0,
2319 .type = RDMA_CONTROL_ERROR,
2320 .repeat = 1,
2321 };
2322 error_report("Early error. Sending error.");
2323 qemu_rdma_post_send_control(rdma, NULL, &head);
2324 }
2325
2326 rdma_disconnect(rdma->cm_id);
2327 trace_qemu_rdma_cleanup_disconnect();
2328 rdma->connected = false;
2329 }
2330
2331 if (rdma->channel) {
2332 qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2333 }
2334 g_free(rdma->dest_blocks);
2335 rdma->dest_blocks = NULL;
2336
2337 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2338 if (rdma->wr_data[idx].control_mr) {
2339 rdma->total_registrations--;
2340 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2341 }
2342 rdma->wr_data[idx].control_mr = NULL;
2343 }
2344
2345 if (rdma->local_ram_blocks.block) {
2346 while (rdma->local_ram_blocks.nb_blocks) {
2347 rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2348 }
2349 }
2350
2351 if (rdma->qp) {
2352 rdma_destroy_qp(rdma->cm_id);
2353 rdma->qp = NULL;
2354 }
2355 if (rdma->cq) {
2356 ibv_destroy_cq(rdma->cq);
2357 rdma->cq = NULL;
2358 }
2359 if (rdma->comp_channel) {
2360 ibv_destroy_comp_channel(rdma->comp_channel);
2361 rdma->comp_channel = NULL;
2362 }
2363 if (rdma->pd) {
2364 ibv_dealloc_pd(rdma->pd);
2365 rdma->pd = NULL;
2366 }
2367 if (rdma->cm_id) {
2368 rdma_destroy_id(rdma->cm_id);
2369 rdma->cm_id = NULL;
2370 }
2371
2372
2373 if (rdma->listen_id) {
2374 if (!rdma->is_return_path) {
2375 rdma_destroy_id(rdma->listen_id);
2376 }
2377 rdma->listen_id = NULL;
2378
2379 if (rdma->channel) {
2380 if (!rdma->is_return_path) {
2381 rdma_destroy_event_channel(rdma->channel);
2382 }
2383 rdma->channel = NULL;
2384 }
2385 }
2386
2387 if (rdma->channel) {
2388 rdma_destroy_event_channel(rdma->channel);
2389 rdma->channel = NULL;
2390 }
2391 g_free(rdma->host);
2392 rdma->host = NULL;
2393}
2394
2395
2396static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2397{
2398 int ret, idx;
2399 Error *local_err = NULL, **temp = &local_err;
2400
2401
2402
2403
2404
2405 rdma->pin_all = pin_all;
2406
2407 ret = qemu_rdma_resolve_host(rdma, temp);
2408 if (ret) {
2409 goto err_rdma_source_init;
2410 }
2411
2412 ret = qemu_rdma_alloc_pd_cq(rdma);
2413 if (ret) {
2414 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2415 " limits may be too low. Please check $ ulimit -a # and "
2416 "search for 'ulimit -l' in the output");
2417 goto err_rdma_source_init;
2418 }
2419
2420 ret = qemu_rdma_alloc_qp(rdma);
2421 if (ret) {
2422 ERROR(temp, "rdma migration: error allocating qp!");
2423 goto err_rdma_source_init;
2424 }
2425
2426 ret = qemu_rdma_init_ram_blocks(rdma);
2427 if (ret) {
2428 ERROR(temp, "rdma migration: error initializing ram blocks!");
2429 goto err_rdma_source_init;
2430 }
2431
2432
2433 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2434 for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2435 g_hash_table_insert(rdma->blockmap,
2436 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2437 &rdma->local_ram_blocks.block[idx]);
2438 }
2439
2440 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2441 ret = qemu_rdma_reg_control(rdma, idx);
2442 if (ret) {
2443 ERROR(temp, "rdma migration: error registering %d control!",
2444 idx);
2445 goto err_rdma_source_init;
2446 }
2447 }
2448
2449 return 0;
2450
2451err_rdma_source_init:
2452 error_propagate(errp, local_err);
2453 qemu_rdma_cleanup(rdma);
2454 return -1;
2455}
2456
2457static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
2458{
2459 RDMACapabilities cap = {
2460 .version = RDMA_CONTROL_VERSION_CURRENT,
2461 .flags = 0,
2462 };
2463 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2464 .retry_count = 5,
2465 .private_data = &cap,
2466 .private_data_len = sizeof(cap),
2467 };
2468 struct rdma_cm_event *cm_event;
2469 int ret;
2470
2471
2472
2473
2474
2475 if (rdma->pin_all) {
2476 trace_qemu_rdma_connect_pin_all_requested();
2477 cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2478 }
2479
2480 caps_to_network(&cap);
2481
2482 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2483 if (ret) {
2484 ERROR(errp, "posting second control recv");
2485 goto err_rdma_source_connect;
2486 }
2487
2488 ret = rdma_connect(rdma->cm_id, &conn_param);
2489 if (ret) {
2490 perror("rdma_connect");
2491 ERROR(errp, "connecting to destination!");
2492 goto err_rdma_source_connect;
2493 }
2494
2495 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2496 if (ret) {
2497 perror("rdma_get_cm_event after rdma_connect");
2498 ERROR(errp, "connecting to destination!");
2499 rdma_ack_cm_event(cm_event);
2500 goto err_rdma_source_connect;
2501 }
2502
2503 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2504 perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2505 ERROR(errp, "connecting to destination!");
2506 rdma_ack_cm_event(cm_event);
2507 goto err_rdma_source_connect;
2508 }
2509 rdma->connected = true;
2510
2511 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2512 network_to_caps(&cap);
2513
2514
2515
2516
2517
2518 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2519 ERROR(errp, "Server cannot support pinning all memory. "
2520 "Will register memory dynamically.");
2521 rdma->pin_all = false;
2522 }
2523
2524 trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2525
2526 rdma_ack_cm_event(cm_event);
2527
2528 rdma->control_ready_expected = 1;
2529 rdma->nb_sent = 0;
2530 return 0;
2531
2532err_rdma_source_connect:
2533 qemu_rdma_cleanup(rdma);
2534 return -1;
2535}
2536
2537static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2538{
2539 int ret, idx;
2540 struct rdma_cm_id *listen_id;
2541 char ip[40] = "unknown";
2542 struct rdma_addrinfo *res, *e;
2543 char port_str[16];
2544
2545 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2546 rdma->wr_data[idx].control_len = 0;
2547 rdma->wr_data[idx].control_curr = NULL;
2548 }
2549
2550 if (!rdma->host || !rdma->host[0]) {
2551 ERROR(errp, "RDMA host is not set!");
2552 rdma->error_state = -EINVAL;
2553 return -1;
2554 }
2555
2556 rdma->channel = rdma_create_event_channel();
2557 if (!rdma->channel) {
2558 ERROR(errp, "could not create rdma event channel");
2559 rdma->error_state = -EINVAL;
2560 return -1;
2561 }
2562
2563
2564 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2565 if (ret) {
2566 ERROR(errp, "could not create cm_id!");
2567 goto err_dest_init_create_listen_id;
2568 }
2569
2570 snprintf(port_str, 16, "%d", rdma->port);
2571 port_str[15] = '\0';
2572
2573 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2574 if (ret < 0) {
2575 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2576 goto err_dest_init_bind_addr;
2577 }
2578
2579 for (e = res; e != NULL; e = e->ai_next) {
2580 inet_ntop(e->ai_family,
2581 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2582 trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2583 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2584 if (ret) {
2585 continue;
2586 }
2587 if (e->ai_family == AF_INET6) {
2588 ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
2589 if (ret) {
2590 continue;
2591 }
2592 }
2593 break;
2594 }
2595
2596 if (!e) {
2597 ERROR(errp, "Error: could not rdma_bind_addr!");
2598 goto err_dest_init_bind_addr;
2599 }
2600
2601 rdma->listen_id = listen_id;
2602 qemu_rdma_dump_gid("dest_init", listen_id);
2603 return 0;
2604
2605err_dest_init_bind_addr:
2606 rdma_destroy_id(listen_id);
2607err_dest_init_create_listen_id:
2608 rdma_destroy_event_channel(rdma->channel);
2609 rdma->channel = NULL;
2610 rdma->error_state = ret;
2611 return ret;
2612
2613}
2614
2615static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
2616 RDMAContext *rdma)
2617{
2618 int idx;
2619
2620 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2621 rdma_return_path->wr_data[idx].control_len = 0;
2622 rdma_return_path->wr_data[idx].control_curr = NULL;
2623 }
2624
2625
2626 rdma_return_path->channel = rdma->channel;
2627 rdma_return_path->listen_id = rdma->listen_id;
2628
2629 rdma->return_path = rdma_return_path;
2630 rdma_return_path->return_path = rdma;
2631 rdma_return_path->is_return_path = true;
2632}
2633
2634static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2635{
2636 RDMAContext *rdma = NULL;
2637 InetSocketAddress *addr;
2638
2639 if (host_port) {
2640 rdma = g_new0(RDMAContext, 1);
2641 rdma->current_index = -1;
2642 rdma->current_chunk = -1;
2643
2644 addr = g_new(InetSocketAddress, 1);
2645 if (!inet_parse(addr, host_port, NULL)) {
2646 rdma->port = atoi(addr->port);
2647 rdma->host = g_strdup(addr->host);
2648 } else {
2649 ERROR(errp, "bad RDMA migration address '%s'", host_port);
2650 g_free(rdma);
2651 rdma = NULL;
2652 }
2653
2654 qapi_free_InetSocketAddress(addr);
2655 }
2656
2657 return rdma;
2658}
2659
2660
2661
2662
2663
2664
2665static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
2666 const struct iovec *iov,
2667 size_t niov,
2668 int *fds,
2669 size_t nfds,
2670 Error **errp)
2671{
2672 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2673 QEMUFile *f = rioc->file;
2674 RDMAContext *rdma;
2675 int ret;
2676 ssize_t done = 0;
2677 size_t i;
2678 size_t len = 0;
2679
2680 rcu_read_lock();
2681 rdma = atomic_rcu_read(&rioc->rdmaout);
2682
2683 if (!rdma) {
2684 rcu_read_unlock();
2685 return -EIO;
2686 }
2687
2688 CHECK_ERROR_STATE();
2689
2690
2691
2692
2693
2694 ret = qemu_rdma_write_flush(f, rdma);
2695 if (ret < 0) {
2696 rdma->error_state = ret;
2697 rcu_read_unlock();
2698 return ret;
2699 }
2700
2701 for (i = 0; i < niov; i++) {
2702 size_t remaining = iov[i].iov_len;
2703 uint8_t * data = (void *)iov[i].iov_base;
2704 while (remaining) {
2705 RDMAControlHeader head;
2706
2707 len = MIN(remaining, RDMA_SEND_INCREMENT);
2708 remaining -= len;
2709
2710 head.len = len;
2711 head.type = RDMA_CONTROL_QEMU_FILE;
2712
2713 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2714
2715 if (ret < 0) {
2716 rdma->error_state = ret;
2717 rcu_read_unlock();
2718 return ret;
2719 }
2720
2721 data += len;
2722 done += len;
2723 }
2724 }
2725
2726 rcu_read_unlock();
2727 return done;
2728}
2729
2730static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2731 size_t size, int idx)
2732{
2733 size_t len = 0;
2734
2735 if (rdma->wr_data[idx].control_len) {
2736 trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2737
2738 len = MIN(size, rdma->wr_data[idx].control_len);
2739 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2740 rdma->wr_data[idx].control_curr += len;
2741 rdma->wr_data[idx].control_len -= len;
2742 }
2743
2744 return len;
2745}
2746
2747
2748
2749
2750
2751
2752static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
2753 const struct iovec *iov,
2754 size_t niov,
2755 int **fds,
2756 size_t *nfds,
2757 Error **errp)
2758{
2759 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2760 RDMAContext *rdma;
2761 RDMAControlHeader head;
2762 int ret = 0;
2763 ssize_t i;
2764 size_t done = 0;
2765
2766 rcu_read_lock();
2767 rdma = atomic_rcu_read(&rioc->rdmain);
2768
2769 if (!rdma) {
2770 rcu_read_unlock();
2771 return -EIO;
2772 }
2773
2774 CHECK_ERROR_STATE();
2775
2776 for (i = 0; i < niov; i++) {
2777 size_t want = iov[i].iov_len;
2778 uint8_t *data = (void *)iov[i].iov_base;
2779
2780
2781
2782
2783
2784
2785 ret = qemu_rdma_fill(rdma, data, want, 0);
2786 done += ret;
2787 want -= ret;
2788
2789 if (want == 0) {
2790 continue;
2791 }
2792
2793
2794
2795 if (done > 0) {
2796 break;
2797 }
2798
2799
2800
2801
2802
2803 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2804
2805 if (ret < 0) {
2806 rdma->error_state = ret;
2807 rcu_read_unlock();
2808 return ret;
2809 }
2810
2811
2812
2813
2814 ret = qemu_rdma_fill(rdma, data, want, 0);
2815 done += ret;
2816 want -= ret;
2817
2818
2819 if (want) {
2820 if (done == 0) {
2821 rcu_read_unlock();
2822 return QIO_CHANNEL_ERR_BLOCK;
2823 } else {
2824 break;
2825 }
2826 }
2827 }
2828 rcu_read_unlock();
2829 return done;
2830}
2831
2832
2833
2834
2835static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2836{
2837 int ret;
2838
2839 if (qemu_rdma_write_flush(f, rdma) < 0) {
2840 return -EIO;
2841 }
2842
2843 while (rdma->nb_sent) {
2844 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2845 if (ret < 0) {
2846 error_report("rdma migration: complete polling error!");
2847 return -EIO;
2848 }
2849 }
2850
2851 qemu_rdma_unregister_waiting(rdma);
2852
2853 return 0;
2854}
2855
2856
2857static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
2858 bool blocking,
2859 Error **errp)
2860{
2861 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2862
2863 rioc->blocking = blocking;
2864 return 0;
2865}
2866
2867
2868typedef struct QIOChannelRDMASource QIOChannelRDMASource;
2869struct QIOChannelRDMASource {
2870 GSource parent;
2871 QIOChannelRDMA *rioc;
2872 GIOCondition condition;
2873};
2874
2875static gboolean
2876qio_channel_rdma_source_prepare(GSource *source,
2877 gint *timeout)
2878{
2879 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2880 RDMAContext *rdma;
2881 GIOCondition cond = 0;
2882 *timeout = -1;
2883
2884 rcu_read_lock();
2885 if (rsource->condition == G_IO_IN) {
2886 rdma = atomic_rcu_read(&rsource->rioc->rdmain);
2887 } else {
2888 rdma = atomic_rcu_read(&rsource->rioc->rdmaout);
2889 }
2890
2891 if (!rdma) {
2892 error_report("RDMAContext is NULL when prepare Gsource");
2893 rcu_read_unlock();
2894 return FALSE;
2895 }
2896
2897 if (rdma->wr_data[0].control_len) {
2898 cond |= G_IO_IN;
2899 }
2900 cond |= G_IO_OUT;
2901
2902 rcu_read_unlock();
2903 return cond & rsource->condition;
2904}
2905
2906static gboolean
2907qio_channel_rdma_source_check(GSource *source)
2908{
2909 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2910 RDMAContext *rdma;
2911 GIOCondition cond = 0;
2912
2913 rcu_read_lock();
2914 if (rsource->condition == G_IO_IN) {
2915 rdma = atomic_rcu_read(&rsource->rioc->rdmain);
2916 } else {
2917 rdma = atomic_rcu_read(&rsource->rioc->rdmaout);
2918 }
2919
2920 if (!rdma) {
2921 error_report("RDMAContext is NULL when check Gsource");
2922 rcu_read_unlock();
2923 return FALSE;
2924 }
2925
2926 if (rdma->wr_data[0].control_len) {
2927 cond |= G_IO_IN;
2928 }
2929 cond |= G_IO_OUT;
2930
2931 rcu_read_unlock();
2932 return cond & rsource->condition;
2933}
2934
2935static gboolean
2936qio_channel_rdma_source_dispatch(GSource *source,
2937 GSourceFunc callback,
2938 gpointer user_data)
2939{
2940 QIOChannelFunc func = (QIOChannelFunc)callback;
2941 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2942 RDMAContext *rdma;
2943 GIOCondition cond = 0;
2944
2945 rcu_read_lock();
2946 if (rsource->condition == G_IO_IN) {
2947 rdma = atomic_rcu_read(&rsource->rioc->rdmain);
2948 } else {
2949 rdma = atomic_rcu_read(&rsource->rioc->rdmaout);
2950 }
2951
2952 if (!rdma) {
2953 error_report("RDMAContext is NULL when dispatch Gsource");
2954 rcu_read_unlock();
2955 return FALSE;
2956 }
2957
2958 if (rdma->wr_data[0].control_len) {
2959 cond |= G_IO_IN;
2960 }
2961 cond |= G_IO_OUT;
2962
2963 rcu_read_unlock();
2964 return (*func)(QIO_CHANNEL(rsource->rioc),
2965 (cond & rsource->condition),
2966 user_data);
2967}
2968
2969static void
2970qio_channel_rdma_source_finalize(GSource *source)
2971{
2972 QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
2973
2974 object_unref(OBJECT(ssource->rioc));
2975}
2976
2977GSourceFuncs qio_channel_rdma_source_funcs = {
2978 qio_channel_rdma_source_prepare,
2979 qio_channel_rdma_source_check,
2980 qio_channel_rdma_source_dispatch,
2981 qio_channel_rdma_source_finalize
2982};
2983
2984static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
2985 GIOCondition condition)
2986{
2987 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2988 QIOChannelRDMASource *ssource;
2989 GSource *source;
2990
2991 source = g_source_new(&qio_channel_rdma_source_funcs,
2992 sizeof(QIOChannelRDMASource));
2993 ssource = (QIOChannelRDMASource *)source;
2994
2995 ssource->rioc = rioc;
2996 object_ref(OBJECT(rioc));
2997
2998 ssource->condition = condition;
2999
3000 return source;
3001}
3002
3003static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
3004 AioContext *ctx,
3005 IOHandler *io_read,
3006 IOHandler *io_write,
3007 void *opaque)
3008{
3009 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3010 if (io_read) {
3011 aio_set_fd_handler(ctx, rioc->rdmain->comp_channel->fd,
3012 false, io_read, io_write, NULL, opaque);
3013 } else {
3014 aio_set_fd_handler(ctx, rioc->rdmaout->comp_channel->fd,
3015 false, io_read, io_write, NULL, opaque);
3016 }
3017}
3018
3019static int qio_channel_rdma_close(QIOChannel *ioc,
3020 Error **errp)
3021{
3022 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3023 RDMAContext *rdmain, *rdmaout;
3024 trace_qemu_rdma_close();
3025
3026 rdmain = rioc->rdmain;
3027 if (rdmain) {
3028 atomic_rcu_set(&rioc->rdmain, NULL);
3029 }
3030
3031 rdmaout = rioc->rdmaout;
3032 if (rdmaout) {
3033 atomic_rcu_set(&rioc->rdmaout, NULL);
3034 }
3035
3036 synchronize_rcu();
3037
3038 if (rdmain) {
3039 qemu_rdma_cleanup(rdmain);
3040 }
3041
3042 if (rdmaout) {
3043 qemu_rdma_cleanup(rdmaout);
3044 }
3045
3046 g_free(rdmain);
3047 g_free(rdmaout);
3048
3049 return 0;
3050}
3051
3052static int
3053qio_channel_rdma_shutdown(QIOChannel *ioc,
3054 QIOChannelShutdown how,
3055 Error **errp)
3056{
3057 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3058 RDMAContext *rdmain, *rdmaout;
3059
3060 rcu_read_lock();
3061
3062 rdmain = atomic_rcu_read(&rioc->rdmain);
3063 rdmaout = atomic_rcu_read(&rioc->rdmain);
3064
3065 switch (how) {
3066 case QIO_CHANNEL_SHUTDOWN_READ:
3067 if (rdmain) {
3068 rdmain->error_state = -1;
3069 }
3070 break;
3071 case QIO_CHANNEL_SHUTDOWN_WRITE:
3072 if (rdmaout) {
3073 rdmaout->error_state = -1;
3074 }
3075 break;
3076 case QIO_CHANNEL_SHUTDOWN_BOTH:
3077 default:
3078 if (rdmain) {
3079 rdmain->error_state = -1;
3080 }
3081 if (rdmaout) {
3082 rdmaout->error_state = -1;
3083 }
3084 break;
3085 }
3086
3087 rcu_read_unlock();
3088 return 0;
3089}
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
3126 ram_addr_t block_offset, ram_addr_t offset,
3127 size_t size, uint64_t *bytes_sent)
3128{
3129 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3130 RDMAContext *rdma;
3131 int ret;
3132
3133 rcu_read_lock();
3134 rdma = atomic_rcu_read(&rioc->rdmaout);
3135
3136 if (!rdma) {
3137 rcu_read_unlock();
3138 return -EIO;
3139 }
3140
3141 CHECK_ERROR_STATE();
3142
3143 if (migrate_get_current()->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3144 rcu_read_unlock();
3145 return RAM_SAVE_CONTROL_NOT_SUPP;
3146 }
3147
3148 qemu_fflush(f);
3149
3150 if (size > 0) {
3151
3152
3153
3154
3155
3156 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
3157 if (ret < 0) {
3158 error_report("rdma migration: write error! %d", ret);
3159 goto err;
3160 }
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170 if (bytes_sent) {
3171 *bytes_sent = 1;
3172 }
3173 } else {
3174 uint64_t index, chunk;
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187 ret = qemu_rdma_search_ram_block(rdma, block_offset,
3188 offset, size, &index, &chunk);
3189
3190 if (ret) {
3191 error_report("ram block search failed");
3192 goto err;
3193 }
3194
3195 qemu_rdma_signal_unregister(rdma, index, chunk, 0);
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205 }
3206
3207
3208
3209
3210
3211
3212
3213
3214 while (1) {
3215 uint64_t wr_id, wr_id_in;
3216 int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
3217 if (ret < 0) {
3218 error_report("rdma migration: polling error! %d", ret);
3219 goto err;
3220 }
3221
3222 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3223
3224 if (wr_id == RDMA_WRID_NONE) {
3225 break;
3226 }
3227 }
3228
3229 rcu_read_unlock();
3230 return RAM_SAVE_CONTROL_DELAYED;
3231err:
3232 rdma->error_state = ret;
3233 rcu_read_unlock();
3234 return ret;
3235}
3236
3237static void rdma_accept_incoming_migration(void *opaque);
3238
3239static void rdma_cm_poll_handler(void *opaque)
3240{
3241 RDMAContext *rdma = opaque;
3242 int ret;
3243 struct rdma_cm_event *cm_event;
3244 MigrationIncomingState *mis = migration_incoming_get_current();
3245
3246 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3247 if (ret) {
3248 error_report("get_cm_event failed %d", errno);
3249 return;
3250 }
3251 rdma_ack_cm_event(cm_event);
3252
3253 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
3254 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3255 error_report("receive cm event, cm event is %d", cm_event->event);
3256 rdma->error_state = -EPIPE;
3257 if (rdma->return_path) {
3258 rdma->return_path->error_state = -EPIPE;
3259 }
3260
3261 if (mis->migration_incoming_co) {
3262 qemu_coroutine_enter(mis->migration_incoming_co);
3263 }
3264 return;
3265 }
3266}
3267
3268static int qemu_rdma_accept(RDMAContext *rdma)
3269{
3270 RDMACapabilities cap;
3271 struct rdma_conn_param conn_param = {
3272 .responder_resources = 2,
3273 .private_data = &cap,
3274 .private_data_len = sizeof(cap),
3275 };
3276 struct rdma_cm_event *cm_event;
3277 struct ibv_context *verbs;
3278 int ret = -EINVAL;
3279 int idx;
3280
3281 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3282 if (ret) {
3283 goto err_rdma_dest_wait;
3284 }
3285
3286 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3287 rdma_ack_cm_event(cm_event);
3288 goto err_rdma_dest_wait;
3289 }
3290
3291 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3292
3293 network_to_caps(&cap);
3294
3295 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3296 error_report("Unknown source RDMA version: %d, bailing...",
3297 cap.version);
3298 rdma_ack_cm_event(cm_event);
3299 goto err_rdma_dest_wait;
3300 }
3301
3302
3303
3304
3305 cap.flags &= known_capabilities;
3306
3307
3308
3309
3310
3311 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3312 rdma->pin_all = true;
3313 }
3314
3315 rdma->cm_id = cm_event->id;
3316 verbs = cm_event->id->verbs;
3317
3318 rdma_ack_cm_event(cm_event);
3319
3320 trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3321
3322 caps_to_network(&cap);
3323
3324 trace_qemu_rdma_accept_pin_verbsc(verbs);
3325
3326 if (!rdma->verbs) {
3327 rdma->verbs = verbs;
3328 } else if (rdma->verbs != verbs) {
3329 error_report("ibv context not matching %p, %p!", rdma->verbs,
3330 verbs);
3331 goto err_rdma_dest_wait;
3332 }
3333
3334 qemu_rdma_dump_id("dest_init", verbs);
3335
3336 ret = qemu_rdma_alloc_pd_cq(rdma);
3337 if (ret) {
3338 error_report("rdma migration: error allocating pd and cq!");
3339 goto err_rdma_dest_wait;
3340 }
3341
3342 ret = qemu_rdma_alloc_qp(rdma);
3343 if (ret) {
3344 error_report("rdma migration: error allocating qp!");
3345 goto err_rdma_dest_wait;
3346 }
3347
3348 ret = qemu_rdma_init_ram_blocks(rdma);
3349 if (ret) {
3350 error_report("rdma migration: error initializing ram blocks!");
3351 goto err_rdma_dest_wait;
3352 }
3353
3354 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
3355 ret = qemu_rdma_reg_control(rdma, idx);
3356 if (ret) {
3357 error_report("rdma: error registering %d control", idx);
3358 goto err_rdma_dest_wait;
3359 }
3360 }
3361
3362
3363 if (migrate_postcopy() && !rdma->is_return_path) {
3364 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3365 NULL,
3366 (void *)(intptr_t)rdma->return_path);
3367 } else {
3368 qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
3369 NULL, rdma);
3370 }
3371
3372 ret = rdma_accept(rdma->cm_id, &conn_param);
3373 if (ret) {
3374 error_report("rdma_accept returns %d", ret);
3375 goto err_rdma_dest_wait;
3376 }
3377
3378 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3379 if (ret) {
3380 error_report("rdma_accept get_cm_event failed %d", ret);
3381 goto err_rdma_dest_wait;
3382 }
3383
3384 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3385 error_report("rdma_accept not event established");
3386 rdma_ack_cm_event(cm_event);
3387 goto err_rdma_dest_wait;
3388 }
3389
3390 rdma_ack_cm_event(cm_event);
3391 rdma->connected = true;
3392
3393 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
3394 if (ret) {
3395 error_report("rdma migration: error posting second control recv");
3396 goto err_rdma_dest_wait;
3397 }
3398
3399 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3400
3401 return 0;
3402
3403err_rdma_dest_wait:
3404 rdma->error_state = ret;
3405 qemu_rdma_cleanup(rdma);
3406 return ret;
3407}
3408
3409static int dest_ram_sort_func(const void *a, const void *b)
3410{
3411 unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3412 unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3413
3414 return (a_index < b_index) ? -1 : (a_index != b_index);
3415}
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
3427{
3428 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3429 .type = RDMA_CONTROL_REGISTER_RESULT,
3430 .repeat = 0,
3431 };
3432 RDMAControlHeader unreg_resp = { .len = 0,
3433 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3434 .repeat = 0,
3435 };
3436 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3437 .repeat = 1 };
3438 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3439 RDMAContext *rdma;
3440 RDMALocalBlocks *local;
3441 RDMAControlHeader head;
3442 RDMARegister *reg, *registers;
3443 RDMACompress *comp;
3444 RDMARegisterResult *reg_result;
3445 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3446 RDMALocalBlock *block;
3447 void *host_addr;
3448 int ret = 0;
3449 int idx = 0;
3450 int count = 0;
3451 int i = 0;
3452
3453 rcu_read_lock();
3454 rdma = atomic_rcu_read(&rioc->rdmain);
3455
3456 if (!rdma) {
3457 rcu_read_unlock();
3458 return -EIO;
3459 }
3460
3461 CHECK_ERROR_STATE();
3462
3463 local = &rdma->local_ram_blocks;
3464 do {
3465 trace_qemu_rdma_registration_handle_wait();
3466
3467 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
3468
3469 if (ret < 0) {
3470 break;
3471 }
3472
3473 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3474 error_report("rdma: Too many requests in this message (%d)."
3475 "Bailing.", head.repeat);
3476 ret = -EIO;
3477 break;
3478 }
3479
3480 switch (head.type) {
3481 case RDMA_CONTROL_COMPRESS:
3482 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3483 network_to_compress(comp);
3484
3485 trace_qemu_rdma_registration_handle_compress(comp->length,
3486 comp->block_idx,
3487 comp->offset);
3488 if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3489 error_report("rdma: 'compress' bad block index %u (vs %d)",
3490 (unsigned int)comp->block_idx,
3491 rdma->local_ram_blocks.nb_blocks);
3492 ret = -EIO;
3493 goto out;
3494 }
3495 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3496
3497 host_addr = block->local_host_addr +
3498 (comp->offset - block->offset);
3499
3500 ram_handle_compressed(host_addr, comp->value, comp->length);
3501 break;
3502
3503 case RDMA_CONTROL_REGISTER_FINISHED:
3504 trace_qemu_rdma_registration_handle_finished();
3505 goto out;
3506
3507 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3508 trace_qemu_rdma_registration_handle_ram_blocks();
3509
3510
3511
3512
3513
3514 qsort(rdma->local_ram_blocks.block,
3515 rdma->local_ram_blocks.nb_blocks,
3516 sizeof(RDMALocalBlock), dest_ram_sort_func);
3517 for (i = 0; i < local->nb_blocks; i++) {
3518 local->block[i].index = i;
3519 }
3520
3521 if (rdma->pin_all) {
3522 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3523 if (ret) {
3524 error_report("rdma migration: error dest "
3525 "registering ram blocks");
3526 goto out;
3527 }
3528 }
3529
3530
3531
3532
3533
3534
3535
3536 for (i = 0; i < local->nb_blocks; i++) {
3537 rdma->dest_blocks[i].remote_host_addr =
3538 (uintptr_t)(local->block[i].local_host_addr);
3539
3540 if (rdma->pin_all) {
3541 rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3542 }
3543
3544 rdma->dest_blocks[i].offset = local->block[i].offset;
3545 rdma->dest_blocks[i].length = local->block[i].length;
3546
3547 dest_block_to_network(&rdma->dest_blocks[i]);
3548 trace_qemu_rdma_registration_handle_ram_blocks_loop(
3549 local->block[i].block_name,
3550 local->block[i].offset,
3551 local->block[i].length,
3552 local->block[i].local_host_addr,
3553 local->block[i].src_index);
3554 }
3555
3556 blocks.len = rdma->local_ram_blocks.nb_blocks
3557 * sizeof(RDMADestBlock);
3558
3559
3560 ret = qemu_rdma_post_send_control(rdma,
3561 (uint8_t *) rdma->dest_blocks, &blocks);
3562
3563 if (ret < 0) {
3564 error_report("rdma migration: error sending remote info");
3565 goto out;
3566 }
3567
3568 break;
3569 case RDMA_CONTROL_REGISTER_REQUEST:
3570 trace_qemu_rdma_registration_handle_register(head.repeat);
3571
3572 reg_resp.repeat = head.repeat;
3573 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3574
3575 for (count = 0; count < head.repeat; count++) {
3576 uint64_t chunk;
3577 uint8_t *chunk_start, *chunk_end;
3578
3579 reg = ®isters[count];
3580 network_to_register(reg);
3581
3582 reg_result = &results[count];
3583
3584 trace_qemu_rdma_registration_handle_register_loop(count,
3585 reg->current_index, reg->key.current_addr, reg->chunks);
3586
3587 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3588 error_report("rdma: 'register' bad block index %u (vs %d)",
3589 (unsigned int)reg->current_index,
3590 rdma->local_ram_blocks.nb_blocks);
3591 ret = -ENOENT;
3592 goto out;
3593 }
3594 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3595 if (block->is_ram_block) {
3596 if (block->offset > reg->key.current_addr) {
3597 error_report("rdma: bad register address for block %s"
3598 " offset: %" PRIx64 " current_addr: %" PRIx64,
3599 block->block_name, block->offset,
3600 reg->key.current_addr);
3601 ret = -ERANGE;
3602 goto out;
3603 }
3604 host_addr = (block->local_host_addr +
3605 (reg->key.current_addr - block->offset));
3606 chunk = ram_chunk_index(block->local_host_addr,
3607 (uint8_t *) host_addr);
3608 } else {
3609 chunk = reg->key.chunk;
3610 host_addr = block->local_host_addr +
3611 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3612
3613 if (host_addr < (void *)block->local_host_addr) {
3614 error_report("rdma: bad chunk for block %s"
3615 " chunk: %" PRIx64,
3616 block->block_name, reg->key.chunk);
3617 ret = -ERANGE;
3618 goto out;
3619 }
3620 }
3621 chunk_start = ram_chunk_start(block, chunk);
3622 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3623
3624 uint32_t tmp_rkey = 0;
3625 if (qemu_rdma_register_and_get_keys(rdma, block,
3626 (uintptr_t)host_addr, NULL, &tmp_rkey,
3627 chunk, chunk_start, chunk_end)) {
3628 error_report("cannot get rkey");
3629 ret = -EINVAL;
3630 goto out;
3631 }
3632 reg_result->rkey = tmp_rkey;
3633
3634 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3635
3636 trace_qemu_rdma_registration_handle_register_rkey(
3637 reg_result->rkey);
3638
3639 result_to_network(reg_result);
3640 }
3641
3642 ret = qemu_rdma_post_send_control(rdma,
3643 (uint8_t *) results, ®_resp);
3644
3645 if (ret < 0) {
3646 error_report("Failed to send control buffer");
3647 goto out;
3648 }
3649 break;
3650 case RDMA_CONTROL_UNREGISTER_REQUEST:
3651 trace_qemu_rdma_registration_handle_unregister(head.repeat);
3652 unreg_resp.repeat = head.repeat;
3653 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3654
3655 for (count = 0; count < head.repeat; count++) {
3656 reg = ®isters[count];
3657 network_to_register(reg);
3658
3659 trace_qemu_rdma_registration_handle_unregister_loop(count,
3660 reg->current_index, reg->key.chunk);
3661
3662 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3663
3664 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3665 block->pmr[reg->key.chunk] = NULL;
3666
3667 if (ret != 0) {
3668 perror("rdma unregistration chunk failed");
3669 ret = -ret;
3670 goto out;
3671 }
3672
3673 rdma->total_registrations--;
3674
3675 trace_qemu_rdma_registration_handle_unregister_success(
3676 reg->key.chunk);
3677 }
3678
3679 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3680
3681 if (ret < 0) {
3682 error_report("Failed to send control buffer");
3683 goto out;
3684 }
3685 break;
3686 case RDMA_CONTROL_REGISTER_RESULT:
3687 error_report("Invalid RESULT message at dest.");
3688 ret = -EIO;
3689 goto out;
3690 default:
3691 error_report("Unknown control message %s", control_desc(head.type));
3692 ret = -EIO;
3693 goto out;
3694 }
3695 } while (1);
3696out:
3697 if (ret < 0) {
3698 rdma->error_state = ret;
3699 }
3700 rcu_read_unlock();
3701 return ret;
3702}
3703
3704
3705
3706
3707
3708
3709
3710
3711static int
3712rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
3713{
3714 RDMAContext *rdma;
3715 int curr;
3716 int found = -1;
3717
3718 rcu_read_lock();
3719 rdma = atomic_rcu_read(&rioc->rdmain);
3720
3721 if (!rdma) {
3722 rcu_read_unlock();
3723 return -EIO;
3724 }
3725
3726
3727 for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3728 if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3729 found = curr;
3730 break;
3731 }
3732 }
3733
3734 if (found == -1) {
3735 error_report("RAMBlock '%s' not found on destination", name);
3736 rcu_read_unlock();
3737 return -ENOENT;
3738 }
3739
3740 rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3741 trace_rdma_block_notification_handle(name, rdma->next_src_index);
3742 rdma->next_src_index++;
3743
3744 rcu_read_unlock();
3745 return 0;
3746}
3747
3748static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
3749{
3750 switch (flags) {
3751 case RAM_CONTROL_BLOCK_REG:
3752 return rdma_block_notification_handle(opaque, data);
3753
3754 case RAM_CONTROL_HOOK:
3755 return qemu_rdma_registration_handle(f, opaque);
3756
3757 default:
3758
3759 abort();
3760 }
3761}
3762
3763static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3764 uint64_t flags, void *data)
3765{
3766 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3767 RDMAContext *rdma;
3768
3769 rcu_read_lock();
3770 rdma = atomic_rcu_read(&rioc->rdmaout);
3771 if (!rdma) {
3772 rcu_read_unlock();
3773 return -EIO;
3774 }
3775
3776 CHECK_ERROR_STATE();
3777
3778 if (migrate_get_current()->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3779 rcu_read_unlock();
3780 return 0;
3781 }
3782
3783 trace_qemu_rdma_registration_start(flags);
3784 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3785 qemu_fflush(f);
3786
3787 rcu_read_unlock();
3788 return 0;
3789}
3790
3791
3792
3793
3794
3795static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3796 uint64_t flags, void *data)
3797{
3798 Error *local_err = NULL, **errp = &local_err;
3799 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3800 RDMAContext *rdma;
3801 RDMAControlHeader head = { .len = 0, .repeat = 1 };
3802 int ret = 0;
3803
3804 rcu_read_lock();
3805 rdma = atomic_rcu_read(&rioc->rdmaout);
3806 if (!rdma) {
3807 rcu_read_unlock();
3808 return -EIO;
3809 }
3810
3811 CHECK_ERROR_STATE();
3812
3813 if (migrate_get_current()->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3814 rcu_read_unlock();
3815 return 0;
3816 }
3817
3818 qemu_fflush(f);
3819 ret = qemu_rdma_drain_cq(f, rdma);
3820
3821 if (ret < 0) {
3822 goto err;
3823 }
3824
3825 if (flags == RAM_CONTROL_SETUP) {
3826 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3827 RDMALocalBlocks *local = &rdma->local_ram_blocks;
3828 int reg_result_idx, i, nb_dest_blocks;
3829
3830 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3831 trace_qemu_rdma_registration_stop_ram();
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3842 ®_result_idx, rdma->pin_all ?
3843 qemu_rdma_reg_whole_ram_blocks : NULL);
3844 if (ret < 0) {
3845 ERROR(errp, "receiving remote info!");
3846 rcu_read_unlock();
3847 return ret;
3848 }
3849
3850 nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864 if (local->nb_blocks != nb_dest_blocks) {
3865 ERROR(errp, "ram blocks mismatch (Number of blocks %d vs %d) "
3866 "Your QEMU command line parameters are probably "
3867 "not identical on both the source and destination.",
3868 local->nb_blocks, nb_dest_blocks);
3869 rdma->error_state = -EINVAL;
3870 rcu_read_unlock();
3871 return -EINVAL;
3872 }
3873
3874 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3875 memcpy(rdma->dest_blocks,
3876 rdma->wr_data[reg_result_idx].control_curr, resp.len);
3877 for (i = 0; i < nb_dest_blocks; i++) {
3878 network_to_dest_block(&rdma->dest_blocks[i]);
3879
3880
3881 if (rdma->dest_blocks[i].length != local->block[i].length) {
3882 ERROR(errp, "Block %s/%d has a different length %" PRIu64
3883 "vs %" PRIu64, local->block[i].block_name, i,
3884 local->block[i].length,
3885 rdma->dest_blocks[i].length);
3886 rdma->error_state = -EINVAL;
3887 rcu_read_unlock();
3888 return -EINVAL;
3889 }
3890 local->block[i].remote_host_addr =
3891 rdma->dest_blocks[i].remote_host_addr;
3892 local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3893 }
3894 }
3895
3896 trace_qemu_rdma_registration_stop(flags);
3897
3898 head.type = RDMA_CONTROL_REGISTER_FINISHED;
3899 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3900
3901 if (ret < 0) {
3902 goto err;
3903 }
3904
3905 rcu_read_unlock();
3906 return 0;
3907err:
3908 rdma->error_state = ret;
3909 rcu_read_unlock();
3910 return ret;
3911}
3912
3913static const QEMUFileHooks rdma_read_hooks = {
3914 .hook_ram_load = rdma_load_hook,
3915};
3916
3917static const QEMUFileHooks rdma_write_hooks = {
3918 .before_ram_iterate = qemu_rdma_registration_start,
3919 .after_ram_iterate = qemu_rdma_registration_stop,
3920 .save_page = qemu_rdma_save_page,
3921};
3922
3923
3924static void qio_channel_rdma_finalize(Object *obj)
3925{
3926 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
3927 if (rioc->rdmain) {
3928 qemu_rdma_cleanup(rioc->rdmain);
3929 g_free(rioc->rdmain);
3930 rioc->rdmain = NULL;
3931 }
3932 if (rioc->rdmaout) {
3933 qemu_rdma_cleanup(rioc->rdmaout);
3934 g_free(rioc->rdmaout);
3935 rioc->rdmaout = NULL;
3936 }
3937}
3938
3939static void qio_channel_rdma_class_init(ObjectClass *klass,
3940 void *class_data G_GNUC_UNUSED)
3941{
3942 QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
3943
3944 ioc_klass->io_writev = qio_channel_rdma_writev;
3945 ioc_klass->io_readv = qio_channel_rdma_readv;
3946 ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
3947 ioc_klass->io_close = qio_channel_rdma_close;
3948 ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
3949 ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
3950 ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
3951}
3952
3953static const TypeInfo qio_channel_rdma_info = {
3954 .parent = TYPE_QIO_CHANNEL,
3955 .name = TYPE_QIO_CHANNEL_RDMA,
3956 .instance_size = sizeof(QIOChannelRDMA),
3957 .instance_finalize = qio_channel_rdma_finalize,
3958 .class_init = qio_channel_rdma_class_init,
3959};
3960
3961static void qio_channel_rdma_register_types(void)
3962{
3963 type_register_static(&qio_channel_rdma_info);
3964}
3965
3966type_init(qio_channel_rdma_register_types);
3967
3968static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
3969{
3970 QIOChannelRDMA *rioc;
3971
3972 if (qemu_file_mode_is_not_valid(mode)) {
3973 return NULL;
3974 }
3975
3976 rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
3977
3978 if (mode[0] == 'w') {
3979 rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc));
3980 rioc->rdmaout = rdma;
3981 rioc->rdmain = rdma->return_path;
3982 qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
3983 } else {
3984 rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc));
3985 rioc->rdmain = rdma;
3986 rioc->rdmaout = rdma->return_path;
3987 qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
3988 }
3989
3990 return rioc->file;
3991}
3992
3993static void rdma_accept_incoming_migration(void *opaque)
3994{
3995 RDMAContext *rdma = opaque;
3996 int ret;
3997 QEMUFile *f;
3998 Error *local_err = NULL, **errp = &local_err;
3999
4000 trace_qemu_rdma_accept_incoming_migration();
4001 ret = qemu_rdma_accept(rdma);
4002
4003 if (ret) {
4004 ERROR(errp, "RDMA Migration initialization failed!");
4005 return;
4006 }
4007
4008 trace_qemu_rdma_accept_incoming_migration_accepted();
4009
4010 if (rdma->is_return_path) {
4011 return;
4012 }
4013
4014 f = qemu_fopen_rdma(rdma, "rb");
4015 if (f == NULL) {
4016 ERROR(errp, "could not qemu_fopen_rdma!");
4017 qemu_rdma_cleanup(rdma);
4018 return;
4019 }
4020
4021 rdma->migration_started_on_destination = 1;
4022 migration_fd_process_incoming(f);
4023}
4024
4025void rdma_start_incoming_migration(const char *host_port, Error **errp)
4026{
4027 int ret;
4028 RDMAContext *rdma, *rdma_return_path = NULL;
4029 Error *local_err = NULL;
4030
4031 trace_rdma_start_incoming_migration();
4032 rdma = qemu_rdma_data_init(host_port, &local_err);
4033
4034 if (rdma == NULL) {
4035 goto err;
4036 }
4037
4038 ret = qemu_rdma_dest_init(rdma, &local_err);
4039
4040 if (ret) {
4041 goto err;
4042 }
4043
4044 trace_rdma_start_incoming_migration_after_dest_init();
4045
4046 ret = rdma_listen(rdma->listen_id, 5);
4047
4048 if (ret) {
4049 ERROR(errp, "listening on socket!");
4050 goto err;
4051 }
4052
4053 trace_rdma_start_incoming_migration_after_rdma_listen();
4054
4055
4056 if (migrate_postcopy()) {
4057 rdma_return_path = qemu_rdma_data_init(host_port, &local_err);
4058
4059 if (rdma_return_path == NULL) {
4060 goto err;
4061 }
4062
4063 qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
4064 }
4065
4066 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
4067 NULL, (void *)(intptr_t)rdma);
4068 return;
4069err:
4070 error_propagate(errp, local_err);
4071 g_free(rdma);
4072 g_free(rdma_return_path);
4073}
4074
4075void rdma_start_outgoing_migration(void *opaque,
4076 const char *host_port, Error **errp)
4077{
4078 MigrationState *s = opaque;
4079 RDMAContext *rdma = qemu_rdma_data_init(host_port, errp);
4080 RDMAContext *rdma_return_path = NULL;
4081 int ret = 0;
4082
4083 if (rdma == NULL) {
4084 goto err;
4085 }
4086
4087 ret = qemu_rdma_source_init(rdma,
4088 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4089
4090 if (ret) {
4091 goto err;
4092 }
4093
4094 trace_rdma_start_outgoing_migration_after_rdma_source_init();
4095 ret = qemu_rdma_connect(rdma, errp);
4096
4097 if (ret) {
4098 goto err;
4099 }
4100
4101
4102 if (migrate_postcopy()) {
4103 rdma_return_path = qemu_rdma_data_init(host_port, errp);
4104
4105 if (rdma_return_path == NULL) {
4106 goto err;
4107 }
4108
4109 ret = qemu_rdma_source_init(rdma_return_path,
4110 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4111
4112 if (ret) {
4113 goto err;
4114 }
4115
4116 ret = qemu_rdma_connect(rdma_return_path, errp);
4117
4118 if (ret) {
4119 goto err;
4120 }
4121
4122 rdma->return_path = rdma_return_path;
4123 rdma_return_path->return_path = rdma;
4124 rdma_return_path->is_return_path = true;
4125 }
4126
4127 trace_rdma_start_outgoing_migration_after_rdma_connect();
4128
4129 s->to_dst_file = qemu_fopen_rdma(rdma, "wb");
4130 migrate_fd_connect(s, NULL);
4131 return;
4132err:
4133 g_free(rdma);
4134 g_free(rdma_return_path);
4135}
4136