1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include "qemu/osdep.h"
18#include "qapi/error.h"
19#include "qemu/cutils.h"
20#include "rdma.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "ram.h"
24#include "qemu-file-channel.h"
25#include "qemu/error-report.h"
26#include "qemu/main-loop.h"
27#include "qemu/module.h"
28#include "qemu/rcu.h"
29#include "qemu/sockets.h"
30#include "qemu/bitmap.h"
31#include "qemu/coroutine.h"
32#include "exec/memory.h"
33#include <sys/socket.h>
34#include <netdb.h>
35#include <arpa/inet.h>
36#include <rdma/rdma_cma.h>
37#include "trace.h"
38
39
40
41
42#define ERROR(errp, fmt, ...) \
43 do { \
44 fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
45 if (errp && (*(errp) == NULL)) { \
46 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
47 } \
48 } while (0)
49
50#define RDMA_RESOLVE_TIMEOUT_MS 10000
51
52
53#define RDMA_MERGE_MAX (2 * 1024 * 1024)
54#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
55
56#define RDMA_REG_CHUNK_SHIFT 20
57
58
59
60
61
62
63
64#define RDMA_SEND_INCREMENT 32768
65
66
67
68
69#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
70#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
71
72#define RDMA_CONTROL_VERSION_CURRENT 1
73
74
75
76#define RDMA_CAPABILITY_PIN_ALL 0x01
77
78
79
80
81
82static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
83
84#define CHECK_ERROR_STATE() \
85 do { \
86 if (rdma->error_state) { \
87 if (!rdma->error_reported) { \
88 error_report("RDMA is in an error state waiting migration" \
89 " to abort!"); \
90 rdma->error_reported = 1; \
91 } \
92 return rdma->error_state; \
93 } \
94 } while (0)
95
96
97
98
99
100
101
102
103
104
105
106
107
108#define RDMA_WRID_TYPE_SHIFT 0UL
109#define RDMA_WRID_BLOCK_SHIFT 16UL
110#define RDMA_WRID_CHUNK_SHIFT 30UL
111
112#define RDMA_WRID_TYPE_MASK \
113 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
114
115#define RDMA_WRID_BLOCK_MASK \
116 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
117
118#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
119
120
121
122
123
124
125enum {
126 RDMA_WRID_NONE = 0,
127 RDMA_WRID_RDMA_WRITE = 1,
128 RDMA_WRID_SEND_CONTROL = 2000,
129 RDMA_WRID_RECV_CONTROL = 4000,
130};
131
132static const char *wrid_desc[] = {
133 [RDMA_WRID_NONE] = "NONE",
134 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
135 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
136 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
137};
138
139
140
141
142
143
144
145
146enum {
147 RDMA_WRID_READY = 0,
148 RDMA_WRID_DATA,
149 RDMA_WRID_CONTROL,
150 RDMA_WRID_MAX,
151};
152
153
154
155
156enum {
157 RDMA_CONTROL_NONE = 0,
158 RDMA_CONTROL_ERROR,
159 RDMA_CONTROL_READY,
160 RDMA_CONTROL_QEMU_FILE,
161 RDMA_CONTROL_RAM_BLOCKS_REQUEST,
162 RDMA_CONTROL_RAM_BLOCKS_RESULT,
163 RDMA_CONTROL_COMPRESS,
164 RDMA_CONTROL_REGISTER_REQUEST,
165 RDMA_CONTROL_REGISTER_RESULT,
166 RDMA_CONTROL_REGISTER_FINISHED,
167 RDMA_CONTROL_UNREGISTER_REQUEST,
168 RDMA_CONTROL_UNREGISTER_FINISHED,
169};
170
171
172
173
174
175
176typedef struct {
177 uint8_t control[RDMA_CONTROL_MAX_BUFFER];
178 struct ibv_mr *control_mr;
179 size_t control_len;
180 uint8_t *control_curr;
181} RDMAWorkRequestData;
182
183
184
185
186typedef struct {
187 uint32_t version;
188 uint32_t flags;
189} RDMACapabilities;
190
191static void caps_to_network(RDMACapabilities *cap)
192{
193 cap->version = htonl(cap->version);
194 cap->flags = htonl(cap->flags);
195}
196
197static void network_to_caps(RDMACapabilities *cap)
198{
199 cap->version = ntohl(cap->version);
200 cap->flags = ntohl(cap->flags);
201}
202
203
204
205
206
207
208
209
210typedef struct RDMALocalBlock {
211 char *block_name;
212 uint8_t *local_host_addr;
213 uint64_t remote_host_addr;
214 uint64_t offset;
215 uint64_t length;
216 struct ibv_mr **pmr;
217 struct ibv_mr *mr;
218 uint32_t *remote_keys;
219 uint32_t remote_rkey;
220 int index;
221 unsigned int src_index;
222 bool is_ram_block;
223 int nb_chunks;
224 unsigned long *transit_bitmap;
225 unsigned long *unregister_bitmap;
226} RDMALocalBlock;
227
228
229
230
231
232
233
234
235typedef struct QEMU_PACKED RDMADestBlock {
236 uint64_t remote_host_addr;
237 uint64_t offset;
238 uint64_t length;
239 uint32_t remote_rkey;
240 uint32_t padding;
241} RDMADestBlock;
242
243static const char *control_desc(unsigned int rdma_control)
244{
245 static const char *strs[] = {
246 [RDMA_CONTROL_NONE] = "NONE",
247 [RDMA_CONTROL_ERROR] = "ERROR",
248 [RDMA_CONTROL_READY] = "READY",
249 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
250 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
251 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
252 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
253 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
254 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
255 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
256 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
257 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
258 };
259
260 if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
261 return "??BAD CONTROL VALUE??";
262 }
263
264 return strs[rdma_control];
265}
266
267static uint64_t htonll(uint64_t v)
268{
269 union { uint32_t lv[2]; uint64_t llv; } u;
270 u.lv[0] = htonl(v >> 32);
271 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
272 return u.llv;
273}
274
275static uint64_t ntohll(uint64_t v) {
276 union { uint32_t lv[2]; uint64_t llv; } u;
277 u.llv = v;
278 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
279}
280
281static void dest_block_to_network(RDMADestBlock *db)
282{
283 db->remote_host_addr = htonll(db->remote_host_addr);
284 db->offset = htonll(db->offset);
285 db->length = htonll(db->length);
286 db->remote_rkey = htonl(db->remote_rkey);
287}
288
289static void network_to_dest_block(RDMADestBlock *db)
290{
291 db->remote_host_addr = ntohll(db->remote_host_addr);
292 db->offset = ntohll(db->offset);
293 db->length = ntohll(db->length);
294 db->remote_rkey = ntohl(db->remote_rkey);
295}
296
297
298
299
300
301
302typedef struct RDMALocalBlocks {
303 int nb_blocks;
304 bool init;
305 RDMALocalBlock *block;
306} RDMALocalBlocks;
307
308
309
310
311
312
313
314typedef struct RDMAContext {
315 char *host;
316 int port;
317
318 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
319
320
321
322
323
324
325
326
327 int control_ready_expected;
328
329
330 int nb_sent;
331
332
333
334 uint64_t current_addr;
335 uint64_t current_length;
336
337 int current_index;
338
339 int current_chunk;
340
341 bool pin_all;
342
343
344
345
346
347
348
349
350 struct rdma_cm_id *cm_id;
351 struct rdma_cm_id *listen_id;
352 bool connected;
353
354 struct ibv_context *verbs;
355 struct rdma_event_channel *channel;
356 struct ibv_qp *qp;
357 struct ibv_comp_channel *comp_channel;
358 struct ibv_pd *pd;
359 struct ibv_cq *cq;
360
361
362
363
364
365
366 int error_state;
367 int error_reported;
368 int received_error;
369
370
371
372
373 RDMALocalBlocks local_ram_blocks;
374 RDMADestBlock *dest_blocks;
375
376
377 unsigned int next_src_index;
378
379
380
381
382
383
384 int migration_started_on_destination;
385
386 int total_registrations;
387 int total_writes;
388
389 int unregister_current, unregister_next;
390 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
391
392 GHashTable *blockmap;
393
394
395 struct RDMAContext *return_path;
396 bool is_return_path;
397} RDMAContext;
398
399#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
400#define QIO_CHANNEL_RDMA(obj) \
401 OBJECT_CHECK(QIOChannelRDMA, (obj), TYPE_QIO_CHANNEL_RDMA)
402
403typedef struct QIOChannelRDMA QIOChannelRDMA;
404
405
406struct QIOChannelRDMA {
407 QIOChannel parent;
408 RDMAContext *rdmain;
409 RDMAContext *rdmaout;
410 QEMUFile *file;
411 bool blocking;
412};
413
414
415
416
417
418typedef struct QEMU_PACKED {
419 uint32_t len;
420 uint32_t type;
421 uint32_t repeat;
422 uint32_t padding;
423} RDMAControlHeader;
424
425static void control_to_network(RDMAControlHeader *control)
426{
427 control->type = htonl(control->type);
428 control->len = htonl(control->len);
429 control->repeat = htonl(control->repeat);
430}
431
432static void network_to_control(RDMAControlHeader *control)
433{
434 control->type = ntohl(control->type);
435 control->len = ntohl(control->len);
436 control->repeat = ntohl(control->repeat);
437}
438
439
440
441
442
443
444
445typedef struct QEMU_PACKED {
446 union QEMU_PACKED {
447 uint64_t current_addr;
448 uint64_t chunk;
449 } key;
450 uint32_t current_index;
451 uint32_t padding;
452 uint64_t chunks;
453} RDMARegister;
454
455static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
456{
457 RDMALocalBlock *local_block;
458 local_block = &rdma->local_ram_blocks.block[reg->current_index];
459
460 if (local_block->is_ram_block) {
461
462
463
464
465 reg->key.current_addr -= local_block->offset;
466 reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
467 }
468 reg->key.current_addr = htonll(reg->key.current_addr);
469 reg->current_index = htonl(reg->current_index);
470 reg->chunks = htonll(reg->chunks);
471}
472
473static void network_to_register(RDMARegister *reg)
474{
475 reg->key.current_addr = ntohll(reg->key.current_addr);
476 reg->current_index = ntohl(reg->current_index);
477 reg->chunks = ntohll(reg->chunks);
478}
479
480typedef struct QEMU_PACKED {
481 uint32_t value;
482 uint32_t block_idx;
483 uint64_t offset;
484 uint64_t length;
485} RDMACompress;
486
487static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
488{
489 comp->value = htonl(comp->value);
490
491
492
493
494 comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
495 comp->offset += rdma->dest_blocks[comp->block_idx].offset;
496 comp->block_idx = htonl(comp->block_idx);
497 comp->offset = htonll(comp->offset);
498 comp->length = htonll(comp->length);
499}
500
501static void network_to_compress(RDMACompress *comp)
502{
503 comp->value = ntohl(comp->value);
504 comp->block_idx = ntohl(comp->block_idx);
505 comp->offset = ntohll(comp->offset);
506 comp->length = ntohll(comp->length);
507}
508
509
510
511
512
513
514typedef struct QEMU_PACKED {
515 uint32_t rkey;
516 uint32_t padding;
517 uint64_t host_addr;
518} RDMARegisterResult;
519
520static void result_to_network(RDMARegisterResult *result)
521{
522 result->rkey = htonl(result->rkey);
523 result->host_addr = htonll(result->host_addr);
524};
525
526static void network_to_result(RDMARegisterResult *result)
527{
528 result->rkey = ntohl(result->rkey);
529 result->host_addr = ntohll(result->host_addr);
530};
531
532const char *print_wrid(int wrid);
533static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
534 uint8_t *data, RDMAControlHeader *resp,
535 int *resp_idx,
536 int (*callback)(RDMAContext *rdma));
537
538static inline uint64_t ram_chunk_index(const uint8_t *start,
539 const uint8_t *host)
540{
541 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
542}
543
544static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
545 uint64_t i)
546{
547 return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
548 (i << RDMA_REG_CHUNK_SHIFT));
549}
550
551static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
552 uint64_t i)
553{
554 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
555 (1UL << RDMA_REG_CHUNK_SHIFT);
556
557 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
558 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
559 }
560
561 return result;
562}
563
564static int rdma_add_block(RDMAContext *rdma, const char *block_name,
565 void *host_addr,
566 ram_addr_t block_offset, uint64_t length)
567{
568 RDMALocalBlocks *local = &rdma->local_ram_blocks;
569 RDMALocalBlock *block;
570 RDMALocalBlock *old = local->block;
571
572 local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
573
574 if (local->nb_blocks) {
575 int x;
576
577 if (rdma->blockmap) {
578 for (x = 0; x < local->nb_blocks; x++) {
579 g_hash_table_remove(rdma->blockmap,
580 (void *)(uintptr_t)old[x].offset);
581 g_hash_table_insert(rdma->blockmap,
582 (void *)(uintptr_t)old[x].offset,
583 &local->block[x]);
584 }
585 }
586 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
587 g_free(old);
588 }
589
590 block = &local->block[local->nb_blocks];
591
592 block->block_name = g_strdup(block_name);
593 block->local_host_addr = host_addr;
594 block->offset = block_offset;
595 block->length = length;
596 block->index = local->nb_blocks;
597 block->src_index = ~0U;
598 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
599 block->transit_bitmap = bitmap_new(block->nb_chunks);
600 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
601 block->unregister_bitmap = bitmap_new(block->nb_chunks);
602 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
603 block->remote_keys = g_new0(uint32_t, block->nb_chunks);
604
605 block->is_ram_block = local->init ? false : true;
606
607 if (rdma->blockmap) {
608 g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
609 }
610
611 trace_rdma_add_block(block_name, local->nb_blocks,
612 (uintptr_t) block->local_host_addr,
613 block->offset, block->length,
614 (uintptr_t) (block->local_host_addr + block->length),
615 BITS_TO_LONGS(block->nb_chunks) *
616 sizeof(unsigned long) * 8,
617 block->nb_chunks);
618
619 local->nb_blocks++;
620
621 return 0;
622}
623
624
625
626
627
628
629static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
630{
631 const char *block_name = qemu_ram_get_idstr(rb);
632 void *host_addr = qemu_ram_get_host_addr(rb);
633 ram_addr_t block_offset = qemu_ram_get_offset(rb);
634 ram_addr_t length = qemu_ram_get_used_length(rb);
635 return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
636}
637
638
639
640
641
642
643static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
644{
645 RDMALocalBlocks *local = &rdma->local_ram_blocks;
646 int ret;
647
648 assert(rdma->blockmap == NULL);
649 memset(local, 0, sizeof *local);
650 ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
651 if (ret) {
652 return ret;
653 }
654 trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
655 rdma->dest_blocks = g_new0(RDMADestBlock,
656 rdma->local_ram_blocks.nb_blocks);
657 local->init = true;
658 return 0;
659}
660
661
662
663
664
665static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
666{
667 RDMALocalBlocks *local = &rdma->local_ram_blocks;
668 RDMALocalBlock *old = local->block;
669 int x;
670
671 if (rdma->blockmap) {
672 g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
673 }
674 if (block->pmr) {
675 int j;
676
677 for (j = 0; j < block->nb_chunks; j++) {
678 if (!block->pmr[j]) {
679 continue;
680 }
681 ibv_dereg_mr(block->pmr[j]);
682 rdma->total_registrations--;
683 }
684 g_free(block->pmr);
685 block->pmr = NULL;
686 }
687
688 if (block->mr) {
689 ibv_dereg_mr(block->mr);
690 rdma->total_registrations--;
691 block->mr = NULL;
692 }
693
694 g_free(block->transit_bitmap);
695 block->transit_bitmap = NULL;
696
697 g_free(block->unregister_bitmap);
698 block->unregister_bitmap = NULL;
699
700 g_free(block->remote_keys);
701 block->remote_keys = NULL;
702
703 g_free(block->block_name);
704 block->block_name = NULL;
705
706 if (rdma->blockmap) {
707 for (x = 0; x < local->nb_blocks; x++) {
708 g_hash_table_remove(rdma->blockmap,
709 (void *)(uintptr_t)old[x].offset);
710 }
711 }
712
713 if (local->nb_blocks > 1) {
714
715 local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
716
717 if (block->index) {
718 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
719 }
720
721 if (block->index < (local->nb_blocks - 1)) {
722 memcpy(local->block + block->index, old + (block->index + 1),
723 sizeof(RDMALocalBlock) *
724 (local->nb_blocks - (block->index + 1)));
725 for (x = block->index; x < local->nb_blocks - 1; x++) {
726 local->block[x].index--;
727 }
728 }
729 } else {
730 assert(block == local->block);
731 local->block = NULL;
732 }
733
734 trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
735 block->offset, block->length,
736 (uintptr_t)(block->local_host_addr + block->length),
737 BITS_TO_LONGS(block->nb_chunks) *
738 sizeof(unsigned long) * 8, block->nb_chunks);
739
740 g_free(old);
741
742 local->nb_blocks--;
743
744 if (local->nb_blocks && rdma->blockmap) {
745 for (x = 0; x < local->nb_blocks; x++) {
746 g_hash_table_insert(rdma->blockmap,
747 (void *)(uintptr_t)local->block[x].offset,
748 &local->block[x]);
749 }
750 }
751
752 return 0;
753}
754
755
756
757
758
759static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
760{
761 struct ibv_port_attr port;
762
763 if (ibv_query_port(verbs, 1, &port)) {
764 error_report("Failed to query port information");
765 return;
766 }
767
768 printf("%s RDMA Device opened: kernel name %s "
769 "uverbs device name %s, "
770 "infiniband_verbs class device path %s, "
771 "infiniband class device path %s, "
772 "transport: (%d) %s\n",
773 who,
774 verbs->device->name,
775 verbs->device->dev_name,
776 verbs->device->dev_path,
777 verbs->device->ibdev_path,
778 port.link_layer,
779 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
780 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
781 ? "Ethernet" : "Unknown"));
782}
783
784
785
786
787
788
789static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
790{
791 char sgid[33];
792 char dgid[33];
793 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
794 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
795 trace_qemu_rdma_dump_gid(who, sgid, dgid);
796}
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
842{
843
844#ifdef CONFIG_LINUX
845 struct ibv_port_attr port_attr;
846
847
848
849
850
851
852
853
854
855
856 if (!verbs) {
857 int num_devices, x;
858 struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
859 bool roce_found = false;
860 bool ib_found = false;
861
862 for (x = 0; x < num_devices; x++) {
863 verbs = ibv_open_device(dev_list[x]);
864 if (!verbs) {
865 if (errno == EPERM) {
866 continue;
867 } else {
868 return -EINVAL;
869 }
870 }
871
872 if (ibv_query_port(verbs, 1, &port_attr)) {
873 ibv_close_device(verbs);
874 ERROR(errp, "Could not query initial IB port");
875 return -EINVAL;
876 }
877
878 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
879 ib_found = true;
880 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
881 roce_found = true;
882 }
883
884 ibv_close_device(verbs);
885
886 }
887
888 if (roce_found) {
889 if (ib_found) {
890 fprintf(stderr, "WARN: migrations may fail:"
891 " IPv6 over RoCE / iWARP in linux"
892 " is broken. But since you appear to have a"
893 " mixed RoCE / IB environment, be sure to only"
894 " migrate over the IB fabric until the kernel "
895 " fixes the bug.\n");
896 } else {
897 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
898 " and your management software has specified '[::]'"
899 ", but IPv6 over RoCE / iWARP is not supported in Linux.");
900 return -ENONET;
901 }
902 }
903
904 return 0;
905 }
906
907
908
909
910
911
912
913
914 if (ibv_query_port(verbs, 1, &port_attr)) {
915 ERROR(errp, "Could not query initial IB port");
916 return -EINVAL;
917 }
918
919 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
920 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
921 "(but patches on linux-rdma in progress)");
922 return -ENONET;
923 }
924
925#endif
926
927 return 0;
928}
929
930
931
932
933
934
935static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
936{
937 int ret;
938 struct rdma_addrinfo *res;
939 char port_str[16];
940 struct rdma_cm_event *cm_event;
941 char ip[40] = "unknown";
942 struct rdma_addrinfo *e;
943
944 if (rdma->host == NULL || !strcmp(rdma->host, "")) {
945 ERROR(errp, "RDMA hostname has not been set");
946 return -EINVAL;
947 }
948
949
950 rdma->channel = rdma_create_event_channel();
951 if (!rdma->channel) {
952 ERROR(errp, "could not create CM channel");
953 return -EINVAL;
954 }
955
956
957 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
958 if (ret) {
959 ERROR(errp, "could not create channel id");
960 goto err_resolve_create_id;
961 }
962
963 snprintf(port_str, 16, "%d", rdma->port);
964 port_str[15] = '\0';
965
966 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
967 if (ret < 0) {
968 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
969 goto err_resolve_get_addr;
970 }
971
972 for (e = res; e != NULL; e = e->ai_next) {
973 inet_ntop(e->ai_family,
974 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
975 trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
976
977 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
978 RDMA_RESOLVE_TIMEOUT_MS);
979 if (!ret) {
980 if (e->ai_family == AF_INET6) {
981 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
982 if (ret) {
983 continue;
984 }
985 }
986 goto route;
987 }
988 }
989
990 ERROR(errp, "could not resolve address %s", rdma->host);
991 goto err_resolve_get_addr;
992
993route:
994 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
995
996 ret = rdma_get_cm_event(rdma->channel, &cm_event);
997 if (ret) {
998 ERROR(errp, "could not perform event_addr_resolved");
999 goto err_resolve_get_addr;
1000 }
1001
1002 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
1003 ERROR(errp, "result not equal to event_addr_resolved %s",
1004 rdma_event_str(cm_event->event));
1005 perror("rdma_resolve_addr");
1006 rdma_ack_cm_event(cm_event);
1007 ret = -EINVAL;
1008 goto err_resolve_get_addr;
1009 }
1010 rdma_ack_cm_event(cm_event);
1011
1012
1013 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
1014 if (ret) {
1015 ERROR(errp, "could not resolve rdma route");
1016 goto err_resolve_get_addr;
1017 }
1018
1019 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1020 if (ret) {
1021 ERROR(errp, "could not perform event_route_resolved");
1022 goto err_resolve_get_addr;
1023 }
1024 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1025 ERROR(errp, "result not equal to event_route_resolved: %s",
1026 rdma_event_str(cm_event->event));
1027 rdma_ack_cm_event(cm_event);
1028 ret = -EINVAL;
1029 goto err_resolve_get_addr;
1030 }
1031 rdma_ack_cm_event(cm_event);
1032 rdma->verbs = rdma->cm_id->verbs;
1033 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1034 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1035 return 0;
1036
1037err_resolve_get_addr:
1038 rdma_destroy_id(rdma->cm_id);
1039 rdma->cm_id = NULL;
1040err_resolve_create_id:
1041 rdma_destroy_event_channel(rdma->channel);
1042 rdma->channel = NULL;
1043 return ret;
1044}
1045
1046
1047
1048
1049static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1050{
1051
1052 rdma->pd = ibv_alloc_pd(rdma->verbs);
1053 if (!rdma->pd) {
1054 error_report("failed to allocate protection domain");
1055 return -1;
1056 }
1057
1058
1059 rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
1060 if (!rdma->comp_channel) {
1061 error_report("failed to allocate completion channel");
1062 goto err_alloc_pd_cq;
1063 }
1064
1065
1066
1067
1068
1069 rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1070 NULL, rdma->comp_channel, 0);
1071 if (!rdma->cq) {
1072 error_report("failed to allocate completion queue");
1073 goto err_alloc_pd_cq;
1074 }
1075
1076 return 0;
1077
1078err_alloc_pd_cq:
1079 if (rdma->pd) {
1080 ibv_dealloc_pd(rdma->pd);
1081 }
1082 if (rdma->comp_channel) {
1083 ibv_destroy_comp_channel(rdma->comp_channel);
1084 }
1085 rdma->pd = NULL;
1086 rdma->comp_channel = NULL;
1087 return -1;
1088
1089}
1090
1091
1092
1093
1094static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1095{
1096 struct ibv_qp_init_attr attr = { 0 };
1097 int ret;
1098
1099 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1100 attr.cap.max_recv_wr = 3;
1101 attr.cap.max_send_sge = 1;
1102 attr.cap.max_recv_sge = 1;
1103 attr.send_cq = rdma->cq;
1104 attr.recv_cq = rdma->cq;
1105 attr.qp_type = IBV_QPT_RC;
1106
1107 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1108 if (ret) {
1109 return -1;
1110 }
1111
1112 rdma->qp = rdma->cm_id->qp;
1113 return 0;
1114}
1115
1116static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1117{
1118 int i;
1119 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1120
1121 for (i = 0; i < local->nb_blocks; i++) {
1122 local->block[i].mr =
1123 ibv_reg_mr(rdma->pd,
1124 local->block[i].local_host_addr,
1125 local->block[i].length,
1126 IBV_ACCESS_LOCAL_WRITE |
1127 IBV_ACCESS_REMOTE_WRITE
1128 );
1129 if (!local->block[i].mr) {
1130 perror("Failed to register local dest ram block!\n");
1131 break;
1132 }
1133 rdma->total_registrations++;
1134 }
1135
1136 if (i >= local->nb_blocks) {
1137 return 0;
1138 }
1139
1140 for (i--; i >= 0; i--) {
1141 ibv_dereg_mr(local->block[i].mr);
1142 rdma->total_registrations--;
1143 }
1144
1145 return -1;
1146
1147}
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1159 uintptr_t block_offset,
1160 uint64_t offset,
1161 uint64_t length,
1162 uint64_t *block_index,
1163 uint64_t *chunk_index)
1164{
1165 uint64_t current_addr = block_offset + offset;
1166 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1167 (void *) block_offset);
1168 assert(block);
1169 assert(current_addr >= block->offset);
1170 assert((current_addr + length) <= (block->offset + block->length));
1171
1172 *block_index = block->index;
1173 *chunk_index = ram_chunk_index(block->local_host_addr,
1174 block->local_host_addr + (current_addr - block->offset));
1175
1176 return 0;
1177}
1178
1179
1180
1181
1182
1183
1184
1185
1186static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1187 RDMALocalBlock *block, uintptr_t host_addr,
1188 uint32_t *lkey, uint32_t *rkey, int chunk,
1189 uint8_t *chunk_start, uint8_t *chunk_end)
1190{
1191 if (block->mr) {
1192 if (lkey) {
1193 *lkey = block->mr->lkey;
1194 }
1195 if (rkey) {
1196 *rkey = block->mr->rkey;
1197 }
1198 return 0;
1199 }
1200
1201
1202 if (!block->pmr) {
1203 block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1204 }
1205
1206
1207
1208
1209
1210
1211 if (!block->pmr[chunk]) {
1212 uint64_t len = chunk_end - chunk_start;
1213
1214 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1215
1216 block->pmr[chunk] = ibv_reg_mr(rdma->pd,
1217 chunk_start, len,
1218 (rkey ? (IBV_ACCESS_LOCAL_WRITE |
1219 IBV_ACCESS_REMOTE_WRITE) : 0));
1220
1221 if (!block->pmr[chunk]) {
1222 perror("Failed to register chunk!");
1223 fprintf(stderr, "Chunk details: block: %d chunk index %d"
1224 " start %" PRIuPTR " end %" PRIuPTR
1225 " host %" PRIuPTR
1226 " local %" PRIuPTR " registrations: %d\n",
1227 block->index, chunk, (uintptr_t)chunk_start,
1228 (uintptr_t)chunk_end, host_addr,
1229 (uintptr_t)block->local_host_addr,
1230 rdma->total_registrations);
1231 return -1;
1232 }
1233 rdma->total_registrations++;
1234 }
1235
1236 if (lkey) {
1237 *lkey = block->pmr[chunk]->lkey;
1238 }
1239 if (rkey) {
1240 *rkey = block->pmr[chunk]->rkey;
1241 }
1242 return 0;
1243}
1244
1245
1246
1247
1248
1249static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1250{
1251 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1252 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1253 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1254 if (rdma->wr_data[idx].control_mr) {
1255 rdma->total_registrations++;
1256 return 0;
1257 }
1258 error_report("qemu_rdma_reg_control failed");
1259 return -1;
1260}
1261
1262const char *print_wrid(int wrid)
1263{
1264 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1265 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1266 }
1267 return wrid_desc[wrid];
1268}
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1306{
1307 while (rdma->unregistrations[rdma->unregister_current]) {
1308 int ret;
1309 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1310 uint64_t chunk =
1311 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1312 uint64_t index =
1313 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1314 RDMALocalBlock *block =
1315 &(rdma->local_ram_blocks.block[index]);
1316 RDMARegister reg = { .current_index = index };
1317 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1318 };
1319 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1320 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1321 .repeat = 1,
1322 };
1323
1324 trace_qemu_rdma_unregister_waiting_proc(chunk,
1325 rdma->unregister_current);
1326
1327 rdma->unregistrations[rdma->unregister_current] = 0;
1328 rdma->unregister_current++;
1329
1330 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1331 rdma->unregister_current = 0;
1332 }
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342 clear_bit(chunk, block->unregister_bitmap);
1343
1344 if (test_bit(chunk, block->transit_bitmap)) {
1345 trace_qemu_rdma_unregister_waiting_inflight(chunk);
1346 continue;
1347 }
1348
1349 trace_qemu_rdma_unregister_waiting_send(chunk);
1350
1351 ret = ibv_dereg_mr(block->pmr[chunk]);
1352 block->pmr[chunk] = NULL;
1353 block->remote_keys[chunk] = 0;
1354
1355 if (ret != 0) {
1356 perror("unregistration chunk failed");
1357 return -ret;
1358 }
1359 rdma->total_registrations--;
1360
1361 reg.key.chunk = chunk;
1362 register_to_network(rdma, ®);
1363 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1364 &resp, NULL, NULL);
1365 if (ret < 0) {
1366 return ret;
1367 }
1368
1369 trace_qemu_rdma_unregister_waiting_complete(chunk);
1370 }
1371
1372 return 0;
1373}
1374
1375static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1376 uint64_t chunk)
1377{
1378 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1379
1380 result |= (index << RDMA_WRID_BLOCK_SHIFT);
1381 result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1382
1383 return result;
1384}
1385
1386
1387
1388
1389
1390static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1391 uint64_t chunk, uint64_t wr_id)
1392{
1393 if (rdma->unregistrations[rdma->unregister_next] != 0) {
1394 error_report("rdma migration: queue is full");
1395 } else {
1396 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1397
1398 if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1399 trace_qemu_rdma_signal_unregister_append(chunk,
1400 rdma->unregister_next);
1401
1402 rdma->unregistrations[rdma->unregister_next++] =
1403 qemu_rdma_make_wrid(wr_id, index, chunk);
1404
1405 if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1406 rdma->unregister_next = 0;
1407 }
1408 } else {
1409 trace_qemu_rdma_signal_unregister_already(chunk);
1410 }
1411 }
1412}
1413
1414
1415
1416
1417
1418
1419static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
1420 uint32_t *byte_len)
1421{
1422 int ret;
1423 struct ibv_wc wc;
1424 uint64_t wr_id;
1425
1426 ret = ibv_poll_cq(rdma->cq, 1, &wc);
1427
1428 if (!ret) {
1429 *wr_id_out = RDMA_WRID_NONE;
1430 return 0;
1431 }
1432
1433 if (ret < 0) {
1434 error_report("ibv_poll_cq return %d", ret);
1435 return ret;
1436 }
1437
1438 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1439
1440 if (wc.status != IBV_WC_SUCCESS) {
1441 fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1442 wc.status, ibv_wc_status_str(wc.status));
1443 fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1444
1445 return -1;
1446 }
1447
1448 if (rdma->control_ready_expected &&
1449 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1450 trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
1451 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1452 rdma->control_ready_expected = 0;
1453 }
1454
1455 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1456 uint64_t chunk =
1457 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1458 uint64_t index =
1459 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1460 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1461
1462 trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
1463 index, chunk, block->local_host_addr,
1464 (void *)(uintptr_t)block->remote_host_addr);
1465
1466 clear_bit(chunk, block->transit_bitmap);
1467
1468 if (rdma->nb_sent > 0) {
1469 rdma->nb_sent--;
1470 }
1471
1472 if (!rdma->pin_all) {
1473
1474
1475
1476
1477
1478
1479#ifdef RDMA_UNREGISTRATION_EXAMPLE
1480 qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1481#endif
1482 }
1483 } else {
1484 trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
1485 }
1486
1487 *wr_id_out = wc.wr_id;
1488 if (byte_len) {
1489 *byte_len = wc.byte_len;
1490 }
1491
1492 return 0;
1493}
1494
1495
1496
1497
1498static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
1499{
1500 struct rdma_cm_event *cm_event;
1501 int ret = -1;
1502
1503
1504
1505
1506
1507 if (rdma->migration_started_on_destination &&
1508 migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1509 yield_until_fd_readable(rdma->comp_channel->fd);
1510 } else {
1511
1512
1513
1514
1515
1516
1517
1518 while (!rdma->error_state && !rdma->received_error) {
1519 GPollFD pfds[2];
1520 pfds[0].fd = rdma->comp_channel->fd;
1521 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1522 pfds[0].revents = 0;
1523
1524 pfds[1].fd = rdma->channel->fd;
1525 pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1526 pfds[1].revents = 0;
1527
1528
1529 switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1530 case 2:
1531 case 1:
1532 if (pfds[0].revents) {
1533 return 0;
1534 }
1535
1536 if (pfds[1].revents) {
1537 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1538 if (!ret) {
1539 rdma_ack_cm_event(cm_event);
1540 }
1541
1542 error_report("receive cm event while wait comp channel,"
1543 "cm event is %d", cm_event->event);
1544 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1545 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
1546 return -EPIPE;
1547 }
1548 }
1549 break;
1550
1551 case 0:
1552 break;
1553
1554 default:
1555
1556
1557 error_report("%s: poll failed", __func__);
1558 return -EPIPE;
1559 }
1560
1561 if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
1562
1563 return -EPIPE;
1564 }
1565 }
1566 }
1567
1568 if (rdma->received_error) {
1569 return -EPIPE;
1570 }
1571 return rdma->error_state;
1572}
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1588 uint32_t *byte_len)
1589{
1590 int num_cq_events = 0, ret = 0;
1591 struct ibv_cq *cq;
1592 void *cq_ctx;
1593 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1594
1595 if (ibv_req_notify_cq(rdma->cq, 0)) {
1596 return -1;
1597 }
1598
1599 while (wr_id != wrid_requested) {
1600 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1601 if (ret < 0) {
1602 return ret;
1603 }
1604
1605 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1606
1607 if (wr_id == RDMA_WRID_NONE) {
1608 break;
1609 }
1610 if (wr_id != wrid_requested) {
1611 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1612 wrid_requested, print_wrid(wr_id), wr_id);
1613 }
1614 }
1615
1616 if (wr_id == wrid_requested) {
1617 return 0;
1618 }
1619
1620 while (1) {
1621 ret = qemu_rdma_wait_comp_channel(rdma);
1622 if (ret) {
1623 goto err_block_for_wrid;
1624 }
1625
1626 ret = ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx);
1627 if (ret) {
1628 perror("ibv_get_cq_event");
1629 goto err_block_for_wrid;
1630 }
1631
1632 num_cq_events++;
1633
1634 ret = -ibv_req_notify_cq(cq, 0);
1635 if (ret) {
1636 goto err_block_for_wrid;
1637 }
1638
1639 while (wr_id != wrid_requested) {
1640 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1641 if (ret < 0) {
1642 goto err_block_for_wrid;
1643 }
1644
1645 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1646
1647 if (wr_id == RDMA_WRID_NONE) {
1648 break;
1649 }
1650 if (wr_id != wrid_requested) {
1651 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1652 wrid_requested, print_wrid(wr_id), wr_id);
1653 }
1654 }
1655
1656 if (wr_id == wrid_requested) {
1657 goto success_block_for_wrid;
1658 }
1659 }
1660
1661success_block_for_wrid:
1662 if (num_cq_events) {
1663 ibv_ack_cq_events(cq, num_cq_events);
1664 }
1665 return 0;
1666
1667err_block_for_wrid:
1668 if (num_cq_events) {
1669 ibv_ack_cq_events(cq, num_cq_events);
1670 }
1671
1672 rdma->error_state = ret;
1673 return ret;
1674}
1675
1676
1677
1678
1679
1680static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1681 RDMAControlHeader *head)
1682{
1683 int ret = 0;
1684 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1685 struct ibv_send_wr *bad_wr;
1686 struct ibv_sge sge = {
1687 .addr = (uintptr_t)(wr->control),
1688 .length = head->len + sizeof(RDMAControlHeader),
1689 .lkey = wr->control_mr->lkey,
1690 };
1691 struct ibv_send_wr send_wr = {
1692 .wr_id = RDMA_WRID_SEND_CONTROL,
1693 .opcode = IBV_WR_SEND,
1694 .send_flags = IBV_SEND_SIGNALED,
1695 .sg_list = &sge,
1696 .num_sge = 1,
1697 };
1698
1699 trace_qemu_rdma_post_send_control(control_desc(head->type));
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1710 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1711 control_to_network((void *) wr->control);
1712
1713 if (buf) {
1714 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1715 }
1716
1717
1718 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1719
1720 if (ret > 0) {
1721 error_report("Failed to use post IB SEND for control");
1722 return -ret;
1723 }
1724
1725 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1726 if (ret < 0) {
1727 error_report("rdma migration: send polling control error");
1728 }
1729
1730 return ret;
1731}
1732
1733
1734
1735
1736
1737static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1738{
1739 struct ibv_recv_wr *bad_wr;
1740 struct ibv_sge sge = {
1741 .addr = (uintptr_t)(rdma->wr_data[idx].control),
1742 .length = RDMA_CONTROL_MAX_BUFFER,
1743 .lkey = rdma->wr_data[idx].control_mr->lkey,
1744 };
1745
1746 struct ibv_recv_wr recv_wr = {
1747 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1748 .sg_list = &sge,
1749 .num_sge = 1,
1750 };
1751
1752
1753 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1754 return -1;
1755 }
1756
1757 return 0;
1758}
1759
1760
1761
1762
1763static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1764 RDMAControlHeader *head, int expecting, int idx)
1765{
1766 uint32_t byte_len;
1767 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1768 &byte_len);
1769
1770 if (ret < 0) {
1771 error_report("rdma migration: recv polling control error!");
1772 return ret;
1773 }
1774
1775 network_to_control((void *) rdma->wr_data[idx].control);
1776 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1777
1778 trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1779
1780 if (expecting == RDMA_CONTROL_NONE) {
1781 trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1782 head->type);
1783 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1784 error_report("Was expecting a %s (%d) control message"
1785 ", but got: %s (%d), length: %d",
1786 control_desc(expecting), expecting,
1787 control_desc(head->type), head->type, head->len);
1788 if (head->type == RDMA_CONTROL_ERROR) {
1789 rdma->received_error = true;
1790 }
1791 return -EIO;
1792 }
1793 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1794 error_report("too long length: %d", head->len);
1795 return -EINVAL;
1796 }
1797 if (sizeof(*head) + head->len != byte_len) {
1798 error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1799 return -EINVAL;
1800 }
1801
1802 return 0;
1803}
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1814 RDMAControlHeader *head)
1815{
1816 rdma->wr_data[idx].control_len = head->len;
1817 rdma->wr_data[idx].control_curr =
1818 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1819}
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1835 uint8_t *data, RDMAControlHeader *resp,
1836 int *resp_idx,
1837 int (*callback)(RDMAContext *rdma))
1838{
1839 int ret = 0;
1840
1841
1842
1843
1844
1845 if (rdma->control_ready_expected) {
1846 RDMAControlHeader resp;
1847 ret = qemu_rdma_exchange_get_response(rdma,
1848 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1849 if (ret < 0) {
1850 return ret;
1851 }
1852 }
1853
1854
1855
1856
1857 if (resp) {
1858 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1859 if (ret) {
1860 error_report("rdma migration: error posting"
1861 " extra control recv for anticipated result!");
1862 return ret;
1863 }
1864 }
1865
1866
1867
1868
1869 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1870 if (ret) {
1871 error_report("rdma migration: error posting first control recv!");
1872 return ret;
1873 }
1874
1875
1876
1877
1878 ret = qemu_rdma_post_send_control(rdma, data, head);
1879
1880 if (ret < 0) {
1881 error_report("Failed to send control buffer!");
1882 return ret;
1883 }
1884
1885
1886
1887
1888 if (resp) {
1889 if (callback) {
1890 trace_qemu_rdma_exchange_send_issue_callback();
1891 ret = callback(rdma);
1892 if (ret < 0) {
1893 return ret;
1894 }
1895 }
1896
1897 trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1898 ret = qemu_rdma_exchange_get_response(rdma, resp,
1899 resp->type, RDMA_WRID_DATA);
1900
1901 if (ret < 0) {
1902 return ret;
1903 }
1904
1905 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1906 if (resp_idx) {
1907 *resp_idx = RDMA_WRID_DATA;
1908 }
1909 trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1910 }
1911
1912 rdma->control_ready_expected = 1;
1913
1914 return 0;
1915}
1916
1917
1918
1919
1920
1921static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1922 int expecting)
1923{
1924 RDMAControlHeader ready = {
1925 .len = 0,
1926 .type = RDMA_CONTROL_READY,
1927 .repeat = 1,
1928 };
1929 int ret;
1930
1931
1932
1933
1934 ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1935
1936 if (ret < 0) {
1937 error_report("Failed to send control buffer!");
1938 return ret;
1939 }
1940
1941
1942
1943
1944 ret = qemu_rdma_exchange_get_response(rdma, head,
1945 expecting, RDMA_WRID_READY);
1946
1947 if (ret < 0) {
1948 return ret;
1949 }
1950
1951 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1952
1953
1954
1955
1956 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1957 if (ret) {
1958 error_report("rdma migration: error posting second control recv!");
1959 return ret;
1960 }
1961
1962 return 0;
1963}
1964
1965
1966
1967
1968
1969
1970
1971static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
1972 int current_index, uint64_t current_addr,
1973 uint64_t length)
1974{
1975 struct ibv_sge sge;
1976 struct ibv_send_wr send_wr = { 0 };
1977 struct ibv_send_wr *bad_wr;
1978 int reg_result_idx, ret, count = 0;
1979 uint64_t chunk, chunks;
1980 uint8_t *chunk_start, *chunk_end;
1981 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
1982 RDMARegister reg;
1983 RDMARegisterResult *reg_result;
1984 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
1985 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1986 .type = RDMA_CONTROL_REGISTER_REQUEST,
1987 .repeat = 1,
1988 };
1989
1990retry:
1991 sge.addr = (uintptr_t)(block->local_host_addr +
1992 (current_addr - block->offset));
1993 sge.length = length;
1994
1995 chunk = ram_chunk_index(block->local_host_addr,
1996 (uint8_t *)(uintptr_t)sge.addr);
1997 chunk_start = ram_chunk_start(block, chunk);
1998
1999 if (block->is_ram_block) {
2000 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2001
2002 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2003 chunks--;
2004 }
2005 } else {
2006 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2007
2008 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2009 chunks--;
2010 }
2011 }
2012
2013 trace_qemu_rdma_write_one_top(chunks + 1,
2014 (chunks + 1) *
2015 (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2016
2017 chunk_end = ram_chunk_end(block, chunk + chunks);
2018
2019 if (!rdma->pin_all) {
2020#ifdef RDMA_UNREGISTRATION_EXAMPLE
2021 qemu_rdma_unregister_waiting(rdma);
2022#endif
2023 }
2024
2025 while (test_bit(chunk, block->transit_bitmap)) {
2026 (void)count;
2027 trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2028 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2029
2030 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2031
2032 if (ret < 0) {
2033 error_report("Failed to Wait for previous write to complete "
2034 "block %d chunk %" PRIu64
2035 " current %" PRIu64 " len %" PRIu64 " %d",
2036 current_index, chunk, sge.addr, length, rdma->nb_sent);
2037 return ret;
2038 }
2039 }
2040
2041 if (!rdma->pin_all || !block->is_ram_block) {
2042 if (!block->remote_keys[chunk]) {
2043
2044
2045
2046
2047
2048
2049 if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2050 RDMACompress comp = {
2051 .offset = current_addr,
2052 .value = 0,
2053 .block_idx = current_index,
2054 .length = length,
2055 };
2056
2057 head.len = sizeof(comp);
2058 head.type = RDMA_CONTROL_COMPRESS;
2059
2060 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2061 current_index, current_addr);
2062
2063 compress_to_network(rdma, &comp);
2064 ret = qemu_rdma_exchange_send(rdma, &head,
2065 (uint8_t *) &comp, NULL, NULL, NULL);
2066
2067 if (ret < 0) {
2068 return -EIO;
2069 }
2070
2071 acct_update_position(f, sge.length, true);
2072
2073 return 1;
2074 }
2075
2076
2077
2078
2079 reg.current_index = current_index;
2080 if (block->is_ram_block) {
2081 reg.key.current_addr = current_addr;
2082 } else {
2083 reg.key.chunk = chunk;
2084 }
2085 reg.chunks = chunks;
2086
2087 trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2088 current_addr);
2089
2090 register_to_network(rdma, ®);
2091 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
2092 &resp, ®_result_idx, NULL);
2093 if (ret < 0) {
2094 return ret;
2095 }
2096
2097
2098 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2099 &sge.lkey, NULL, chunk,
2100 chunk_start, chunk_end)) {
2101 error_report("cannot get lkey");
2102 return -EINVAL;
2103 }
2104
2105 reg_result = (RDMARegisterResult *)
2106 rdma->wr_data[reg_result_idx].control_curr;
2107
2108 network_to_result(reg_result);
2109
2110 trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2111 reg_result->rkey, chunk);
2112
2113 block->remote_keys[chunk] = reg_result->rkey;
2114 block->remote_host_addr = reg_result->host_addr;
2115 } else {
2116
2117 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2118 &sge.lkey, NULL, chunk,
2119 chunk_start, chunk_end)) {
2120 error_report("cannot get lkey!");
2121 return -EINVAL;
2122 }
2123 }
2124
2125 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2126 } else {
2127 send_wr.wr.rdma.rkey = block->remote_rkey;
2128
2129 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2130 &sge.lkey, NULL, chunk,
2131 chunk_start, chunk_end)) {
2132 error_report("cannot get lkey!");
2133 return -EINVAL;
2134 }
2135 }
2136
2137
2138
2139
2140
2141
2142
2143 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2144 current_index, chunk);
2145
2146 send_wr.opcode = IBV_WR_RDMA_WRITE;
2147 send_wr.send_flags = IBV_SEND_SIGNALED;
2148 send_wr.sg_list = &sge;
2149 send_wr.num_sge = 1;
2150 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2151 (current_addr - block->offset);
2152
2153 trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2154 sge.length);
2155
2156
2157
2158
2159
2160 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2161
2162 if (ret == ENOMEM) {
2163 trace_qemu_rdma_write_one_queue_full();
2164 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2165 if (ret < 0) {
2166 error_report("rdma migration: failed to make "
2167 "room in full send queue! %d", ret);
2168 return ret;
2169 }
2170
2171 goto retry;
2172
2173 } else if (ret > 0) {
2174 perror("rdma migration: post rdma write failed");
2175 return -ret;
2176 }
2177
2178 set_bit(chunk, block->transit_bitmap);
2179 acct_update_position(f, sge.length, false);
2180 rdma->total_writes++;
2181
2182 return 0;
2183}
2184
2185
2186
2187
2188
2189
2190
2191static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2192{
2193 int ret;
2194
2195 if (!rdma->current_length) {
2196 return 0;
2197 }
2198
2199 ret = qemu_rdma_write_one(f, rdma,
2200 rdma->current_index, rdma->current_addr, rdma->current_length);
2201
2202 if (ret < 0) {
2203 return ret;
2204 }
2205
2206 if (ret == 0) {
2207 rdma->nb_sent++;
2208 trace_qemu_rdma_write_flush(rdma->nb_sent);
2209 }
2210
2211 rdma->current_length = 0;
2212 rdma->current_addr = 0;
2213
2214 return 0;
2215}
2216
2217static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2218 uint64_t offset, uint64_t len)
2219{
2220 RDMALocalBlock *block;
2221 uint8_t *host_addr;
2222 uint8_t *chunk_end;
2223
2224 if (rdma->current_index < 0) {
2225 return 0;
2226 }
2227
2228 if (rdma->current_chunk < 0) {
2229 return 0;
2230 }
2231
2232 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2233 host_addr = block->local_host_addr + (offset - block->offset);
2234 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2235
2236 if (rdma->current_length == 0) {
2237 return 0;
2238 }
2239
2240
2241
2242
2243 if (offset != (rdma->current_addr + rdma->current_length)) {
2244 return 0;
2245 }
2246
2247 if (offset < block->offset) {
2248 return 0;
2249 }
2250
2251 if ((offset + len) > (block->offset + block->length)) {
2252 return 0;
2253 }
2254
2255 if ((host_addr + len) > chunk_end) {
2256 return 0;
2257 }
2258
2259 return 1;
2260}
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2273 uint64_t block_offset, uint64_t offset,
2274 uint64_t len)
2275{
2276 uint64_t current_addr = block_offset + offset;
2277 uint64_t index = rdma->current_index;
2278 uint64_t chunk = rdma->current_chunk;
2279 int ret;
2280
2281
2282 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2283 ret = qemu_rdma_write_flush(f, rdma);
2284 if (ret) {
2285 return ret;
2286 }
2287 rdma->current_length = 0;
2288 rdma->current_addr = current_addr;
2289
2290 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2291 offset, len, &index, &chunk);
2292 if (ret) {
2293 error_report("ram block search failed");
2294 return ret;
2295 }
2296 rdma->current_index = index;
2297 rdma->current_chunk = chunk;
2298 }
2299
2300
2301 rdma->current_length += len;
2302
2303
2304 if (rdma->current_length >= RDMA_MERGE_MAX) {
2305 return qemu_rdma_write_flush(f, rdma);
2306 }
2307
2308 return 0;
2309}
2310
2311static void qemu_rdma_cleanup(RDMAContext *rdma)
2312{
2313 int idx;
2314
2315 if (rdma->cm_id && rdma->connected) {
2316 if ((rdma->error_state ||
2317 migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
2318 !rdma->received_error) {
2319 RDMAControlHeader head = { .len = 0,
2320 .type = RDMA_CONTROL_ERROR,
2321 .repeat = 1,
2322 };
2323 error_report("Early error. Sending error.");
2324 qemu_rdma_post_send_control(rdma, NULL, &head);
2325 }
2326
2327 rdma_disconnect(rdma->cm_id);
2328 trace_qemu_rdma_cleanup_disconnect();
2329 rdma->connected = false;
2330 }
2331
2332 if (rdma->channel) {
2333 qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2334 }
2335 g_free(rdma->dest_blocks);
2336 rdma->dest_blocks = NULL;
2337
2338 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2339 if (rdma->wr_data[idx].control_mr) {
2340 rdma->total_registrations--;
2341 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2342 }
2343 rdma->wr_data[idx].control_mr = NULL;
2344 }
2345
2346 if (rdma->local_ram_blocks.block) {
2347 while (rdma->local_ram_blocks.nb_blocks) {
2348 rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2349 }
2350 }
2351
2352 if (rdma->qp) {
2353 rdma_destroy_qp(rdma->cm_id);
2354 rdma->qp = NULL;
2355 }
2356 if (rdma->cq) {
2357 ibv_destroy_cq(rdma->cq);
2358 rdma->cq = NULL;
2359 }
2360 if (rdma->comp_channel) {
2361 ibv_destroy_comp_channel(rdma->comp_channel);
2362 rdma->comp_channel = NULL;
2363 }
2364 if (rdma->pd) {
2365 ibv_dealloc_pd(rdma->pd);
2366 rdma->pd = NULL;
2367 }
2368 if (rdma->cm_id) {
2369 rdma_destroy_id(rdma->cm_id);
2370 rdma->cm_id = NULL;
2371 }
2372
2373
2374 if (rdma->listen_id) {
2375 if (!rdma->is_return_path) {
2376 rdma_destroy_id(rdma->listen_id);
2377 }
2378 rdma->listen_id = NULL;
2379
2380 if (rdma->channel) {
2381 if (!rdma->is_return_path) {
2382 rdma_destroy_event_channel(rdma->channel);
2383 }
2384 rdma->channel = NULL;
2385 }
2386 }
2387
2388 if (rdma->channel) {
2389 rdma_destroy_event_channel(rdma->channel);
2390 rdma->channel = NULL;
2391 }
2392 g_free(rdma->host);
2393 rdma->host = NULL;
2394}
2395
2396
2397static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2398{
2399 int ret, idx;
2400 Error *local_err = NULL, **temp = &local_err;
2401
2402
2403
2404
2405
2406 rdma->pin_all = pin_all;
2407
2408 ret = qemu_rdma_resolve_host(rdma, temp);
2409 if (ret) {
2410 goto err_rdma_source_init;
2411 }
2412
2413 ret = qemu_rdma_alloc_pd_cq(rdma);
2414 if (ret) {
2415 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2416 " limits may be too low. Please check $ ulimit -a # and "
2417 "search for 'ulimit -l' in the output");
2418 goto err_rdma_source_init;
2419 }
2420
2421 ret = qemu_rdma_alloc_qp(rdma);
2422 if (ret) {
2423 ERROR(temp, "rdma migration: error allocating qp!");
2424 goto err_rdma_source_init;
2425 }
2426
2427 ret = qemu_rdma_init_ram_blocks(rdma);
2428 if (ret) {
2429 ERROR(temp, "rdma migration: error initializing ram blocks!");
2430 goto err_rdma_source_init;
2431 }
2432
2433
2434 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2435 for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2436 g_hash_table_insert(rdma->blockmap,
2437 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2438 &rdma->local_ram_blocks.block[idx]);
2439 }
2440
2441 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2442 ret = qemu_rdma_reg_control(rdma, idx);
2443 if (ret) {
2444 ERROR(temp, "rdma migration: error registering %d control!",
2445 idx);
2446 goto err_rdma_source_init;
2447 }
2448 }
2449
2450 return 0;
2451
2452err_rdma_source_init:
2453 error_propagate(errp, local_err);
2454 qemu_rdma_cleanup(rdma);
2455 return -1;
2456}
2457
2458static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
2459{
2460 RDMACapabilities cap = {
2461 .version = RDMA_CONTROL_VERSION_CURRENT,
2462 .flags = 0,
2463 };
2464 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2465 .retry_count = 5,
2466 .private_data = &cap,
2467 .private_data_len = sizeof(cap),
2468 };
2469 struct rdma_cm_event *cm_event;
2470 int ret;
2471
2472
2473
2474
2475
2476 if (rdma->pin_all) {
2477 trace_qemu_rdma_connect_pin_all_requested();
2478 cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2479 }
2480
2481 caps_to_network(&cap);
2482
2483 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2484 if (ret) {
2485 ERROR(errp, "posting second control recv");
2486 goto err_rdma_source_connect;
2487 }
2488
2489 ret = rdma_connect(rdma->cm_id, &conn_param);
2490 if (ret) {
2491 perror("rdma_connect");
2492 ERROR(errp, "connecting to destination!");
2493 goto err_rdma_source_connect;
2494 }
2495
2496 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2497 if (ret) {
2498 perror("rdma_get_cm_event after rdma_connect");
2499 ERROR(errp, "connecting to destination!");
2500 rdma_ack_cm_event(cm_event);
2501 goto err_rdma_source_connect;
2502 }
2503
2504 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2505 perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2506 ERROR(errp, "connecting to destination!");
2507 rdma_ack_cm_event(cm_event);
2508 goto err_rdma_source_connect;
2509 }
2510 rdma->connected = true;
2511
2512 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2513 network_to_caps(&cap);
2514
2515
2516
2517
2518
2519 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2520 ERROR(errp, "Server cannot support pinning all memory. "
2521 "Will register memory dynamically.");
2522 rdma->pin_all = false;
2523 }
2524
2525 trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2526
2527 rdma_ack_cm_event(cm_event);
2528
2529 rdma->control_ready_expected = 1;
2530 rdma->nb_sent = 0;
2531 return 0;
2532
2533err_rdma_source_connect:
2534 qemu_rdma_cleanup(rdma);
2535 return -1;
2536}
2537
2538static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2539{
2540 int ret, idx;
2541 struct rdma_cm_id *listen_id;
2542 char ip[40] = "unknown";
2543 struct rdma_addrinfo *res, *e;
2544 char port_str[16];
2545
2546 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2547 rdma->wr_data[idx].control_len = 0;
2548 rdma->wr_data[idx].control_curr = NULL;
2549 }
2550
2551 if (!rdma->host || !rdma->host[0]) {
2552 ERROR(errp, "RDMA host is not set!");
2553 rdma->error_state = -EINVAL;
2554 return -1;
2555 }
2556
2557 rdma->channel = rdma_create_event_channel();
2558 if (!rdma->channel) {
2559 ERROR(errp, "could not create rdma event channel");
2560 rdma->error_state = -EINVAL;
2561 return -1;
2562 }
2563
2564
2565 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2566 if (ret) {
2567 ERROR(errp, "could not create cm_id!");
2568 goto err_dest_init_create_listen_id;
2569 }
2570
2571 snprintf(port_str, 16, "%d", rdma->port);
2572 port_str[15] = '\0';
2573
2574 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2575 if (ret < 0) {
2576 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2577 goto err_dest_init_bind_addr;
2578 }
2579
2580 for (e = res; e != NULL; e = e->ai_next) {
2581 inet_ntop(e->ai_family,
2582 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2583 trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2584 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2585 if (ret) {
2586 continue;
2587 }
2588 if (e->ai_family == AF_INET6) {
2589 ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
2590 if (ret) {
2591 continue;
2592 }
2593 }
2594 break;
2595 }
2596
2597 if (!e) {
2598 ERROR(errp, "Error: could not rdma_bind_addr!");
2599 goto err_dest_init_bind_addr;
2600 }
2601
2602 rdma->listen_id = listen_id;
2603 qemu_rdma_dump_gid("dest_init", listen_id);
2604 return 0;
2605
2606err_dest_init_bind_addr:
2607 rdma_destroy_id(listen_id);
2608err_dest_init_create_listen_id:
2609 rdma_destroy_event_channel(rdma->channel);
2610 rdma->channel = NULL;
2611 rdma->error_state = ret;
2612 return ret;
2613
2614}
2615
2616static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
2617 RDMAContext *rdma)
2618{
2619 int idx;
2620
2621 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2622 rdma_return_path->wr_data[idx].control_len = 0;
2623 rdma_return_path->wr_data[idx].control_curr = NULL;
2624 }
2625
2626
2627 rdma_return_path->channel = rdma->channel;
2628 rdma_return_path->listen_id = rdma->listen_id;
2629
2630 rdma->return_path = rdma_return_path;
2631 rdma_return_path->return_path = rdma;
2632 rdma_return_path->is_return_path = true;
2633}
2634
2635static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2636{
2637 RDMAContext *rdma = NULL;
2638 InetSocketAddress *addr;
2639
2640 if (host_port) {
2641 rdma = g_new0(RDMAContext, 1);
2642 rdma->current_index = -1;
2643 rdma->current_chunk = -1;
2644
2645 addr = g_new(InetSocketAddress, 1);
2646 if (!inet_parse(addr, host_port, NULL)) {
2647 rdma->port = atoi(addr->port);
2648 rdma->host = g_strdup(addr->host);
2649 } else {
2650 ERROR(errp, "bad RDMA migration address '%s'", host_port);
2651 g_free(rdma);
2652 rdma = NULL;
2653 }
2654
2655 qapi_free_InetSocketAddress(addr);
2656 }
2657
2658 return rdma;
2659}
2660
2661
2662
2663
2664
2665
2666static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
2667 const struct iovec *iov,
2668 size_t niov,
2669 int *fds,
2670 size_t nfds,
2671 Error **errp)
2672{
2673 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2674 QEMUFile *f = rioc->file;
2675 RDMAContext *rdma;
2676 int ret;
2677 ssize_t done = 0;
2678 size_t i;
2679 size_t len = 0;
2680
2681 RCU_READ_LOCK_GUARD();
2682 rdma = atomic_rcu_read(&rioc->rdmaout);
2683
2684 if (!rdma) {
2685 return -EIO;
2686 }
2687
2688 CHECK_ERROR_STATE();
2689
2690
2691
2692
2693
2694 ret = qemu_rdma_write_flush(f, rdma);
2695 if (ret < 0) {
2696 rdma->error_state = ret;
2697 return ret;
2698 }
2699
2700 for (i = 0; i < niov; i++) {
2701 size_t remaining = iov[i].iov_len;
2702 uint8_t * data = (void *)iov[i].iov_base;
2703 while (remaining) {
2704 RDMAControlHeader head;
2705
2706 len = MIN(remaining, RDMA_SEND_INCREMENT);
2707 remaining -= len;
2708
2709 head.len = len;
2710 head.type = RDMA_CONTROL_QEMU_FILE;
2711
2712 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2713
2714 if (ret < 0) {
2715 rdma->error_state = ret;
2716 return ret;
2717 }
2718
2719 data += len;
2720 done += len;
2721 }
2722 }
2723
2724 return done;
2725}
2726
2727static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2728 size_t size, int idx)
2729{
2730 size_t len = 0;
2731
2732 if (rdma->wr_data[idx].control_len) {
2733 trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2734
2735 len = MIN(size, rdma->wr_data[idx].control_len);
2736 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2737 rdma->wr_data[idx].control_curr += len;
2738 rdma->wr_data[idx].control_len -= len;
2739 }
2740
2741 return len;
2742}
2743
2744
2745
2746
2747
2748
2749static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
2750 const struct iovec *iov,
2751 size_t niov,
2752 int **fds,
2753 size_t *nfds,
2754 Error **errp)
2755{
2756 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2757 RDMAContext *rdma;
2758 RDMAControlHeader head;
2759 int ret = 0;
2760 ssize_t i;
2761 size_t done = 0;
2762
2763 RCU_READ_LOCK_GUARD();
2764 rdma = atomic_rcu_read(&rioc->rdmain);
2765
2766 if (!rdma) {
2767 return -EIO;
2768 }
2769
2770 CHECK_ERROR_STATE();
2771
2772 for (i = 0; i < niov; i++) {
2773 size_t want = iov[i].iov_len;
2774 uint8_t *data = (void *)iov[i].iov_base;
2775
2776
2777
2778
2779
2780
2781 ret = qemu_rdma_fill(rdma, data, want, 0);
2782 done += ret;
2783 want -= ret;
2784
2785 if (want == 0) {
2786 continue;
2787 }
2788
2789
2790
2791 if (done > 0) {
2792 break;
2793 }
2794
2795
2796
2797
2798
2799 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2800
2801 if (ret < 0) {
2802 rdma->error_state = ret;
2803 return ret;
2804 }
2805
2806
2807
2808
2809 ret = qemu_rdma_fill(rdma, data, want, 0);
2810 done += ret;
2811 want -= ret;
2812
2813
2814 if (want) {
2815 if (done == 0) {
2816 return QIO_CHANNEL_ERR_BLOCK;
2817 } else {
2818 break;
2819 }
2820 }
2821 }
2822 return done;
2823}
2824
2825
2826
2827
2828static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2829{
2830 int ret;
2831
2832 if (qemu_rdma_write_flush(f, rdma) < 0) {
2833 return -EIO;
2834 }
2835
2836 while (rdma->nb_sent) {
2837 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2838 if (ret < 0) {
2839 error_report("rdma migration: complete polling error!");
2840 return -EIO;
2841 }
2842 }
2843
2844 qemu_rdma_unregister_waiting(rdma);
2845
2846 return 0;
2847}
2848
2849
2850static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
2851 bool blocking,
2852 Error **errp)
2853{
2854 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2855
2856 rioc->blocking = blocking;
2857 return 0;
2858}
2859
2860
2861typedef struct QIOChannelRDMASource QIOChannelRDMASource;
2862struct QIOChannelRDMASource {
2863 GSource parent;
2864 QIOChannelRDMA *rioc;
2865 GIOCondition condition;
2866};
2867
2868static gboolean
2869qio_channel_rdma_source_prepare(GSource *source,
2870 gint *timeout)
2871{
2872 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2873 RDMAContext *rdma;
2874 GIOCondition cond = 0;
2875 *timeout = -1;
2876
2877 RCU_READ_LOCK_GUARD();
2878 if (rsource->condition == G_IO_IN) {
2879 rdma = atomic_rcu_read(&rsource->rioc->rdmain);
2880 } else {
2881 rdma = atomic_rcu_read(&rsource->rioc->rdmaout);
2882 }
2883
2884 if (!rdma) {
2885 error_report("RDMAContext is NULL when prepare Gsource");
2886 return FALSE;
2887 }
2888
2889 if (rdma->wr_data[0].control_len) {
2890 cond |= G_IO_IN;
2891 }
2892 cond |= G_IO_OUT;
2893
2894 return cond & rsource->condition;
2895}
2896
2897static gboolean
2898qio_channel_rdma_source_check(GSource *source)
2899{
2900 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2901 RDMAContext *rdma;
2902 GIOCondition cond = 0;
2903
2904 RCU_READ_LOCK_GUARD();
2905 if (rsource->condition == G_IO_IN) {
2906 rdma = atomic_rcu_read(&rsource->rioc->rdmain);
2907 } else {
2908 rdma = atomic_rcu_read(&rsource->rioc->rdmaout);
2909 }
2910
2911 if (!rdma) {
2912 error_report("RDMAContext is NULL when check Gsource");
2913 return FALSE;
2914 }
2915
2916 if (rdma->wr_data[0].control_len) {
2917 cond |= G_IO_IN;
2918 }
2919 cond |= G_IO_OUT;
2920
2921 return cond & rsource->condition;
2922}
2923
2924static gboolean
2925qio_channel_rdma_source_dispatch(GSource *source,
2926 GSourceFunc callback,
2927 gpointer user_data)
2928{
2929 QIOChannelFunc func = (QIOChannelFunc)callback;
2930 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2931 RDMAContext *rdma;
2932 GIOCondition cond = 0;
2933
2934 RCU_READ_LOCK_GUARD();
2935 if (rsource->condition == G_IO_IN) {
2936 rdma = atomic_rcu_read(&rsource->rioc->rdmain);
2937 } else {
2938 rdma = atomic_rcu_read(&rsource->rioc->rdmaout);
2939 }
2940
2941 if (!rdma) {
2942 error_report("RDMAContext is NULL when dispatch Gsource");
2943 return FALSE;
2944 }
2945
2946 if (rdma->wr_data[0].control_len) {
2947 cond |= G_IO_IN;
2948 }
2949 cond |= G_IO_OUT;
2950
2951 return (*func)(QIO_CHANNEL(rsource->rioc),
2952 (cond & rsource->condition),
2953 user_data);
2954}
2955
2956static void
2957qio_channel_rdma_source_finalize(GSource *source)
2958{
2959 QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
2960
2961 object_unref(OBJECT(ssource->rioc));
2962}
2963
2964GSourceFuncs qio_channel_rdma_source_funcs = {
2965 qio_channel_rdma_source_prepare,
2966 qio_channel_rdma_source_check,
2967 qio_channel_rdma_source_dispatch,
2968 qio_channel_rdma_source_finalize
2969};
2970
2971static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
2972 GIOCondition condition)
2973{
2974 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2975 QIOChannelRDMASource *ssource;
2976 GSource *source;
2977
2978 source = g_source_new(&qio_channel_rdma_source_funcs,
2979 sizeof(QIOChannelRDMASource));
2980 ssource = (QIOChannelRDMASource *)source;
2981
2982 ssource->rioc = rioc;
2983 object_ref(OBJECT(rioc));
2984
2985 ssource->condition = condition;
2986
2987 return source;
2988}
2989
2990static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
2991 AioContext *ctx,
2992 IOHandler *io_read,
2993 IOHandler *io_write,
2994 void *opaque)
2995{
2996 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2997 if (io_read) {
2998 aio_set_fd_handler(ctx, rioc->rdmain->comp_channel->fd,
2999 false, io_read, io_write, NULL, opaque);
3000 } else {
3001 aio_set_fd_handler(ctx, rioc->rdmaout->comp_channel->fd,
3002 false, io_read, io_write, NULL, opaque);
3003 }
3004}
3005
3006struct rdma_close_rcu {
3007 struct rcu_head rcu;
3008 RDMAContext *rdmain;
3009 RDMAContext *rdmaout;
3010};
3011
3012
3013static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3014{
3015 if (rcu->rdmain) {
3016 qemu_rdma_cleanup(rcu->rdmain);
3017 }
3018
3019 if (rcu->rdmaout) {
3020 qemu_rdma_cleanup(rcu->rdmaout);
3021 }
3022
3023 g_free(rcu->rdmain);
3024 g_free(rcu->rdmaout);
3025 g_free(rcu);
3026}
3027
3028static int qio_channel_rdma_close(QIOChannel *ioc,
3029 Error **errp)
3030{
3031 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3032 RDMAContext *rdmain, *rdmaout;
3033 struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3034
3035 trace_qemu_rdma_close();
3036
3037 rdmain = rioc->rdmain;
3038 if (rdmain) {
3039 atomic_rcu_set(&rioc->rdmain, NULL);
3040 }
3041
3042 rdmaout = rioc->rdmaout;
3043 if (rdmaout) {
3044 atomic_rcu_set(&rioc->rdmaout, NULL);
3045 }
3046
3047 rcu->rdmain = rdmain;
3048 rcu->rdmaout = rdmaout;
3049 call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
3050
3051 return 0;
3052}
3053
3054static int
3055qio_channel_rdma_shutdown(QIOChannel *ioc,
3056 QIOChannelShutdown how,
3057 Error **errp)
3058{
3059 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3060 RDMAContext *rdmain, *rdmaout;
3061
3062 RCU_READ_LOCK_GUARD();
3063
3064 rdmain = atomic_rcu_read(&rioc->rdmain);
3065 rdmaout = atomic_rcu_read(&rioc->rdmain);
3066
3067 switch (how) {
3068 case QIO_CHANNEL_SHUTDOWN_READ:
3069 if (rdmain) {
3070 rdmain->error_state = -1;
3071 }
3072 break;
3073 case QIO_CHANNEL_SHUTDOWN_WRITE:
3074 if (rdmaout) {
3075 rdmaout->error_state = -1;
3076 }
3077 break;
3078 case QIO_CHANNEL_SHUTDOWN_BOTH:
3079 default:
3080 if (rdmain) {
3081 rdmain->error_state = -1;
3082 }
3083 if (rdmaout) {
3084 rdmaout->error_state = -1;
3085 }
3086 break;
3087 }
3088
3089 return 0;
3090}
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
3127 ram_addr_t block_offset, ram_addr_t offset,
3128 size_t size, uint64_t *bytes_sent)
3129{
3130 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3131 RDMAContext *rdma;
3132 int ret;
3133
3134 RCU_READ_LOCK_GUARD();
3135 rdma = atomic_rcu_read(&rioc->rdmaout);
3136
3137 if (!rdma) {
3138 return -EIO;
3139 }
3140
3141 CHECK_ERROR_STATE();
3142
3143 if (migration_in_postcopy()) {
3144 return RAM_SAVE_CONTROL_NOT_SUPP;
3145 }
3146
3147 qemu_fflush(f);
3148
3149 if (size > 0) {
3150
3151
3152
3153
3154
3155 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
3156 if (ret < 0) {
3157 error_report("rdma migration: write error! %d", ret);
3158 goto err;
3159 }
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169 if (bytes_sent) {
3170 *bytes_sent = 1;
3171 }
3172 } else {
3173 uint64_t index, chunk;
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186 ret = qemu_rdma_search_ram_block(rdma, block_offset,
3187 offset, size, &index, &chunk);
3188
3189 if (ret) {
3190 error_report("ram block search failed");
3191 goto err;
3192 }
3193
3194 qemu_rdma_signal_unregister(rdma, index, chunk, 0);
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204 }
3205
3206
3207
3208
3209
3210
3211
3212
3213 while (1) {
3214 uint64_t wr_id, wr_id_in;
3215 int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
3216 if (ret < 0) {
3217 error_report("rdma migration: polling error! %d", ret);
3218 goto err;
3219 }
3220
3221 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3222
3223 if (wr_id == RDMA_WRID_NONE) {
3224 break;
3225 }
3226 }
3227
3228 return RAM_SAVE_CONTROL_DELAYED;
3229err:
3230 rdma->error_state = ret;
3231 return ret;
3232}
3233
3234static void rdma_accept_incoming_migration(void *opaque);
3235
3236static void rdma_cm_poll_handler(void *opaque)
3237{
3238 RDMAContext *rdma = opaque;
3239 int ret;
3240 struct rdma_cm_event *cm_event;
3241 MigrationIncomingState *mis = migration_incoming_get_current();
3242
3243 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3244 if (ret) {
3245 error_report("get_cm_event failed %d", errno);
3246 return;
3247 }
3248 rdma_ack_cm_event(cm_event);
3249
3250 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
3251 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3252 if (!rdma->error_state &&
3253 migration_incoming_get_current()->state !=
3254 MIGRATION_STATUS_COMPLETED) {
3255 error_report("receive cm event, cm event is %d", cm_event->event);
3256 rdma->error_state = -EPIPE;
3257 if (rdma->return_path) {
3258 rdma->return_path->error_state = -EPIPE;
3259 }
3260 }
3261
3262 if (mis->migration_incoming_co) {
3263 qemu_coroutine_enter(mis->migration_incoming_co);
3264 }
3265 return;
3266 }
3267}
3268
3269static int qemu_rdma_accept(RDMAContext *rdma)
3270{
3271 RDMACapabilities cap;
3272 struct rdma_conn_param conn_param = {
3273 .responder_resources = 2,
3274 .private_data = &cap,
3275 .private_data_len = sizeof(cap),
3276 };
3277 struct rdma_cm_event *cm_event;
3278 struct ibv_context *verbs;
3279 int ret = -EINVAL;
3280 int idx;
3281
3282 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3283 if (ret) {
3284 goto err_rdma_dest_wait;
3285 }
3286
3287 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3288 rdma_ack_cm_event(cm_event);
3289 goto err_rdma_dest_wait;
3290 }
3291
3292 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3293
3294 network_to_caps(&cap);
3295
3296 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3297 error_report("Unknown source RDMA version: %d, bailing...",
3298 cap.version);
3299 rdma_ack_cm_event(cm_event);
3300 goto err_rdma_dest_wait;
3301 }
3302
3303
3304
3305
3306 cap.flags &= known_capabilities;
3307
3308
3309
3310
3311
3312 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3313 rdma->pin_all = true;
3314 }
3315
3316 rdma->cm_id = cm_event->id;
3317 verbs = cm_event->id->verbs;
3318
3319 rdma_ack_cm_event(cm_event);
3320
3321 trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3322
3323 caps_to_network(&cap);
3324
3325 trace_qemu_rdma_accept_pin_verbsc(verbs);
3326
3327 if (!rdma->verbs) {
3328 rdma->verbs = verbs;
3329 } else if (rdma->verbs != verbs) {
3330 error_report("ibv context not matching %p, %p!", rdma->verbs,
3331 verbs);
3332 goto err_rdma_dest_wait;
3333 }
3334
3335 qemu_rdma_dump_id("dest_init", verbs);
3336
3337 ret = qemu_rdma_alloc_pd_cq(rdma);
3338 if (ret) {
3339 error_report("rdma migration: error allocating pd and cq!");
3340 goto err_rdma_dest_wait;
3341 }
3342
3343 ret = qemu_rdma_alloc_qp(rdma);
3344 if (ret) {
3345 error_report("rdma migration: error allocating qp!");
3346 goto err_rdma_dest_wait;
3347 }
3348
3349 ret = qemu_rdma_init_ram_blocks(rdma);
3350 if (ret) {
3351 error_report("rdma migration: error initializing ram blocks!");
3352 goto err_rdma_dest_wait;
3353 }
3354
3355 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
3356 ret = qemu_rdma_reg_control(rdma, idx);
3357 if (ret) {
3358 error_report("rdma: error registering %d control", idx);
3359 goto err_rdma_dest_wait;
3360 }
3361 }
3362
3363
3364 if (migrate_postcopy() && !rdma->is_return_path) {
3365 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3366 NULL,
3367 (void *)(intptr_t)rdma->return_path);
3368 } else {
3369 qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
3370 NULL, rdma);
3371 }
3372
3373 ret = rdma_accept(rdma->cm_id, &conn_param);
3374 if (ret) {
3375 error_report("rdma_accept returns %d", ret);
3376 goto err_rdma_dest_wait;
3377 }
3378
3379 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3380 if (ret) {
3381 error_report("rdma_accept get_cm_event failed %d", ret);
3382 goto err_rdma_dest_wait;
3383 }
3384
3385 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3386 error_report("rdma_accept not event established");
3387 rdma_ack_cm_event(cm_event);
3388 goto err_rdma_dest_wait;
3389 }
3390
3391 rdma_ack_cm_event(cm_event);
3392 rdma->connected = true;
3393
3394 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
3395 if (ret) {
3396 error_report("rdma migration: error posting second control recv");
3397 goto err_rdma_dest_wait;
3398 }
3399
3400 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3401
3402 return 0;
3403
3404err_rdma_dest_wait:
3405 rdma->error_state = ret;
3406 qemu_rdma_cleanup(rdma);
3407 return ret;
3408}
3409
3410static int dest_ram_sort_func(const void *a, const void *b)
3411{
3412 unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3413 unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3414
3415 return (a_index < b_index) ? -1 : (a_index != b_index);
3416}
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
3428{
3429 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3430 .type = RDMA_CONTROL_REGISTER_RESULT,
3431 .repeat = 0,
3432 };
3433 RDMAControlHeader unreg_resp = { .len = 0,
3434 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3435 .repeat = 0,
3436 };
3437 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3438 .repeat = 1 };
3439 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3440 RDMAContext *rdma;
3441 RDMALocalBlocks *local;
3442 RDMAControlHeader head;
3443 RDMARegister *reg, *registers;
3444 RDMACompress *comp;
3445 RDMARegisterResult *reg_result;
3446 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3447 RDMALocalBlock *block;
3448 void *host_addr;
3449 int ret = 0;
3450 int idx = 0;
3451 int count = 0;
3452 int i = 0;
3453
3454 RCU_READ_LOCK_GUARD();
3455 rdma = atomic_rcu_read(&rioc->rdmain);
3456
3457 if (!rdma) {
3458 return -EIO;
3459 }
3460
3461 CHECK_ERROR_STATE();
3462
3463 local = &rdma->local_ram_blocks;
3464 do {
3465 trace_qemu_rdma_registration_handle_wait();
3466
3467 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
3468
3469 if (ret < 0) {
3470 break;
3471 }
3472
3473 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3474 error_report("rdma: Too many requests in this message (%d)."
3475 "Bailing.", head.repeat);
3476 ret = -EIO;
3477 break;
3478 }
3479
3480 switch (head.type) {
3481 case RDMA_CONTROL_COMPRESS:
3482 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3483 network_to_compress(comp);
3484
3485 trace_qemu_rdma_registration_handle_compress(comp->length,
3486 comp->block_idx,
3487 comp->offset);
3488 if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3489 error_report("rdma: 'compress' bad block index %u (vs %d)",
3490 (unsigned int)comp->block_idx,
3491 rdma->local_ram_blocks.nb_blocks);
3492 ret = -EIO;
3493 goto out;
3494 }
3495 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3496
3497 host_addr = block->local_host_addr +
3498 (comp->offset - block->offset);
3499
3500 ram_handle_compressed(host_addr, comp->value, comp->length);
3501 break;
3502
3503 case RDMA_CONTROL_REGISTER_FINISHED:
3504 trace_qemu_rdma_registration_handle_finished();
3505 goto out;
3506
3507 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3508 trace_qemu_rdma_registration_handle_ram_blocks();
3509
3510
3511
3512
3513
3514 qsort(rdma->local_ram_blocks.block,
3515 rdma->local_ram_blocks.nb_blocks,
3516 sizeof(RDMALocalBlock), dest_ram_sort_func);
3517 for (i = 0; i < local->nb_blocks; i++) {
3518 local->block[i].index = i;
3519 }
3520
3521 if (rdma->pin_all) {
3522 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3523 if (ret) {
3524 error_report("rdma migration: error dest "
3525 "registering ram blocks");
3526 goto out;
3527 }
3528 }
3529
3530
3531
3532
3533
3534
3535
3536 for (i = 0; i < local->nb_blocks; i++) {
3537 rdma->dest_blocks[i].remote_host_addr =
3538 (uintptr_t)(local->block[i].local_host_addr);
3539
3540 if (rdma->pin_all) {
3541 rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3542 }
3543
3544 rdma->dest_blocks[i].offset = local->block[i].offset;
3545 rdma->dest_blocks[i].length = local->block[i].length;
3546
3547 dest_block_to_network(&rdma->dest_blocks[i]);
3548 trace_qemu_rdma_registration_handle_ram_blocks_loop(
3549 local->block[i].block_name,
3550 local->block[i].offset,
3551 local->block[i].length,
3552 local->block[i].local_host_addr,
3553 local->block[i].src_index);
3554 }
3555
3556 blocks.len = rdma->local_ram_blocks.nb_blocks
3557 * sizeof(RDMADestBlock);
3558
3559
3560 ret = qemu_rdma_post_send_control(rdma,
3561 (uint8_t *) rdma->dest_blocks, &blocks);
3562
3563 if (ret < 0) {
3564 error_report("rdma migration: error sending remote info");
3565 goto out;
3566 }
3567
3568 break;
3569 case RDMA_CONTROL_REGISTER_REQUEST:
3570 trace_qemu_rdma_registration_handle_register(head.repeat);
3571
3572 reg_resp.repeat = head.repeat;
3573 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3574
3575 for (count = 0; count < head.repeat; count++) {
3576 uint64_t chunk;
3577 uint8_t *chunk_start, *chunk_end;
3578
3579 reg = ®isters[count];
3580 network_to_register(reg);
3581
3582 reg_result = &results[count];
3583
3584 trace_qemu_rdma_registration_handle_register_loop(count,
3585 reg->current_index, reg->key.current_addr, reg->chunks);
3586
3587 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3588 error_report("rdma: 'register' bad block index %u (vs %d)",
3589 (unsigned int)reg->current_index,
3590 rdma->local_ram_blocks.nb_blocks);
3591 ret = -ENOENT;
3592 goto out;
3593 }
3594 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3595 if (block->is_ram_block) {
3596 if (block->offset > reg->key.current_addr) {
3597 error_report("rdma: bad register address for block %s"
3598 " offset: %" PRIx64 " current_addr: %" PRIx64,
3599 block->block_name, block->offset,
3600 reg->key.current_addr);
3601 ret = -ERANGE;
3602 goto out;
3603 }
3604 host_addr = (block->local_host_addr +
3605 (reg->key.current_addr - block->offset));
3606 chunk = ram_chunk_index(block->local_host_addr,
3607 (uint8_t *) host_addr);
3608 } else {
3609 chunk = reg->key.chunk;
3610 host_addr = block->local_host_addr +
3611 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3612
3613 if (host_addr < (void *)block->local_host_addr) {
3614 error_report("rdma: bad chunk for block %s"
3615 " chunk: %" PRIx64,
3616 block->block_name, reg->key.chunk);
3617 ret = -ERANGE;
3618 goto out;
3619 }
3620 }
3621 chunk_start = ram_chunk_start(block, chunk);
3622 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3623
3624 uint32_t tmp_rkey = 0;
3625 if (qemu_rdma_register_and_get_keys(rdma, block,
3626 (uintptr_t)host_addr, NULL, &tmp_rkey,
3627 chunk, chunk_start, chunk_end)) {
3628 error_report("cannot get rkey");
3629 ret = -EINVAL;
3630 goto out;
3631 }
3632 reg_result->rkey = tmp_rkey;
3633
3634 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3635
3636 trace_qemu_rdma_registration_handle_register_rkey(
3637 reg_result->rkey);
3638
3639 result_to_network(reg_result);
3640 }
3641
3642 ret = qemu_rdma_post_send_control(rdma,
3643 (uint8_t *) results, ®_resp);
3644
3645 if (ret < 0) {
3646 error_report("Failed to send control buffer");
3647 goto out;
3648 }
3649 break;
3650 case RDMA_CONTROL_UNREGISTER_REQUEST:
3651 trace_qemu_rdma_registration_handle_unregister(head.repeat);
3652 unreg_resp.repeat = head.repeat;
3653 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3654
3655 for (count = 0; count < head.repeat; count++) {
3656 reg = ®isters[count];
3657 network_to_register(reg);
3658
3659 trace_qemu_rdma_registration_handle_unregister_loop(count,
3660 reg->current_index, reg->key.chunk);
3661
3662 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3663
3664 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3665 block->pmr[reg->key.chunk] = NULL;
3666
3667 if (ret != 0) {
3668 perror("rdma unregistration chunk failed");
3669 ret = -ret;
3670 goto out;
3671 }
3672
3673 rdma->total_registrations--;
3674
3675 trace_qemu_rdma_registration_handle_unregister_success(
3676 reg->key.chunk);
3677 }
3678
3679 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3680
3681 if (ret < 0) {
3682 error_report("Failed to send control buffer");
3683 goto out;
3684 }
3685 break;
3686 case RDMA_CONTROL_REGISTER_RESULT:
3687 error_report("Invalid RESULT message at dest.");
3688 ret = -EIO;
3689 goto out;
3690 default:
3691 error_report("Unknown control message %s", control_desc(head.type));
3692 ret = -EIO;
3693 goto out;
3694 }
3695 } while (1);
3696out:
3697 if (ret < 0) {
3698 rdma->error_state = ret;
3699 }
3700 return ret;
3701}
3702
3703
3704
3705
3706
3707
3708
3709
3710static int
3711rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
3712{
3713 RDMAContext *rdma;
3714 int curr;
3715 int found = -1;
3716
3717 RCU_READ_LOCK_GUARD();
3718 rdma = atomic_rcu_read(&rioc->rdmain);
3719
3720 if (!rdma) {
3721 return -EIO;
3722 }
3723
3724
3725 for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3726 if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3727 found = curr;
3728 break;
3729 }
3730 }
3731
3732 if (found == -1) {
3733 error_report("RAMBlock '%s' not found on destination", name);
3734 return -ENOENT;
3735 }
3736
3737 rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3738 trace_rdma_block_notification_handle(name, rdma->next_src_index);
3739 rdma->next_src_index++;
3740
3741 return 0;
3742}
3743
3744static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
3745{
3746 switch (flags) {
3747 case RAM_CONTROL_BLOCK_REG:
3748 return rdma_block_notification_handle(opaque, data);
3749
3750 case RAM_CONTROL_HOOK:
3751 return qemu_rdma_registration_handle(f, opaque);
3752
3753 default:
3754
3755 abort();
3756 }
3757}
3758
3759static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3760 uint64_t flags, void *data)
3761{
3762 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3763 RDMAContext *rdma;
3764
3765 RCU_READ_LOCK_GUARD();
3766 rdma = atomic_rcu_read(&rioc->rdmaout);
3767 if (!rdma) {
3768 return -EIO;
3769 }
3770
3771 CHECK_ERROR_STATE();
3772
3773 if (migration_in_postcopy()) {
3774 return 0;
3775 }
3776
3777 trace_qemu_rdma_registration_start(flags);
3778 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3779 qemu_fflush(f);
3780
3781 return 0;
3782}
3783
3784
3785
3786
3787
3788static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3789 uint64_t flags, void *data)
3790{
3791 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3792 RDMAContext *rdma;
3793 RDMAControlHeader head = { .len = 0, .repeat = 1 };
3794 int ret = 0;
3795
3796 RCU_READ_LOCK_GUARD();
3797 rdma = atomic_rcu_read(&rioc->rdmaout);
3798 if (!rdma) {
3799 return -EIO;
3800 }
3801
3802 CHECK_ERROR_STATE();
3803
3804 if (migration_in_postcopy()) {
3805 return 0;
3806 }
3807
3808 qemu_fflush(f);
3809 ret = qemu_rdma_drain_cq(f, rdma);
3810
3811 if (ret < 0) {
3812 goto err;
3813 }
3814
3815 if (flags == RAM_CONTROL_SETUP) {
3816 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3817 RDMALocalBlocks *local = &rdma->local_ram_blocks;
3818 int reg_result_idx, i, nb_dest_blocks;
3819
3820 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3821 trace_qemu_rdma_registration_stop_ram();
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3832 ®_result_idx, rdma->pin_all ?
3833 qemu_rdma_reg_whole_ram_blocks : NULL);
3834 if (ret < 0) {
3835 fprintf(stderr, "receiving remote info!");
3836 return ret;
3837 }
3838
3839 nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853 if (local->nb_blocks != nb_dest_blocks) {
3854 fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
3855 "Your QEMU command line parameters are probably "
3856 "not identical on both the source and destination.",
3857 local->nb_blocks, nb_dest_blocks);
3858 rdma->error_state = -EINVAL;
3859 return -EINVAL;
3860 }
3861
3862 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3863 memcpy(rdma->dest_blocks,
3864 rdma->wr_data[reg_result_idx].control_curr, resp.len);
3865 for (i = 0; i < nb_dest_blocks; i++) {
3866 network_to_dest_block(&rdma->dest_blocks[i]);
3867
3868
3869 if (rdma->dest_blocks[i].length != local->block[i].length) {
3870 fprintf(stderr, "Block %s/%d has a different length %" PRIu64
3871 "vs %" PRIu64, local->block[i].block_name, i,
3872 local->block[i].length,
3873 rdma->dest_blocks[i].length);
3874 rdma->error_state = -EINVAL;
3875 return -EINVAL;
3876 }
3877 local->block[i].remote_host_addr =
3878 rdma->dest_blocks[i].remote_host_addr;
3879 local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3880 }
3881 }
3882
3883 trace_qemu_rdma_registration_stop(flags);
3884
3885 head.type = RDMA_CONTROL_REGISTER_FINISHED;
3886 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3887
3888 if (ret < 0) {
3889 goto err;
3890 }
3891
3892 return 0;
3893err:
3894 rdma->error_state = ret;
3895 return ret;
3896}
3897
3898static const QEMUFileHooks rdma_read_hooks = {
3899 .hook_ram_load = rdma_load_hook,
3900};
3901
3902static const QEMUFileHooks rdma_write_hooks = {
3903 .before_ram_iterate = qemu_rdma_registration_start,
3904 .after_ram_iterate = qemu_rdma_registration_stop,
3905 .save_page = qemu_rdma_save_page,
3906};
3907
3908
3909static void qio_channel_rdma_finalize(Object *obj)
3910{
3911 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
3912 if (rioc->rdmain) {
3913 qemu_rdma_cleanup(rioc->rdmain);
3914 g_free(rioc->rdmain);
3915 rioc->rdmain = NULL;
3916 }
3917 if (rioc->rdmaout) {
3918 qemu_rdma_cleanup(rioc->rdmaout);
3919 g_free(rioc->rdmaout);
3920 rioc->rdmaout = NULL;
3921 }
3922}
3923
3924static void qio_channel_rdma_class_init(ObjectClass *klass,
3925 void *class_data G_GNUC_UNUSED)
3926{
3927 QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
3928
3929 ioc_klass->io_writev = qio_channel_rdma_writev;
3930 ioc_klass->io_readv = qio_channel_rdma_readv;
3931 ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
3932 ioc_klass->io_close = qio_channel_rdma_close;
3933 ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
3934 ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
3935 ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
3936}
3937
3938static const TypeInfo qio_channel_rdma_info = {
3939 .parent = TYPE_QIO_CHANNEL,
3940 .name = TYPE_QIO_CHANNEL_RDMA,
3941 .instance_size = sizeof(QIOChannelRDMA),
3942 .instance_finalize = qio_channel_rdma_finalize,
3943 .class_init = qio_channel_rdma_class_init,
3944};
3945
3946static void qio_channel_rdma_register_types(void)
3947{
3948 type_register_static(&qio_channel_rdma_info);
3949}
3950
3951type_init(qio_channel_rdma_register_types);
3952
3953static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
3954{
3955 QIOChannelRDMA *rioc;
3956
3957 if (qemu_file_mode_is_not_valid(mode)) {
3958 return NULL;
3959 }
3960
3961 rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
3962
3963 if (mode[0] == 'w') {
3964 rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc));
3965 rioc->rdmaout = rdma;
3966 rioc->rdmain = rdma->return_path;
3967 qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
3968 } else {
3969 rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc));
3970 rioc->rdmain = rdma;
3971 rioc->rdmaout = rdma->return_path;
3972 qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
3973 }
3974
3975 return rioc->file;
3976}
3977
3978static void rdma_accept_incoming_migration(void *opaque)
3979{
3980 RDMAContext *rdma = opaque;
3981 int ret;
3982 QEMUFile *f;
3983 Error *local_err = NULL;
3984
3985 trace_qemu_rdma_accept_incoming_migration();
3986 ret = qemu_rdma_accept(rdma);
3987
3988 if (ret) {
3989 fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
3990 return;
3991 }
3992
3993 trace_qemu_rdma_accept_incoming_migration_accepted();
3994
3995 if (rdma->is_return_path) {
3996 return;
3997 }
3998
3999 f = qemu_fopen_rdma(rdma, "rb");
4000 if (f == NULL) {
4001 fprintf(stderr, "RDMA ERROR: could not qemu_fopen_rdma\n");
4002 qemu_rdma_cleanup(rdma);
4003 return;
4004 }
4005
4006 rdma->migration_started_on_destination = 1;
4007 migration_fd_process_incoming(f, &local_err);
4008 if (local_err) {
4009 error_reportf_err(local_err, "RDMA ERROR:");
4010 }
4011}
4012
4013void rdma_start_incoming_migration(const char *host_port, Error **errp)
4014{
4015 int ret;
4016 RDMAContext *rdma, *rdma_return_path = NULL;
4017 Error *local_err = NULL;
4018
4019 trace_rdma_start_incoming_migration();
4020
4021
4022 if (ram_block_discard_is_required()) {
4023 error_setg(errp, "RDMA: cannot disable RAM discard");
4024 return;
4025 }
4026
4027 rdma = qemu_rdma_data_init(host_port, &local_err);
4028 if (rdma == NULL) {
4029 goto err;
4030 }
4031
4032 ret = qemu_rdma_dest_init(rdma, &local_err);
4033
4034 if (ret) {
4035 goto err;
4036 }
4037
4038 trace_rdma_start_incoming_migration_after_dest_init();
4039
4040 ret = rdma_listen(rdma->listen_id, 5);
4041
4042 if (ret) {
4043 ERROR(errp, "listening on socket!");
4044 goto err;
4045 }
4046
4047 trace_rdma_start_incoming_migration_after_rdma_listen();
4048
4049
4050 if (migrate_postcopy()) {
4051 rdma_return_path = qemu_rdma_data_init(host_port, &local_err);
4052
4053 if (rdma_return_path == NULL) {
4054 goto err;
4055 }
4056
4057 qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
4058 }
4059
4060 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
4061 NULL, (void *)(intptr_t)rdma);
4062 return;
4063err:
4064 error_propagate(errp, local_err);
4065 if (rdma) {
4066 g_free(rdma->host);
4067 }
4068 g_free(rdma);
4069 g_free(rdma_return_path);
4070}
4071
4072void rdma_start_outgoing_migration(void *opaque,
4073 const char *host_port, Error **errp)
4074{
4075 MigrationState *s = opaque;
4076 RDMAContext *rdma_return_path = NULL;
4077 RDMAContext *rdma;
4078 int ret = 0;
4079
4080
4081 if (ram_block_discard_is_required()) {
4082 error_setg(errp, "RDMA: cannot disable RAM discard");
4083 return;
4084 }
4085
4086 rdma = qemu_rdma_data_init(host_port, errp);
4087 if (rdma == NULL) {
4088 goto err;
4089 }
4090
4091 ret = qemu_rdma_source_init(rdma,
4092 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4093
4094 if (ret) {
4095 goto err;
4096 }
4097
4098 trace_rdma_start_outgoing_migration_after_rdma_source_init();
4099 ret = qemu_rdma_connect(rdma, errp);
4100
4101 if (ret) {
4102 goto err;
4103 }
4104
4105
4106 if (migrate_postcopy()) {
4107 rdma_return_path = qemu_rdma_data_init(host_port, errp);
4108
4109 if (rdma_return_path == NULL) {
4110 goto return_path_err;
4111 }
4112
4113 ret = qemu_rdma_source_init(rdma_return_path,
4114 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4115
4116 if (ret) {
4117 goto return_path_err;
4118 }
4119
4120 ret = qemu_rdma_connect(rdma_return_path, errp);
4121
4122 if (ret) {
4123 goto return_path_err;
4124 }
4125
4126 rdma->return_path = rdma_return_path;
4127 rdma_return_path->return_path = rdma;
4128 rdma_return_path->is_return_path = true;
4129 }
4130
4131 trace_rdma_start_outgoing_migration_after_rdma_connect();
4132
4133 s->to_dst_file = qemu_fopen_rdma(rdma, "wb");
4134 migrate_fd_connect(s, NULL);
4135 return;
4136return_path_err:
4137 qemu_rdma_cleanup(rdma);
4138err:
4139 g_free(rdma);
4140 g_free(rdma_return_path);
4141}
4142