1
2
3
4
5
6
7
8
9
10
11
12
13
14#include "qemu-common.h"
15#include "migration/migration.h"
16#include "migration/qemu-file.h"
17#include "exec/cpu-common.h"
18#include "qemu/main-loop.h"
19#include "qemu/sockets.h"
20#include "qemu/bitmap.h"
21#include "block/coroutine.h"
22#include <stdio.h>
23#include <sys/types.h>
24#include <sys/socket.h>
25#include <netdb.h>
26#include <arpa/inet.h>
27#include <string.h>
28#include <rdma/rdma_cma.h>
29
30
31
32
33
34#ifdef DEBUG_RDMA
35#define DPRINTF(fmt, ...) \
36 do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
37#else
38#define DPRINTF(fmt, ...) \
39 do { } while (0)
40#endif
41
42#ifdef DEBUG_RDMA_VERBOSE
43#define DDPRINTF(fmt, ...) \
44 do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
45#else
46#define DDPRINTF(fmt, ...) \
47 do { } while (0)
48#endif
49
50#ifdef DEBUG_RDMA_REALLY_VERBOSE
51#define DDDPRINTF(fmt, ...) \
52 do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
53#else
54#define DDDPRINTF(fmt, ...) \
55 do { } while (0)
56#endif
57
58
59
60
61#define ERROR(errp, fmt, ...) \
62 do { \
63 fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
64 if (errp && (*(errp) == NULL)) { \
65 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
66 } \
67 } while (0)
68
69#define RDMA_RESOLVE_TIMEOUT_MS 10000
70
71
72#define RDMA_MERGE_MAX (2 * 1024 * 1024)
73#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
74
75#define RDMA_REG_CHUNK_SHIFT 20
76
77
78
79
80
81
82
83#define RDMA_SEND_INCREMENT 32768
84
85
86
87
88#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
89#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
90
91#define RDMA_CONTROL_VERSION_CURRENT 1
92
93
94
95#define RDMA_CAPABILITY_PIN_ALL 0x01
96
97
98
99
100
101static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
102
103#define CHECK_ERROR_STATE() \
104 do { \
105 if (rdma->error_state) { \
106 if (!rdma->error_reported) { \
107 fprintf(stderr, "RDMA is in an error state waiting migration" \
108 " to abort!\n"); \
109 rdma->error_reported = 1; \
110 } \
111 return rdma->error_state; \
112 } \
113 } while (0);
114
115
116
117
118
119
120
121
122
123
124
125
126
127#define RDMA_WRID_TYPE_SHIFT 0UL
128#define RDMA_WRID_BLOCK_SHIFT 16UL
129#define RDMA_WRID_CHUNK_SHIFT 30UL
130
131#define RDMA_WRID_TYPE_MASK \
132 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
133
134#define RDMA_WRID_BLOCK_MASK \
135 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
136
137#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
138
139
140
141
142
143
144enum {
145 RDMA_WRID_NONE = 0,
146 RDMA_WRID_RDMA_WRITE = 1,
147 RDMA_WRID_SEND_CONTROL = 2000,
148 RDMA_WRID_RECV_CONTROL = 4000,
149};
150
151const char *wrid_desc[] = {
152 [RDMA_WRID_NONE] = "NONE",
153 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
154 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
155 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
156};
157
158
159
160
161
162
163
164
165enum {
166 RDMA_WRID_READY = 0,
167 RDMA_WRID_DATA,
168 RDMA_WRID_CONTROL,
169 RDMA_WRID_MAX,
170};
171
172
173
174
175enum {
176 RDMA_CONTROL_NONE = 0,
177 RDMA_CONTROL_ERROR,
178 RDMA_CONTROL_READY,
179 RDMA_CONTROL_QEMU_FILE,
180 RDMA_CONTROL_RAM_BLOCKS_REQUEST,
181 RDMA_CONTROL_RAM_BLOCKS_RESULT,
182 RDMA_CONTROL_COMPRESS,
183 RDMA_CONTROL_REGISTER_REQUEST,
184 RDMA_CONTROL_REGISTER_RESULT,
185 RDMA_CONTROL_REGISTER_FINISHED,
186 RDMA_CONTROL_UNREGISTER_REQUEST,
187 RDMA_CONTROL_UNREGISTER_FINISHED,
188};
189
190const char *control_desc[] = {
191 [RDMA_CONTROL_NONE] = "NONE",
192 [RDMA_CONTROL_ERROR] = "ERROR",
193 [RDMA_CONTROL_READY] = "READY",
194 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
195 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
196 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
197 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
198 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
199 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
200 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
201 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
202 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
203};
204
205
206
207
208
209typedef struct {
210 uint8_t control[RDMA_CONTROL_MAX_BUFFER];
211 struct ibv_mr *control_mr;
212 size_t control_len;
213 uint8_t *control_curr;
214} RDMAWorkRequestData;
215
216
217
218
219typedef struct {
220 uint32_t version;
221 uint32_t flags;
222} RDMACapabilities;
223
224static void caps_to_network(RDMACapabilities *cap)
225{
226 cap->version = htonl(cap->version);
227 cap->flags = htonl(cap->flags);
228}
229
230static void network_to_caps(RDMACapabilities *cap)
231{
232 cap->version = ntohl(cap->version);
233 cap->flags = ntohl(cap->flags);
234}
235
236
237
238
239
240
241
242
243typedef struct RDMALocalBlock {
244 uint8_t *local_host_addr;
245 uint64_t remote_host_addr;
246 uint64_t offset;
247 uint64_t length;
248 struct ibv_mr **pmr;
249 struct ibv_mr *mr;
250 uint32_t *remote_keys;
251 uint32_t remote_rkey;
252 int index;
253 bool is_ram_block;
254 int nb_chunks;
255 unsigned long *transit_bitmap;
256 unsigned long *unregister_bitmap;
257} RDMALocalBlock;
258
259
260
261
262
263
264
265
266typedef struct QEMU_PACKED RDMARemoteBlock {
267 uint64_t remote_host_addr;
268 uint64_t offset;
269 uint64_t length;
270 uint32_t remote_rkey;
271 uint32_t padding;
272} RDMARemoteBlock;
273
274static uint64_t htonll(uint64_t v)
275{
276 union { uint32_t lv[2]; uint64_t llv; } u;
277 u.lv[0] = htonl(v >> 32);
278 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
279 return u.llv;
280}
281
282static uint64_t ntohll(uint64_t v) {
283 union { uint32_t lv[2]; uint64_t llv; } u;
284 u.llv = v;
285 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
286}
287
288static void remote_block_to_network(RDMARemoteBlock *rb)
289{
290 rb->remote_host_addr = htonll(rb->remote_host_addr);
291 rb->offset = htonll(rb->offset);
292 rb->length = htonll(rb->length);
293 rb->remote_rkey = htonl(rb->remote_rkey);
294}
295
296static void network_to_remote_block(RDMARemoteBlock *rb)
297{
298 rb->remote_host_addr = ntohll(rb->remote_host_addr);
299 rb->offset = ntohll(rb->offset);
300 rb->length = ntohll(rb->length);
301 rb->remote_rkey = ntohl(rb->remote_rkey);
302}
303
304
305
306
307
308
309typedef struct RDMALocalBlocks {
310 int nb_blocks;
311 bool init;
312 RDMALocalBlock *block;
313} RDMALocalBlocks;
314
315
316
317
318
319
320
321typedef struct RDMAContext {
322 char *host;
323 int port;
324
325 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
326
327
328
329
330
331
332
333
334 int control_ready_expected;
335
336
337 int nb_sent;
338
339
340
341 uint64_t current_addr;
342 uint64_t current_length;
343
344 int current_index;
345
346 int current_chunk;
347
348 bool pin_all;
349
350
351
352
353
354
355
356
357 struct rdma_cm_id *cm_id;
358 struct rdma_cm_id *listen_id;
359
360 struct ibv_context *verbs;
361 struct rdma_event_channel *channel;
362 struct ibv_qp *qp;
363 struct ibv_comp_channel *comp_channel;
364 struct ibv_pd *pd;
365 struct ibv_cq *cq;
366
367
368
369
370
371
372 int error_state;
373 int error_reported;
374
375
376
377
378 RDMALocalBlocks local_ram_blocks;
379 RDMARemoteBlock *block;
380
381
382
383
384
385
386 int migration_started_on_destination;
387
388 int total_registrations;
389 int total_writes;
390
391 int unregister_current, unregister_next;
392 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
393
394 GHashTable *blockmap;
395} RDMAContext;
396
397
398
399
400typedef struct QEMUFileRDMA {
401 RDMAContext *rdma;
402 size_t len;
403 void *file;
404} QEMUFileRDMA;
405
406
407
408
409
410typedef struct QEMU_PACKED {
411 uint32_t len;
412 uint32_t type;
413 uint32_t repeat;
414 uint32_t padding;
415} RDMAControlHeader;
416
417static void control_to_network(RDMAControlHeader *control)
418{
419 control->type = htonl(control->type);
420 control->len = htonl(control->len);
421 control->repeat = htonl(control->repeat);
422}
423
424static void network_to_control(RDMAControlHeader *control)
425{
426 control->type = ntohl(control->type);
427 control->len = ntohl(control->len);
428 control->repeat = ntohl(control->repeat);
429}
430
431
432
433
434
435
436
437typedef struct QEMU_PACKED {
438 union QEMU_PACKED {
439 uint64_t current_addr;
440 uint64_t chunk;
441 } key;
442 uint32_t current_index;
443 uint32_t padding;
444 uint64_t chunks;
445} RDMARegister;
446
447static void register_to_network(RDMARegister *reg)
448{
449 reg->key.current_addr = htonll(reg->key.current_addr);
450 reg->current_index = htonl(reg->current_index);
451 reg->chunks = htonll(reg->chunks);
452}
453
454static void network_to_register(RDMARegister *reg)
455{
456 reg->key.current_addr = ntohll(reg->key.current_addr);
457 reg->current_index = ntohl(reg->current_index);
458 reg->chunks = ntohll(reg->chunks);
459}
460
461typedef struct QEMU_PACKED {
462 uint32_t value;
463 uint32_t block_idx;
464 uint64_t offset;
465 uint64_t length;
466} RDMACompress;
467
468static void compress_to_network(RDMACompress *comp)
469{
470 comp->value = htonl(comp->value);
471 comp->block_idx = htonl(comp->block_idx);
472 comp->offset = htonll(comp->offset);
473 comp->length = htonll(comp->length);
474}
475
476static void network_to_compress(RDMACompress *comp)
477{
478 comp->value = ntohl(comp->value);
479 comp->block_idx = ntohl(comp->block_idx);
480 comp->offset = ntohll(comp->offset);
481 comp->length = ntohll(comp->length);
482}
483
484
485
486
487
488
489typedef struct QEMU_PACKED {
490 uint32_t rkey;
491 uint32_t padding;
492 uint64_t host_addr;
493} RDMARegisterResult;
494
495static void result_to_network(RDMARegisterResult *result)
496{
497 result->rkey = htonl(result->rkey);
498 result->host_addr = htonll(result->host_addr);
499};
500
501static void network_to_result(RDMARegisterResult *result)
502{
503 result->rkey = ntohl(result->rkey);
504 result->host_addr = ntohll(result->host_addr);
505};
506
507const char *print_wrid(int wrid);
508static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
509 uint8_t *data, RDMAControlHeader *resp,
510 int *resp_idx,
511 int (*callback)(RDMAContext *rdma));
512
513static inline uint64_t ram_chunk_index(uint8_t *start, uint8_t *host)
514{
515 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
516}
517
518static inline uint8_t *ram_chunk_start(RDMALocalBlock *rdma_ram_block,
519 uint64_t i)
520{
521 return (uint8_t *) (((uintptr_t) rdma_ram_block->local_host_addr)
522 + (i << RDMA_REG_CHUNK_SHIFT));
523}
524
525static inline uint8_t *ram_chunk_end(RDMALocalBlock *rdma_ram_block, uint64_t i)
526{
527 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
528 (1UL << RDMA_REG_CHUNK_SHIFT);
529
530 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
531 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
532 }
533
534 return result;
535}
536
537static int __qemu_rdma_add_block(RDMAContext *rdma, void *host_addr,
538 ram_addr_t block_offset, uint64_t length)
539{
540 RDMALocalBlocks *local = &rdma->local_ram_blocks;
541 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
542 (void *) block_offset);
543 RDMALocalBlock *old = local->block;
544
545 assert(block == NULL);
546
547 local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1));
548
549 if (local->nb_blocks) {
550 int x;
551
552 for (x = 0; x < local->nb_blocks; x++) {
553 g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
554 g_hash_table_insert(rdma->blockmap, (void *)old[x].offset,
555 &local->block[x]);
556 }
557 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
558 g_free(old);
559 }
560
561 block = &local->block[local->nb_blocks];
562
563 block->local_host_addr = host_addr;
564 block->offset = block_offset;
565 block->length = length;
566 block->index = local->nb_blocks;
567 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
568 block->transit_bitmap = bitmap_new(block->nb_chunks);
569 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
570 block->unregister_bitmap = bitmap_new(block->nb_chunks);
571 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
572 block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t));
573
574 block->is_ram_block = local->init ? false : true;
575
576 g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
577
578 DDPRINTF("Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64
579 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",
580 local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,
581 block->length, (uint64_t) (block->local_host_addr + block->length),
582 BITS_TO_LONGS(block->nb_chunks) *
583 sizeof(unsigned long) * 8, block->nb_chunks);
584
585 local->nb_blocks++;
586
587 return 0;
588}
589
590
591
592
593
594
595static void qemu_rdma_init_one_block(void *host_addr,
596 ram_addr_t block_offset, ram_addr_t length, void *opaque)
597{
598 __qemu_rdma_add_block(opaque, host_addr, block_offset, length);
599}
600
601
602
603
604
605
606static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
607{
608 RDMALocalBlocks *local = &rdma->local_ram_blocks;
609
610 assert(rdma->blockmap == NULL);
611 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
612 memset(local, 0, sizeof *local);
613 qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
614 DPRINTF("Allocated %d local ram block structures\n", local->nb_blocks);
615 rdma->block = (RDMARemoteBlock *) g_malloc0(sizeof(RDMARemoteBlock) *
616 rdma->local_ram_blocks.nb_blocks);
617 local->init = true;
618 return 0;
619}
620
621static int __qemu_rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
622{
623 RDMALocalBlocks *local = &rdma->local_ram_blocks;
624 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
625 (void *) block_offset);
626 RDMALocalBlock *old = local->block;
627 int x;
628
629 assert(block);
630
631 if (block->pmr) {
632 int j;
633
634 for (j = 0; j < block->nb_chunks; j++) {
635 if (!block->pmr[j]) {
636 continue;
637 }
638 ibv_dereg_mr(block->pmr[j]);
639 rdma->total_registrations--;
640 }
641 g_free(block->pmr);
642 block->pmr = NULL;
643 }
644
645 if (block->mr) {
646 ibv_dereg_mr(block->mr);
647 rdma->total_registrations--;
648 block->mr = NULL;
649 }
650
651 g_free(block->transit_bitmap);
652 block->transit_bitmap = NULL;
653
654 g_free(block->unregister_bitmap);
655 block->unregister_bitmap = NULL;
656
657 g_free(block->remote_keys);
658 block->remote_keys = NULL;
659
660 for (x = 0; x < local->nb_blocks; x++) {
661 g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
662 }
663
664 if (local->nb_blocks > 1) {
665
666 local->block = g_malloc0(sizeof(RDMALocalBlock) *
667 (local->nb_blocks - 1));
668
669 if (block->index) {
670 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
671 }
672
673 if (block->index < (local->nb_blocks - 1)) {
674 memcpy(local->block + block->index, old + (block->index + 1),
675 sizeof(RDMALocalBlock) *
676 (local->nb_blocks - (block->index + 1)));
677 }
678 } else {
679 assert(block == local->block);
680 local->block = NULL;
681 }
682
683 DDPRINTF("Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64
684 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",
685 local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,
686 block->length, (uint64_t) (block->local_host_addr + block->length),
687 BITS_TO_LONGS(block->nb_chunks) *
688 sizeof(unsigned long) * 8, block->nb_chunks);
689
690 g_free(old);
691
692 local->nb_blocks--;
693
694 if (local->nb_blocks) {
695 for (x = 0; x < local->nb_blocks; x++) {
696 g_hash_table_insert(rdma->blockmap, (void *)local->block[x].offset,
697 &local->block[x]);
698 }
699 }
700
701 return 0;
702}
703
704
705
706
707
708static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
709{
710 struct ibv_port_attr port;
711
712 if (ibv_query_port(verbs, 1, &port)) {
713 fprintf(stderr, "FAILED TO QUERY PORT INFORMATION!\n");
714 return;
715 }
716
717 printf("%s RDMA Device opened: kernel name %s "
718 "uverbs device name %s, "
719 "infiniband_verbs class device path %s, "
720 "infiniband class device path %s, "
721 "transport: (%d) %s\n",
722 who,
723 verbs->device->name,
724 verbs->device->dev_name,
725 verbs->device->dev_path,
726 verbs->device->ibdev_path,
727 port.link_layer,
728 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
729 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
730 ? "Ethernet" : "Unknown"));
731}
732
733
734
735
736
737
738static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
739{
740 char sgid[33];
741 char dgid[33];
742 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
743 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
744 DPRINTF("%s Source GID: %s, Dest GID: %s\n", who, sgid, dgid);
745}
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs)
791{
792 struct ibv_port_attr port_attr;
793
794
795#ifdef CONFIG_LINUX
796
797
798
799
800
801
802
803
804
805
806 if (!verbs) {
807 int num_devices, x;
808 struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
809 bool roce_found = false;
810 bool ib_found = false;
811
812 for (x = 0; x < num_devices; x++) {
813 verbs = ibv_open_device(dev_list[x]);
814
815 if (ibv_query_port(verbs, 1, &port_attr)) {
816 ibv_close_device(verbs);
817 ERROR(errp, "Could not query initial IB port");
818 return -EINVAL;
819 }
820
821 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
822 ib_found = true;
823 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
824 roce_found = true;
825 }
826
827 ibv_close_device(verbs);
828
829 }
830
831 if (roce_found) {
832 if (ib_found) {
833 fprintf(stderr, "WARN: migrations may fail:"
834 " IPv6 over RoCE / iWARP in linux"
835 " is broken. But since you appear to have a"
836 " mixed RoCE / IB environment, be sure to only"
837 " migrate over the IB fabric until the kernel "
838 " fixes the bug.\n");
839 } else {
840 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
841 " and your management software has specified '[::]'"
842 ", but IPv6 over RoCE / iWARP is not supported in Linux.");
843 return -ENONET;
844 }
845 }
846
847 return 0;
848 }
849
850
851
852
853
854
855
856
857 if (ibv_query_port(verbs, 1, &port_attr)) {
858 ERROR(errp, "Could not query initial IB port");
859 return -EINVAL;
860 }
861
862 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
863 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
864 "(but patches on linux-rdma in progress)");
865 return -ENONET;
866 }
867
868#endif
869
870 return 0;
871}
872
873
874
875
876
877
878static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
879{
880 int ret;
881 struct rdma_addrinfo *res;
882 char port_str[16];
883 struct rdma_cm_event *cm_event;
884 char ip[40] = "unknown";
885 struct rdma_addrinfo *e;
886
887 if (rdma->host == NULL || !strcmp(rdma->host, "")) {
888 ERROR(errp, "RDMA hostname has not been set");
889 return -EINVAL;
890 }
891
892
893 rdma->channel = rdma_create_event_channel();
894 if (!rdma->channel) {
895 ERROR(errp, "could not create CM channel");
896 return -EINVAL;
897 }
898
899
900 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
901 if (ret) {
902 ERROR(errp, "could not create channel id");
903 goto err_resolve_create_id;
904 }
905
906 snprintf(port_str, 16, "%d", rdma->port);
907 port_str[15] = '\0';
908
909 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
910 if (ret < 0) {
911 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
912 goto err_resolve_get_addr;
913 }
914
915 for (e = res; e != NULL; e = e->ai_next) {
916 inet_ntop(e->ai_family,
917 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
918 DPRINTF("Trying %s => %s\n", rdma->host, ip);
919
920 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
921 RDMA_RESOLVE_TIMEOUT_MS);
922 if (!ret) {
923 if (e->ai_family == AF_INET6) {
924 ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs);
925 if (ret) {
926 continue;
927 }
928 }
929 goto route;
930 }
931 }
932
933 ERROR(errp, "could not resolve address %s", rdma->host);
934 goto err_resolve_get_addr;
935
936route:
937 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
938
939 ret = rdma_get_cm_event(rdma->channel, &cm_event);
940 if (ret) {
941 ERROR(errp, "could not perform event_addr_resolved");
942 goto err_resolve_get_addr;
943 }
944
945 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
946 ERROR(errp, "result not equal to event_addr_resolved %s",
947 rdma_event_str(cm_event->event));
948 perror("rdma_resolve_addr");
949 ret = -EINVAL;
950 goto err_resolve_get_addr;
951 }
952 rdma_ack_cm_event(cm_event);
953
954
955 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
956 if (ret) {
957 ERROR(errp, "could not resolve rdma route");
958 goto err_resolve_get_addr;
959 }
960
961 ret = rdma_get_cm_event(rdma->channel, &cm_event);
962 if (ret) {
963 ERROR(errp, "could not perform event_route_resolved");
964 goto err_resolve_get_addr;
965 }
966 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
967 ERROR(errp, "result not equal to event_route_resolved: %s",
968 rdma_event_str(cm_event->event));
969 rdma_ack_cm_event(cm_event);
970 ret = -EINVAL;
971 goto err_resolve_get_addr;
972 }
973 rdma_ack_cm_event(cm_event);
974 rdma->verbs = rdma->cm_id->verbs;
975 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
976 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
977 return 0;
978
979err_resolve_get_addr:
980 rdma_destroy_id(rdma->cm_id);
981 rdma->cm_id = NULL;
982err_resolve_create_id:
983 rdma_destroy_event_channel(rdma->channel);
984 rdma->channel = NULL;
985 return ret;
986}
987
988
989
990
991static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
992{
993
994 rdma->pd = ibv_alloc_pd(rdma->verbs);
995 if (!rdma->pd) {
996 fprintf(stderr, "failed to allocate protection domain\n");
997 return -1;
998 }
999
1000
1001 rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
1002 if (!rdma->comp_channel) {
1003 fprintf(stderr, "failed to allocate completion channel\n");
1004 goto err_alloc_pd_cq;
1005 }
1006
1007
1008
1009
1010
1011 rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1012 NULL, rdma->comp_channel, 0);
1013 if (!rdma->cq) {
1014 fprintf(stderr, "failed to allocate completion queue\n");
1015 goto err_alloc_pd_cq;
1016 }
1017
1018 return 0;
1019
1020err_alloc_pd_cq:
1021 if (rdma->pd) {
1022 ibv_dealloc_pd(rdma->pd);
1023 }
1024 if (rdma->comp_channel) {
1025 ibv_destroy_comp_channel(rdma->comp_channel);
1026 }
1027 rdma->pd = NULL;
1028 rdma->comp_channel = NULL;
1029 return -1;
1030
1031}
1032
1033
1034
1035
1036static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1037{
1038 struct ibv_qp_init_attr attr = { 0 };
1039 int ret;
1040
1041 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1042 attr.cap.max_recv_wr = 3;
1043 attr.cap.max_send_sge = 1;
1044 attr.cap.max_recv_sge = 1;
1045 attr.send_cq = rdma->cq;
1046 attr.recv_cq = rdma->cq;
1047 attr.qp_type = IBV_QPT_RC;
1048
1049 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1050 if (ret) {
1051 return -1;
1052 }
1053
1054 rdma->qp = rdma->cm_id->qp;
1055 return 0;
1056}
1057
1058static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1059{
1060 int i;
1061 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1062
1063 for (i = 0; i < local->nb_blocks; i++) {
1064 local->block[i].mr =
1065 ibv_reg_mr(rdma->pd,
1066 local->block[i].local_host_addr,
1067 local->block[i].length,
1068 IBV_ACCESS_LOCAL_WRITE |
1069 IBV_ACCESS_REMOTE_WRITE
1070 );
1071 if (!local->block[i].mr) {
1072 perror("Failed to register local dest ram block!\n");
1073 break;
1074 }
1075 rdma->total_registrations++;
1076 }
1077
1078 if (i >= local->nb_blocks) {
1079 return 0;
1080 }
1081
1082 for (i--; i >= 0; i--) {
1083 ibv_dereg_mr(local->block[i].mr);
1084 rdma->total_registrations--;
1085 }
1086
1087 return -1;
1088
1089}
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1101 uint64_t block_offset,
1102 uint64_t offset,
1103 uint64_t length,
1104 uint64_t *block_index,
1105 uint64_t *chunk_index)
1106{
1107 uint64_t current_addr = block_offset + offset;
1108 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1109 (void *) block_offset);
1110 assert(block);
1111 assert(current_addr >= block->offset);
1112 assert((current_addr + length) <= (block->offset + block->length));
1113
1114 *block_index = block->index;
1115 *chunk_index = ram_chunk_index(block->local_host_addr,
1116 block->local_host_addr + (current_addr - block->offset));
1117
1118 return 0;
1119}
1120
1121
1122
1123
1124
1125
1126
1127
1128static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1129 RDMALocalBlock *block, uint8_t *host_addr,
1130 uint32_t *lkey, uint32_t *rkey, int chunk,
1131 uint8_t *chunk_start, uint8_t *chunk_end)
1132{
1133 if (block->mr) {
1134 if (lkey) {
1135 *lkey = block->mr->lkey;
1136 }
1137 if (rkey) {
1138 *rkey = block->mr->rkey;
1139 }
1140 return 0;
1141 }
1142
1143
1144 if (!block->pmr) {
1145 block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *));
1146 if (!block->pmr) {
1147 return -1;
1148 }
1149 }
1150
1151
1152
1153
1154
1155
1156 if (!block->pmr[chunk]) {
1157 uint64_t len = chunk_end - chunk_start;
1158
1159 DDPRINTF("Registering %" PRIu64 " bytes @ %p\n",
1160 len, chunk_start);
1161
1162 block->pmr[chunk] = ibv_reg_mr(rdma->pd,
1163 chunk_start, len,
1164 (rkey ? (IBV_ACCESS_LOCAL_WRITE |
1165 IBV_ACCESS_REMOTE_WRITE) : 0));
1166
1167 if (!block->pmr[chunk]) {
1168 perror("Failed to register chunk!");
1169 fprintf(stderr, "Chunk details: block: %d chunk index %d"
1170 " start %" PRIu64 " end %" PRIu64 " host %" PRIu64
1171 " local %" PRIu64 " registrations: %d\n",
1172 block->index, chunk, (uint64_t) chunk_start,
1173 (uint64_t) chunk_end, (uint64_t) host_addr,
1174 (uint64_t) block->local_host_addr,
1175 rdma->total_registrations);
1176 return -1;
1177 }
1178 rdma->total_registrations++;
1179 }
1180
1181 if (lkey) {
1182 *lkey = block->pmr[chunk]->lkey;
1183 }
1184 if (rkey) {
1185 *rkey = block->pmr[chunk]->rkey;
1186 }
1187 return 0;
1188}
1189
1190
1191
1192
1193
1194static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1195{
1196 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1197 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1198 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1199 if (rdma->wr_data[idx].control_mr) {
1200 rdma->total_registrations++;
1201 return 0;
1202 }
1203 fprintf(stderr, "qemu_rdma_reg_control failed!\n");
1204 return -1;
1205}
1206
1207const char *print_wrid(int wrid)
1208{
1209 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1210 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1211 }
1212 return wrid_desc[wrid];
1213}
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1251{
1252 while (rdma->unregistrations[rdma->unregister_current]) {
1253 int ret;
1254 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1255 uint64_t chunk =
1256 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1257 uint64_t index =
1258 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1259 RDMALocalBlock *block =
1260 &(rdma->local_ram_blocks.block[index]);
1261 RDMARegister reg = { .current_index = index };
1262 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1263 };
1264 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1265 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1266 .repeat = 1,
1267 };
1268
1269 DDPRINTF("Processing unregister for chunk: %" PRIu64
1270 " at position %d\n", chunk, rdma->unregister_current);
1271
1272 rdma->unregistrations[rdma->unregister_current] = 0;
1273 rdma->unregister_current++;
1274
1275 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1276 rdma->unregister_current = 0;
1277 }
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287 clear_bit(chunk, block->unregister_bitmap);
1288
1289 if (test_bit(chunk, block->transit_bitmap)) {
1290 DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk);
1291 continue;
1292 }
1293
1294 DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk);
1295
1296 ret = ibv_dereg_mr(block->pmr[chunk]);
1297 block->pmr[chunk] = NULL;
1298 block->remote_keys[chunk] = 0;
1299
1300 if (ret != 0) {
1301 perror("unregistration chunk failed");
1302 return -ret;
1303 }
1304 rdma->total_registrations--;
1305
1306 reg.key.chunk = chunk;
1307 register_to_network(®);
1308 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1309 &resp, NULL, NULL);
1310 if (ret < 0) {
1311 return ret;
1312 }
1313
1314 DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk);
1315 }
1316
1317 return 0;
1318}
1319
1320static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1321 uint64_t chunk)
1322{
1323 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1324
1325 result |= (index << RDMA_WRID_BLOCK_SHIFT);
1326 result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1327
1328 return result;
1329}
1330
1331
1332
1333
1334
1335static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1336 uint64_t chunk, uint64_t wr_id)
1337{
1338 if (rdma->unregistrations[rdma->unregister_next] != 0) {
1339 fprintf(stderr, "rdma migration: queue is full!\n");
1340 } else {
1341 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1342
1343 if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1344 DDPRINTF("Appending unregister chunk %" PRIu64
1345 " at position %d\n", chunk, rdma->unregister_next);
1346
1347 rdma->unregistrations[rdma->unregister_next++] =
1348 qemu_rdma_make_wrid(wr_id, index, chunk);
1349
1350 if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1351 rdma->unregister_next = 0;
1352 }
1353 } else {
1354 DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",
1355 chunk);
1356 }
1357 }
1358}
1359
1360
1361
1362
1363
1364
1365static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
1366 uint32_t *byte_len)
1367{
1368 int ret;
1369 struct ibv_wc wc;
1370 uint64_t wr_id;
1371
1372 ret = ibv_poll_cq(rdma->cq, 1, &wc);
1373
1374 if (!ret) {
1375 *wr_id_out = RDMA_WRID_NONE;
1376 return 0;
1377 }
1378
1379 if (ret < 0) {
1380 fprintf(stderr, "ibv_poll_cq return %d!\n", ret);
1381 return ret;
1382 }
1383
1384 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1385
1386 if (wc.status != IBV_WC_SUCCESS) {
1387 fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1388 wc.status, ibv_wc_status_str(wc.status));
1389 fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1390
1391 return -1;
1392 }
1393
1394 if (rdma->control_ready_expected &&
1395 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1396 DDDPRINTF("completion %s #%" PRId64 " received (%" PRId64 ")"
1397 " left %d\n", wrid_desc[RDMA_WRID_RECV_CONTROL],
1398 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1399 rdma->control_ready_expected = 0;
1400 }
1401
1402 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1403 uint64_t chunk =
1404 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1405 uint64_t index =
1406 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1407 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1408
1409 DDDPRINTF("completions %s (%" PRId64 ") left %d, "
1410 "block %" PRIu64 ", chunk: %" PRIu64 " %p %p\n",
1411 print_wrid(wr_id), wr_id, rdma->nb_sent, index, chunk,
1412 block->local_host_addr, (void *)block->remote_host_addr);
1413
1414 clear_bit(chunk, block->transit_bitmap);
1415
1416 if (rdma->nb_sent > 0) {
1417 rdma->nb_sent--;
1418 }
1419
1420 if (!rdma->pin_all) {
1421
1422
1423
1424
1425
1426
1427#ifdef RDMA_UNREGISTRATION_EXAMPLE
1428 qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1429#endif
1430 }
1431 } else {
1432 DDDPRINTF("other completion %s (%" PRId64 ") received left %d\n",
1433 print_wrid(wr_id), wr_id, rdma->nb_sent);
1434 }
1435
1436 *wr_id_out = wc.wr_id;
1437 if (byte_len) {
1438 *byte_len = wc.byte_len;
1439 }
1440
1441 return 0;
1442}
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1458 uint32_t *byte_len)
1459{
1460 int num_cq_events = 0, ret = 0;
1461 struct ibv_cq *cq;
1462 void *cq_ctx;
1463 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1464
1465 if (ibv_req_notify_cq(rdma->cq, 0)) {
1466 return -1;
1467 }
1468
1469 while (wr_id != wrid_requested) {
1470 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1471 if (ret < 0) {
1472 return ret;
1473 }
1474
1475 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1476
1477 if (wr_id == RDMA_WRID_NONE) {
1478 break;
1479 }
1480 if (wr_id != wrid_requested) {
1481 DDDPRINTF("A Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",
1482 print_wrid(wrid_requested),
1483 wrid_requested, print_wrid(wr_id), wr_id);
1484 }
1485 }
1486
1487 if (wr_id == wrid_requested) {
1488 return 0;
1489 }
1490
1491 while (1) {
1492
1493
1494
1495
1496 if (rdma->migration_started_on_destination) {
1497 yield_until_fd_readable(rdma->comp_channel->fd);
1498 }
1499
1500 if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) {
1501 perror("ibv_get_cq_event");
1502 goto err_block_for_wrid;
1503 }
1504
1505 num_cq_events++;
1506
1507 if (ibv_req_notify_cq(cq, 0)) {
1508 goto err_block_for_wrid;
1509 }
1510
1511 while (wr_id != wrid_requested) {
1512 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1513 if (ret < 0) {
1514 goto err_block_for_wrid;
1515 }
1516
1517 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1518
1519 if (wr_id == RDMA_WRID_NONE) {
1520 break;
1521 }
1522 if (wr_id != wrid_requested) {
1523 DDDPRINTF("B Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",
1524 print_wrid(wrid_requested), wrid_requested,
1525 print_wrid(wr_id), wr_id);
1526 }
1527 }
1528
1529 if (wr_id == wrid_requested) {
1530 goto success_block_for_wrid;
1531 }
1532 }
1533
1534success_block_for_wrid:
1535 if (num_cq_events) {
1536 ibv_ack_cq_events(cq, num_cq_events);
1537 }
1538 return 0;
1539
1540err_block_for_wrid:
1541 if (num_cq_events) {
1542 ibv_ack_cq_events(cq, num_cq_events);
1543 }
1544 return ret;
1545}
1546
1547
1548
1549
1550
1551static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1552 RDMAControlHeader *head)
1553{
1554 int ret = 0;
1555 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1556 struct ibv_send_wr *bad_wr;
1557 struct ibv_sge sge = {
1558 .addr = (uint64_t)(wr->control),
1559 .length = head->len + sizeof(RDMAControlHeader),
1560 .lkey = wr->control_mr->lkey,
1561 };
1562 struct ibv_send_wr send_wr = {
1563 .wr_id = RDMA_WRID_SEND_CONTROL,
1564 .opcode = IBV_WR_SEND,
1565 .send_flags = IBV_SEND_SIGNALED,
1566 .sg_list = &sge,
1567 .num_sge = 1,
1568 };
1569
1570 DDDPRINTF("CONTROL: sending %s..\n", control_desc[head->type]);
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1581 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1582 control_to_network((void *) wr->control);
1583
1584 if (buf) {
1585 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1586 }
1587
1588
1589 if (ibv_post_send(rdma->qp, &send_wr, &bad_wr)) {
1590 return -1;
1591 }
1592
1593 if (ret < 0) {
1594 fprintf(stderr, "Failed to use post IB SEND for control!\n");
1595 return ret;
1596 }
1597
1598 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1599 if (ret < 0) {
1600 fprintf(stderr, "rdma migration: send polling control error!\n");
1601 }
1602
1603 return ret;
1604}
1605
1606
1607
1608
1609
1610static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1611{
1612 struct ibv_recv_wr *bad_wr;
1613 struct ibv_sge sge = {
1614 .addr = (uint64_t)(rdma->wr_data[idx].control),
1615 .length = RDMA_CONTROL_MAX_BUFFER,
1616 .lkey = rdma->wr_data[idx].control_mr->lkey,
1617 };
1618
1619 struct ibv_recv_wr recv_wr = {
1620 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1621 .sg_list = &sge,
1622 .num_sge = 1,
1623 };
1624
1625
1626 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1627 return -1;
1628 }
1629
1630 return 0;
1631}
1632
1633
1634
1635
1636static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1637 RDMAControlHeader *head, int expecting, int idx)
1638{
1639 uint32_t byte_len;
1640 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1641 &byte_len);
1642
1643 if (ret < 0) {
1644 fprintf(stderr, "rdma migration: recv polling control error!\n");
1645 return ret;
1646 }
1647
1648 network_to_control((void *) rdma->wr_data[idx].control);
1649 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1650
1651 DDDPRINTF("CONTROL: %s receiving...\n", control_desc[expecting]);
1652
1653 if (expecting == RDMA_CONTROL_NONE) {
1654 DDDPRINTF("Surprise: got %s (%d)\n",
1655 control_desc[head->type], head->type);
1656 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1657 fprintf(stderr, "Was expecting a %s (%d) control message"
1658 ", but got: %s (%d), length: %d\n",
1659 control_desc[expecting], expecting,
1660 control_desc[head->type], head->type, head->len);
1661 return -EIO;
1662 }
1663 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1664 fprintf(stderr, "too long length: %d\n", head->len);
1665 return -EINVAL;
1666 }
1667 if (sizeof(*head) + head->len != byte_len) {
1668 fprintf(stderr, "Malformed length: %d byte_len %d\n",
1669 head->len, byte_len);
1670 return -EINVAL;
1671 }
1672
1673 return 0;
1674}
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1685 RDMAControlHeader *head)
1686{
1687 rdma->wr_data[idx].control_len = head->len;
1688 rdma->wr_data[idx].control_curr =
1689 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1690}
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1706 uint8_t *data, RDMAControlHeader *resp,
1707 int *resp_idx,
1708 int (*callback)(RDMAContext *rdma))
1709{
1710 int ret = 0;
1711
1712
1713
1714
1715
1716 if (rdma->control_ready_expected) {
1717 RDMAControlHeader resp;
1718 ret = qemu_rdma_exchange_get_response(rdma,
1719 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1720 if (ret < 0) {
1721 return ret;
1722 }
1723 }
1724
1725
1726
1727
1728 if (resp) {
1729 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1730 if (ret) {
1731 fprintf(stderr, "rdma migration: error posting"
1732 " extra control recv for anticipated result!");
1733 return ret;
1734 }
1735 }
1736
1737
1738
1739
1740 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1741 if (ret) {
1742 fprintf(stderr, "rdma migration: error posting first control recv!");
1743 return ret;
1744 }
1745
1746
1747
1748
1749 ret = qemu_rdma_post_send_control(rdma, data, head);
1750
1751 if (ret < 0) {
1752 fprintf(stderr, "Failed to send control buffer!\n");
1753 return ret;
1754 }
1755
1756
1757
1758
1759 if (resp) {
1760 if (callback) {
1761 DDPRINTF("Issuing callback before receiving response...\n");
1762 ret = callback(rdma);
1763 if (ret < 0) {
1764 return ret;
1765 }
1766 }
1767
1768 DDPRINTF("Waiting for response %s\n", control_desc[resp->type]);
1769 ret = qemu_rdma_exchange_get_response(rdma, resp,
1770 resp->type, RDMA_WRID_DATA);
1771
1772 if (ret < 0) {
1773 return ret;
1774 }
1775
1776 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1777 if (resp_idx) {
1778 *resp_idx = RDMA_WRID_DATA;
1779 }
1780 DDPRINTF("Response %s received.\n", control_desc[resp->type]);
1781 }
1782
1783 rdma->control_ready_expected = 1;
1784
1785 return 0;
1786}
1787
1788
1789
1790
1791
1792static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1793 int expecting)
1794{
1795 RDMAControlHeader ready = {
1796 .len = 0,
1797 .type = RDMA_CONTROL_READY,
1798 .repeat = 1,
1799 };
1800 int ret;
1801
1802
1803
1804
1805 ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1806
1807 if (ret < 0) {
1808 fprintf(stderr, "Failed to send control buffer!\n");
1809 return ret;
1810 }
1811
1812
1813
1814
1815 ret = qemu_rdma_exchange_get_response(rdma, head,
1816 expecting, RDMA_WRID_READY);
1817
1818 if (ret < 0) {
1819 return ret;
1820 }
1821
1822 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1823
1824
1825
1826
1827 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1828 if (ret) {
1829 fprintf(stderr, "rdma migration: error posting second control recv!");
1830 return ret;
1831 }
1832
1833 return 0;
1834}
1835
1836
1837
1838
1839
1840
1841
1842static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
1843 int current_index, uint64_t current_addr,
1844 uint64_t length)
1845{
1846 struct ibv_sge sge;
1847 struct ibv_send_wr send_wr = { 0 };
1848 struct ibv_send_wr *bad_wr;
1849 int reg_result_idx, ret, count = 0;
1850 uint64_t chunk, chunks;
1851 uint8_t *chunk_start, *chunk_end;
1852 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
1853 RDMARegister reg;
1854 RDMARegisterResult *reg_result;
1855 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
1856 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1857 .type = RDMA_CONTROL_REGISTER_REQUEST,
1858 .repeat = 1,
1859 };
1860
1861retry:
1862 sge.addr = (uint64_t)(block->local_host_addr +
1863 (current_addr - block->offset));
1864 sge.length = length;
1865
1866 chunk = ram_chunk_index(block->local_host_addr, (uint8_t *) sge.addr);
1867 chunk_start = ram_chunk_start(block, chunk);
1868
1869 if (block->is_ram_block) {
1870 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
1871
1872 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1873 chunks--;
1874 }
1875 } else {
1876 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
1877
1878 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1879 chunks--;
1880 }
1881 }
1882
1883 DDPRINTF("Writing %" PRIu64 " chunks, (%" PRIu64 " MB)\n",
1884 chunks + 1, (chunks + 1) * (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
1885
1886 chunk_end = ram_chunk_end(block, chunk + chunks);
1887
1888 if (!rdma->pin_all) {
1889#ifdef RDMA_UNREGISTRATION_EXAMPLE
1890 qemu_rdma_unregister_waiting(rdma);
1891#endif
1892 }
1893
1894 while (test_bit(chunk, block->transit_bitmap)) {
1895 (void)count;
1896 DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64
1897 " current %" PRIu64 " len %" PRIu64 " %d %d\n",
1898 count++, current_index, chunk,
1899 sge.addr, length, rdma->nb_sent, block->nb_chunks);
1900
1901 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
1902
1903 if (ret < 0) {
1904 fprintf(stderr, "Failed to Wait for previous write to complete "
1905 "block %d chunk %" PRIu64
1906 " current %" PRIu64 " len %" PRIu64 " %d\n",
1907 current_index, chunk, sge.addr, length, rdma->nb_sent);
1908 return ret;
1909 }
1910 }
1911
1912 if (!rdma->pin_all || !block->is_ram_block) {
1913 if (!block->remote_keys[chunk]) {
1914
1915
1916
1917
1918
1919
1920 if (can_use_buffer_find_nonzero_offset((void *)sge.addr, length)
1921 && buffer_find_nonzero_offset((void *)sge.addr,
1922 length) == length) {
1923 RDMACompress comp = {
1924 .offset = current_addr,
1925 .value = 0,
1926 .block_idx = current_index,
1927 .length = length,
1928 };
1929
1930 head.len = sizeof(comp);
1931 head.type = RDMA_CONTROL_COMPRESS;
1932
1933 DDPRINTF("Entire chunk is zero, sending compress: %"
1934 PRIu64 " for %d "
1935 "bytes, index: %d, offset: %" PRId64 "...\n",
1936 chunk, sge.length, current_index, current_addr);
1937
1938 compress_to_network(&comp);
1939 ret = qemu_rdma_exchange_send(rdma, &head,
1940 (uint8_t *) &comp, NULL, NULL, NULL);
1941
1942 if (ret < 0) {
1943 return -EIO;
1944 }
1945
1946 acct_update_position(f, sge.length, true);
1947
1948 return 1;
1949 }
1950
1951
1952
1953
1954 reg.current_index = current_index;
1955 if (block->is_ram_block) {
1956 reg.key.current_addr = current_addr;
1957 } else {
1958 reg.key.chunk = chunk;
1959 }
1960 reg.chunks = chunks;
1961
1962 DDPRINTF("Sending registration request chunk %" PRIu64 " for %d "
1963 "bytes, index: %d, offset: %" PRId64 "...\n",
1964 chunk, sge.length, current_index, current_addr);
1965
1966 register_to_network(®);
1967 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1968 &resp, ®_result_idx, NULL);
1969 if (ret < 0) {
1970 return ret;
1971 }
1972
1973
1974 if (qemu_rdma_register_and_get_keys(rdma, block,
1975 (uint8_t *) sge.addr,
1976 &sge.lkey, NULL, chunk,
1977 chunk_start, chunk_end)) {
1978 fprintf(stderr, "cannot get lkey!\n");
1979 return -EINVAL;
1980 }
1981
1982 reg_result = (RDMARegisterResult *)
1983 rdma->wr_data[reg_result_idx].control_curr;
1984
1985 network_to_result(reg_result);
1986
1987 DDPRINTF("Received registration result:"
1988 " my key: %x their key %x, chunk %" PRIu64 "\n",
1989 block->remote_keys[chunk], reg_result->rkey, chunk);
1990
1991 block->remote_keys[chunk] = reg_result->rkey;
1992 block->remote_host_addr = reg_result->host_addr;
1993 } else {
1994
1995 if (qemu_rdma_register_and_get_keys(rdma, block,
1996 (uint8_t *)sge.addr,
1997 &sge.lkey, NULL, chunk,
1998 chunk_start, chunk_end)) {
1999 fprintf(stderr, "cannot get lkey!\n");
2000 return -EINVAL;
2001 }
2002 }
2003
2004 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2005 } else {
2006 send_wr.wr.rdma.rkey = block->remote_rkey;
2007
2008 if (qemu_rdma_register_and_get_keys(rdma, block, (uint8_t *)sge.addr,
2009 &sge.lkey, NULL, chunk,
2010 chunk_start, chunk_end)) {
2011 fprintf(stderr, "cannot get lkey!\n");
2012 return -EINVAL;
2013 }
2014 }
2015
2016
2017
2018
2019
2020
2021
2022 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2023 current_index, chunk);
2024
2025 send_wr.opcode = IBV_WR_RDMA_WRITE;
2026 send_wr.send_flags = IBV_SEND_SIGNALED;
2027 send_wr.sg_list = &sge;
2028 send_wr.num_sge = 1;
2029 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2030 (current_addr - block->offset);
2031
2032 DDDPRINTF("Posting chunk: %" PRIu64 ", addr: %lx"
2033 " remote: %lx, bytes %" PRIu32 "\n",
2034 chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2035 sge.length);
2036
2037
2038
2039
2040
2041 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2042
2043 if (ret == ENOMEM) {
2044 DDPRINTF("send queue is full. wait a little....\n");
2045 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2046 if (ret < 0) {
2047 fprintf(stderr, "rdma migration: failed to make "
2048 "room in full send queue! %d\n", ret);
2049 return ret;
2050 }
2051
2052 goto retry;
2053
2054 } else if (ret > 0) {
2055 perror("rdma migration: post rdma write failed");
2056 return -ret;
2057 }
2058
2059 set_bit(chunk, block->transit_bitmap);
2060 acct_update_position(f, sge.length, false);
2061 rdma->total_writes++;
2062
2063 return 0;
2064}
2065
2066
2067
2068
2069
2070
2071
2072static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2073{
2074 int ret;
2075
2076 if (!rdma->current_length) {
2077 return 0;
2078 }
2079
2080 ret = qemu_rdma_write_one(f, rdma,
2081 rdma->current_index, rdma->current_addr, rdma->current_length);
2082
2083 if (ret < 0) {
2084 return ret;
2085 }
2086
2087 if (ret == 0) {
2088 rdma->nb_sent++;
2089 DDDPRINTF("sent total: %d\n", rdma->nb_sent);
2090 }
2091
2092 rdma->current_length = 0;
2093 rdma->current_addr = 0;
2094
2095 return 0;
2096}
2097
2098static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2099 uint64_t offset, uint64_t len)
2100{
2101 RDMALocalBlock *block;
2102 uint8_t *host_addr;
2103 uint8_t *chunk_end;
2104
2105 if (rdma->current_index < 0) {
2106 return 0;
2107 }
2108
2109 if (rdma->current_chunk < 0) {
2110 return 0;
2111 }
2112
2113 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2114 host_addr = block->local_host_addr + (offset - block->offset);
2115 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2116
2117 if (rdma->current_length == 0) {
2118 return 0;
2119 }
2120
2121
2122
2123
2124 if (offset != (rdma->current_addr + rdma->current_length)) {
2125 return 0;
2126 }
2127
2128 if (offset < block->offset) {
2129 return 0;
2130 }
2131
2132 if ((offset + len) > (block->offset + block->length)) {
2133 return 0;
2134 }
2135
2136 if ((host_addr + len) > chunk_end) {
2137 return 0;
2138 }
2139
2140 return 1;
2141}
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2154 uint64_t block_offset, uint64_t offset,
2155 uint64_t len)
2156{
2157 uint64_t current_addr = block_offset + offset;
2158 uint64_t index = rdma->current_index;
2159 uint64_t chunk = rdma->current_chunk;
2160 int ret;
2161
2162
2163 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2164 ret = qemu_rdma_write_flush(f, rdma);
2165 if (ret) {
2166 return ret;
2167 }
2168 rdma->current_length = 0;
2169 rdma->current_addr = current_addr;
2170
2171 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2172 offset, len, &index, &chunk);
2173 if (ret) {
2174 fprintf(stderr, "ram block search failed\n");
2175 return ret;
2176 }
2177 rdma->current_index = index;
2178 rdma->current_chunk = chunk;
2179 }
2180
2181
2182 rdma->current_length += len;
2183
2184
2185 if (rdma->current_length >= RDMA_MERGE_MAX) {
2186 return qemu_rdma_write_flush(f, rdma);
2187 }
2188
2189 return 0;
2190}
2191
2192static void qemu_rdma_cleanup(RDMAContext *rdma)
2193{
2194 struct rdma_cm_event *cm_event;
2195 int ret, idx;
2196
2197 if (rdma->cm_id) {
2198 if (rdma->error_state) {
2199 RDMAControlHeader head = { .len = 0,
2200 .type = RDMA_CONTROL_ERROR,
2201 .repeat = 1,
2202 };
2203 fprintf(stderr, "Early error. Sending error.\n");
2204 qemu_rdma_post_send_control(rdma, NULL, &head);
2205 }
2206
2207 ret = rdma_disconnect(rdma->cm_id);
2208 if (!ret) {
2209 DDPRINTF("waiting for disconnect\n");
2210 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2211 if (!ret) {
2212 rdma_ack_cm_event(cm_event);
2213 }
2214 }
2215 DDPRINTF("Disconnected.\n");
2216 rdma->cm_id = NULL;
2217 }
2218
2219 g_free(rdma->block);
2220 rdma->block = NULL;
2221
2222 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2223 if (rdma->wr_data[idx].control_mr) {
2224 rdma->total_registrations--;
2225 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2226 }
2227 rdma->wr_data[idx].control_mr = NULL;
2228 }
2229
2230 if (rdma->local_ram_blocks.block) {
2231 while (rdma->local_ram_blocks.nb_blocks) {
2232 __qemu_rdma_delete_block(rdma,
2233 rdma->local_ram_blocks.block->offset);
2234 }
2235 }
2236
2237 if (rdma->qp) {
2238 ibv_destroy_qp(rdma->qp);
2239 rdma->qp = NULL;
2240 }
2241 if (rdma->cq) {
2242 ibv_destroy_cq(rdma->cq);
2243 rdma->cq = NULL;
2244 }
2245 if (rdma->comp_channel) {
2246 ibv_destroy_comp_channel(rdma->comp_channel);
2247 rdma->comp_channel = NULL;
2248 }
2249 if (rdma->pd) {
2250 ibv_dealloc_pd(rdma->pd);
2251 rdma->pd = NULL;
2252 }
2253 if (rdma->listen_id) {
2254 rdma_destroy_id(rdma->listen_id);
2255 rdma->listen_id = NULL;
2256 }
2257 if (rdma->cm_id) {
2258 rdma_destroy_id(rdma->cm_id);
2259 rdma->cm_id = NULL;
2260 }
2261 if (rdma->channel) {
2262 rdma_destroy_event_channel(rdma->channel);
2263 rdma->channel = NULL;
2264 }
2265 g_free(rdma->host);
2266 rdma->host = NULL;
2267}
2268
2269
2270static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all)
2271{
2272 int ret, idx;
2273 Error *local_err = NULL, **temp = &local_err;
2274
2275
2276
2277
2278
2279 rdma->pin_all = pin_all;
2280
2281 ret = qemu_rdma_resolve_host(rdma, temp);
2282 if (ret) {
2283 goto err_rdma_source_init;
2284 }
2285
2286 ret = qemu_rdma_alloc_pd_cq(rdma);
2287 if (ret) {
2288 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2289 " limits may be too low. Please check $ ulimit -a # and "
2290 "search for 'ulimit -l' in the output");
2291 goto err_rdma_source_init;
2292 }
2293
2294 ret = qemu_rdma_alloc_qp(rdma);
2295 if (ret) {
2296 ERROR(temp, "rdma migration: error allocating qp!");
2297 goto err_rdma_source_init;
2298 }
2299
2300 ret = qemu_rdma_init_ram_blocks(rdma);
2301 if (ret) {
2302 ERROR(temp, "rdma migration: error initializing ram blocks!");
2303 goto err_rdma_source_init;
2304 }
2305
2306 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2307 ret = qemu_rdma_reg_control(rdma, idx);
2308 if (ret) {
2309 ERROR(temp, "rdma migration: error registering %d control!",
2310 idx);
2311 goto err_rdma_source_init;
2312 }
2313 }
2314
2315 return 0;
2316
2317err_rdma_source_init:
2318 error_propagate(errp, local_err);
2319 qemu_rdma_cleanup(rdma);
2320 return -1;
2321}
2322
2323static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
2324{
2325 RDMACapabilities cap = {
2326 .version = RDMA_CONTROL_VERSION_CURRENT,
2327 .flags = 0,
2328 };
2329 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2330 .retry_count = 5,
2331 .private_data = &cap,
2332 .private_data_len = sizeof(cap),
2333 };
2334 struct rdma_cm_event *cm_event;
2335 int ret;
2336
2337
2338
2339
2340
2341 if (rdma->pin_all) {
2342 DPRINTF("Server pin-all memory requested.\n");
2343 cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2344 }
2345
2346 caps_to_network(&cap);
2347
2348 ret = rdma_connect(rdma->cm_id, &conn_param);
2349 if (ret) {
2350 perror("rdma_connect");
2351 ERROR(errp, "connecting to destination!");
2352 rdma_destroy_id(rdma->cm_id);
2353 rdma->cm_id = NULL;
2354 goto err_rdma_source_connect;
2355 }
2356
2357 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2358 if (ret) {
2359 perror("rdma_get_cm_event after rdma_connect");
2360 ERROR(errp, "connecting to destination!");
2361 rdma_ack_cm_event(cm_event);
2362 rdma_destroy_id(rdma->cm_id);
2363 rdma->cm_id = NULL;
2364 goto err_rdma_source_connect;
2365 }
2366
2367 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2368 perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2369 ERROR(errp, "connecting to destination!");
2370 rdma_ack_cm_event(cm_event);
2371 rdma_destroy_id(rdma->cm_id);
2372 rdma->cm_id = NULL;
2373 goto err_rdma_source_connect;
2374 }
2375
2376 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2377 network_to_caps(&cap);
2378
2379
2380
2381
2382
2383 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2384 ERROR(errp, "Server cannot support pinning all memory. "
2385 "Will register memory dynamically.");
2386 rdma->pin_all = false;
2387 }
2388
2389 DPRINTF("Pin all memory: %s\n", rdma->pin_all ? "enabled" : "disabled");
2390
2391 rdma_ack_cm_event(cm_event);
2392
2393 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2394 if (ret) {
2395 ERROR(errp, "posting second control recv!");
2396 goto err_rdma_source_connect;
2397 }
2398
2399 rdma->control_ready_expected = 1;
2400 rdma->nb_sent = 0;
2401 return 0;
2402
2403err_rdma_source_connect:
2404 qemu_rdma_cleanup(rdma);
2405 return -1;
2406}
2407
2408static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2409{
2410 int ret = -EINVAL, idx;
2411 struct rdma_cm_id *listen_id;
2412 char ip[40] = "unknown";
2413 struct rdma_addrinfo *res;
2414 char port_str[16];
2415
2416 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2417 rdma->wr_data[idx].control_len = 0;
2418 rdma->wr_data[idx].control_curr = NULL;
2419 }
2420
2421 if (rdma->host == NULL) {
2422 ERROR(errp, "RDMA host is not set!");
2423 rdma->error_state = -EINVAL;
2424 return -1;
2425 }
2426
2427 rdma->channel = rdma_create_event_channel();
2428 if (!rdma->channel) {
2429 ERROR(errp, "could not create rdma event channel");
2430 rdma->error_state = -EINVAL;
2431 return -1;
2432 }
2433
2434
2435 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2436 if (ret) {
2437 ERROR(errp, "could not create cm_id!");
2438 goto err_dest_init_create_listen_id;
2439 }
2440
2441 snprintf(port_str, 16, "%d", rdma->port);
2442 port_str[15] = '\0';
2443
2444 if (rdma->host && strcmp("", rdma->host)) {
2445 struct rdma_addrinfo *e;
2446
2447 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2448 if (ret < 0) {
2449 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2450 goto err_dest_init_bind_addr;
2451 }
2452
2453 for (e = res; e != NULL; e = e->ai_next) {
2454 inet_ntop(e->ai_family,
2455 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2456 DPRINTF("Trying %s => %s\n", rdma->host, ip);
2457 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2458 if (!ret) {
2459 if (e->ai_family == AF_INET6) {
2460 ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs);
2461 if (ret) {
2462 continue;
2463 }
2464 }
2465
2466 goto listen;
2467 }
2468 }
2469
2470 ERROR(errp, "Error: could not rdma_bind_addr!");
2471 goto err_dest_init_bind_addr;
2472 } else {
2473 ERROR(errp, "migration host and port not specified!");
2474 ret = -EINVAL;
2475 goto err_dest_init_bind_addr;
2476 }
2477listen:
2478
2479 rdma->listen_id = listen_id;
2480 qemu_rdma_dump_gid("dest_init", listen_id);
2481 return 0;
2482
2483err_dest_init_bind_addr:
2484 rdma_destroy_id(listen_id);
2485err_dest_init_create_listen_id:
2486 rdma_destroy_event_channel(rdma->channel);
2487 rdma->channel = NULL;
2488 rdma->error_state = ret;
2489 return ret;
2490
2491}
2492
2493static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2494{
2495 RDMAContext *rdma = NULL;
2496 InetSocketAddress *addr;
2497
2498 if (host_port) {
2499 rdma = g_malloc0(sizeof(RDMAContext));
2500 memset(rdma, 0, sizeof(RDMAContext));
2501 rdma->current_index = -1;
2502 rdma->current_chunk = -1;
2503
2504 addr = inet_parse(host_port, NULL);
2505 if (addr != NULL) {
2506 rdma->port = atoi(addr->port);
2507 rdma->host = g_strdup(addr->host);
2508 } else {
2509 ERROR(errp, "bad RDMA migration address '%s'", host_port);
2510 g_free(rdma);
2511 return NULL;
2512 }
2513 }
2514
2515 return rdma;
2516}
2517
2518
2519
2520
2521
2522
2523static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
2524 int64_t pos, int size)
2525{
2526 QEMUFileRDMA *r = opaque;
2527 QEMUFile *f = r->file;
2528 RDMAContext *rdma = r->rdma;
2529 size_t remaining = size;
2530 uint8_t * data = (void *) buf;
2531 int ret;
2532
2533 CHECK_ERROR_STATE();
2534
2535
2536
2537
2538
2539 ret = qemu_rdma_write_flush(f, rdma);
2540 if (ret < 0) {
2541 rdma->error_state = ret;
2542 return ret;
2543 }
2544
2545 while (remaining) {
2546 RDMAControlHeader head;
2547
2548 r->len = MIN(remaining, RDMA_SEND_INCREMENT);
2549 remaining -= r->len;
2550
2551 head.len = r->len;
2552 head.type = RDMA_CONTROL_QEMU_FILE;
2553
2554 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2555
2556 if (ret < 0) {
2557 rdma->error_state = ret;
2558 return ret;
2559 }
2560
2561 data += r->len;
2562 }
2563
2564 return size;
2565}
2566
2567static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2568 int size, int idx)
2569{
2570 size_t len = 0;
2571
2572 if (rdma->wr_data[idx].control_len) {
2573 DDDPRINTF("RDMA %" PRId64 " of %d bytes already in buffer\n",
2574 rdma->wr_data[idx].control_len, size);
2575
2576 len = MIN(size, rdma->wr_data[idx].control_len);
2577 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2578 rdma->wr_data[idx].control_curr += len;
2579 rdma->wr_data[idx].control_len -= len;
2580 }
2581
2582 return len;
2583}
2584
2585
2586
2587
2588
2589
2590static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf,
2591 int64_t pos, int size)
2592{
2593 QEMUFileRDMA *r = opaque;
2594 RDMAContext *rdma = r->rdma;
2595 RDMAControlHeader head;
2596 int ret = 0;
2597
2598 CHECK_ERROR_STATE();
2599
2600
2601
2602
2603
2604
2605 r->len = qemu_rdma_fill(r->rdma, buf, size, 0);
2606 if (r->len) {
2607 return r->len;
2608 }
2609
2610
2611
2612
2613
2614 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2615
2616 if (ret < 0) {
2617 rdma->error_state = ret;
2618 return ret;
2619 }
2620
2621
2622
2623
2624 return qemu_rdma_fill(r->rdma, buf, size, 0);
2625}
2626
2627
2628
2629
2630static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2631{
2632 int ret;
2633
2634 if (qemu_rdma_write_flush(f, rdma) < 0) {
2635 return -EIO;
2636 }
2637
2638 while (rdma->nb_sent) {
2639 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2640 if (ret < 0) {
2641 fprintf(stderr, "rdma migration: complete polling error!\n");
2642 return -EIO;
2643 }
2644 }
2645
2646 qemu_rdma_unregister_waiting(rdma);
2647
2648 return 0;
2649}
2650
2651static int qemu_rdma_close(void *opaque)
2652{
2653 DPRINTF("Shutting down connection.\n");
2654 QEMUFileRDMA *r = opaque;
2655 if (r->rdma) {
2656 qemu_rdma_cleanup(r->rdma);
2657 g_free(r->rdma);
2658 }
2659 g_free(r);
2660 return 0;
2661}
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
2698 ram_addr_t block_offset, ram_addr_t offset,
2699 size_t size, int *bytes_sent)
2700{
2701 QEMUFileRDMA *rfile = opaque;
2702 RDMAContext *rdma = rfile->rdma;
2703 int ret;
2704
2705 CHECK_ERROR_STATE();
2706
2707 qemu_fflush(f);
2708
2709 if (size > 0) {
2710
2711
2712
2713
2714
2715 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
2716 if (ret < 0) {
2717 fprintf(stderr, "rdma migration: write error! %d\n", ret);
2718 goto err;
2719 }
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729 if (bytes_sent) {
2730 *bytes_sent = 1;
2731 }
2732 } else {
2733 uint64_t index, chunk;
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2747 offset, size, &index, &chunk);
2748
2749 if (ret) {
2750 fprintf(stderr, "ram block search failed\n");
2751 goto err;
2752 }
2753
2754 qemu_rdma_signal_unregister(rdma, index, chunk, 0);
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764 }
2765
2766
2767
2768
2769
2770
2771
2772
2773 while (1) {
2774 uint64_t wr_id, wr_id_in;
2775 int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
2776 if (ret < 0) {
2777 fprintf(stderr, "rdma migration: polling error! %d\n", ret);
2778 goto err;
2779 }
2780
2781 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
2782
2783 if (wr_id == RDMA_WRID_NONE) {
2784 break;
2785 }
2786 }
2787
2788 return RAM_SAVE_CONTROL_DELAYED;
2789err:
2790 rdma->error_state = ret;
2791 return ret;
2792}
2793
2794static int qemu_rdma_accept(RDMAContext *rdma)
2795{
2796 RDMACapabilities cap;
2797 struct rdma_conn_param conn_param = {
2798 .responder_resources = 2,
2799 .private_data = &cap,
2800 .private_data_len = sizeof(cap),
2801 };
2802 struct rdma_cm_event *cm_event;
2803 struct ibv_context *verbs;
2804 int ret = -EINVAL;
2805 int idx;
2806
2807 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2808 if (ret) {
2809 goto err_rdma_dest_wait;
2810 }
2811
2812 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
2813 rdma_ack_cm_event(cm_event);
2814 goto err_rdma_dest_wait;
2815 }
2816
2817 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2818
2819 network_to_caps(&cap);
2820
2821 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
2822 fprintf(stderr, "Unknown source RDMA version: %d, bailing...\n",
2823 cap.version);
2824 rdma_ack_cm_event(cm_event);
2825 goto err_rdma_dest_wait;
2826 }
2827
2828
2829
2830
2831 cap.flags &= known_capabilities;
2832
2833
2834
2835
2836
2837 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
2838 rdma->pin_all = true;
2839 }
2840
2841 rdma->cm_id = cm_event->id;
2842 verbs = cm_event->id->verbs;
2843
2844 rdma_ack_cm_event(cm_event);
2845
2846 DPRINTF("Memory pin all: %s\n", rdma->pin_all ? "enabled" : "disabled");
2847
2848 caps_to_network(&cap);
2849
2850 DPRINTF("verbs context after listen: %p\n", verbs);
2851
2852 if (!rdma->verbs) {
2853 rdma->verbs = verbs;
2854 } else if (rdma->verbs != verbs) {
2855 fprintf(stderr, "ibv context not matching %p, %p!\n",
2856 rdma->verbs, verbs);
2857 goto err_rdma_dest_wait;
2858 }
2859
2860 qemu_rdma_dump_id("dest_init", verbs);
2861
2862 ret = qemu_rdma_alloc_pd_cq(rdma);
2863 if (ret) {
2864 fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
2865 goto err_rdma_dest_wait;
2866 }
2867
2868 ret = qemu_rdma_alloc_qp(rdma);
2869 if (ret) {
2870 fprintf(stderr, "rdma migration: error allocating qp!\n");
2871 goto err_rdma_dest_wait;
2872 }
2873
2874 ret = qemu_rdma_init_ram_blocks(rdma);
2875 if (ret) {
2876 fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
2877 goto err_rdma_dest_wait;
2878 }
2879
2880 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2881 ret = qemu_rdma_reg_control(rdma, idx);
2882 if (ret) {
2883 fprintf(stderr, "rdma: error registering %d control!\n", idx);
2884 goto err_rdma_dest_wait;
2885 }
2886 }
2887
2888 qemu_set_fd_handler2(rdma->channel->fd, NULL, NULL, NULL, NULL);
2889
2890 ret = rdma_accept(rdma->cm_id, &conn_param);
2891 if (ret) {
2892 fprintf(stderr, "rdma_accept returns %d!\n", ret);
2893 goto err_rdma_dest_wait;
2894 }
2895
2896 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2897 if (ret) {
2898 fprintf(stderr, "rdma_accept get_cm_event failed %d!\n", ret);
2899 goto err_rdma_dest_wait;
2900 }
2901
2902 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2903 fprintf(stderr, "rdma_accept not event established!\n");
2904 rdma_ack_cm_event(cm_event);
2905 goto err_rdma_dest_wait;
2906 }
2907
2908 rdma_ack_cm_event(cm_event);
2909
2910 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2911 if (ret) {
2912 fprintf(stderr, "rdma migration: error posting second control recv!\n");
2913 goto err_rdma_dest_wait;
2914 }
2915
2916 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
2917
2918 return 0;
2919
2920err_rdma_dest_wait:
2921 rdma->error_state = ret;
2922 qemu_rdma_cleanup(rdma);
2923 return ret;
2924}
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
2936 uint64_t flags)
2937{
2938 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
2939 .type = RDMA_CONTROL_REGISTER_RESULT,
2940 .repeat = 0,
2941 };
2942 RDMAControlHeader unreg_resp = { .len = 0,
2943 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
2944 .repeat = 0,
2945 };
2946 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
2947 .repeat = 1 };
2948 QEMUFileRDMA *rfile = opaque;
2949 RDMAContext *rdma = rfile->rdma;
2950 RDMALocalBlocks *local = &rdma->local_ram_blocks;
2951 RDMAControlHeader head;
2952 RDMARegister *reg, *registers;
2953 RDMACompress *comp;
2954 RDMARegisterResult *reg_result;
2955 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
2956 RDMALocalBlock *block;
2957 void *host_addr;
2958 int ret = 0;
2959 int idx = 0;
2960 int count = 0;
2961 int i = 0;
2962
2963 CHECK_ERROR_STATE();
2964
2965 do {
2966 DDDPRINTF("Waiting for next request %" PRIu64 "...\n", flags);
2967
2968 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
2969
2970 if (ret < 0) {
2971 break;
2972 }
2973
2974 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
2975 fprintf(stderr, "rdma: Too many requests in this message (%d)."
2976 "Bailing.\n", head.repeat);
2977 ret = -EIO;
2978 break;
2979 }
2980
2981 switch (head.type) {
2982 case RDMA_CONTROL_COMPRESS:
2983 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
2984 network_to_compress(comp);
2985
2986 DDPRINTF("Zapping zero chunk: %" PRId64
2987 " bytes, index %d, offset %" PRId64 "\n",
2988 comp->length, comp->block_idx, comp->offset);
2989 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
2990
2991 host_addr = block->local_host_addr +
2992 (comp->offset - block->offset);
2993
2994 ram_handle_compressed(host_addr, comp->value, comp->length);
2995 break;
2996
2997 case RDMA_CONTROL_REGISTER_FINISHED:
2998 DDDPRINTF("Current registrations complete.\n");
2999 goto out;
3000
3001 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3002 DPRINTF("Initial setup info requested.\n");
3003
3004 if (rdma->pin_all) {
3005 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3006 if (ret) {
3007 fprintf(stderr, "rdma migration: error dest "
3008 "registering ram blocks!\n");
3009 goto out;
3010 }
3011 }
3012
3013
3014
3015
3016
3017
3018
3019 for (i = 0; i < local->nb_blocks; i++) {
3020 rdma->block[i].remote_host_addr =
3021 (uint64_t)(local->block[i].local_host_addr);
3022
3023 if (rdma->pin_all) {
3024 rdma->block[i].remote_rkey = local->block[i].mr->rkey;
3025 }
3026
3027 rdma->block[i].offset = local->block[i].offset;
3028 rdma->block[i].length = local->block[i].length;
3029
3030 remote_block_to_network(&rdma->block[i]);
3031 }
3032
3033 blocks.len = rdma->local_ram_blocks.nb_blocks
3034 * sizeof(RDMARemoteBlock);
3035
3036
3037 ret = qemu_rdma_post_send_control(rdma,
3038 (uint8_t *) rdma->block, &blocks);
3039
3040 if (ret < 0) {
3041 fprintf(stderr, "rdma migration: error sending remote info!\n");
3042 goto out;
3043 }
3044
3045 break;
3046 case RDMA_CONTROL_REGISTER_REQUEST:
3047 DDPRINTF("There are %d registration requests\n", head.repeat);
3048
3049 reg_resp.repeat = head.repeat;
3050 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3051
3052 for (count = 0; count < head.repeat; count++) {
3053 uint64_t chunk;
3054 uint8_t *chunk_start, *chunk_end;
3055
3056 reg = ®isters[count];
3057 network_to_register(reg);
3058
3059 reg_result = &results[count];
3060
3061 DDPRINTF("Registration request (%d): index %d, current_addr %"
3062 PRIu64 " chunks: %" PRIu64 "\n", count,
3063 reg->current_index, reg->key.current_addr, reg->chunks);
3064
3065 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3066 if (block->is_ram_block) {
3067 host_addr = (block->local_host_addr +
3068 (reg->key.current_addr - block->offset));
3069 chunk = ram_chunk_index(block->local_host_addr,
3070 (uint8_t *) host_addr);
3071 } else {
3072 chunk = reg->key.chunk;
3073 host_addr = block->local_host_addr +
3074 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3075 }
3076 chunk_start = ram_chunk_start(block, chunk);
3077 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3078 if (qemu_rdma_register_and_get_keys(rdma, block,
3079 (uint8_t *)host_addr, NULL, ®_result->rkey,
3080 chunk, chunk_start, chunk_end)) {
3081 fprintf(stderr, "cannot get rkey!\n");
3082 ret = -EINVAL;
3083 goto out;
3084 }
3085
3086 reg_result->host_addr = (uint64_t) block->local_host_addr;
3087
3088 DDPRINTF("Registered rkey for this request: %x\n",
3089 reg_result->rkey);
3090
3091 result_to_network(reg_result);
3092 }
3093
3094 ret = qemu_rdma_post_send_control(rdma,
3095 (uint8_t *) results, ®_resp);
3096
3097 if (ret < 0) {
3098 fprintf(stderr, "Failed to send control buffer!\n");
3099 goto out;
3100 }
3101 break;
3102 case RDMA_CONTROL_UNREGISTER_REQUEST:
3103 DDPRINTF("There are %d unregistration requests\n", head.repeat);
3104 unreg_resp.repeat = head.repeat;
3105 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3106
3107 for (count = 0; count < head.repeat; count++) {
3108 reg = ®isters[count];
3109 network_to_register(reg);
3110
3111 DDPRINTF("Unregistration request (%d): "
3112 " index %d, chunk %" PRIu64 "\n",
3113 count, reg->current_index, reg->key.chunk);
3114
3115 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3116
3117 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3118 block->pmr[reg->key.chunk] = NULL;
3119
3120 if (ret != 0) {
3121 perror("rdma unregistration chunk failed");
3122 ret = -ret;
3123 goto out;
3124 }
3125
3126 rdma->total_registrations--;
3127
3128 DDPRINTF("Unregistered chunk %" PRIu64 " successfully.\n",
3129 reg->key.chunk);
3130 }
3131
3132 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3133
3134 if (ret < 0) {
3135 fprintf(stderr, "Failed to send control buffer!\n");
3136 goto out;
3137 }
3138 break;
3139 case RDMA_CONTROL_REGISTER_RESULT:
3140 fprintf(stderr, "Invalid RESULT message at dest.\n");
3141 ret = -EIO;
3142 goto out;
3143 default:
3144 fprintf(stderr, "Unknown control message %s\n",
3145 control_desc[head.type]);
3146 ret = -EIO;
3147 goto out;
3148 }
3149 } while (1);
3150out:
3151 if (ret < 0) {
3152 rdma->error_state = ret;
3153 }
3154 return ret;
3155}
3156
3157static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3158 uint64_t flags)
3159{
3160 QEMUFileRDMA *rfile = opaque;
3161 RDMAContext *rdma = rfile->rdma;
3162
3163 CHECK_ERROR_STATE();
3164
3165 DDDPRINTF("start section: %" PRIu64 "\n", flags);
3166 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3167 qemu_fflush(f);
3168
3169 return 0;
3170}
3171
3172
3173
3174
3175
3176static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3177 uint64_t flags)
3178{
3179 Error *local_err = NULL, **errp = &local_err;
3180 QEMUFileRDMA *rfile = opaque;
3181 RDMAContext *rdma = rfile->rdma;
3182 RDMAControlHeader head = { .len = 0, .repeat = 1 };
3183 int ret = 0;
3184
3185 CHECK_ERROR_STATE();
3186
3187 qemu_fflush(f);
3188 ret = qemu_rdma_drain_cq(f, rdma);
3189
3190 if (ret < 0) {
3191 goto err;
3192 }
3193
3194 if (flags == RAM_CONTROL_SETUP) {
3195 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3196 RDMALocalBlocks *local = &rdma->local_ram_blocks;
3197 int reg_result_idx, i, j, nb_remote_blocks;
3198
3199 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3200 DPRINTF("Sending registration setup for ram blocks...\n");
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3211 ®_result_idx, rdma->pin_all ?
3212 qemu_rdma_reg_whole_ram_blocks : NULL);
3213 if (ret < 0) {
3214 ERROR(errp, "receiving remote info!");
3215 return ret;
3216 }
3217
3218 nb_remote_blocks = resp.len / sizeof(RDMARemoteBlock);
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232 if (local->nb_blocks != nb_remote_blocks) {
3233 ERROR(errp, "ram blocks mismatch #1! "
3234 "Your QEMU command line parameters are probably "
3235 "not identical on both the source and destination.");
3236 return -EINVAL;
3237 }
3238
3239 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3240 memcpy(rdma->block,
3241 rdma->wr_data[reg_result_idx].control_curr, resp.len);
3242 for (i = 0; i < nb_remote_blocks; i++) {
3243 network_to_remote_block(&rdma->block[i]);
3244
3245
3246 for (j = 0; j < local->nb_blocks; j++) {
3247 if (rdma->block[i].offset != local->block[j].offset) {
3248 continue;
3249 }
3250
3251 if (rdma->block[i].length != local->block[j].length) {
3252 ERROR(errp, "ram blocks mismatch #2! "
3253 "Your QEMU command line parameters are probably "
3254 "not identical on both the source and destination.");
3255 return -EINVAL;
3256 }
3257 local->block[j].remote_host_addr =
3258 rdma->block[i].remote_host_addr;
3259 local->block[j].remote_rkey = rdma->block[i].remote_rkey;
3260 break;
3261 }
3262
3263 if (j >= local->nb_blocks) {
3264 ERROR(errp, "ram blocks mismatch #3! "
3265 "Your QEMU command line parameters are probably "
3266 "not identical on both the source and destination.");
3267 return -EINVAL;
3268 }
3269 }
3270 }
3271
3272 DDDPRINTF("Sending registration finish %" PRIu64 "...\n", flags);
3273
3274 head.type = RDMA_CONTROL_REGISTER_FINISHED;
3275 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3276
3277 if (ret < 0) {
3278 goto err;
3279 }
3280
3281 return 0;
3282err:
3283 rdma->error_state = ret;
3284 return ret;
3285}
3286
3287static int qemu_rdma_get_fd(void *opaque)
3288{
3289 QEMUFileRDMA *rfile = opaque;
3290 RDMAContext *rdma = rfile->rdma;
3291
3292 return rdma->comp_channel->fd;
3293}
3294
3295const QEMUFileOps rdma_read_ops = {
3296 .get_buffer = qemu_rdma_get_buffer,
3297 .get_fd = qemu_rdma_get_fd,
3298 .close = qemu_rdma_close,
3299 .hook_ram_load = qemu_rdma_registration_handle,
3300};
3301
3302const QEMUFileOps rdma_write_ops = {
3303 .put_buffer = qemu_rdma_put_buffer,
3304 .close = qemu_rdma_close,
3305 .before_ram_iterate = qemu_rdma_registration_start,
3306 .after_ram_iterate = qemu_rdma_registration_stop,
3307 .save_page = qemu_rdma_save_page,
3308};
3309
3310static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
3311{
3312 QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
3313
3314 if (qemu_file_mode_is_not_valid(mode)) {
3315 return NULL;
3316 }
3317
3318 r->rdma = rdma;
3319
3320 if (mode[0] == 'w') {
3321 r->file = qemu_fopen_ops(r, &rdma_write_ops);
3322 } else {
3323 r->file = qemu_fopen_ops(r, &rdma_read_ops);
3324 }
3325
3326 return r->file;
3327}
3328
3329static void rdma_accept_incoming_migration(void *opaque)
3330{
3331 RDMAContext *rdma = opaque;
3332 int ret;
3333 QEMUFile *f;
3334 Error *local_err = NULL, **errp = &local_err;
3335
3336 DPRINTF("Accepting rdma connection...\n");
3337 ret = qemu_rdma_accept(rdma);
3338
3339 if (ret) {
3340 ERROR(errp, "RDMA Migration initialization failed!");
3341 return;
3342 }
3343
3344 DPRINTF("Accepted migration\n");
3345
3346 f = qemu_fopen_rdma(rdma, "rb");
3347 if (f == NULL) {
3348 ERROR(errp, "could not qemu_fopen_rdma!");
3349 qemu_rdma_cleanup(rdma);
3350 return;
3351 }
3352
3353 rdma->migration_started_on_destination = 1;
3354 process_incoming_migration(f);
3355}
3356
3357void rdma_start_incoming_migration(const char *host_port, Error **errp)
3358{
3359 int ret;
3360 RDMAContext *rdma;
3361 Error *local_err = NULL;
3362
3363 DPRINTF("Starting RDMA-based incoming migration\n");
3364 rdma = qemu_rdma_data_init(host_port, &local_err);
3365
3366 if (rdma == NULL) {
3367 goto err;
3368 }
3369
3370 ret = qemu_rdma_dest_init(rdma, &local_err);
3371
3372 if (ret) {
3373 goto err;
3374 }
3375
3376 DPRINTF("qemu_rdma_dest_init success\n");
3377
3378 ret = rdma_listen(rdma->listen_id, 5);
3379
3380 if (ret) {
3381 ERROR(errp, "listening on socket!");
3382 goto err;
3383 }
3384
3385 DPRINTF("rdma_listen success\n");
3386
3387 qemu_set_fd_handler2(rdma->channel->fd, NULL,
3388 rdma_accept_incoming_migration, NULL,
3389 (void *)(intptr_t) rdma);
3390 return;
3391err:
3392 error_propagate(errp, local_err);
3393 g_free(rdma);
3394}
3395
3396void rdma_start_outgoing_migration(void *opaque,
3397 const char *host_port, Error **errp)
3398{
3399 MigrationState *s = opaque;
3400 Error *local_err = NULL, **temp = &local_err;
3401 RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err);
3402 int ret = 0;
3403
3404 if (rdma == NULL) {
3405 ERROR(temp, "Failed to initialize RDMA data structures! %d", ret);
3406 goto err;
3407 }
3408
3409 ret = qemu_rdma_source_init(rdma, &local_err,
3410 s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]);
3411
3412 if (ret) {
3413 goto err;
3414 }
3415
3416 DPRINTF("qemu_rdma_source_init success\n");
3417 ret = qemu_rdma_connect(rdma, &local_err);
3418
3419 if (ret) {
3420 goto err;
3421 }
3422
3423 DPRINTF("qemu_rdma_source_connect success\n");
3424
3425 s->file = qemu_fopen_rdma(rdma, "wb");
3426 migrate_fd_connect(s);
3427 return;
3428err:
3429 error_propagate(errp, local_err);
3430 g_free(rdma);
3431 migrate_fd_error(s);
3432}
3433