1
2
3
4
5
6
7
8
9
10
11
12
13
14#include "qemu-common.h"
15#include "migration/migration.h"
16#include "migration/qemu-file.h"
17#include "exec/cpu-common.h"
18#include "qemu/main-loop.h"
19#include "qemu/sockets.h"
20#include "qemu/bitmap.h"
21#include "block/coroutine.h"
22#include <stdio.h>
23#include <sys/types.h>
24#include <sys/socket.h>
25#include <netdb.h>
26#include <arpa/inet.h>
27#include <string.h>
28#include <rdma/rdma_cma.h>
29
30
31
32
33
34#ifdef DEBUG_RDMA
35#define DPRINTF(fmt, ...) \
36 do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
37#else
38#define DPRINTF(fmt, ...) \
39 do { } while (0)
40#endif
41
42#ifdef DEBUG_RDMA_VERBOSE
43#define DDPRINTF(fmt, ...) \
44 do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
45#else
46#define DDPRINTF(fmt, ...) \
47 do { } while (0)
48#endif
49
50#ifdef DEBUG_RDMA_REALLY_VERBOSE
51#define DDDPRINTF(fmt, ...) \
52 do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
53#else
54#define DDDPRINTF(fmt, ...) \
55 do { } while (0)
56#endif
57
58
59
60
61#define ERROR(errp, fmt, ...) \
62 do { \
63 fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
64 if (errp && (*(errp) == NULL)) { \
65 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
66 } \
67 } while (0)
68
69#define RDMA_RESOLVE_TIMEOUT_MS 10000
70
71
72#define RDMA_MERGE_MAX (2 * 1024 * 1024)
73#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
74
75#define RDMA_REG_CHUNK_SHIFT 20
76
77
78
79
80
81
82
83#define RDMA_SEND_INCREMENT 32768
84
85
86
87
88#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
89#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
90
91#define RDMA_CONTROL_VERSION_CURRENT 1
92
93
94
95#define RDMA_CAPABILITY_PIN_ALL 0x01
96
97
98
99
100
101static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
102
103#define CHECK_ERROR_STATE() \
104 do { \
105 if (rdma->error_state) { \
106 if (!rdma->error_reported) { \
107 fprintf(stderr, "RDMA is in an error state waiting migration" \
108 " to abort!\n"); \
109 rdma->error_reported = 1; \
110 } \
111 return rdma->error_state; \
112 } \
113 } while (0);
114
115
116
117
118
119
120
121
122
123
124
125
126
127#define RDMA_WRID_TYPE_SHIFT 0UL
128#define RDMA_WRID_BLOCK_SHIFT 16UL
129#define RDMA_WRID_CHUNK_SHIFT 30UL
130
131#define RDMA_WRID_TYPE_MASK \
132 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
133
134#define RDMA_WRID_BLOCK_MASK \
135 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
136
137#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
138
139
140
141
142
143
144enum {
145 RDMA_WRID_NONE = 0,
146 RDMA_WRID_RDMA_WRITE = 1,
147 RDMA_WRID_SEND_CONTROL = 2000,
148 RDMA_WRID_RECV_CONTROL = 4000,
149};
150
151const char *wrid_desc[] = {
152 [RDMA_WRID_NONE] = "NONE",
153 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
154 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
155 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
156};
157
158
159
160
161
162
163
164
165enum {
166 RDMA_WRID_READY = 0,
167 RDMA_WRID_DATA,
168 RDMA_WRID_CONTROL,
169 RDMA_WRID_MAX,
170};
171
172
173
174
175enum {
176 RDMA_CONTROL_NONE = 0,
177 RDMA_CONTROL_ERROR,
178 RDMA_CONTROL_READY,
179 RDMA_CONTROL_QEMU_FILE,
180 RDMA_CONTROL_RAM_BLOCKS_REQUEST,
181 RDMA_CONTROL_RAM_BLOCKS_RESULT,
182 RDMA_CONTROL_COMPRESS,
183 RDMA_CONTROL_REGISTER_REQUEST,
184 RDMA_CONTROL_REGISTER_RESULT,
185 RDMA_CONTROL_REGISTER_FINISHED,
186 RDMA_CONTROL_UNREGISTER_REQUEST,
187 RDMA_CONTROL_UNREGISTER_FINISHED,
188};
189
190const char *control_desc[] = {
191 [RDMA_CONTROL_NONE] = "NONE",
192 [RDMA_CONTROL_ERROR] = "ERROR",
193 [RDMA_CONTROL_READY] = "READY",
194 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
195 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
196 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
197 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
198 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
199 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
200 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
201 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
202 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
203};
204
205
206
207
208
209typedef struct {
210 uint8_t control[RDMA_CONTROL_MAX_BUFFER];
211 struct ibv_mr *control_mr;
212 size_t control_len;
213 uint8_t *control_curr;
214} RDMAWorkRequestData;
215
216
217
218
219typedef struct {
220 uint32_t version;
221 uint32_t flags;
222} RDMACapabilities;
223
224static void caps_to_network(RDMACapabilities *cap)
225{
226 cap->version = htonl(cap->version);
227 cap->flags = htonl(cap->flags);
228}
229
230static void network_to_caps(RDMACapabilities *cap)
231{
232 cap->version = ntohl(cap->version);
233 cap->flags = ntohl(cap->flags);
234}
235
236
237
238
239
240
241
242
243typedef struct RDMALocalBlock {
244 uint8_t *local_host_addr;
245 uint64_t remote_host_addr;
246 uint64_t offset;
247 uint64_t length;
248 struct ibv_mr **pmr;
249 struct ibv_mr *mr;
250 uint32_t *remote_keys;
251 uint32_t remote_rkey;
252 int index;
253 bool is_ram_block;
254 int nb_chunks;
255 unsigned long *transit_bitmap;
256 unsigned long *unregister_bitmap;
257} RDMALocalBlock;
258
259
260
261
262
263
264
265
266typedef struct QEMU_PACKED RDMARemoteBlock {
267 uint64_t remote_host_addr;
268 uint64_t offset;
269 uint64_t length;
270 uint32_t remote_rkey;
271 uint32_t padding;
272} RDMARemoteBlock;
273
274static uint64_t htonll(uint64_t v)
275{
276 union { uint32_t lv[2]; uint64_t llv; } u;
277 u.lv[0] = htonl(v >> 32);
278 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
279 return u.llv;
280}
281
282static uint64_t ntohll(uint64_t v) {
283 union { uint32_t lv[2]; uint64_t llv; } u;
284 u.llv = v;
285 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
286}
287
288static void remote_block_to_network(RDMARemoteBlock *rb)
289{
290 rb->remote_host_addr = htonll(rb->remote_host_addr);
291 rb->offset = htonll(rb->offset);
292 rb->length = htonll(rb->length);
293 rb->remote_rkey = htonl(rb->remote_rkey);
294}
295
296static void network_to_remote_block(RDMARemoteBlock *rb)
297{
298 rb->remote_host_addr = ntohll(rb->remote_host_addr);
299 rb->offset = ntohll(rb->offset);
300 rb->length = ntohll(rb->length);
301 rb->remote_rkey = ntohl(rb->remote_rkey);
302}
303
304
305
306
307
308
309typedef struct RDMALocalBlocks {
310 int nb_blocks;
311 bool init;
312 RDMALocalBlock *block;
313} RDMALocalBlocks;
314
315
316
317
318
319
320
321typedef struct RDMAContext {
322 char *host;
323 int port;
324
325 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
326
327
328
329
330
331
332
333
334 int control_ready_expected;
335
336
337 int nb_sent;
338
339
340
341 uint64_t current_addr;
342 uint64_t current_length;
343
344 int current_index;
345
346 int current_chunk;
347
348 bool pin_all;
349
350
351
352
353
354
355
356
357 struct rdma_cm_id *cm_id;
358 struct rdma_cm_id *listen_id;
359 bool connected;
360
361 struct ibv_context *verbs;
362 struct rdma_event_channel *channel;
363 struct ibv_qp *qp;
364 struct ibv_comp_channel *comp_channel;
365 struct ibv_pd *pd;
366 struct ibv_cq *cq;
367
368
369
370
371
372
373 int error_state;
374 int error_reported;
375
376
377
378
379 RDMALocalBlocks local_ram_blocks;
380 RDMARemoteBlock *block;
381
382
383
384
385
386
387 int migration_started_on_destination;
388
389 int total_registrations;
390 int total_writes;
391
392 int unregister_current, unregister_next;
393 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
394
395 GHashTable *blockmap;
396} RDMAContext;
397
398
399
400
401typedef struct QEMUFileRDMA {
402 RDMAContext *rdma;
403 size_t len;
404 void *file;
405} QEMUFileRDMA;
406
407
408
409
410
411typedef struct QEMU_PACKED {
412 uint32_t len;
413 uint32_t type;
414 uint32_t repeat;
415 uint32_t padding;
416} RDMAControlHeader;
417
418static void control_to_network(RDMAControlHeader *control)
419{
420 control->type = htonl(control->type);
421 control->len = htonl(control->len);
422 control->repeat = htonl(control->repeat);
423}
424
425static void network_to_control(RDMAControlHeader *control)
426{
427 control->type = ntohl(control->type);
428 control->len = ntohl(control->len);
429 control->repeat = ntohl(control->repeat);
430}
431
432
433
434
435
436
437
438typedef struct QEMU_PACKED {
439 union QEMU_PACKED {
440 uint64_t current_addr;
441 uint64_t chunk;
442 } key;
443 uint32_t current_index;
444 uint32_t padding;
445 uint64_t chunks;
446} RDMARegister;
447
448static void register_to_network(RDMARegister *reg)
449{
450 reg->key.current_addr = htonll(reg->key.current_addr);
451 reg->current_index = htonl(reg->current_index);
452 reg->chunks = htonll(reg->chunks);
453}
454
455static void network_to_register(RDMARegister *reg)
456{
457 reg->key.current_addr = ntohll(reg->key.current_addr);
458 reg->current_index = ntohl(reg->current_index);
459 reg->chunks = ntohll(reg->chunks);
460}
461
462typedef struct QEMU_PACKED {
463 uint32_t value;
464 uint32_t block_idx;
465 uint64_t offset;
466 uint64_t length;
467} RDMACompress;
468
469static void compress_to_network(RDMACompress *comp)
470{
471 comp->value = htonl(comp->value);
472 comp->block_idx = htonl(comp->block_idx);
473 comp->offset = htonll(comp->offset);
474 comp->length = htonll(comp->length);
475}
476
477static void network_to_compress(RDMACompress *comp)
478{
479 comp->value = ntohl(comp->value);
480 comp->block_idx = ntohl(comp->block_idx);
481 comp->offset = ntohll(comp->offset);
482 comp->length = ntohll(comp->length);
483}
484
485
486
487
488
489
490typedef struct QEMU_PACKED {
491 uint32_t rkey;
492 uint32_t padding;
493 uint64_t host_addr;
494} RDMARegisterResult;
495
496static void result_to_network(RDMARegisterResult *result)
497{
498 result->rkey = htonl(result->rkey);
499 result->host_addr = htonll(result->host_addr);
500};
501
502static void network_to_result(RDMARegisterResult *result)
503{
504 result->rkey = ntohl(result->rkey);
505 result->host_addr = ntohll(result->host_addr);
506};
507
508const char *print_wrid(int wrid);
509static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
510 uint8_t *data, RDMAControlHeader *resp,
511 int *resp_idx,
512 int (*callback)(RDMAContext *rdma));
513
514static inline uint64_t ram_chunk_index(const uint8_t *start,
515 const uint8_t *host)
516{
517 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
518}
519
520static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
521 uint64_t i)
522{
523 return (uint8_t *) (((uintptr_t) rdma_ram_block->local_host_addr)
524 + (i << RDMA_REG_CHUNK_SHIFT));
525}
526
527static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
528 uint64_t i)
529{
530 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
531 (1UL << RDMA_REG_CHUNK_SHIFT);
532
533 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
534 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
535 }
536
537 return result;
538}
539
540static int __qemu_rdma_add_block(RDMAContext *rdma, void *host_addr,
541 ram_addr_t block_offset, uint64_t length)
542{
543 RDMALocalBlocks *local = &rdma->local_ram_blocks;
544 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
545 (void *) block_offset);
546 RDMALocalBlock *old = local->block;
547
548 assert(block == NULL);
549
550 local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1));
551
552 if (local->nb_blocks) {
553 int x;
554
555 for (x = 0; x < local->nb_blocks; x++) {
556 g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
557 g_hash_table_insert(rdma->blockmap, (void *)old[x].offset,
558 &local->block[x]);
559 }
560 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
561 g_free(old);
562 }
563
564 block = &local->block[local->nb_blocks];
565
566 block->local_host_addr = host_addr;
567 block->offset = block_offset;
568 block->length = length;
569 block->index = local->nb_blocks;
570 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
571 block->transit_bitmap = bitmap_new(block->nb_chunks);
572 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
573 block->unregister_bitmap = bitmap_new(block->nb_chunks);
574 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
575 block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t));
576
577 block->is_ram_block = local->init ? false : true;
578
579 g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
580
581 DDPRINTF("Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64
582 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",
583 local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,
584 block->length, (uint64_t) (block->local_host_addr + block->length),
585 BITS_TO_LONGS(block->nb_chunks) *
586 sizeof(unsigned long) * 8, block->nb_chunks);
587
588 local->nb_blocks++;
589
590 return 0;
591}
592
593
594
595
596
597
598static void qemu_rdma_init_one_block(void *host_addr,
599 ram_addr_t block_offset, ram_addr_t length, void *opaque)
600{
601 __qemu_rdma_add_block(opaque, host_addr, block_offset, length);
602}
603
604
605
606
607
608
609static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
610{
611 RDMALocalBlocks *local = &rdma->local_ram_blocks;
612
613 assert(rdma->blockmap == NULL);
614 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
615 memset(local, 0, sizeof *local);
616 qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
617 DPRINTF("Allocated %d local ram block structures\n", local->nb_blocks);
618 rdma->block = (RDMARemoteBlock *) g_malloc0(sizeof(RDMARemoteBlock) *
619 rdma->local_ram_blocks.nb_blocks);
620 local->init = true;
621 return 0;
622}
623
624static int __qemu_rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
625{
626 RDMALocalBlocks *local = &rdma->local_ram_blocks;
627 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
628 (void *) block_offset);
629 RDMALocalBlock *old = local->block;
630 int x;
631
632 assert(block);
633
634 if (block->pmr) {
635 int j;
636
637 for (j = 0; j < block->nb_chunks; j++) {
638 if (!block->pmr[j]) {
639 continue;
640 }
641 ibv_dereg_mr(block->pmr[j]);
642 rdma->total_registrations--;
643 }
644 g_free(block->pmr);
645 block->pmr = NULL;
646 }
647
648 if (block->mr) {
649 ibv_dereg_mr(block->mr);
650 rdma->total_registrations--;
651 block->mr = NULL;
652 }
653
654 g_free(block->transit_bitmap);
655 block->transit_bitmap = NULL;
656
657 g_free(block->unregister_bitmap);
658 block->unregister_bitmap = NULL;
659
660 g_free(block->remote_keys);
661 block->remote_keys = NULL;
662
663 for (x = 0; x < local->nb_blocks; x++) {
664 g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
665 }
666
667 if (local->nb_blocks > 1) {
668
669 local->block = g_malloc0(sizeof(RDMALocalBlock) *
670 (local->nb_blocks - 1));
671
672 if (block->index) {
673 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
674 }
675
676 if (block->index < (local->nb_blocks - 1)) {
677 memcpy(local->block + block->index, old + (block->index + 1),
678 sizeof(RDMALocalBlock) *
679 (local->nb_blocks - (block->index + 1)));
680 }
681 } else {
682 assert(block == local->block);
683 local->block = NULL;
684 }
685
686 DDPRINTF("Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64
687 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",
688 local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,
689 block->length, (uint64_t) (block->local_host_addr + block->length),
690 BITS_TO_LONGS(block->nb_chunks) *
691 sizeof(unsigned long) * 8, block->nb_chunks);
692
693 g_free(old);
694
695 local->nb_blocks--;
696
697 if (local->nb_blocks) {
698 for (x = 0; x < local->nb_blocks; x++) {
699 g_hash_table_insert(rdma->blockmap, (void *)local->block[x].offset,
700 &local->block[x]);
701 }
702 }
703
704 return 0;
705}
706
707
708
709
710
711static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
712{
713 struct ibv_port_attr port;
714
715 if (ibv_query_port(verbs, 1, &port)) {
716 fprintf(stderr, "FAILED TO QUERY PORT INFORMATION!\n");
717 return;
718 }
719
720 printf("%s RDMA Device opened: kernel name %s "
721 "uverbs device name %s, "
722 "infiniband_verbs class device path %s, "
723 "infiniband class device path %s, "
724 "transport: (%d) %s\n",
725 who,
726 verbs->device->name,
727 verbs->device->dev_name,
728 verbs->device->dev_path,
729 verbs->device->ibdev_path,
730 port.link_layer,
731 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
732 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
733 ? "Ethernet" : "Unknown"));
734}
735
736
737
738
739
740
741static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
742{
743 char sgid[33];
744 char dgid[33];
745 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
746 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
747 DPRINTF("%s Source GID: %s, Dest GID: %s\n", who, sgid, dgid);
748}
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs)
794{
795 struct ibv_port_attr port_attr;
796
797
798#ifdef CONFIG_LINUX
799
800
801
802
803
804
805
806
807
808
809 if (!verbs) {
810 int num_devices, x;
811 struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
812 bool roce_found = false;
813 bool ib_found = false;
814
815 for (x = 0; x < num_devices; x++) {
816 verbs = ibv_open_device(dev_list[x]);
817
818 if (ibv_query_port(verbs, 1, &port_attr)) {
819 ibv_close_device(verbs);
820 ERROR(errp, "Could not query initial IB port");
821 return -EINVAL;
822 }
823
824 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
825 ib_found = true;
826 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
827 roce_found = true;
828 }
829
830 ibv_close_device(verbs);
831
832 }
833
834 if (roce_found) {
835 if (ib_found) {
836 fprintf(stderr, "WARN: migrations may fail:"
837 " IPv6 over RoCE / iWARP in linux"
838 " is broken. But since you appear to have a"
839 " mixed RoCE / IB environment, be sure to only"
840 " migrate over the IB fabric until the kernel "
841 " fixes the bug.\n");
842 } else {
843 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
844 " and your management software has specified '[::]'"
845 ", but IPv6 over RoCE / iWARP is not supported in Linux.");
846 return -ENONET;
847 }
848 }
849
850 return 0;
851 }
852
853
854
855
856
857
858
859
860 if (ibv_query_port(verbs, 1, &port_attr)) {
861 ERROR(errp, "Could not query initial IB port");
862 return -EINVAL;
863 }
864
865 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
866 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
867 "(but patches on linux-rdma in progress)");
868 return -ENONET;
869 }
870
871#endif
872
873 return 0;
874}
875
876
877
878
879
880
881static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
882{
883 int ret;
884 struct rdma_addrinfo *res;
885 char port_str[16];
886 struct rdma_cm_event *cm_event;
887 char ip[40] = "unknown";
888 struct rdma_addrinfo *e;
889
890 if (rdma->host == NULL || !strcmp(rdma->host, "")) {
891 ERROR(errp, "RDMA hostname has not been set");
892 return -EINVAL;
893 }
894
895
896 rdma->channel = rdma_create_event_channel();
897 if (!rdma->channel) {
898 ERROR(errp, "could not create CM channel");
899 return -EINVAL;
900 }
901
902
903 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
904 if (ret) {
905 ERROR(errp, "could not create channel id");
906 goto err_resolve_create_id;
907 }
908
909 snprintf(port_str, 16, "%d", rdma->port);
910 port_str[15] = '\0';
911
912 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
913 if (ret < 0) {
914 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
915 goto err_resolve_get_addr;
916 }
917
918 for (e = res; e != NULL; e = e->ai_next) {
919 inet_ntop(e->ai_family,
920 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
921 DPRINTF("Trying %s => %s\n", rdma->host, ip);
922
923 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
924 RDMA_RESOLVE_TIMEOUT_MS);
925 if (!ret) {
926 if (e->ai_family == AF_INET6) {
927 ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs);
928 if (ret) {
929 continue;
930 }
931 }
932 goto route;
933 }
934 }
935
936 ERROR(errp, "could not resolve address %s", rdma->host);
937 goto err_resolve_get_addr;
938
939route:
940 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
941
942 ret = rdma_get_cm_event(rdma->channel, &cm_event);
943 if (ret) {
944 ERROR(errp, "could not perform event_addr_resolved");
945 goto err_resolve_get_addr;
946 }
947
948 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
949 ERROR(errp, "result not equal to event_addr_resolved %s",
950 rdma_event_str(cm_event->event));
951 perror("rdma_resolve_addr");
952 ret = -EINVAL;
953 goto err_resolve_get_addr;
954 }
955 rdma_ack_cm_event(cm_event);
956
957
958 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
959 if (ret) {
960 ERROR(errp, "could not resolve rdma route");
961 goto err_resolve_get_addr;
962 }
963
964 ret = rdma_get_cm_event(rdma->channel, &cm_event);
965 if (ret) {
966 ERROR(errp, "could not perform event_route_resolved");
967 goto err_resolve_get_addr;
968 }
969 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
970 ERROR(errp, "result not equal to event_route_resolved: %s",
971 rdma_event_str(cm_event->event));
972 rdma_ack_cm_event(cm_event);
973 ret = -EINVAL;
974 goto err_resolve_get_addr;
975 }
976 rdma_ack_cm_event(cm_event);
977 rdma->verbs = rdma->cm_id->verbs;
978 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
979 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
980 return 0;
981
982err_resolve_get_addr:
983 rdma_destroy_id(rdma->cm_id);
984 rdma->cm_id = NULL;
985err_resolve_create_id:
986 rdma_destroy_event_channel(rdma->channel);
987 rdma->channel = NULL;
988 return ret;
989}
990
991
992
993
994static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
995{
996
997 rdma->pd = ibv_alloc_pd(rdma->verbs);
998 if (!rdma->pd) {
999 fprintf(stderr, "failed to allocate protection domain\n");
1000 return -1;
1001 }
1002
1003
1004 rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
1005 if (!rdma->comp_channel) {
1006 fprintf(stderr, "failed to allocate completion channel\n");
1007 goto err_alloc_pd_cq;
1008 }
1009
1010
1011
1012
1013
1014 rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1015 NULL, rdma->comp_channel, 0);
1016 if (!rdma->cq) {
1017 fprintf(stderr, "failed to allocate completion queue\n");
1018 goto err_alloc_pd_cq;
1019 }
1020
1021 return 0;
1022
1023err_alloc_pd_cq:
1024 if (rdma->pd) {
1025 ibv_dealloc_pd(rdma->pd);
1026 }
1027 if (rdma->comp_channel) {
1028 ibv_destroy_comp_channel(rdma->comp_channel);
1029 }
1030 rdma->pd = NULL;
1031 rdma->comp_channel = NULL;
1032 return -1;
1033
1034}
1035
1036
1037
1038
1039static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1040{
1041 struct ibv_qp_init_attr attr = { 0 };
1042 int ret;
1043
1044 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1045 attr.cap.max_recv_wr = 3;
1046 attr.cap.max_send_sge = 1;
1047 attr.cap.max_recv_sge = 1;
1048 attr.send_cq = rdma->cq;
1049 attr.recv_cq = rdma->cq;
1050 attr.qp_type = IBV_QPT_RC;
1051
1052 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1053 if (ret) {
1054 return -1;
1055 }
1056
1057 rdma->qp = rdma->cm_id->qp;
1058 return 0;
1059}
1060
1061static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1062{
1063 int i;
1064 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1065
1066 for (i = 0; i < local->nb_blocks; i++) {
1067 local->block[i].mr =
1068 ibv_reg_mr(rdma->pd,
1069 local->block[i].local_host_addr,
1070 local->block[i].length,
1071 IBV_ACCESS_LOCAL_WRITE |
1072 IBV_ACCESS_REMOTE_WRITE
1073 );
1074 if (!local->block[i].mr) {
1075 perror("Failed to register local dest ram block!\n");
1076 break;
1077 }
1078 rdma->total_registrations++;
1079 }
1080
1081 if (i >= local->nb_blocks) {
1082 return 0;
1083 }
1084
1085 for (i--; i >= 0; i--) {
1086 ibv_dereg_mr(local->block[i].mr);
1087 rdma->total_registrations--;
1088 }
1089
1090 return -1;
1091
1092}
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1104 uint64_t block_offset,
1105 uint64_t offset,
1106 uint64_t length,
1107 uint64_t *block_index,
1108 uint64_t *chunk_index)
1109{
1110 uint64_t current_addr = block_offset + offset;
1111 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1112 (void *) block_offset);
1113 assert(block);
1114 assert(current_addr >= block->offset);
1115 assert((current_addr + length) <= (block->offset + block->length));
1116
1117 *block_index = block->index;
1118 *chunk_index = ram_chunk_index(block->local_host_addr,
1119 block->local_host_addr + (current_addr - block->offset));
1120
1121 return 0;
1122}
1123
1124
1125
1126
1127
1128
1129
1130
1131static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1132 RDMALocalBlock *block, uint8_t *host_addr,
1133 uint32_t *lkey, uint32_t *rkey, int chunk,
1134 uint8_t *chunk_start, uint8_t *chunk_end)
1135{
1136 if (block->mr) {
1137 if (lkey) {
1138 *lkey = block->mr->lkey;
1139 }
1140 if (rkey) {
1141 *rkey = block->mr->rkey;
1142 }
1143 return 0;
1144 }
1145
1146
1147 if (!block->pmr) {
1148 block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *));
1149 if (!block->pmr) {
1150 return -1;
1151 }
1152 }
1153
1154
1155
1156
1157
1158
1159 if (!block->pmr[chunk]) {
1160 uint64_t len = chunk_end - chunk_start;
1161
1162 DDPRINTF("Registering %" PRIu64 " bytes @ %p\n",
1163 len, chunk_start);
1164
1165 block->pmr[chunk] = ibv_reg_mr(rdma->pd,
1166 chunk_start, len,
1167 (rkey ? (IBV_ACCESS_LOCAL_WRITE |
1168 IBV_ACCESS_REMOTE_WRITE) : 0));
1169
1170 if (!block->pmr[chunk]) {
1171 perror("Failed to register chunk!");
1172 fprintf(stderr, "Chunk details: block: %d chunk index %d"
1173 " start %" PRIu64 " end %" PRIu64 " host %" PRIu64
1174 " local %" PRIu64 " registrations: %d\n",
1175 block->index, chunk, (uint64_t) chunk_start,
1176 (uint64_t) chunk_end, (uint64_t) host_addr,
1177 (uint64_t) block->local_host_addr,
1178 rdma->total_registrations);
1179 return -1;
1180 }
1181 rdma->total_registrations++;
1182 }
1183
1184 if (lkey) {
1185 *lkey = block->pmr[chunk]->lkey;
1186 }
1187 if (rkey) {
1188 *rkey = block->pmr[chunk]->rkey;
1189 }
1190 return 0;
1191}
1192
1193
1194
1195
1196
1197static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1198{
1199 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1200 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1201 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1202 if (rdma->wr_data[idx].control_mr) {
1203 rdma->total_registrations++;
1204 return 0;
1205 }
1206 fprintf(stderr, "qemu_rdma_reg_control failed!\n");
1207 return -1;
1208}
1209
1210const char *print_wrid(int wrid)
1211{
1212 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1213 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1214 }
1215 return wrid_desc[wrid];
1216}
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1254{
1255 while (rdma->unregistrations[rdma->unregister_current]) {
1256 int ret;
1257 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1258 uint64_t chunk =
1259 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1260 uint64_t index =
1261 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1262 RDMALocalBlock *block =
1263 &(rdma->local_ram_blocks.block[index]);
1264 RDMARegister reg = { .current_index = index };
1265 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1266 };
1267 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1268 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1269 .repeat = 1,
1270 };
1271
1272 DDPRINTF("Processing unregister for chunk: %" PRIu64
1273 " at position %d\n", chunk, rdma->unregister_current);
1274
1275 rdma->unregistrations[rdma->unregister_current] = 0;
1276 rdma->unregister_current++;
1277
1278 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1279 rdma->unregister_current = 0;
1280 }
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290 clear_bit(chunk, block->unregister_bitmap);
1291
1292 if (test_bit(chunk, block->transit_bitmap)) {
1293 DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk);
1294 continue;
1295 }
1296
1297 DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk);
1298
1299 ret = ibv_dereg_mr(block->pmr[chunk]);
1300 block->pmr[chunk] = NULL;
1301 block->remote_keys[chunk] = 0;
1302
1303 if (ret != 0) {
1304 perror("unregistration chunk failed");
1305 return -ret;
1306 }
1307 rdma->total_registrations--;
1308
1309 reg.key.chunk = chunk;
1310 register_to_network(®);
1311 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1312 &resp, NULL, NULL);
1313 if (ret < 0) {
1314 return ret;
1315 }
1316
1317 DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk);
1318 }
1319
1320 return 0;
1321}
1322
1323static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1324 uint64_t chunk)
1325{
1326 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1327
1328 result |= (index << RDMA_WRID_BLOCK_SHIFT);
1329 result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1330
1331 return result;
1332}
1333
1334
1335
1336
1337
1338static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1339 uint64_t chunk, uint64_t wr_id)
1340{
1341 if (rdma->unregistrations[rdma->unregister_next] != 0) {
1342 fprintf(stderr, "rdma migration: queue is full!\n");
1343 } else {
1344 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1345
1346 if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1347 DDPRINTF("Appending unregister chunk %" PRIu64
1348 " at position %d\n", chunk, rdma->unregister_next);
1349
1350 rdma->unregistrations[rdma->unregister_next++] =
1351 qemu_rdma_make_wrid(wr_id, index, chunk);
1352
1353 if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1354 rdma->unregister_next = 0;
1355 }
1356 } else {
1357 DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",
1358 chunk);
1359 }
1360 }
1361}
1362
1363
1364
1365
1366
1367
1368static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
1369 uint32_t *byte_len)
1370{
1371 int ret;
1372 struct ibv_wc wc;
1373 uint64_t wr_id;
1374
1375 ret = ibv_poll_cq(rdma->cq, 1, &wc);
1376
1377 if (!ret) {
1378 *wr_id_out = RDMA_WRID_NONE;
1379 return 0;
1380 }
1381
1382 if (ret < 0) {
1383 fprintf(stderr, "ibv_poll_cq return %d!\n", ret);
1384 return ret;
1385 }
1386
1387 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1388
1389 if (wc.status != IBV_WC_SUCCESS) {
1390 fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1391 wc.status, ibv_wc_status_str(wc.status));
1392 fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1393
1394 return -1;
1395 }
1396
1397 if (rdma->control_ready_expected &&
1398 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1399 DDDPRINTF("completion %s #%" PRId64 " received (%" PRId64 ")"
1400 " left %d\n", wrid_desc[RDMA_WRID_RECV_CONTROL],
1401 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1402 rdma->control_ready_expected = 0;
1403 }
1404
1405 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1406 uint64_t chunk =
1407 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1408 uint64_t index =
1409 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1410 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1411
1412 DDDPRINTF("completions %s (%" PRId64 ") left %d, "
1413 "block %" PRIu64 ", chunk: %" PRIu64 " %p %p\n",
1414 print_wrid(wr_id), wr_id, rdma->nb_sent, index, chunk,
1415 block->local_host_addr, (void *)block->remote_host_addr);
1416
1417 clear_bit(chunk, block->transit_bitmap);
1418
1419 if (rdma->nb_sent > 0) {
1420 rdma->nb_sent--;
1421 }
1422
1423 if (!rdma->pin_all) {
1424
1425
1426
1427
1428
1429
1430#ifdef RDMA_UNREGISTRATION_EXAMPLE
1431 qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1432#endif
1433 }
1434 } else {
1435 DDDPRINTF("other completion %s (%" PRId64 ") received left %d\n",
1436 print_wrid(wr_id), wr_id, rdma->nb_sent);
1437 }
1438
1439 *wr_id_out = wc.wr_id;
1440 if (byte_len) {
1441 *byte_len = wc.byte_len;
1442 }
1443
1444 return 0;
1445}
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1461 uint32_t *byte_len)
1462{
1463 int num_cq_events = 0, ret = 0;
1464 struct ibv_cq *cq;
1465 void *cq_ctx;
1466 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1467
1468 if (ibv_req_notify_cq(rdma->cq, 0)) {
1469 return -1;
1470 }
1471
1472 while (wr_id != wrid_requested) {
1473 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1474 if (ret < 0) {
1475 return ret;
1476 }
1477
1478 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1479
1480 if (wr_id == RDMA_WRID_NONE) {
1481 break;
1482 }
1483 if (wr_id != wrid_requested) {
1484 DDDPRINTF("A Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",
1485 print_wrid(wrid_requested),
1486 wrid_requested, print_wrid(wr_id), wr_id);
1487 }
1488 }
1489
1490 if (wr_id == wrid_requested) {
1491 return 0;
1492 }
1493
1494 while (1) {
1495
1496
1497
1498
1499 if (rdma->migration_started_on_destination) {
1500 yield_until_fd_readable(rdma->comp_channel->fd);
1501 }
1502
1503 if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) {
1504 perror("ibv_get_cq_event");
1505 goto err_block_for_wrid;
1506 }
1507
1508 num_cq_events++;
1509
1510 if (ibv_req_notify_cq(cq, 0)) {
1511 goto err_block_for_wrid;
1512 }
1513
1514 while (wr_id != wrid_requested) {
1515 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1516 if (ret < 0) {
1517 goto err_block_for_wrid;
1518 }
1519
1520 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1521
1522 if (wr_id == RDMA_WRID_NONE) {
1523 break;
1524 }
1525 if (wr_id != wrid_requested) {
1526 DDDPRINTF("B Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",
1527 print_wrid(wrid_requested), wrid_requested,
1528 print_wrid(wr_id), wr_id);
1529 }
1530 }
1531
1532 if (wr_id == wrid_requested) {
1533 goto success_block_for_wrid;
1534 }
1535 }
1536
1537success_block_for_wrid:
1538 if (num_cq_events) {
1539 ibv_ack_cq_events(cq, num_cq_events);
1540 }
1541 return 0;
1542
1543err_block_for_wrid:
1544 if (num_cq_events) {
1545 ibv_ack_cq_events(cq, num_cq_events);
1546 }
1547 return ret;
1548}
1549
1550
1551
1552
1553
1554static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1555 RDMAControlHeader *head)
1556{
1557 int ret = 0;
1558 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1559 struct ibv_send_wr *bad_wr;
1560 struct ibv_sge sge = {
1561 .addr = (uint64_t)(wr->control),
1562 .length = head->len + sizeof(RDMAControlHeader),
1563 .lkey = wr->control_mr->lkey,
1564 };
1565 struct ibv_send_wr send_wr = {
1566 .wr_id = RDMA_WRID_SEND_CONTROL,
1567 .opcode = IBV_WR_SEND,
1568 .send_flags = IBV_SEND_SIGNALED,
1569 .sg_list = &sge,
1570 .num_sge = 1,
1571 };
1572
1573 DDDPRINTF("CONTROL: sending %s..\n", control_desc[head->type]);
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1584 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1585 control_to_network((void *) wr->control);
1586
1587 if (buf) {
1588 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1589 }
1590
1591
1592 if (ibv_post_send(rdma->qp, &send_wr, &bad_wr)) {
1593 return -1;
1594 }
1595
1596 if (ret < 0) {
1597 fprintf(stderr, "Failed to use post IB SEND for control!\n");
1598 return ret;
1599 }
1600
1601 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1602 if (ret < 0) {
1603 fprintf(stderr, "rdma migration: send polling control error!\n");
1604 }
1605
1606 return ret;
1607}
1608
1609
1610
1611
1612
1613static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1614{
1615 struct ibv_recv_wr *bad_wr;
1616 struct ibv_sge sge = {
1617 .addr = (uint64_t)(rdma->wr_data[idx].control),
1618 .length = RDMA_CONTROL_MAX_BUFFER,
1619 .lkey = rdma->wr_data[idx].control_mr->lkey,
1620 };
1621
1622 struct ibv_recv_wr recv_wr = {
1623 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1624 .sg_list = &sge,
1625 .num_sge = 1,
1626 };
1627
1628
1629 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1630 return -1;
1631 }
1632
1633 return 0;
1634}
1635
1636
1637
1638
1639static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1640 RDMAControlHeader *head, int expecting, int idx)
1641{
1642 uint32_t byte_len;
1643 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1644 &byte_len);
1645
1646 if (ret < 0) {
1647 fprintf(stderr, "rdma migration: recv polling control error!\n");
1648 return ret;
1649 }
1650
1651 network_to_control((void *) rdma->wr_data[idx].control);
1652 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1653
1654 DDDPRINTF("CONTROL: %s receiving...\n", control_desc[expecting]);
1655
1656 if (expecting == RDMA_CONTROL_NONE) {
1657 DDDPRINTF("Surprise: got %s (%d)\n",
1658 control_desc[head->type], head->type);
1659 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1660 fprintf(stderr, "Was expecting a %s (%d) control message"
1661 ", but got: %s (%d), length: %d\n",
1662 control_desc[expecting], expecting,
1663 control_desc[head->type], head->type, head->len);
1664 return -EIO;
1665 }
1666 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1667 fprintf(stderr, "too long length: %d\n", head->len);
1668 return -EINVAL;
1669 }
1670 if (sizeof(*head) + head->len != byte_len) {
1671 fprintf(stderr, "Malformed length: %d byte_len %d\n",
1672 head->len, byte_len);
1673 return -EINVAL;
1674 }
1675
1676 return 0;
1677}
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1688 RDMAControlHeader *head)
1689{
1690 rdma->wr_data[idx].control_len = head->len;
1691 rdma->wr_data[idx].control_curr =
1692 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1693}
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1709 uint8_t *data, RDMAControlHeader *resp,
1710 int *resp_idx,
1711 int (*callback)(RDMAContext *rdma))
1712{
1713 int ret = 0;
1714
1715
1716
1717
1718
1719 if (rdma->control_ready_expected) {
1720 RDMAControlHeader resp;
1721 ret = qemu_rdma_exchange_get_response(rdma,
1722 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1723 if (ret < 0) {
1724 return ret;
1725 }
1726 }
1727
1728
1729
1730
1731 if (resp) {
1732 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1733 if (ret) {
1734 fprintf(stderr, "rdma migration: error posting"
1735 " extra control recv for anticipated result!");
1736 return ret;
1737 }
1738 }
1739
1740
1741
1742
1743 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1744 if (ret) {
1745 fprintf(stderr, "rdma migration: error posting first control recv!");
1746 return ret;
1747 }
1748
1749
1750
1751
1752 ret = qemu_rdma_post_send_control(rdma, data, head);
1753
1754 if (ret < 0) {
1755 fprintf(stderr, "Failed to send control buffer!\n");
1756 return ret;
1757 }
1758
1759
1760
1761
1762 if (resp) {
1763 if (callback) {
1764 DDPRINTF("Issuing callback before receiving response...\n");
1765 ret = callback(rdma);
1766 if (ret < 0) {
1767 return ret;
1768 }
1769 }
1770
1771 DDPRINTF("Waiting for response %s\n", control_desc[resp->type]);
1772 ret = qemu_rdma_exchange_get_response(rdma, resp,
1773 resp->type, RDMA_WRID_DATA);
1774
1775 if (ret < 0) {
1776 return ret;
1777 }
1778
1779 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1780 if (resp_idx) {
1781 *resp_idx = RDMA_WRID_DATA;
1782 }
1783 DDPRINTF("Response %s received.\n", control_desc[resp->type]);
1784 }
1785
1786 rdma->control_ready_expected = 1;
1787
1788 return 0;
1789}
1790
1791
1792
1793
1794
1795static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1796 int expecting)
1797{
1798 RDMAControlHeader ready = {
1799 .len = 0,
1800 .type = RDMA_CONTROL_READY,
1801 .repeat = 1,
1802 };
1803 int ret;
1804
1805
1806
1807
1808 ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1809
1810 if (ret < 0) {
1811 fprintf(stderr, "Failed to send control buffer!\n");
1812 return ret;
1813 }
1814
1815
1816
1817
1818 ret = qemu_rdma_exchange_get_response(rdma, head,
1819 expecting, RDMA_WRID_READY);
1820
1821 if (ret < 0) {
1822 return ret;
1823 }
1824
1825 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1826
1827
1828
1829
1830 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1831 if (ret) {
1832 fprintf(stderr, "rdma migration: error posting second control recv!");
1833 return ret;
1834 }
1835
1836 return 0;
1837}
1838
1839
1840
1841
1842
1843
1844
1845static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
1846 int current_index, uint64_t current_addr,
1847 uint64_t length)
1848{
1849 struct ibv_sge sge;
1850 struct ibv_send_wr send_wr = { 0 };
1851 struct ibv_send_wr *bad_wr;
1852 int reg_result_idx, ret, count = 0;
1853 uint64_t chunk, chunks;
1854 uint8_t *chunk_start, *chunk_end;
1855 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
1856 RDMARegister reg;
1857 RDMARegisterResult *reg_result;
1858 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
1859 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1860 .type = RDMA_CONTROL_REGISTER_REQUEST,
1861 .repeat = 1,
1862 };
1863
1864retry:
1865 sge.addr = (uint64_t)(block->local_host_addr +
1866 (current_addr - block->offset));
1867 sge.length = length;
1868
1869 chunk = ram_chunk_index(block->local_host_addr, (uint8_t *) sge.addr);
1870 chunk_start = ram_chunk_start(block, chunk);
1871
1872 if (block->is_ram_block) {
1873 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
1874
1875 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1876 chunks--;
1877 }
1878 } else {
1879 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
1880
1881 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1882 chunks--;
1883 }
1884 }
1885
1886 DDPRINTF("Writing %" PRIu64 " chunks, (%" PRIu64 " MB)\n",
1887 chunks + 1, (chunks + 1) * (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
1888
1889 chunk_end = ram_chunk_end(block, chunk + chunks);
1890
1891 if (!rdma->pin_all) {
1892#ifdef RDMA_UNREGISTRATION_EXAMPLE
1893 qemu_rdma_unregister_waiting(rdma);
1894#endif
1895 }
1896
1897 while (test_bit(chunk, block->transit_bitmap)) {
1898 (void)count;
1899 DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64
1900 " current %" PRIu64 " len %" PRIu64 " %d %d\n",
1901 count++, current_index, chunk,
1902 sge.addr, length, rdma->nb_sent, block->nb_chunks);
1903
1904 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
1905
1906 if (ret < 0) {
1907 fprintf(stderr, "Failed to Wait for previous write to complete "
1908 "block %d chunk %" PRIu64
1909 " current %" PRIu64 " len %" PRIu64 " %d\n",
1910 current_index, chunk, sge.addr, length, rdma->nb_sent);
1911 return ret;
1912 }
1913 }
1914
1915 if (!rdma->pin_all || !block->is_ram_block) {
1916 if (!block->remote_keys[chunk]) {
1917
1918
1919
1920
1921
1922
1923 if (can_use_buffer_find_nonzero_offset((void *)sge.addr, length)
1924 && buffer_find_nonzero_offset((void *)sge.addr,
1925 length) == length) {
1926 RDMACompress comp = {
1927 .offset = current_addr,
1928 .value = 0,
1929 .block_idx = current_index,
1930 .length = length,
1931 };
1932
1933 head.len = sizeof(comp);
1934 head.type = RDMA_CONTROL_COMPRESS;
1935
1936 DDPRINTF("Entire chunk is zero, sending compress: %"
1937 PRIu64 " for %d "
1938 "bytes, index: %d, offset: %" PRId64 "...\n",
1939 chunk, sge.length, current_index, current_addr);
1940
1941 compress_to_network(&comp);
1942 ret = qemu_rdma_exchange_send(rdma, &head,
1943 (uint8_t *) &comp, NULL, NULL, NULL);
1944
1945 if (ret < 0) {
1946 return -EIO;
1947 }
1948
1949 acct_update_position(f, sge.length, true);
1950
1951 return 1;
1952 }
1953
1954
1955
1956
1957 reg.current_index = current_index;
1958 if (block->is_ram_block) {
1959 reg.key.current_addr = current_addr;
1960 } else {
1961 reg.key.chunk = chunk;
1962 }
1963 reg.chunks = chunks;
1964
1965 DDPRINTF("Sending registration request chunk %" PRIu64 " for %d "
1966 "bytes, index: %d, offset: %" PRId64 "...\n",
1967 chunk, sge.length, current_index, current_addr);
1968
1969 register_to_network(®);
1970 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1971 &resp, ®_result_idx, NULL);
1972 if (ret < 0) {
1973 return ret;
1974 }
1975
1976
1977 if (qemu_rdma_register_and_get_keys(rdma, block,
1978 (uint8_t *) sge.addr,
1979 &sge.lkey, NULL, chunk,
1980 chunk_start, chunk_end)) {
1981 fprintf(stderr, "cannot get lkey!\n");
1982 return -EINVAL;
1983 }
1984
1985 reg_result = (RDMARegisterResult *)
1986 rdma->wr_data[reg_result_idx].control_curr;
1987
1988 network_to_result(reg_result);
1989
1990 DDPRINTF("Received registration result:"
1991 " my key: %x their key %x, chunk %" PRIu64 "\n",
1992 block->remote_keys[chunk], reg_result->rkey, chunk);
1993
1994 block->remote_keys[chunk] = reg_result->rkey;
1995 block->remote_host_addr = reg_result->host_addr;
1996 } else {
1997
1998 if (qemu_rdma_register_and_get_keys(rdma, block,
1999 (uint8_t *)sge.addr,
2000 &sge.lkey, NULL, chunk,
2001 chunk_start, chunk_end)) {
2002 fprintf(stderr, "cannot get lkey!\n");
2003 return -EINVAL;
2004 }
2005 }
2006
2007 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2008 } else {
2009 send_wr.wr.rdma.rkey = block->remote_rkey;
2010
2011 if (qemu_rdma_register_and_get_keys(rdma, block, (uint8_t *)sge.addr,
2012 &sge.lkey, NULL, chunk,
2013 chunk_start, chunk_end)) {
2014 fprintf(stderr, "cannot get lkey!\n");
2015 return -EINVAL;
2016 }
2017 }
2018
2019
2020
2021
2022
2023
2024
2025 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2026 current_index, chunk);
2027
2028 send_wr.opcode = IBV_WR_RDMA_WRITE;
2029 send_wr.send_flags = IBV_SEND_SIGNALED;
2030 send_wr.sg_list = &sge;
2031 send_wr.num_sge = 1;
2032 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2033 (current_addr - block->offset);
2034
2035 DDDPRINTF("Posting chunk: %" PRIu64 ", addr: %lx"
2036 " remote: %lx, bytes %" PRIu32 "\n",
2037 chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2038 sge.length);
2039
2040
2041
2042
2043
2044 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2045
2046 if (ret == ENOMEM) {
2047 DDPRINTF("send queue is full. wait a little....\n");
2048 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2049 if (ret < 0) {
2050 fprintf(stderr, "rdma migration: failed to make "
2051 "room in full send queue! %d\n", ret);
2052 return ret;
2053 }
2054
2055 goto retry;
2056
2057 } else if (ret > 0) {
2058 perror("rdma migration: post rdma write failed");
2059 return -ret;
2060 }
2061
2062 set_bit(chunk, block->transit_bitmap);
2063 acct_update_position(f, sge.length, false);
2064 rdma->total_writes++;
2065
2066 return 0;
2067}
2068
2069
2070
2071
2072
2073
2074
2075static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2076{
2077 int ret;
2078
2079 if (!rdma->current_length) {
2080 return 0;
2081 }
2082
2083 ret = qemu_rdma_write_one(f, rdma,
2084 rdma->current_index, rdma->current_addr, rdma->current_length);
2085
2086 if (ret < 0) {
2087 return ret;
2088 }
2089
2090 if (ret == 0) {
2091 rdma->nb_sent++;
2092 DDDPRINTF("sent total: %d\n", rdma->nb_sent);
2093 }
2094
2095 rdma->current_length = 0;
2096 rdma->current_addr = 0;
2097
2098 return 0;
2099}
2100
2101static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2102 uint64_t offset, uint64_t len)
2103{
2104 RDMALocalBlock *block;
2105 uint8_t *host_addr;
2106 uint8_t *chunk_end;
2107
2108 if (rdma->current_index < 0) {
2109 return 0;
2110 }
2111
2112 if (rdma->current_chunk < 0) {
2113 return 0;
2114 }
2115
2116 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2117 host_addr = block->local_host_addr + (offset - block->offset);
2118 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2119
2120 if (rdma->current_length == 0) {
2121 return 0;
2122 }
2123
2124
2125
2126
2127 if (offset != (rdma->current_addr + rdma->current_length)) {
2128 return 0;
2129 }
2130
2131 if (offset < block->offset) {
2132 return 0;
2133 }
2134
2135 if ((offset + len) > (block->offset + block->length)) {
2136 return 0;
2137 }
2138
2139 if ((host_addr + len) > chunk_end) {
2140 return 0;
2141 }
2142
2143 return 1;
2144}
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2157 uint64_t block_offset, uint64_t offset,
2158 uint64_t len)
2159{
2160 uint64_t current_addr = block_offset + offset;
2161 uint64_t index = rdma->current_index;
2162 uint64_t chunk = rdma->current_chunk;
2163 int ret;
2164
2165
2166 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2167 ret = qemu_rdma_write_flush(f, rdma);
2168 if (ret) {
2169 return ret;
2170 }
2171 rdma->current_length = 0;
2172 rdma->current_addr = current_addr;
2173
2174 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2175 offset, len, &index, &chunk);
2176 if (ret) {
2177 fprintf(stderr, "ram block search failed\n");
2178 return ret;
2179 }
2180 rdma->current_index = index;
2181 rdma->current_chunk = chunk;
2182 }
2183
2184
2185 rdma->current_length += len;
2186
2187
2188 if (rdma->current_length >= RDMA_MERGE_MAX) {
2189 return qemu_rdma_write_flush(f, rdma);
2190 }
2191
2192 return 0;
2193}
2194
2195static void qemu_rdma_cleanup(RDMAContext *rdma)
2196{
2197 struct rdma_cm_event *cm_event;
2198 int ret, idx;
2199
2200 if (rdma->cm_id && rdma->connected) {
2201 if (rdma->error_state) {
2202 RDMAControlHeader head = { .len = 0,
2203 .type = RDMA_CONTROL_ERROR,
2204 .repeat = 1,
2205 };
2206 fprintf(stderr, "Early error. Sending error.\n");
2207 qemu_rdma_post_send_control(rdma, NULL, &head);
2208 }
2209
2210 ret = rdma_disconnect(rdma->cm_id);
2211 if (!ret) {
2212 DDPRINTF("waiting for disconnect\n");
2213 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2214 if (!ret) {
2215 rdma_ack_cm_event(cm_event);
2216 }
2217 }
2218 DDPRINTF("Disconnected.\n");
2219 rdma->connected = false;
2220 }
2221
2222 g_free(rdma->block);
2223 rdma->block = NULL;
2224
2225 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2226 if (rdma->wr_data[idx].control_mr) {
2227 rdma->total_registrations--;
2228 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2229 }
2230 rdma->wr_data[idx].control_mr = NULL;
2231 }
2232
2233 if (rdma->local_ram_blocks.block) {
2234 while (rdma->local_ram_blocks.nb_blocks) {
2235 __qemu_rdma_delete_block(rdma,
2236 rdma->local_ram_blocks.block->offset);
2237 }
2238 }
2239
2240 if (rdma->qp) {
2241 rdma_destroy_qp(rdma->cm_id);
2242 rdma->qp = NULL;
2243 }
2244 if (rdma->cq) {
2245 ibv_destroy_cq(rdma->cq);
2246 rdma->cq = NULL;
2247 }
2248 if (rdma->comp_channel) {
2249 ibv_destroy_comp_channel(rdma->comp_channel);
2250 rdma->comp_channel = NULL;
2251 }
2252 if (rdma->pd) {
2253 ibv_dealloc_pd(rdma->pd);
2254 rdma->pd = NULL;
2255 }
2256 if (rdma->listen_id) {
2257 rdma_destroy_id(rdma->listen_id);
2258 rdma->listen_id = NULL;
2259 }
2260 if (rdma->cm_id) {
2261 rdma_destroy_id(rdma->cm_id);
2262 rdma->cm_id = NULL;
2263 }
2264 if (rdma->channel) {
2265 rdma_destroy_event_channel(rdma->channel);
2266 rdma->channel = NULL;
2267 }
2268 g_free(rdma->host);
2269 rdma->host = NULL;
2270}
2271
2272
2273static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all)
2274{
2275 int ret, idx;
2276 Error *local_err = NULL, **temp = &local_err;
2277
2278
2279
2280
2281
2282 rdma->pin_all = pin_all;
2283
2284 ret = qemu_rdma_resolve_host(rdma, temp);
2285 if (ret) {
2286 goto err_rdma_source_init;
2287 }
2288
2289 ret = qemu_rdma_alloc_pd_cq(rdma);
2290 if (ret) {
2291 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2292 " limits may be too low. Please check $ ulimit -a # and "
2293 "search for 'ulimit -l' in the output");
2294 goto err_rdma_source_init;
2295 }
2296
2297 ret = qemu_rdma_alloc_qp(rdma);
2298 if (ret) {
2299 ERROR(temp, "rdma migration: error allocating qp!");
2300 goto err_rdma_source_init;
2301 }
2302
2303 ret = qemu_rdma_init_ram_blocks(rdma);
2304 if (ret) {
2305 ERROR(temp, "rdma migration: error initializing ram blocks!");
2306 goto err_rdma_source_init;
2307 }
2308
2309 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2310 ret = qemu_rdma_reg_control(rdma, idx);
2311 if (ret) {
2312 ERROR(temp, "rdma migration: error registering %d control!",
2313 idx);
2314 goto err_rdma_source_init;
2315 }
2316 }
2317
2318 return 0;
2319
2320err_rdma_source_init:
2321 error_propagate(errp, local_err);
2322 qemu_rdma_cleanup(rdma);
2323 return -1;
2324}
2325
2326static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
2327{
2328 RDMACapabilities cap = {
2329 .version = RDMA_CONTROL_VERSION_CURRENT,
2330 .flags = 0,
2331 };
2332 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2333 .retry_count = 5,
2334 .private_data = &cap,
2335 .private_data_len = sizeof(cap),
2336 };
2337 struct rdma_cm_event *cm_event;
2338 int ret;
2339
2340
2341
2342
2343
2344 if (rdma->pin_all) {
2345 DPRINTF("Server pin-all memory requested.\n");
2346 cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2347 }
2348
2349 caps_to_network(&cap);
2350
2351 ret = rdma_connect(rdma->cm_id, &conn_param);
2352 if (ret) {
2353 perror("rdma_connect");
2354 ERROR(errp, "connecting to destination!");
2355 rdma_destroy_id(rdma->cm_id);
2356 rdma->cm_id = NULL;
2357 goto err_rdma_source_connect;
2358 }
2359
2360 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2361 if (ret) {
2362 perror("rdma_get_cm_event after rdma_connect");
2363 ERROR(errp, "connecting to destination!");
2364 rdma_ack_cm_event(cm_event);
2365 rdma_destroy_id(rdma->cm_id);
2366 rdma->cm_id = NULL;
2367 goto err_rdma_source_connect;
2368 }
2369
2370 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2371 perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2372 ERROR(errp, "connecting to destination!");
2373 rdma_ack_cm_event(cm_event);
2374 rdma_destroy_id(rdma->cm_id);
2375 rdma->cm_id = NULL;
2376 goto err_rdma_source_connect;
2377 }
2378 rdma->connected = true;
2379
2380 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2381 network_to_caps(&cap);
2382
2383
2384
2385
2386
2387 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2388 ERROR(errp, "Server cannot support pinning all memory. "
2389 "Will register memory dynamically.");
2390 rdma->pin_all = false;
2391 }
2392
2393 DPRINTF("Pin all memory: %s\n", rdma->pin_all ? "enabled" : "disabled");
2394
2395 rdma_ack_cm_event(cm_event);
2396
2397 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2398 if (ret) {
2399 ERROR(errp, "posting second control recv!");
2400 goto err_rdma_source_connect;
2401 }
2402
2403 rdma->control_ready_expected = 1;
2404 rdma->nb_sent = 0;
2405 return 0;
2406
2407err_rdma_source_connect:
2408 qemu_rdma_cleanup(rdma);
2409 return -1;
2410}
2411
2412static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2413{
2414 int ret = -EINVAL, idx;
2415 struct rdma_cm_id *listen_id;
2416 char ip[40] = "unknown";
2417 struct rdma_addrinfo *res;
2418 char port_str[16];
2419
2420 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2421 rdma->wr_data[idx].control_len = 0;
2422 rdma->wr_data[idx].control_curr = NULL;
2423 }
2424
2425 if (rdma->host == NULL) {
2426 ERROR(errp, "RDMA host is not set!");
2427 rdma->error_state = -EINVAL;
2428 return -1;
2429 }
2430
2431 rdma->channel = rdma_create_event_channel();
2432 if (!rdma->channel) {
2433 ERROR(errp, "could not create rdma event channel");
2434 rdma->error_state = -EINVAL;
2435 return -1;
2436 }
2437
2438
2439 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2440 if (ret) {
2441 ERROR(errp, "could not create cm_id!");
2442 goto err_dest_init_create_listen_id;
2443 }
2444
2445 snprintf(port_str, 16, "%d", rdma->port);
2446 port_str[15] = '\0';
2447
2448 if (rdma->host && strcmp("", rdma->host)) {
2449 struct rdma_addrinfo *e;
2450
2451 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2452 if (ret < 0) {
2453 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2454 goto err_dest_init_bind_addr;
2455 }
2456
2457 for (e = res; e != NULL; e = e->ai_next) {
2458 inet_ntop(e->ai_family,
2459 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2460 DPRINTF("Trying %s => %s\n", rdma->host, ip);
2461 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2462 if (!ret) {
2463 if (e->ai_family == AF_INET6) {
2464 ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs);
2465 if (ret) {
2466 continue;
2467 }
2468 }
2469
2470 goto listen;
2471 }
2472 }
2473
2474 ERROR(errp, "Error: could not rdma_bind_addr!");
2475 goto err_dest_init_bind_addr;
2476 } else {
2477 ERROR(errp, "migration host and port not specified!");
2478 ret = -EINVAL;
2479 goto err_dest_init_bind_addr;
2480 }
2481listen:
2482
2483 rdma->listen_id = listen_id;
2484 qemu_rdma_dump_gid("dest_init", listen_id);
2485 return 0;
2486
2487err_dest_init_bind_addr:
2488 rdma_destroy_id(listen_id);
2489err_dest_init_create_listen_id:
2490 rdma_destroy_event_channel(rdma->channel);
2491 rdma->channel = NULL;
2492 rdma->error_state = ret;
2493 return ret;
2494
2495}
2496
2497static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2498{
2499 RDMAContext *rdma = NULL;
2500 InetSocketAddress *addr;
2501
2502 if (host_port) {
2503 rdma = g_malloc0(sizeof(RDMAContext));
2504 memset(rdma, 0, sizeof(RDMAContext));
2505 rdma->current_index = -1;
2506 rdma->current_chunk = -1;
2507
2508 addr = inet_parse(host_port, NULL);
2509 if (addr != NULL) {
2510 rdma->port = atoi(addr->port);
2511 rdma->host = g_strdup(addr->host);
2512 } else {
2513 ERROR(errp, "bad RDMA migration address '%s'", host_port);
2514 g_free(rdma);
2515 return NULL;
2516 }
2517 }
2518
2519 return rdma;
2520}
2521
2522
2523
2524
2525
2526
2527static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
2528 int64_t pos, int size)
2529{
2530 QEMUFileRDMA *r = opaque;
2531 QEMUFile *f = r->file;
2532 RDMAContext *rdma = r->rdma;
2533 size_t remaining = size;
2534 uint8_t * data = (void *) buf;
2535 int ret;
2536
2537 CHECK_ERROR_STATE();
2538
2539
2540
2541
2542
2543 ret = qemu_rdma_write_flush(f, rdma);
2544 if (ret < 0) {
2545 rdma->error_state = ret;
2546 return ret;
2547 }
2548
2549 while (remaining) {
2550 RDMAControlHeader head;
2551
2552 r->len = MIN(remaining, RDMA_SEND_INCREMENT);
2553 remaining -= r->len;
2554
2555 head.len = r->len;
2556 head.type = RDMA_CONTROL_QEMU_FILE;
2557
2558 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2559
2560 if (ret < 0) {
2561 rdma->error_state = ret;
2562 return ret;
2563 }
2564
2565 data += r->len;
2566 }
2567
2568 return size;
2569}
2570
2571static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2572 int size, int idx)
2573{
2574 size_t len = 0;
2575
2576 if (rdma->wr_data[idx].control_len) {
2577 DDDPRINTF("RDMA %" PRId64 " of %d bytes already in buffer\n",
2578 rdma->wr_data[idx].control_len, size);
2579
2580 len = MIN(size, rdma->wr_data[idx].control_len);
2581 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2582 rdma->wr_data[idx].control_curr += len;
2583 rdma->wr_data[idx].control_len -= len;
2584 }
2585
2586 return len;
2587}
2588
2589
2590
2591
2592
2593
2594static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf,
2595 int64_t pos, int size)
2596{
2597 QEMUFileRDMA *r = opaque;
2598 RDMAContext *rdma = r->rdma;
2599 RDMAControlHeader head;
2600 int ret = 0;
2601
2602 CHECK_ERROR_STATE();
2603
2604
2605
2606
2607
2608
2609 r->len = qemu_rdma_fill(r->rdma, buf, size, 0);
2610 if (r->len) {
2611 return r->len;
2612 }
2613
2614
2615
2616
2617
2618 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2619
2620 if (ret < 0) {
2621 rdma->error_state = ret;
2622 return ret;
2623 }
2624
2625
2626
2627
2628 return qemu_rdma_fill(r->rdma, buf, size, 0);
2629}
2630
2631
2632
2633
2634static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2635{
2636 int ret;
2637
2638 if (qemu_rdma_write_flush(f, rdma) < 0) {
2639 return -EIO;
2640 }
2641
2642 while (rdma->nb_sent) {
2643 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2644 if (ret < 0) {
2645 fprintf(stderr, "rdma migration: complete polling error!\n");
2646 return -EIO;
2647 }
2648 }
2649
2650 qemu_rdma_unregister_waiting(rdma);
2651
2652 return 0;
2653}
2654
2655static int qemu_rdma_close(void *opaque)
2656{
2657 DPRINTF("Shutting down connection.\n");
2658 QEMUFileRDMA *r = opaque;
2659 if (r->rdma) {
2660 qemu_rdma_cleanup(r->rdma);
2661 g_free(r->rdma);
2662 }
2663 g_free(r);
2664 return 0;
2665}
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
2702 ram_addr_t block_offset, ram_addr_t offset,
2703 size_t size, int *bytes_sent)
2704{
2705 QEMUFileRDMA *rfile = opaque;
2706 RDMAContext *rdma = rfile->rdma;
2707 int ret;
2708
2709 CHECK_ERROR_STATE();
2710
2711 qemu_fflush(f);
2712
2713 if (size > 0) {
2714
2715
2716
2717
2718
2719 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
2720 if (ret < 0) {
2721 fprintf(stderr, "rdma migration: write error! %d\n", ret);
2722 goto err;
2723 }
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733 if (bytes_sent) {
2734 *bytes_sent = 1;
2735 }
2736 } else {
2737 uint64_t index, chunk;
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2751 offset, size, &index, &chunk);
2752
2753 if (ret) {
2754 fprintf(stderr, "ram block search failed\n");
2755 goto err;
2756 }
2757
2758 qemu_rdma_signal_unregister(rdma, index, chunk, 0);
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768 }
2769
2770
2771
2772
2773
2774
2775
2776
2777 while (1) {
2778 uint64_t wr_id, wr_id_in;
2779 int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
2780 if (ret < 0) {
2781 fprintf(stderr, "rdma migration: polling error! %d\n", ret);
2782 goto err;
2783 }
2784
2785 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
2786
2787 if (wr_id == RDMA_WRID_NONE) {
2788 break;
2789 }
2790 }
2791
2792 return RAM_SAVE_CONTROL_DELAYED;
2793err:
2794 rdma->error_state = ret;
2795 return ret;
2796}
2797
2798static int qemu_rdma_accept(RDMAContext *rdma)
2799{
2800 RDMACapabilities cap;
2801 struct rdma_conn_param conn_param = {
2802 .responder_resources = 2,
2803 .private_data = &cap,
2804 .private_data_len = sizeof(cap),
2805 };
2806 struct rdma_cm_event *cm_event;
2807 struct ibv_context *verbs;
2808 int ret = -EINVAL;
2809 int idx;
2810
2811 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2812 if (ret) {
2813 goto err_rdma_dest_wait;
2814 }
2815
2816 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
2817 rdma_ack_cm_event(cm_event);
2818 goto err_rdma_dest_wait;
2819 }
2820
2821 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2822
2823 network_to_caps(&cap);
2824
2825 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
2826 fprintf(stderr, "Unknown source RDMA version: %d, bailing...\n",
2827 cap.version);
2828 rdma_ack_cm_event(cm_event);
2829 goto err_rdma_dest_wait;
2830 }
2831
2832
2833
2834
2835 cap.flags &= known_capabilities;
2836
2837
2838
2839
2840
2841 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
2842 rdma->pin_all = true;
2843 }
2844
2845 rdma->cm_id = cm_event->id;
2846 verbs = cm_event->id->verbs;
2847
2848 rdma_ack_cm_event(cm_event);
2849
2850 DPRINTF("Memory pin all: %s\n", rdma->pin_all ? "enabled" : "disabled");
2851
2852 caps_to_network(&cap);
2853
2854 DPRINTF("verbs context after listen: %p\n", verbs);
2855
2856 if (!rdma->verbs) {
2857 rdma->verbs = verbs;
2858 } else if (rdma->verbs != verbs) {
2859 fprintf(stderr, "ibv context not matching %p, %p!\n",
2860 rdma->verbs, verbs);
2861 goto err_rdma_dest_wait;
2862 }
2863
2864 qemu_rdma_dump_id("dest_init", verbs);
2865
2866 ret = qemu_rdma_alloc_pd_cq(rdma);
2867 if (ret) {
2868 fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
2869 goto err_rdma_dest_wait;
2870 }
2871
2872 ret = qemu_rdma_alloc_qp(rdma);
2873 if (ret) {
2874 fprintf(stderr, "rdma migration: error allocating qp!\n");
2875 goto err_rdma_dest_wait;
2876 }
2877
2878 ret = qemu_rdma_init_ram_blocks(rdma);
2879 if (ret) {
2880 fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
2881 goto err_rdma_dest_wait;
2882 }
2883
2884 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2885 ret = qemu_rdma_reg_control(rdma, idx);
2886 if (ret) {
2887 fprintf(stderr, "rdma: error registering %d control!\n", idx);
2888 goto err_rdma_dest_wait;
2889 }
2890 }
2891
2892 qemu_set_fd_handler2(rdma->channel->fd, NULL, NULL, NULL, NULL);
2893
2894 ret = rdma_accept(rdma->cm_id, &conn_param);
2895 if (ret) {
2896 fprintf(stderr, "rdma_accept returns %d!\n", ret);
2897 goto err_rdma_dest_wait;
2898 }
2899
2900 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2901 if (ret) {
2902 fprintf(stderr, "rdma_accept get_cm_event failed %d!\n", ret);
2903 goto err_rdma_dest_wait;
2904 }
2905
2906 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2907 fprintf(stderr, "rdma_accept not event established!\n");
2908 rdma_ack_cm_event(cm_event);
2909 goto err_rdma_dest_wait;
2910 }
2911
2912 rdma_ack_cm_event(cm_event);
2913 rdma->connected = true;
2914
2915 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2916 if (ret) {
2917 fprintf(stderr, "rdma migration: error posting second control recv!\n");
2918 goto err_rdma_dest_wait;
2919 }
2920
2921 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
2922
2923 return 0;
2924
2925err_rdma_dest_wait:
2926 rdma->error_state = ret;
2927 qemu_rdma_cleanup(rdma);
2928 return ret;
2929}
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
2941 uint64_t flags)
2942{
2943 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
2944 .type = RDMA_CONTROL_REGISTER_RESULT,
2945 .repeat = 0,
2946 };
2947 RDMAControlHeader unreg_resp = { .len = 0,
2948 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
2949 .repeat = 0,
2950 };
2951 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
2952 .repeat = 1 };
2953 QEMUFileRDMA *rfile = opaque;
2954 RDMAContext *rdma = rfile->rdma;
2955 RDMALocalBlocks *local = &rdma->local_ram_blocks;
2956 RDMAControlHeader head;
2957 RDMARegister *reg, *registers;
2958 RDMACompress *comp;
2959 RDMARegisterResult *reg_result;
2960 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
2961 RDMALocalBlock *block;
2962 void *host_addr;
2963 int ret = 0;
2964 int idx = 0;
2965 int count = 0;
2966 int i = 0;
2967
2968 CHECK_ERROR_STATE();
2969
2970 do {
2971 DDDPRINTF("Waiting for next request %" PRIu64 "...\n", flags);
2972
2973 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
2974
2975 if (ret < 0) {
2976 break;
2977 }
2978
2979 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
2980 fprintf(stderr, "rdma: Too many requests in this message (%d)."
2981 "Bailing.\n", head.repeat);
2982 ret = -EIO;
2983 break;
2984 }
2985
2986 switch (head.type) {
2987 case RDMA_CONTROL_COMPRESS:
2988 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
2989 network_to_compress(comp);
2990
2991 DDPRINTF("Zapping zero chunk: %" PRId64
2992 " bytes, index %d, offset %" PRId64 "\n",
2993 comp->length, comp->block_idx, comp->offset);
2994 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
2995
2996 host_addr = block->local_host_addr +
2997 (comp->offset - block->offset);
2998
2999 ram_handle_compressed(host_addr, comp->value, comp->length);
3000 break;
3001
3002 case RDMA_CONTROL_REGISTER_FINISHED:
3003 DDDPRINTF("Current registrations complete.\n");
3004 goto out;
3005
3006 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3007 DPRINTF("Initial setup info requested.\n");
3008
3009 if (rdma->pin_all) {
3010 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3011 if (ret) {
3012 fprintf(stderr, "rdma migration: error dest "
3013 "registering ram blocks!\n");
3014 goto out;
3015 }
3016 }
3017
3018
3019
3020
3021
3022
3023
3024 for (i = 0; i < local->nb_blocks; i++) {
3025 rdma->block[i].remote_host_addr =
3026 (uint64_t)(local->block[i].local_host_addr);
3027
3028 if (rdma->pin_all) {
3029 rdma->block[i].remote_rkey = local->block[i].mr->rkey;
3030 }
3031
3032 rdma->block[i].offset = local->block[i].offset;
3033 rdma->block[i].length = local->block[i].length;
3034
3035 remote_block_to_network(&rdma->block[i]);
3036 }
3037
3038 blocks.len = rdma->local_ram_blocks.nb_blocks
3039 * sizeof(RDMARemoteBlock);
3040
3041
3042 ret = qemu_rdma_post_send_control(rdma,
3043 (uint8_t *) rdma->block, &blocks);
3044
3045 if (ret < 0) {
3046 fprintf(stderr, "rdma migration: error sending remote info!\n");
3047 goto out;
3048 }
3049
3050 break;
3051 case RDMA_CONTROL_REGISTER_REQUEST:
3052 DDPRINTF("There are %d registration requests\n", head.repeat);
3053
3054 reg_resp.repeat = head.repeat;
3055 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3056
3057 for (count = 0; count < head.repeat; count++) {
3058 uint64_t chunk;
3059 uint8_t *chunk_start, *chunk_end;
3060
3061 reg = ®isters[count];
3062 network_to_register(reg);
3063
3064 reg_result = &results[count];
3065
3066 DDPRINTF("Registration request (%d): index %d, current_addr %"
3067 PRIu64 " chunks: %" PRIu64 "\n", count,
3068 reg->current_index, reg->key.current_addr, reg->chunks);
3069
3070 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3071 if (block->is_ram_block) {
3072 host_addr = (block->local_host_addr +
3073 (reg->key.current_addr - block->offset));
3074 chunk = ram_chunk_index(block->local_host_addr,
3075 (uint8_t *) host_addr);
3076 } else {
3077 chunk = reg->key.chunk;
3078 host_addr = block->local_host_addr +
3079 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3080 }
3081 chunk_start = ram_chunk_start(block, chunk);
3082 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3083 if (qemu_rdma_register_and_get_keys(rdma, block,
3084 (uint8_t *)host_addr, NULL, ®_result->rkey,
3085 chunk, chunk_start, chunk_end)) {
3086 fprintf(stderr, "cannot get rkey!\n");
3087 ret = -EINVAL;
3088 goto out;
3089 }
3090
3091 reg_result->host_addr = (uint64_t) block->local_host_addr;
3092
3093 DDPRINTF("Registered rkey for this request: %x\n",
3094 reg_result->rkey);
3095
3096 result_to_network(reg_result);
3097 }
3098
3099 ret = qemu_rdma_post_send_control(rdma,
3100 (uint8_t *) results, ®_resp);
3101
3102 if (ret < 0) {
3103 fprintf(stderr, "Failed to send control buffer!\n");
3104 goto out;
3105 }
3106 break;
3107 case RDMA_CONTROL_UNREGISTER_REQUEST:
3108 DDPRINTF("There are %d unregistration requests\n", head.repeat);
3109 unreg_resp.repeat = head.repeat;
3110 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3111
3112 for (count = 0; count < head.repeat; count++) {
3113 reg = ®isters[count];
3114 network_to_register(reg);
3115
3116 DDPRINTF("Unregistration request (%d): "
3117 " index %d, chunk %" PRIu64 "\n",
3118 count, reg->current_index, reg->key.chunk);
3119
3120 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3121
3122 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3123 block->pmr[reg->key.chunk] = NULL;
3124
3125 if (ret != 0) {
3126 perror("rdma unregistration chunk failed");
3127 ret = -ret;
3128 goto out;
3129 }
3130
3131 rdma->total_registrations--;
3132
3133 DDPRINTF("Unregistered chunk %" PRIu64 " successfully.\n",
3134 reg->key.chunk);
3135 }
3136
3137 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3138
3139 if (ret < 0) {
3140 fprintf(stderr, "Failed to send control buffer!\n");
3141 goto out;
3142 }
3143 break;
3144 case RDMA_CONTROL_REGISTER_RESULT:
3145 fprintf(stderr, "Invalid RESULT message at dest.\n");
3146 ret = -EIO;
3147 goto out;
3148 default:
3149 fprintf(stderr, "Unknown control message %s\n",
3150 control_desc[head.type]);
3151 ret = -EIO;
3152 goto out;
3153 }
3154 } while (1);
3155out:
3156 if (ret < 0) {
3157 rdma->error_state = ret;
3158 }
3159 return ret;
3160}
3161
3162static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3163 uint64_t flags)
3164{
3165 QEMUFileRDMA *rfile = opaque;
3166 RDMAContext *rdma = rfile->rdma;
3167
3168 CHECK_ERROR_STATE();
3169
3170 DDDPRINTF("start section: %" PRIu64 "\n", flags);
3171 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3172 qemu_fflush(f);
3173
3174 return 0;
3175}
3176
3177
3178
3179
3180
3181static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3182 uint64_t flags)
3183{
3184 Error *local_err = NULL, **errp = &local_err;
3185 QEMUFileRDMA *rfile = opaque;
3186 RDMAContext *rdma = rfile->rdma;
3187 RDMAControlHeader head = { .len = 0, .repeat = 1 };
3188 int ret = 0;
3189
3190 CHECK_ERROR_STATE();
3191
3192 qemu_fflush(f);
3193 ret = qemu_rdma_drain_cq(f, rdma);
3194
3195 if (ret < 0) {
3196 goto err;
3197 }
3198
3199 if (flags == RAM_CONTROL_SETUP) {
3200 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3201 RDMALocalBlocks *local = &rdma->local_ram_blocks;
3202 int reg_result_idx, i, j, nb_remote_blocks;
3203
3204 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3205 DPRINTF("Sending registration setup for ram blocks...\n");
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3216 ®_result_idx, rdma->pin_all ?
3217 qemu_rdma_reg_whole_ram_blocks : NULL);
3218 if (ret < 0) {
3219 ERROR(errp, "receiving remote info!");
3220 return ret;
3221 }
3222
3223 nb_remote_blocks = resp.len / sizeof(RDMARemoteBlock);
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237 if (local->nb_blocks != nb_remote_blocks) {
3238 ERROR(errp, "ram blocks mismatch #1! "
3239 "Your QEMU command line parameters are probably "
3240 "not identical on both the source and destination.");
3241 return -EINVAL;
3242 }
3243
3244 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3245 memcpy(rdma->block,
3246 rdma->wr_data[reg_result_idx].control_curr, resp.len);
3247 for (i = 0; i < nb_remote_blocks; i++) {
3248 network_to_remote_block(&rdma->block[i]);
3249
3250
3251 for (j = 0; j < local->nb_blocks; j++) {
3252 if (rdma->block[i].offset != local->block[j].offset) {
3253 continue;
3254 }
3255
3256 if (rdma->block[i].length != local->block[j].length) {
3257 ERROR(errp, "ram blocks mismatch #2! "
3258 "Your QEMU command line parameters are probably "
3259 "not identical on both the source and destination.");
3260 return -EINVAL;
3261 }
3262 local->block[j].remote_host_addr =
3263 rdma->block[i].remote_host_addr;
3264 local->block[j].remote_rkey = rdma->block[i].remote_rkey;
3265 break;
3266 }
3267
3268 if (j >= local->nb_blocks) {
3269 ERROR(errp, "ram blocks mismatch #3! "
3270 "Your QEMU command line parameters are probably "
3271 "not identical on both the source and destination.");
3272 return -EINVAL;
3273 }
3274 }
3275 }
3276
3277 DDDPRINTF("Sending registration finish %" PRIu64 "...\n", flags);
3278
3279 head.type = RDMA_CONTROL_REGISTER_FINISHED;
3280 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3281
3282 if (ret < 0) {
3283 goto err;
3284 }
3285
3286 return 0;
3287err:
3288 rdma->error_state = ret;
3289 return ret;
3290}
3291
3292static int qemu_rdma_get_fd(void *opaque)
3293{
3294 QEMUFileRDMA *rfile = opaque;
3295 RDMAContext *rdma = rfile->rdma;
3296
3297 return rdma->comp_channel->fd;
3298}
3299
3300const QEMUFileOps rdma_read_ops = {
3301 .get_buffer = qemu_rdma_get_buffer,
3302 .get_fd = qemu_rdma_get_fd,
3303 .close = qemu_rdma_close,
3304 .hook_ram_load = qemu_rdma_registration_handle,
3305};
3306
3307const QEMUFileOps rdma_write_ops = {
3308 .put_buffer = qemu_rdma_put_buffer,
3309 .close = qemu_rdma_close,
3310 .before_ram_iterate = qemu_rdma_registration_start,
3311 .after_ram_iterate = qemu_rdma_registration_stop,
3312 .save_page = qemu_rdma_save_page,
3313};
3314
3315static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
3316{
3317 QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
3318
3319 if (qemu_file_mode_is_not_valid(mode)) {
3320 return NULL;
3321 }
3322
3323 r->rdma = rdma;
3324
3325 if (mode[0] == 'w') {
3326 r->file = qemu_fopen_ops(r, &rdma_write_ops);
3327 } else {
3328 r->file = qemu_fopen_ops(r, &rdma_read_ops);
3329 }
3330
3331 return r->file;
3332}
3333
3334static void rdma_accept_incoming_migration(void *opaque)
3335{
3336 RDMAContext *rdma = opaque;
3337 int ret;
3338 QEMUFile *f;
3339 Error *local_err = NULL, **errp = &local_err;
3340
3341 DPRINTF("Accepting rdma connection...\n");
3342 ret = qemu_rdma_accept(rdma);
3343
3344 if (ret) {
3345 ERROR(errp, "RDMA Migration initialization failed!");
3346 return;
3347 }
3348
3349 DPRINTF("Accepted migration\n");
3350
3351 f = qemu_fopen_rdma(rdma, "rb");
3352 if (f == NULL) {
3353 ERROR(errp, "could not qemu_fopen_rdma!");
3354 qemu_rdma_cleanup(rdma);
3355 return;
3356 }
3357
3358 rdma->migration_started_on_destination = 1;
3359 process_incoming_migration(f);
3360}
3361
3362void rdma_start_incoming_migration(const char *host_port, Error **errp)
3363{
3364 int ret;
3365 RDMAContext *rdma;
3366 Error *local_err = NULL;
3367
3368 DPRINTF("Starting RDMA-based incoming migration\n");
3369 rdma = qemu_rdma_data_init(host_port, &local_err);
3370
3371 if (rdma == NULL) {
3372 goto err;
3373 }
3374
3375 ret = qemu_rdma_dest_init(rdma, &local_err);
3376
3377 if (ret) {
3378 goto err;
3379 }
3380
3381 DPRINTF("qemu_rdma_dest_init success\n");
3382
3383 ret = rdma_listen(rdma->listen_id, 5);
3384
3385 if (ret) {
3386 ERROR(errp, "listening on socket!");
3387 goto err;
3388 }
3389
3390 DPRINTF("rdma_listen success\n");
3391
3392 qemu_set_fd_handler2(rdma->channel->fd, NULL,
3393 rdma_accept_incoming_migration, NULL,
3394 (void *)(intptr_t) rdma);
3395 return;
3396err:
3397 error_propagate(errp, local_err);
3398 g_free(rdma);
3399}
3400
3401void rdma_start_outgoing_migration(void *opaque,
3402 const char *host_port, Error **errp)
3403{
3404 MigrationState *s = opaque;
3405 Error *local_err = NULL, **temp = &local_err;
3406 RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err);
3407 int ret = 0;
3408
3409 if (rdma == NULL) {
3410 ERROR(temp, "Failed to initialize RDMA data structures! %d", ret);
3411 goto err;
3412 }
3413
3414 ret = qemu_rdma_source_init(rdma, &local_err,
3415 s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]);
3416
3417 if (ret) {
3418 goto err;
3419 }
3420
3421 DPRINTF("qemu_rdma_source_init success\n");
3422 ret = qemu_rdma_connect(rdma, &local_err);
3423
3424 if (ret) {
3425 goto err;
3426 }
3427
3428 DPRINTF("qemu_rdma_source_connect success\n");
3429
3430 s->file = qemu_fopen_rdma(rdma, "wb");
3431 migrate_fd_connect(s);
3432 return;
3433err:
3434 error_propagate(errp, local_err);
3435 g_free(rdma);
3436 migrate_fd_error(s);
3437}
3438