1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include "qemu/osdep.h"
17#include "qapi/error.h"
18#include "qemu-common.h"
19#include "qemu/cutils.h"
20#include "rdma.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "ram.h"
24#include "qemu-file-channel.h"
25#include "qemu/error-report.h"
26#include "qemu/main-loop.h"
27#include "qemu/sockets.h"
28#include "qemu/bitmap.h"
29#include "qemu/coroutine.h"
30#include <sys/socket.h>
31#include <netdb.h>
32#include <arpa/inet.h>
33#include <rdma/rdma_cma.h>
34#include "trace.h"
35
36
37
38
39#define ERROR(errp, fmt, ...) \
40 do { \
41 fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
42 if (errp && (*(errp) == NULL)) { \
43 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
44 } \
45 } while (0)
46
47#define RDMA_RESOLVE_TIMEOUT_MS 10000
48
49
50#define RDMA_MERGE_MAX (2 * 1024 * 1024)
51#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
52
53#define RDMA_REG_CHUNK_SHIFT 20
54
55
56
57
58
59
60
61#define RDMA_SEND_INCREMENT 32768
62
63
64
65
66#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
67#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
68
69#define RDMA_CONTROL_VERSION_CURRENT 1
70
71
72
73#define RDMA_CAPABILITY_PIN_ALL 0x01
74
75
76
77
78
79static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
80
81#define CHECK_ERROR_STATE() \
82 do { \
83 if (rdma->error_state) { \
84 if (!rdma->error_reported) { \
85 error_report("RDMA is in an error state waiting migration" \
86 " to abort!"); \
87 rdma->error_reported = 1; \
88 } \
89 return rdma->error_state; \
90 } \
91 } while (0);
92
93
94
95
96
97
98
99
100
101
102
103
104
105#define RDMA_WRID_TYPE_SHIFT 0UL
106#define RDMA_WRID_BLOCK_SHIFT 16UL
107#define RDMA_WRID_CHUNK_SHIFT 30UL
108
109#define RDMA_WRID_TYPE_MASK \
110 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
111
112#define RDMA_WRID_BLOCK_MASK \
113 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
114
115#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
116
117
118
119
120
121
122enum {
123 RDMA_WRID_NONE = 0,
124 RDMA_WRID_RDMA_WRITE = 1,
125 RDMA_WRID_SEND_CONTROL = 2000,
126 RDMA_WRID_RECV_CONTROL = 4000,
127};
128
129static const char *wrid_desc[] = {
130 [RDMA_WRID_NONE] = "NONE",
131 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
132 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
133 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
134};
135
136
137
138
139
140
141
142
143enum {
144 RDMA_WRID_READY = 0,
145 RDMA_WRID_DATA,
146 RDMA_WRID_CONTROL,
147 RDMA_WRID_MAX,
148};
149
150
151
152
153enum {
154 RDMA_CONTROL_NONE = 0,
155 RDMA_CONTROL_ERROR,
156 RDMA_CONTROL_READY,
157 RDMA_CONTROL_QEMU_FILE,
158 RDMA_CONTROL_RAM_BLOCKS_REQUEST,
159 RDMA_CONTROL_RAM_BLOCKS_RESULT,
160 RDMA_CONTROL_COMPRESS,
161 RDMA_CONTROL_REGISTER_REQUEST,
162 RDMA_CONTROL_REGISTER_RESULT,
163 RDMA_CONTROL_REGISTER_FINISHED,
164 RDMA_CONTROL_UNREGISTER_REQUEST,
165 RDMA_CONTROL_UNREGISTER_FINISHED,
166};
167
168
169
170
171
172
173typedef struct {
174 uint8_t control[RDMA_CONTROL_MAX_BUFFER];
175 struct ibv_mr *control_mr;
176 size_t control_len;
177 uint8_t *control_curr;
178} RDMAWorkRequestData;
179
180
181
182
183typedef struct {
184 uint32_t version;
185 uint32_t flags;
186} RDMACapabilities;
187
188static void caps_to_network(RDMACapabilities *cap)
189{
190 cap->version = htonl(cap->version);
191 cap->flags = htonl(cap->flags);
192}
193
194static void network_to_caps(RDMACapabilities *cap)
195{
196 cap->version = ntohl(cap->version);
197 cap->flags = ntohl(cap->flags);
198}
199
200
201
202
203
204
205
206
207typedef struct RDMALocalBlock {
208 char *block_name;
209 uint8_t *local_host_addr;
210 uint64_t remote_host_addr;
211 uint64_t offset;
212 uint64_t length;
213 struct ibv_mr **pmr;
214 struct ibv_mr *mr;
215 uint32_t *remote_keys;
216 uint32_t remote_rkey;
217 int index;
218 unsigned int src_index;
219 bool is_ram_block;
220 int nb_chunks;
221 unsigned long *transit_bitmap;
222 unsigned long *unregister_bitmap;
223} RDMALocalBlock;
224
225
226
227
228
229
230
231
232typedef struct QEMU_PACKED RDMADestBlock {
233 uint64_t remote_host_addr;
234 uint64_t offset;
235 uint64_t length;
236 uint32_t remote_rkey;
237 uint32_t padding;
238} RDMADestBlock;
239
240static const char *control_desc(unsigned int rdma_control)
241{
242 static const char *strs[] = {
243 [RDMA_CONTROL_NONE] = "NONE",
244 [RDMA_CONTROL_ERROR] = "ERROR",
245 [RDMA_CONTROL_READY] = "READY",
246 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
247 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
248 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
249 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
250 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
251 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
252 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
253 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
254 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
255 };
256
257 if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
258 return "??BAD CONTROL VALUE??";
259 }
260
261 return strs[rdma_control];
262}
263
264static uint64_t htonll(uint64_t v)
265{
266 union { uint32_t lv[2]; uint64_t llv; } u;
267 u.lv[0] = htonl(v >> 32);
268 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
269 return u.llv;
270}
271
272static uint64_t ntohll(uint64_t v) {
273 union { uint32_t lv[2]; uint64_t llv; } u;
274 u.llv = v;
275 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
276}
277
278static void dest_block_to_network(RDMADestBlock *db)
279{
280 db->remote_host_addr = htonll(db->remote_host_addr);
281 db->offset = htonll(db->offset);
282 db->length = htonll(db->length);
283 db->remote_rkey = htonl(db->remote_rkey);
284}
285
286static void network_to_dest_block(RDMADestBlock *db)
287{
288 db->remote_host_addr = ntohll(db->remote_host_addr);
289 db->offset = ntohll(db->offset);
290 db->length = ntohll(db->length);
291 db->remote_rkey = ntohl(db->remote_rkey);
292}
293
294
295
296
297
298
299typedef struct RDMALocalBlocks {
300 int nb_blocks;
301 bool init;
302 RDMALocalBlock *block;
303} RDMALocalBlocks;
304
305
306
307
308
309
310
311typedef struct RDMAContext {
312 char *host;
313 int port;
314
315 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
316
317
318
319
320
321
322
323
324 int control_ready_expected;
325
326
327 int nb_sent;
328
329
330
331 uint64_t current_addr;
332 uint64_t current_length;
333
334 int current_index;
335
336 int current_chunk;
337
338 bool pin_all;
339
340
341
342
343
344
345
346
347 struct rdma_cm_id *cm_id;
348 struct rdma_cm_id *listen_id;
349 bool connected;
350
351 struct ibv_context *verbs;
352 struct rdma_event_channel *channel;
353 struct ibv_qp *qp;
354 struct ibv_comp_channel *comp_channel;
355 struct ibv_pd *pd;
356 struct ibv_cq *cq;
357
358
359
360
361
362
363 int error_state;
364 int error_reported;
365 int received_error;
366
367
368
369
370 RDMALocalBlocks local_ram_blocks;
371 RDMADestBlock *dest_blocks;
372
373
374 unsigned int next_src_index;
375
376
377
378
379
380
381 int migration_started_on_destination;
382
383 int total_registrations;
384 int total_writes;
385
386 int unregister_current, unregister_next;
387 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
388
389 GHashTable *blockmap;
390} RDMAContext;
391
392#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
393#define QIO_CHANNEL_RDMA(obj) \
394 OBJECT_CHECK(QIOChannelRDMA, (obj), TYPE_QIO_CHANNEL_RDMA)
395
396typedef struct QIOChannelRDMA QIOChannelRDMA;
397
398
399struct QIOChannelRDMA {
400 QIOChannel parent;
401 RDMAContext *rdma;
402 QEMUFile *file;
403 size_t len;
404 bool blocking;
405};
406
407
408
409
410
411typedef struct QEMU_PACKED {
412 uint32_t len;
413 uint32_t type;
414 uint32_t repeat;
415 uint32_t padding;
416} RDMAControlHeader;
417
418static void control_to_network(RDMAControlHeader *control)
419{
420 control->type = htonl(control->type);
421 control->len = htonl(control->len);
422 control->repeat = htonl(control->repeat);
423}
424
425static void network_to_control(RDMAControlHeader *control)
426{
427 control->type = ntohl(control->type);
428 control->len = ntohl(control->len);
429 control->repeat = ntohl(control->repeat);
430}
431
432
433
434
435
436
437
438typedef struct QEMU_PACKED {
439 union QEMU_PACKED {
440 uint64_t current_addr;
441 uint64_t chunk;
442 } key;
443 uint32_t current_index;
444 uint32_t padding;
445 uint64_t chunks;
446} RDMARegister;
447
448static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
449{
450 RDMALocalBlock *local_block;
451 local_block = &rdma->local_ram_blocks.block[reg->current_index];
452
453 if (local_block->is_ram_block) {
454
455
456
457
458 reg->key.current_addr -= local_block->offset;
459 reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
460 }
461 reg->key.current_addr = htonll(reg->key.current_addr);
462 reg->current_index = htonl(reg->current_index);
463 reg->chunks = htonll(reg->chunks);
464}
465
466static void network_to_register(RDMARegister *reg)
467{
468 reg->key.current_addr = ntohll(reg->key.current_addr);
469 reg->current_index = ntohl(reg->current_index);
470 reg->chunks = ntohll(reg->chunks);
471}
472
473typedef struct QEMU_PACKED {
474 uint32_t value;
475 uint32_t block_idx;
476 uint64_t offset;
477 uint64_t length;
478} RDMACompress;
479
480static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
481{
482 comp->value = htonl(comp->value);
483
484
485
486
487 comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
488 comp->offset += rdma->dest_blocks[comp->block_idx].offset;
489 comp->block_idx = htonl(comp->block_idx);
490 comp->offset = htonll(comp->offset);
491 comp->length = htonll(comp->length);
492}
493
494static void network_to_compress(RDMACompress *comp)
495{
496 comp->value = ntohl(comp->value);
497 comp->block_idx = ntohl(comp->block_idx);
498 comp->offset = ntohll(comp->offset);
499 comp->length = ntohll(comp->length);
500}
501
502
503
504
505
506
507typedef struct QEMU_PACKED {
508 uint32_t rkey;
509 uint32_t padding;
510 uint64_t host_addr;
511} RDMARegisterResult;
512
513static void result_to_network(RDMARegisterResult *result)
514{
515 result->rkey = htonl(result->rkey);
516 result->host_addr = htonll(result->host_addr);
517};
518
519static void network_to_result(RDMARegisterResult *result)
520{
521 result->rkey = ntohl(result->rkey);
522 result->host_addr = ntohll(result->host_addr);
523};
524
525const char *print_wrid(int wrid);
526static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
527 uint8_t *data, RDMAControlHeader *resp,
528 int *resp_idx,
529 int (*callback)(RDMAContext *rdma));
530
531static inline uint64_t ram_chunk_index(const uint8_t *start,
532 const uint8_t *host)
533{
534 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
535}
536
537static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
538 uint64_t i)
539{
540 return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
541 (i << RDMA_REG_CHUNK_SHIFT));
542}
543
544static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
545 uint64_t i)
546{
547 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
548 (1UL << RDMA_REG_CHUNK_SHIFT);
549
550 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
551 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
552 }
553
554 return result;
555}
556
557static int rdma_add_block(RDMAContext *rdma, const char *block_name,
558 void *host_addr,
559 ram_addr_t block_offset, uint64_t length)
560{
561 RDMALocalBlocks *local = &rdma->local_ram_blocks;
562 RDMALocalBlock *block;
563 RDMALocalBlock *old = local->block;
564
565 local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
566
567 if (local->nb_blocks) {
568 int x;
569
570 if (rdma->blockmap) {
571 for (x = 0; x < local->nb_blocks; x++) {
572 g_hash_table_remove(rdma->blockmap,
573 (void *)(uintptr_t)old[x].offset);
574 g_hash_table_insert(rdma->blockmap,
575 (void *)(uintptr_t)old[x].offset,
576 &local->block[x]);
577 }
578 }
579 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
580 g_free(old);
581 }
582
583 block = &local->block[local->nb_blocks];
584
585 block->block_name = g_strdup(block_name);
586 block->local_host_addr = host_addr;
587 block->offset = block_offset;
588 block->length = length;
589 block->index = local->nb_blocks;
590 block->src_index = ~0U;
591 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
592 block->transit_bitmap = bitmap_new(block->nb_chunks);
593 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
594 block->unregister_bitmap = bitmap_new(block->nb_chunks);
595 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
596 block->remote_keys = g_new0(uint32_t, block->nb_chunks);
597
598 block->is_ram_block = local->init ? false : true;
599
600 if (rdma->blockmap) {
601 g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
602 }
603
604 trace_rdma_add_block(block_name, local->nb_blocks,
605 (uintptr_t) block->local_host_addr,
606 block->offset, block->length,
607 (uintptr_t) (block->local_host_addr + block->length),
608 BITS_TO_LONGS(block->nb_chunks) *
609 sizeof(unsigned long) * 8,
610 block->nb_chunks);
611
612 local->nb_blocks++;
613
614 return 0;
615}
616
617
618
619
620
621
622static int qemu_rdma_init_one_block(const char *block_name, void *host_addr,
623 ram_addr_t block_offset, ram_addr_t length, void *opaque)
624{
625 return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
626}
627
628
629
630
631
632
633static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
634{
635 RDMALocalBlocks *local = &rdma->local_ram_blocks;
636
637 assert(rdma->blockmap == NULL);
638 memset(local, 0, sizeof *local);
639 qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
640 trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
641 rdma->dest_blocks = g_new0(RDMADestBlock,
642 rdma->local_ram_blocks.nb_blocks);
643 local->init = true;
644 return 0;
645}
646
647
648
649
650
651static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
652{
653 RDMALocalBlocks *local = &rdma->local_ram_blocks;
654 RDMALocalBlock *old = local->block;
655 int x;
656
657 if (rdma->blockmap) {
658 g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
659 }
660 if (block->pmr) {
661 int j;
662
663 for (j = 0; j < block->nb_chunks; j++) {
664 if (!block->pmr[j]) {
665 continue;
666 }
667 ibv_dereg_mr(block->pmr[j]);
668 rdma->total_registrations--;
669 }
670 g_free(block->pmr);
671 block->pmr = NULL;
672 }
673
674 if (block->mr) {
675 ibv_dereg_mr(block->mr);
676 rdma->total_registrations--;
677 block->mr = NULL;
678 }
679
680 g_free(block->transit_bitmap);
681 block->transit_bitmap = NULL;
682
683 g_free(block->unregister_bitmap);
684 block->unregister_bitmap = NULL;
685
686 g_free(block->remote_keys);
687 block->remote_keys = NULL;
688
689 g_free(block->block_name);
690 block->block_name = NULL;
691
692 if (rdma->blockmap) {
693 for (x = 0; x < local->nb_blocks; x++) {
694 g_hash_table_remove(rdma->blockmap,
695 (void *)(uintptr_t)old[x].offset);
696 }
697 }
698
699 if (local->nb_blocks > 1) {
700
701 local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
702
703 if (block->index) {
704 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
705 }
706
707 if (block->index < (local->nb_blocks - 1)) {
708 memcpy(local->block + block->index, old + (block->index + 1),
709 sizeof(RDMALocalBlock) *
710 (local->nb_blocks - (block->index + 1)));
711 }
712 } else {
713 assert(block == local->block);
714 local->block = NULL;
715 }
716
717 trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
718 block->offset, block->length,
719 (uintptr_t)(block->local_host_addr + block->length),
720 BITS_TO_LONGS(block->nb_chunks) *
721 sizeof(unsigned long) * 8, block->nb_chunks);
722
723 g_free(old);
724
725 local->nb_blocks--;
726
727 if (local->nb_blocks && rdma->blockmap) {
728 for (x = 0; x < local->nb_blocks; x++) {
729 g_hash_table_insert(rdma->blockmap,
730 (void *)(uintptr_t)local->block[x].offset,
731 &local->block[x]);
732 }
733 }
734
735 return 0;
736}
737
738
739
740
741
742static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
743{
744 struct ibv_port_attr port;
745
746 if (ibv_query_port(verbs, 1, &port)) {
747 error_report("Failed to query port information");
748 return;
749 }
750
751 printf("%s RDMA Device opened: kernel name %s "
752 "uverbs device name %s, "
753 "infiniband_verbs class device path %s, "
754 "infiniband class device path %s, "
755 "transport: (%d) %s\n",
756 who,
757 verbs->device->name,
758 verbs->device->dev_name,
759 verbs->device->dev_path,
760 verbs->device->ibdev_path,
761 port.link_layer,
762 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
763 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
764 ? "Ethernet" : "Unknown"));
765}
766
767
768
769
770
771
772static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
773{
774 char sgid[33];
775 char dgid[33];
776 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
777 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
778 trace_qemu_rdma_dump_gid(who, sgid, dgid);
779}
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
825{
826 struct ibv_port_attr port_attr;
827
828
829#ifdef CONFIG_LINUX
830
831
832
833
834
835
836
837
838
839
840 if (!verbs) {
841 int num_devices, x;
842 struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
843 bool roce_found = false;
844 bool ib_found = false;
845
846 for (x = 0; x < num_devices; x++) {
847 verbs = ibv_open_device(dev_list[x]);
848 if (!verbs) {
849 if (errno == EPERM) {
850 continue;
851 } else {
852 return -EINVAL;
853 }
854 }
855
856 if (ibv_query_port(verbs, 1, &port_attr)) {
857 ibv_close_device(verbs);
858 ERROR(errp, "Could not query initial IB port");
859 return -EINVAL;
860 }
861
862 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
863 ib_found = true;
864 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
865 roce_found = true;
866 }
867
868 ibv_close_device(verbs);
869
870 }
871
872 if (roce_found) {
873 if (ib_found) {
874 fprintf(stderr, "WARN: migrations may fail:"
875 " IPv6 over RoCE / iWARP in linux"
876 " is broken. But since you appear to have a"
877 " mixed RoCE / IB environment, be sure to only"
878 " migrate over the IB fabric until the kernel "
879 " fixes the bug.\n");
880 } else {
881 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
882 " and your management software has specified '[::]'"
883 ", but IPv6 over RoCE / iWARP is not supported in Linux.");
884 return -ENONET;
885 }
886 }
887
888 return 0;
889 }
890
891
892
893
894
895
896
897
898 if (ibv_query_port(verbs, 1, &port_attr)) {
899 ERROR(errp, "Could not query initial IB port");
900 return -EINVAL;
901 }
902
903 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
904 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
905 "(but patches on linux-rdma in progress)");
906 return -ENONET;
907 }
908
909#endif
910
911 return 0;
912}
913
914
915
916
917
918
919static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
920{
921 int ret;
922 struct rdma_addrinfo *res;
923 char port_str[16];
924 struct rdma_cm_event *cm_event;
925 char ip[40] = "unknown";
926 struct rdma_addrinfo *e;
927
928 if (rdma->host == NULL || !strcmp(rdma->host, "")) {
929 ERROR(errp, "RDMA hostname has not been set");
930 return -EINVAL;
931 }
932
933
934 rdma->channel = rdma_create_event_channel();
935 if (!rdma->channel) {
936 ERROR(errp, "could not create CM channel");
937 return -EINVAL;
938 }
939
940
941 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
942 if (ret) {
943 ERROR(errp, "could not create channel id");
944 goto err_resolve_create_id;
945 }
946
947 snprintf(port_str, 16, "%d", rdma->port);
948 port_str[15] = '\0';
949
950 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
951 if (ret < 0) {
952 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
953 goto err_resolve_get_addr;
954 }
955
956 for (e = res; e != NULL; e = e->ai_next) {
957 inet_ntop(e->ai_family,
958 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
959 trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
960
961 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
962 RDMA_RESOLVE_TIMEOUT_MS);
963 if (!ret) {
964 if (e->ai_family == AF_INET6) {
965 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
966 if (ret) {
967 continue;
968 }
969 }
970 goto route;
971 }
972 }
973
974 ERROR(errp, "could not resolve address %s", rdma->host);
975 goto err_resolve_get_addr;
976
977route:
978 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
979
980 ret = rdma_get_cm_event(rdma->channel, &cm_event);
981 if (ret) {
982 ERROR(errp, "could not perform event_addr_resolved");
983 goto err_resolve_get_addr;
984 }
985
986 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
987 ERROR(errp, "result not equal to event_addr_resolved %s",
988 rdma_event_str(cm_event->event));
989 perror("rdma_resolve_addr");
990 rdma_ack_cm_event(cm_event);
991 ret = -EINVAL;
992 goto err_resolve_get_addr;
993 }
994 rdma_ack_cm_event(cm_event);
995
996
997 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
998 if (ret) {
999 ERROR(errp, "could not resolve rdma route");
1000 goto err_resolve_get_addr;
1001 }
1002
1003 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1004 if (ret) {
1005 ERROR(errp, "could not perform event_route_resolved");
1006 goto err_resolve_get_addr;
1007 }
1008 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1009 ERROR(errp, "result not equal to event_route_resolved: %s",
1010 rdma_event_str(cm_event->event));
1011 rdma_ack_cm_event(cm_event);
1012 ret = -EINVAL;
1013 goto err_resolve_get_addr;
1014 }
1015 rdma_ack_cm_event(cm_event);
1016 rdma->verbs = rdma->cm_id->verbs;
1017 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1018 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1019 return 0;
1020
1021err_resolve_get_addr:
1022 rdma_destroy_id(rdma->cm_id);
1023 rdma->cm_id = NULL;
1024err_resolve_create_id:
1025 rdma_destroy_event_channel(rdma->channel);
1026 rdma->channel = NULL;
1027 return ret;
1028}
1029
1030
1031
1032
1033static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1034{
1035
1036 rdma->pd = ibv_alloc_pd(rdma->verbs);
1037 if (!rdma->pd) {
1038 error_report("failed to allocate protection domain");
1039 return -1;
1040 }
1041
1042
1043 rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
1044 if (!rdma->comp_channel) {
1045 error_report("failed to allocate completion channel");
1046 goto err_alloc_pd_cq;
1047 }
1048
1049
1050
1051
1052
1053 rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1054 NULL, rdma->comp_channel, 0);
1055 if (!rdma->cq) {
1056 error_report("failed to allocate completion queue");
1057 goto err_alloc_pd_cq;
1058 }
1059
1060 return 0;
1061
1062err_alloc_pd_cq:
1063 if (rdma->pd) {
1064 ibv_dealloc_pd(rdma->pd);
1065 }
1066 if (rdma->comp_channel) {
1067 ibv_destroy_comp_channel(rdma->comp_channel);
1068 }
1069 rdma->pd = NULL;
1070 rdma->comp_channel = NULL;
1071 return -1;
1072
1073}
1074
1075
1076
1077
1078static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1079{
1080 struct ibv_qp_init_attr attr = { 0 };
1081 int ret;
1082
1083 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1084 attr.cap.max_recv_wr = 3;
1085 attr.cap.max_send_sge = 1;
1086 attr.cap.max_recv_sge = 1;
1087 attr.send_cq = rdma->cq;
1088 attr.recv_cq = rdma->cq;
1089 attr.qp_type = IBV_QPT_RC;
1090
1091 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1092 if (ret) {
1093 return -1;
1094 }
1095
1096 rdma->qp = rdma->cm_id->qp;
1097 return 0;
1098}
1099
1100static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1101{
1102 int i;
1103 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1104
1105 for (i = 0; i < local->nb_blocks; i++) {
1106 local->block[i].mr =
1107 ibv_reg_mr(rdma->pd,
1108 local->block[i].local_host_addr,
1109 local->block[i].length,
1110 IBV_ACCESS_LOCAL_WRITE |
1111 IBV_ACCESS_REMOTE_WRITE
1112 );
1113 if (!local->block[i].mr) {
1114 perror("Failed to register local dest ram block!\n");
1115 break;
1116 }
1117 rdma->total_registrations++;
1118 }
1119
1120 if (i >= local->nb_blocks) {
1121 return 0;
1122 }
1123
1124 for (i--; i >= 0; i--) {
1125 ibv_dereg_mr(local->block[i].mr);
1126 rdma->total_registrations--;
1127 }
1128
1129 return -1;
1130
1131}
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1143 uintptr_t block_offset,
1144 uint64_t offset,
1145 uint64_t length,
1146 uint64_t *block_index,
1147 uint64_t *chunk_index)
1148{
1149 uint64_t current_addr = block_offset + offset;
1150 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1151 (void *) block_offset);
1152 assert(block);
1153 assert(current_addr >= block->offset);
1154 assert((current_addr + length) <= (block->offset + block->length));
1155
1156 *block_index = block->index;
1157 *chunk_index = ram_chunk_index(block->local_host_addr,
1158 block->local_host_addr + (current_addr - block->offset));
1159
1160 return 0;
1161}
1162
1163
1164
1165
1166
1167
1168
1169
1170static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1171 RDMALocalBlock *block, uintptr_t host_addr,
1172 uint32_t *lkey, uint32_t *rkey, int chunk,
1173 uint8_t *chunk_start, uint8_t *chunk_end)
1174{
1175 if (block->mr) {
1176 if (lkey) {
1177 *lkey = block->mr->lkey;
1178 }
1179 if (rkey) {
1180 *rkey = block->mr->rkey;
1181 }
1182 return 0;
1183 }
1184
1185
1186 if (!block->pmr) {
1187 block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1188 }
1189
1190
1191
1192
1193
1194
1195 if (!block->pmr[chunk]) {
1196 uint64_t len = chunk_end - chunk_start;
1197
1198 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1199
1200 block->pmr[chunk] = ibv_reg_mr(rdma->pd,
1201 chunk_start, len,
1202 (rkey ? (IBV_ACCESS_LOCAL_WRITE |
1203 IBV_ACCESS_REMOTE_WRITE) : 0));
1204
1205 if (!block->pmr[chunk]) {
1206 perror("Failed to register chunk!");
1207 fprintf(stderr, "Chunk details: block: %d chunk index %d"
1208 " start %" PRIuPTR " end %" PRIuPTR
1209 " host %" PRIuPTR
1210 " local %" PRIuPTR " registrations: %d\n",
1211 block->index, chunk, (uintptr_t)chunk_start,
1212 (uintptr_t)chunk_end, host_addr,
1213 (uintptr_t)block->local_host_addr,
1214 rdma->total_registrations);
1215 return -1;
1216 }
1217 rdma->total_registrations++;
1218 }
1219
1220 if (lkey) {
1221 *lkey = block->pmr[chunk]->lkey;
1222 }
1223 if (rkey) {
1224 *rkey = block->pmr[chunk]->rkey;
1225 }
1226 return 0;
1227}
1228
1229
1230
1231
1232
1233static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1234{
1235 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1236 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1237 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1238 if (rdma->wr_data[idx].control_mr) {
1239 rdma->total_registrations++;
1240 return 0;
1241 }
1242 error_report("qemu_rdma_reg_control failed");
1243 return -1;
1244}
1245
1246const char *print_wrid(int wrid)
1247{
1248 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1249 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1250 }
1251 return wrid_desc[wrid];
1252}
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1290{
1291 while (rdma->unregistrations[rdma->unregister_current]) {
1292 int ret;
1293 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1294 uint64_t chunk =
1295 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1296 uint64_t index =
1297 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1298 RDMALocalBlock *block =
1299 &(rdma->local_ram_blocks.block[index]);
1300 RDMARegister reg = { .current_index = index };
1301 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1302 };
1303 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1304 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1305 .repeat = 1,
1306 };
1307
1308 trace_qemu_rdma_unregister_waiting_proc(chunk,
1309 rdma->unregister_current);
1310
1311 rdma->unregistrations[rdma->unregister_current] = 0;
1312 rdma->unregister_current++;
1313
1314 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1315 rdma->unregister_current = 0;
1316 }
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326 clear_bit(chunk, block->unregister_bitmap);
1327
1328 if (test_bit(chunk, block->transit_bitmap)) {
1329 trace_qemu_rdma_unregister_waiting_inflight(chunk);
1330 continue;
1331 }
1332
1333 trace_qemu_rdma_unregister_waiting_send(chunk);
1334
1335 ret = ibv_dereg_mr(block->pmr[chunk]);
1336 block->pmr[chunk] = NULL;
1337 block->remote_keys[chunk] = 0;
1338
1339 if (ret != 0) {
1340 perror("unregistration chunk failed");
1341 return -ret;
1342 }
1343 rdma->total_registrations--;
1344
1345 reg.key.chunk = chunk;
1346 register_to_network(rdma, ®);
1347 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1348 &resp, NULL, NULL);
1349 if (ret < 0) {
1350 return ret;
1351 }
1352
1353 trace_qemu_rdma_unregister_waiting_complete(chunk);
1354 }
1355
1356 return 0;
1357}
1358
1359static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1360 uint64_t chunk)
1361{
1362 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1363
1364 result |= (index << RDMA_WRID_BLOCK_SHIFT);
1365 result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1366
1367 return result;
1368}
1369
1370
1371
1372
1373
1374static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1375 uint64_t chunk, uint64_t wr_id)
1376{
1377 if (rdma->unregistrations[rdma->unregister_next] != 0) {
1378 error_report("rdma migration: queue is full");
1379 } else {
1380 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1381
1382 if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1383 trace_qemu_rdma_signal_unregister_append(chunk,
1384 rdma->unregister_next);
1385
1386 rdma->unregistrations[rdma->unregister_next++] =
1387 qemu_rdma_make_wrid(wr_id, index, chunk);
1388
1389 if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1390 rdma->unregister_next = 0;
1391 }
1392 } else {
1393 trace_qemu_rdma_signal_unregister_already(chunk);
1394 }
1395 }
1396}
1397
1398
1399
1400
1401
1402
1403static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
1404 uint32_t *byte_len)
1405{
1406 int ret;
1407 struct ibv_wc wc;
1408 uint64_t wr_id;
1409
1410 ret = ibv_poll_cq(rdma->cq, 1, &wc);
1411
1412 if (!ret) {
1413 *wr_id_out = RDMA_WRID_NONE;
1414 return 0;
1415 }
1416
1417 if (ret < 0) {
1418 error_report("ibv_poll_cq return %d", ret);
1419 return ret;
1420 }
1421
1422 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1423
1424 if (wc.status != IBV_WC_SUCCESS) {
1425 fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1426 wc.status, ibv_wc_status_str(wc.status));
1427 fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1428
1429 return -1;
1430 }
1431
1432 if (rdma->control_ready_expected &&
1433 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1434 trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
1435 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1436 rdma->control_ready_expected = 0;
1437 }
1438
1439 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1440 uint64_t chunk =
1441 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1442 uint64_t index =
1443 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1444 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1445
1446 trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
1447 index, chunk, block->local_host_addr,
1448 (void *)(uintptr_t)block->remote_host_addr);
1449
1450 clear_bit(chunk, block->transit_bitmap);
1451
1452 if (rdma->nb_sent > 0) {
1453 rdma->nb_sent--;
1454 }
1455
1456 if (!rdma->pin_all) {
1457
1458
1459
1460
1461
1462
1463#ifdef RDMA_UNREGISTRATION_EXAMPLE
1464 qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1465#endif
1466 }
1467 } else {
1468 trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
1469 }
1470
1471 *wr_id_out = wc.wr_id;
1472 if (byte_len) {
1473 *byte_len = wc.byte_len;
1474 }
1475
1476 return 0;
1477}
1478
1479
1480
1481
1482static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
1483{
1484
1485
1486
1487
1488 if (rdma->migration_started_on_destination) {
1489 yield_until_fd_readable(rdma->comp_channel->fd);
1490 } else {
1491
1492
1493
1494
1495
1496
1497 while (!rdma->error_state && !rdma->received_error) {
1498 GPollFD pfds[1];
1499 pfds[0].fd = rdma->comp_channel->fd;
1500 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1501
1502 switch (qemu_poll_ns(pfds, 1, 100 * 1000 * 1000)) {
1503 case 1:
1504 return 0;
1505
1506 case 0:
1507 break;
1508
1509 default:
1510
1511
1512 error_report("%s: poll failed", __func__);
1513 return -EPIPE;
1514 }
1515
1516 if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
1517
1518 return -EPIPE;
1519 }
1520 }
1521 }
1522
1523 if (rdma->received_error) {
1524 return -EPIPE;
1525 }
1526 return rdma->error_state;
1527}
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1543 uint32_t *byte_len)
1544{
1545 int num_cq_events = 0, ret = 0;
1546 struct ibv_cq *cq;
1547 void *cq_ctx;
1548 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1549
1550 if (ibv_req_notify_cq(rdma->cq, 0)) {
1551 return -1;
1552 }
1553
1554 while (wr_id != wrid_requested) {
1555 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1556 if (ret < 0) {
1557 return ret;
1558 }
1559
1560 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1561
1562 if (wr_id == RDMA_WRID_NONE) {
1563 break;
1564 }
1565 if (wr_id != wrid_requested) {
1566 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1567 wrid_requested, print_wrid(wr_id), wr_id);
1568 }
1569 }
1570
1571 if (wr_id == wrid_requested) {
1572 return 0;
1573 }
1574
1575 while (1) {
1576 ret = qemu_rdma_wait_comp_channel(rdma);
1577 if (ret) {
1578 goto err_block_for_wrid;
1579 }
1580
1581 ret = ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx);
1582 if (ret) {
1583 perror("ibv_get_cq_event");
1584 goto err_block_for_wrid;
1585 }
1586
1587 num_cq_events++;
1588
1589 ret = -ibv_req_notify_cq(cq, 0);
1590 if (ret) {
1591 goto err_block_for_wrid;
1592 }
1593
1594 while (wr_id != wrid_requested) {
1595 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1596 if (ret < 0) {
1597 goto err_block_for_wrid;
1598 }
1599
1600 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1601
1602 if (wr_id == RDMA_WRID_NONE) {
1603 break;
1604 }
1605 if (wr_id != wrid_requested) {
1606 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1607 wrid_requested, print_wrid(wr_id), wr_id);
1608 }
1609 }
1610
1611 if (wr_id == wrid_requested) {
1612 goto success_block_for_wrid;
1613 }
1614 }
1615
1616success_block_for_wrid:
1617 if (num_cq_events) {
1618 ibv_ack_cq_events(cq, num_cq_events);
1619 }
1620 return 0;
1621
1622err_block_for_wrid:
1623 if (num_cq_events) {
1624 ibv_ack_cq_events(cq, num_cq_events);
1625 }
1626
1627 rdma->error_state = ret;
1628 return ret;
1629}
1630
1631
1632
1633
1634
1635static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1636 RDMAControlHeader *head)
1637{
1638 int ret = 0;
1639 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1640 struct ibv_send_wr *bad_wr;
1641 struct ibv_sge sge = {
1642 .addr = (uintptr_t)(wr->control),
1643 .length = head->len + sizeof(RDMAControlHeader),
1644 .lkey = wr->control_mr->lkey,
1645 };
1646 struct ibv_send_wr send_wr = {
1647 .wr_id = RDMA_WRID_SEND_CONTROL,
1648 .opcode = IBV_WR_SEND,
1649 .send_flags = IBV_SEND_SIGNALED,
1650 .sg_list = &sge,
1651 .num_sge = 1,
1652 };
1653
1654 trace_qemu_rdma_post_send_control(control_desc(head->type));
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1665 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1666 control_to_network((void *) wr->control);
1667
1668 if (buf) {
1669 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1670 }
1671
1672
1673 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1674
1675 if (ret > 0) {
1676 error_report("Failed to use post IB SEND for control");
1677 return -ret;
1678 }
1679
1680 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1681 if (ret < 0) {
1682 error_report("rdma migration: send polling control error");
1683 }
1684
1685 return ret;
1686}
1687
1688
1689
1690
1691
1692static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1693{
1694 struct ibv_recv_wr *bad_wr;
1695 struct ibv_sge sge = {
1696 .addr = (uintptr_t)(rdma->wr_data[idx].control),
1697 .length = RDMA_CONTROL_MAX_BUFFER,
1698 .lkey = rdma->wr_data[idx].control_mr->lkey,
1699 };
1700
1701 struct ibv_recv_wr recv_wr = {
1702 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1703 .sg_list = &sge,
1704 .num_sge = 1,
1705 };
1706
1707
1708 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1709 return -1;
1710 }
1711
1712 return 0;
1713}
1714
1715
1716
1717
1718static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1719 RDMAControlHeader *head, int expecting, int idx)
1720{
1721 uint32_t byte_len;
1722 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1723 &byte_len);
1724
1725 if (ret < 0) {
1726 error_report("rdma migration: recv polling control error!");
1727 return ret;
1728 }
1729
1730 network_to_control((void *) rdma->wr_data[idx].control);
1731 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1732
1733 trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1734
1735 if (expecting == RDMA_CONTROL_NONE) {
1736 trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1737 head->type);
1738 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1739 error_report("Was expecting a %s (%d) control message"
1740 ", but got: %s (%d), length: %d",
1741 control_desc(expecting), expecting,
1742 control_desc(head->type), head->type, head->len);
1743 if (head->type == RDMA_CONTROL_ERROR) {
1744 rdma->received_error = true;
1745 }
1746 return -EIO;
1747 }
1748 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1749 error_report("too long length: %d", head->len);
1750 return -EINVAL;
1751 }
1752 if (sizeof(*head) + head->len != byte_len) {
1753 error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1754 return -EINVAL;
1755 }
1756
1757 return 0;
1758}
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1769 RDMAControlHeader *head)
1770{
1771 rdma->wr_data[idx].control_len = head->len;
1772 rdma->wr_data[idx].control_curr =
1773 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1774}
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1790 uint8_t *data, RDMAControlHeader *resp,
1791 int *resp_idx,
1792 int (*callback)(RDMAContext *rdma))
1793{
1794 int ret = 0;
1795
1796
1797
1798
1799
1800 if (rdma->control_ready_expected) {
1801 RDMAControlHeader resp;
1802 ret = qemu_rdma_exchange_get_response(rdma,
1803 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1804 if (ret < 0) {
1805 return ret;
1806 }
1807 }
1808
1809
1810
1811
1812 if (resp) {
1813 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1814 if (ret) {
1815 error_report("rdma migration: error posting"
1816 " extra control recv for anticipated result!");
1817 return ret;
1818 }
1819 }
1820
1821
1822
1823
1824 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1825 if (ret) {
1826 error_report("rdma migration: error posting first control recv!");
1827 return ret;
1828 }
1829
1830
1831
1832
1833 ret = qemu_rdma_post_send_control(rdma, data, head);
1834
1835 if (ret < 0) {
1836 error_report("Failed to send control buffer!");
1837 return ret;
1838 }
1839
1840
1841
1842
1843 if (resp) {
1844 if (callback) {
1845 trace_qemu_rdma_exchange_send_issue_callback();
1846 ret = callback(rdma);
1847 if (ret < 0) {
1848 return ret;
1849 }
1850 }
1851
1852 trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1853 ret = qemu_rdma_exchange_get_response(rdma, resp,
1854 resp->type, RDMA_WRID_DATA);
1855
1856 if (ret < 0) {
1857 return ret;
1858 }
1859
1860 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1861 if (resp_idx) {
1862 *resp_idx = RDMA_WRID_DATA;
1863 }
1864 trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1865 }
1866
1867 rdma->control_ready_expected = 1;
1868
1869 return 0;
1870}
1871
1872
1873
1874
1875
1876static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1877 int expecting)
1878{
1879 RDMAControlHeader ready = {
1880 .len = 0,
1881 .type = RDMA_CONTROL_READY,
1882 .repeat = 1,
1883 };
1884 int ret;
1885
1886
1887
1888
1889 ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1890
1891 if (ret < 0) {
1892 error_report("Failed to send control buffer!");
1893 return ret;
1894 }
1895
1896
1897
1898
1899 ret = qemu_rdma_exchange_get_response(rdma, head,
1900 expecting, RDMA_WRID_READY);
1901
1902 if (ret < 0) {
1903 return ret;
1904 }
1905
1906 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1907
1908
1909
1910
1911 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1912 if (ret) {
1913 error_report("rdma migration: error posting second control recv!");
1914 return ret;
1915 }
1916
1917 return 0;
1918}
1919
1920
1921
1922
1923
1924
1925
1926static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
1927 int current_index, uint64_t current_addr,
1928 uint64_t length)
1929{
1930 struct ibv_sge sge;
1931 struct ibv_send_wr send_wr = { 0 };
1932 struct ibv_send_wr *bad_wr;
1933 int reg_result_idx, ret, count = 0;
1934 uint64_t chunk, chunks;
1935 uint8_t *chunk_start, *chunk_end;
1936 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
1937 RDMARegister reg;
1938 RDMARegisterResult *reg_result;
1939 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
1940 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1941 .type = RDMA_CONTROL_REGISTER_REQUEST,
1942 .repeat = 1,
1943 };
1944
1945retry:
1946 sge.addr = (uintptr_t)(block->local_host_addr +
1947 (current_addr - block->offset));
1948 sge.length = length;
1949
1950 chunk = ram_chunk_index(block->local_host_addr,
1951 (uint8_t *)(uintptr_t)sge.addr);
1952 chunk_start = ram_chunk_start(block, chunk);
1953
1954 if (block->is_ram_block) {
1955 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
1956
1957 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1958 chunks--;
1959 }
1960 } else {
1961 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
1962
1963 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1964 chunks--;
1965 }
1966 }
1967
1968 trace_qemu_rdma_write_one_top(chunks + 1,
1969 (chunks + 1) *
1970 (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
1971
1972 chunk_end = ram_chunk_end(block, chunk + chunks);
1973
1974 if (!rdma->pin_all) {
1975#ifdef RDMA_UNREGISTRATION_EXAMPLE
1976 qemu_rdma_unregister_waiting(rdma);
1977#endif
1978 }
1979
1980 while (test_bit(chunk, block->transit_bitmap)) {
1981 (void)count;
1982 trace_qemu_rdma_write_one_block(count++, current_index, chunk,
1983 sge.addr, length, rdma->nb_sent, block->nb_chunks);
1984
1985 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
1986
1987 if (ret < 0) {
1988 error_report("Failed to Wait for previous write to complete "
1989 "block %d chunk %" PRIu64
1990 " current %" PRIu64 " len %" PRIu64 " %d",
1991 current_index, chunk, sge.addr, length, rdma->nb_sent);
1992 return ret;
1993 }
1994 }
1995
1996 if (!rdma->pin_all || !block->is_ram_block) {
1997 if (!block->remote_keys[chunk]) {
1998
1999
2000
2001
2002
2003
2004 if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2005 RDMACompress comp = {
2006 .offset = current_addr,
2007 .value = 0,
2008 .block_idx = current_index,
2009 .length = length,
2010 };
2011
2012 head.len = sizeof(comp);
2013 head.type = RDMA_CONTROL_COMPRESS;
2014
2015 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2016 current_index, current_addr);
2017
2018 compress_to_network(rdma, &comp);
2019 ret = qemu_rdma_exchange_send(rdma, &head,
2020 (uint8_t *) &comp, NULL, NULL, NULL);
2021
2022 if (ret < 0) {
2023 return -EIO;
2024 }
2025
2026 acct_update_position(f, sge.length, true);
2027
2028 return 1;
2029 }
2030
2031
2032
2033
2034 reg.current_index = current_index;
2035 if (block->is_ram_block) {
2036 reg.key.current_addr = current_addr;
2037 } else {
2038 reg.key.chunk = chunk;
2039 }
2040 reg.chunks = chunks;
2041
2042 trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2043 current_addr);
2044
2045 register_to_network(rdma, ®);
2046 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
2047 &resp, ®_result_idx, NULL);
2048 if (ret < 0) {
2049 return ret;
2050 }
2051
2052
2053 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2054 &sge.lkey, NULL, chunk,
2055 chunk_start, chunk_end)) {
2056 error_report("cannot get lkey");
2057 return -EINVAL;
2058 }
2059
2060 reg_result = (RDMARegisterResult *)
2061 rdma->wr_data[reg_result_idx].control_curr;
2062
2063 network_to_result(reg_result);
2064
2065 trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2066 reg_result->rkey, chunk);
2067
2068 block->remote_keys[chunk] = reg_result->rkey;
2069 block->remote_host_addr = reg_result->host_addr;
2070 } else {
2071
2072 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2073 &sge.lkey, NULL, chunk,
2074 chunk_start, chunk_end)) {
2075 error_report("cannot get lkey!");
2076 return -EINVAL;
2077 }
2078 }
2079
2080 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2081 } else {
2082 send_wr.wr.rdma.rkey = block->remote_rkey;
2083
2084 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2085 &sge.lkey, NULL, chunk,
2086 chunk_start, chunk_end)) {
2087 error_report("cannot get lkey!");
2088 return -EINVAL;
2089 }
2090 }
2091
2092
2093
2094
2095
2096
2097
2098 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2099 current_index, chunk);
2100
2101 send_wr.opcode = IBV_WR_RDMA_WRITE;
2102 send_wr.send_flags = IBV_SEND_SIGNALED;
2103 send_wr.sg_list = &sge;
2104 send_wr.num_sge = 1;
2105 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2106 (current_addr - block->offset);
2107
2108 trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2109 sge.length);
2110
2111
2112
2113
2114
2115 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2116
2117 if (ret == ENOMEM) {
2118 trace_qemu_rdma_write_one_queue_full();
2119 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2120 if (ret < 0) {
2121 error_report("rdma migration: failed to make "
2122 "room in full send queue! %d", ret);
2123 return ret;
2124 }
2125
2126 goto retry;
2127
2128 } else if (ret > 0) {
2129 perror("rdma migration: post rdma write failed");
2130 return -ret;
2131 }
2132
2133 set_bit(chunk, block->transit_bitmap);
2134 acct_update_position(f, sge.length, false);
2135 rdma->total_writes++;
2136
2137 return 0;
2138}
2139
2140
2141
2142
2143
2144
2145
2146static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2147{
2148 int ret;
2149
2150 if (!rdma->current_length) {
2151 return 0;
2152 }
2153
2154 ret = qemu_rdma_write_one(f, rdma,
2155 rdma->current_index, rdma->current_addr, rdma->current_length);
2156
2157 if (ret < 0) {
2158 return ret;
2159 }
2160
2161 if (ret == 0) {
2162 rdma->nb_sent++;
2163 trace_qemu_rdma_write_flush(rdma->nb_sent);
2164 }
2165
2166 rdma->current_length = 0;
2167 rdma->current_addr = 0;
2168
2169 return 0;
2170}
2171
2172static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2173 uint64_t offset, uint64_t len)
2174{
2175 RDMALocalBlock *block;
2176 uint8_t *host_addr;
2177 uint8_t *chunk_end;
2178
2179 if (rdma->current_index < 0) {
2180 return 0;
2181 }
2182
2183 if (rdma->current_chunk < 0) {
2184 return 0;
2185 }
2186
2187 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2188 host_addr = block->local_host_addr + (offset - block->offset);
2189 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2190
2191 if (rdma->current_length == 0) {
2192 return 0;
2193 }
2194
2195
2196
2197
2198 if (offset != (rdma->current_addr + rdma->current_length)) {
2199 return 0;
2200 }
2201
2202 if (offset < block->offset) {
2203 return 0;
2204 }
2205
2206 if ((offset + len) > (block->offset + block->length)) {
2207 return 0;
2208 }
2209
2210 if ((host_addr + len) > chunk_end) {
2211 return 0;
2212 }
2213
2214 return 1;
2215}
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2228 uint64_t block_offset, uint64_t offset,
2229 uint64_t len)
2230{
2231 uint64_t current_addr = block_offset + offset;
2232 uint64_t index = rdma->current_index;
2233 uint64_t chunk = rdma->current_chunk;
2234 int ret;
2235
2236
2237 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2238 ret = qemu_rdma_write_flush(f, rdma);
2239 if (ret) {
2240 return ret;
2241 }
2242 rdma->current_length = 0;
2243 rdma->current_addr = current_addr;
2244
2245 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2246 offset, len, &index, &chunk);
2247 if (ret) {
2248 error_report("ram block search failed");
2249 return ret;
2250 }
2251 rdma->current_index = index;
2252 rdma->current_chunk = chunk;
2253 }
2254
2255
2256 rdma->current_length += len;
2257
2258
2259 if (rdma->current_length >= RDMA_MERGE_MAX) {
2260 return qemu_rdma_write_flush(f, rdma);
2261 }
2262
2263 return 0;
2264}
2265
2266static void qemu_rdma_cleanup(RDMAContext *rdma)
2267{
2268 struct rdma_cm_event *cm_event;
2269 int ret, idx;
2270
2271 if (rdma->cm_id && rdma->connected) {
2272 if ((rdma->error_state ||
2273 migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
2274 !rdma->received_error) {
2275 RDMAControlHeader head = { .len = 0,
2276 .type = RDMA_CONTROL_ERROR,
2277 .repeat = 1,
2278 };
2279 error_report("Early error. Sending error.");
2280 qemu_rdma_post_send_control(rdma, NULL, &head);
2281 }
2282
2283 ret = rdma_disconnect(rdma->cm_id);
2284 if (!ret) {
2285 trace_qemu_rdma_cleanup_waiting_for_disconnect();
2286 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2287 if (!ret) {
2288 rdma_ack_cm_event(cm_event);
2289 }
2290 }
2291 trace_qemu_rdma_cleanup_disconnect();
2292 rdma->connected = false;
2293 }
2294
2295 g_free(rdma->dest_blocks);
2296 rdma->dest_blocks = NULL;
2297
2298 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2299 if (rdma->wr_data[idx].control_mr) {
2300 rdma->total_registrations--;
2301 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2302 }
2303 rdma->wr_data[idx].control_mr = NULL;
2304 }
2305
2306 if (rdma->local_ram_blocks.block) {
2307 while (rdma->local_ram_blocks.nb_blocks) {
2308 rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2309 }
2310 }
2311
2312 if (rdma->qp) {
2313 rdma_destroy_qp(rdma->cm_id);
2314 rdma->qp = NULL;
2315 }
2316 if (rdma->cq) {
2317 ibv_destroy_cq(rdma->cq);
2318 rdma->cq = NULL;
2319 }
2320 if (rdma->comp_channel) {
2321 ibv_destroy_comp_channel(rdma->comp_channel);
2322 rdma->comp_channel = NULL;
2323 }
2324 if (rdma->pd) {
2325 ibv_dealloc_pd(rdma->pd);
2326 rdma->pd = NULL;
2327 }
2328 if (rdma->cm_id) {
2329 rdma_destroy_id(rdma->cm_id);
2330 rdma->cm_id = NULL;
2331 }
2332 if (rdma->listen_id) {
2333 rdma_destroy_id(rdma->listen_id);
2334 rdma->listen_id = NULL;
2335 }
2336 if (rdma->channel) {
2337 rdma_destroy_event_channel(rdma->channel);
2338 rdma->channel = NULL;
2339 }
2340 g_free(rdma->host);
2341 rdma->host = NULL;
2342}
2343
2344
2345static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2346{
2347 int ret, idx;
2348 Error *local_err = NULL, **temp = &local_err;
2349
2350
2351
2352
2353
2354 rdma->pin_all = pin_all;
2355
2356 ret = qemu_rdma_resolve_host(rdma, temp);
2357 if (ret) {
2358 goto err_rdma_source_init;
2359 }
2360
2361 ret = qemu_rdma_alloc_pd_cq(rdma);
2362 if (ret) {
2363 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2364 " limits may be too low. Please check $ ulimit -a # and "
2365 "search for 'ulimit -l' in the output");
2366 goto err_rdma_source_init;
2367 }
2368
2369 ret = qemu_rdma_alloc_qp(rdma);
2370 if (ret) {
2371 ERROR(temp, "rdma migration: error allocating qp!");
2372 goto err_rdma_source_init;
2373 }
2374
2375 ret = qemu_rdma_init_ram_blocks(rdma);
2376 if (ret) {
2377 ERROR(temp, "rdma migration: error initializing ram blocks!");
2378 goto err_rdma_source_init;
2379 }
2380
2381
2382 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2383 for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2384 g_hash_table_insert(rdma->blockmap,
2385 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2386 &rdma->local_ram_blocks.block[idx]);
2387 }
2388
2389 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2390 ret = qemu_rdma_reg_control(rdma, idx);
2391 if (ret) {
2392 ERROR(temp, "rdma migration: error registering %d control!",
2393 idx);
2394 goto err_rdma_source_init;
2395 }
2396 }
2397
2398 return 0;
2399
2400err_rdma_source_init:
2401 error_propagate(errp, local_err);
2402 qemu_rdma_cleanup(rdma);
2403 return -1;
2404}
2405
2406static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
2407{
2408 RDMACapabilities cap = {
2409 .version = RDMA_CONTROL_VERSION_CURRENT,
2410 .flags = 0,
2411 };
2412 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2413 .retry_count = 5,
2414 .private_data = &cap,
2415 .private_data_len = sizeof(cap),
2416 };
2417 struct rdma_cm_event *cm_event;
2418 int ret;
2419
2420
2421
2422
2423
2424 if (rdma->pin_all) {
2425 trace_qemu_rdma_connect_pin_all_requested();
2426 cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2427 }
2428
2429 caps_to_network(&cap);
2430
2431 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2432 if (ret) {
2433 ERROR(errp, "posting second control recv");
2434 goto err_rdma_source_connect;
2435 }
2436
2437 ret = rdma_connect(rdma->cm_id, &conn_param);
2438 if (ret) {
2439 perror("rdma_connect");
2440 ERROR(errp, "connecting to destination!");
2441 goto err_rdma_source_connect;
2442 }
2443
2444 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2445 if (ret) {
2446 perror("rdma_get_cm_event after rdma_connect");
2447 ERROR(errp, "connecting to destination!");
2448 rdma_ack_cm_event(cm_event);
2449 goto err_rdma_source_connect;
2450 }
2451
2452 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2453 perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2454 ERROR(errp, "connecting to destination!");
2455 rdma_ack_cm_event(cm_event);
2456 goto err_rdma_source_connect;
2457 }
2458 rdma->connected = true;
2459
2460 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2461 network_to_caps(&cap);
2462
2463
2464
2465
2466
2467 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2468 ERROR(errp, "Server cannot support pinning all memory. "
2469 "Will register memory dynamically.");
2470 rdma->pin_all = false;
2471 }
2472
2473 trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2474
2475 rdma_ack_cm_event(cm_event);
2476
2477 rdma->control_ready_expected = 1;
2478 rdma->nb_sent = 0;
2479 return 0;
2480
2481err_rdma_source_connect:
2482 qemu_rdma_cleanup(rdma);
2483 return -1;
2484}
2485
2486static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2487{
2488 int ret, idx;
2489 struct rdma_cm_id *listen_id;
2490 char ip[40] = "unknown";
2491 struct rdma_addrinfo *res, *e;
2492 char port_str[16];
2493
2494 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2495 rdma->wr_data[idx].control_len = 0;
2496 rdma->wr_data[idx].control_curr = NULL;
2497 }
2498
2499 if (!rdma->host || !rdma->host[0]) {
2500 ERROR(errp, "RDMA host is not set!");
2501 rdma->error_state = -EINVAL;
2502 return -1;
2503 }
2504
2505 rdma->channel = rdma_create_event_channel();
2506 if (!rdma->channel) {
2507 ERROR(errp, "could not create rdma event channel");
2508 rdma->error_state = -EINVAL;
2509 return -1;
2510 }
2511
2512
2513 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2514 if (ret) {
2515 ERROR(errp, "could not create cm_id!");
2516 goto err_dest_init_create_listen_id;
2517 }
2518
2519 snprintf(port_str, 16, "%d", rdma->port);
2520 port_str[15] = '\0';
2521
2522 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2523 if (ret < 0) {
2524 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2525 goto err_dest_init_bind_addr;
2526 }
2527
2528 for (e = res; e != NULL; e = e->ai_next) {
2529 inet_ntop(e->ai_family,
2530 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2531 trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2532 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2533 if (ret) {
2534 continue;
2535 }
2536 if (e->ai_family == AF_INET6) {
2537 ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
2538 if (ret) {
2539 continue;
2540 }
2541 }
2542 break;
2543 }
2544
2545 if (!e) {
2546 ERROR(errp, "Error: could not rdma_bind_addr!");
2547 goto err_dest_init_bind_addr;
2548 }
2549
2550 rdma->listen_id = listen_id;
2551 qemu_rdma_dump_gid("dest_init", listen_id);
2552 return 0;
2553
2554err_dest_init_bind_addr:
2555 rdma_destroy_id(listen_id);
2556err_dest_init_create_listen_id:
2557 rdma_destroy_event_channel(rdma->channel);
2558 rdma->channel = NULL;
2559 rdma->error_state = ret;
2560 return ret;
2561
2562}
2563
2564static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2565{
2566 RDMAContext *rdma = NULL;
2567 InetSocketAddress *addr;
2568
2569 if (host_port) {
2570 rdma = g_new0(RDMAContext, 1);
2571 rdma->current_index = -1;
2572 rdma->current_chunk = -1;
2573
2574 addr = g_new(InetSocketAddress, 1);
2575 if (!inet_parse(addr, host_port, NULL)) {
2576 rdma->port = atoi(addr->port);
2577 rdma->host = g_strdup(addr->host);
2578 } else {
2579 ERROR(errp, "bad RDMA migration address '%s'", host_port);
2580 g_free(rdma);
2581 rdma = NULL;
2582 }
2583
2584 qapi_free_InetSocketAddress(addr);
2585 }
2586
2587 return rdma;
2588}
2589
2590
2591
2592
2593
2594
2595static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
2596 const struct iovec *iov,
2597 size_t niov,
2598 int *fds,
2599 size_t nfds,
2600 Error **errp)
2601{
2602 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2603 QEMUFile *f = rioc->file;
2604 RDMAContext *rdma = rioc->rdma;
2605 int ret;
2606 ssize_t done = 0;
2607 size_t i;
2608
2609 CHECK_ERROR_STATE();
2610
2611
2612
2613
2614
2615 ret = qemu_rdma_write_flush(f, rdma);
2616 if (ret < 0) {
2617 rdma->error_state = ret;
2618 return ret;
2619 }
2620
2621 for (i = 0; i < niov; i++) {
2622 size_t remaining = iov[i].iov_len;
2623 uint8_t * data = (void *)iov[i].iov_base;
2624 while (remaining) {
2625 RDMAControlHeader head;
2626
2627 rioc->len = MIN(remaining, RDMA_SEND_INCREMENT);
2628 remaining -= rioc->len;
2629
2630 head.len = rioc->len;
2631 head.type = RDMA_CONTROL_QEMU_FILE;
2632
2633 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2634
2635 if (ret < 0) {
2636 rdma->error_state = ret;
2637 return ret;
2638 }
2639
2640 data += rioc->len;
2641 done += rioc->len;
2642 }
2643 }
2644
2645 return done;
2646}
2647
2648static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2649 size_t size, int idx)
2650{
2651 size_t len = 0;
2652
2653 if (rdma->wr_data[idx].control_len) {
2654 trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2655
2656 len = MIN(size, rdma->wr_data[idx].control_len);
2657 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2658 rdma->wr_data[idx].control_curr += len;
2659 rdma->wr_data[idx].control_len -= len;
2660 }
2661
2662 return len;
2663}
2664
2665
2666
2667
2668
2669
2670static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
2671 const struct iovec *iov,
2672 size_t niov,
2673 int **fds,
2674 size_t *nfds,
2675 Error **errp)
2676{
2677 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2678 RDMAContext *rdma = rioc->rdma;
2679 RDMAControlHeader head;
2680 int ret = 0;
2681 ssize_t i;
2682 size_t done = 0;
2683
2684 CHECK_ERROR_STATE();
2685
2686 for (i = 0; i < niov; i++) {
2687 size_t want = iov[i].iov_len;
2688 uint8_t *data = (void *)iov[i].iov_base;
2689
2690
2691
2692
2693
2694
2695 ret = qemu_rdma_fill(rioc->rdma, data, want, 0);
2696 done += ret;
2697 want -= ret;
2698
2699 if (want == 0) {
2700 continue;
2701 }
2702
2703
2704
2705 if (done > 0) {
2706 break;
2707 }
2708
2709
2710
2711
2712
2713 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2714
2715 if (ret < 0) {
2716 rdma->error_state = ret;
2717 return ret;
2718 }
2719
2720
2721
2722
2723 ret = qemu_rdma_fill(rioc->rdma, data, want, 0);
2724 done += ret;
2725 want -= ret;
2726
2727
2728 if (want) {
2729 if (done == 0) {
2730 return QIO_CHANNEL_ERR_BLOCK;
2731 } else {
2732 break;
2733 }
2734 }
2735 }
2736 rioc->len = done;
2737 return rioc->len;
2738}
2739
2740
2741
2742
2743static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2744{
2745 int ret;
2746
2747 if (qemu_rdma_write_flush(f, rdma) < 0) {
2748 return -EIO;
2749 }
2750
2751 while (rdma->nb_sent) {
2752 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2753 if (ret < 0) {
2754 error_report("rdma migration: complete polling error!");
2755 return -EIO;
2756 }
2757 }
2758
2759 qemu_rdma_unregister_waiting(rdma);
2760
2761 return 0;
2762}
2763
2764
2765static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
2766 bool blocking,
2767 Error **errp)
2768{
2769 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2770
2771 rioc->blocking = blocking;
2772 return 0;
2773}
2774
2775
2776typedef struct QIOChannelRDMASource QIOChannelRDMASource;
2777struct QIOChannelRDMASource {
2778 GSource parent;
2779 QIOChannelRDMA *rioc;
2780 GIOCondition condition;
2781};
2782
2783static gboolean
2784qio_channel_rdma_source_prepare(GSource *source,
2785 gint *timeout)
2786{
2787 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2788 RDMAContext *rdma = rsource->rioc->rdma;
2789 GIOCondition cond = 0;
2790 *timeout = -1;
2791
2792 if (rdma->wr_data[0].control_len) {
2793 cond |= G_IO_IN;
2794 }
2795 cond |= G_IO_OUT;
2796
2797 return cond & rsource->condition;
2798}
2799
2800static gboolean
2801qio_channel_rdma_source_check(GSource *source)
2802{
2803 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2804 RDMAContext *rdma = rsource->rioc->rdma;
2805 GIOCondition cond = 0;
2806
2807 if (rdma->wr_data[0].control_len) {
2808 cond |= G_IO_IN;
2809 }
2810 cond |= G_IO_OUT;
2811
2812 return cond & rsource->condition;
2813}
2814
2815static gboolean
2816qio_channel_rdma_source_dispatch(GSource *source,
2817 GSourceFunc callback,
2818 gpointer user_data)
2819{
2820 QIOChannelFunc func = (QIOChannelFunc)callback;
2821 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2822 RDMAContext *rdma = rsource->rioc->rdma;
2823 GIOCondition cond = 0;
2824
2825 if (rdma->wr_data[0].control_len) {
2826 cond |= G_IO_IN;
2827 }
2828 cond |= G_IO_OUT;
2829
2830 return (*func)(QIO_CHANNEL(rsource->rioc),
2831 (cond & rsource->condition),
2832 user_data);
2833}
2834
2835static void
2836qio_channel_rdma_source_finalize(GSource *source)
2837{
2838 QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
2839
2840 object_unref(OBJECT(ssource->rioc));
2841}
2842
2843GSourceFuncs qio_channel_rdma_source_funcs = {
2844 qio_channel_rdma_source_prepare,
2845 qio_channel_rdma_source_check,
2846 qio_channel_rdma_source_dispatch,
2847 qio_channel_rdma_source_finalize
2848};
2849
2850static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
2851 GIOCondition condition)
2852{
2853 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2854 QIOChannelRDMASource *ssource;
2855 GSource *source;
2856
2857 source = g_source_new(&qio_channel_rdma_source_funcs,
2858 sizeof(QIOChannelRDMASource));
2859 ssource = (QIOChannelRDMASource *)source;
2860
2861 ssource->rioc = rioc;
2862 object_ref(OBJECT(rioc));
2863
2864 ssource->condition = condition;
2865
2866 return source;
2867}
2868
2869
2870static int qio_channel_rdma_close(QIOChannel *ioc,
2871 Error **errp)
2872{
2873 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2874 trace_qemu_rdma_close();
2875 if (rioc->rdma) {
2876 if (!rioc->rdma->error_state) {
2877 rioc->rdma->error_state = qemu_file_get_error(rioc->file);
2878 }
2879 qemu_rdma_cleanup(rioc->rdma);
2880 g_free(rioc->rdma);
2881 rioc->rdma = NULL;
2882 }
2883 return 0;
2884}
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
2921 ram_addr_t block_offset, ram_addr_t offset,
2922 size_t size, uint64_t *bytes_sent)
2923{
2924 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
2925 RDMAContext *rdma = rioc->rdma;
2926 int ret;
2927
2928 CHECK_ERROR_STATE();
2929
2930 qemu_fflush(f);
2931
2932 if (size > 0) {
2933
2934
2935
2936
2937
2938 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
2939 if (ret < 0) {
2940 error_report("rdma migration: write error! %d", ret);
2941 goto err;
2942 }
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952 if (bytes_sent) {
2953 *bytes_sent = 1;
2954 }
2955 } else {
2956 uint64_t index, chunk;
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2970 offset, size, &index, &chunk);
2971
2972 if (ret) {
2973 error_report("ram block search failed");
2974 goto err;
2975 }
2976
2977 qemu_rdma_signal_unregister(rdma, index, chunk, 0);
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987 }
2988
2989
2990
2991
2992
2993
2994
2995
2996 while (1) {
2997 uint64_t wr_id, wr_id_in;
2998 int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
2999 if (ret < 0) {
3000 error_report("rdma migration: polling error! %d", ret);
3001 goto err;
3002 }
3003
3004 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3005
3006 if (wr_id == RDMA_WRID_NONE) {
3007 break;
3008 }
3009 }
3010
3011 return RAM_SAVE_CONTROL_DELAYED;
3012err:
3013 rdma->error_state = ret;
3014 return ret;
3015}
3016
3017static int qemu_rdma_accept(RDMAContext *rdma)
3018{
3019 RDMACapabilities cap;
3020 struct rdma_conn_param conn_param = {
3021 .responder_resources = 2,
3022 .private_data = &cap,
3023 .private_data_len = sizeof(cap),
3024 };
3025 struct rdma_cm_event *cm_event;
3026 struct ibv_context *verbs;
3027 int ret = -EINVAL;
3028 int idx;
3029
3030 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3031 if (ret) {
3032 goto err_rdma_dest_wait;
3033 }
3034
3035 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3036 rdma_ack_cm_event(cm_event);
3037 goto err_rdma_dest_wait;
3038 }
3039
3040 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3041
3042 network_to_caps(&cap);
3043
3044 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3045 error_report("Unknown source RDMA version: %d, bailing...",
3046 cap.version);
3047 rdma_ack_cm_event(cm_event);
3048 goto err_rdma_dest_wait;
3049 }
3050
3051
3052
3053
3054 cap.flags &= known_capabilities;
3055
3056
3057
3058
3059
3060 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3061 rdma->pin_all = true;
3062 }
3063
3064 rdma->cm_id = cm_event->id;
3065 verbs = cm_event->id->verbs;
3066
3067 rdma_ack_cm_event(cm_event);
3068
3069 trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3070
3071 caps_to_network(&cap);
3072
3073 trace_qemu_rdma_accept_pin_verbsc(verbs);
3074
3075 if (!rdma->verbs) {
3076 rdma->verbs = verbs;
3077 } else if (rdma->verbs != verbs) {
3078 error_report("ibv context not matching %p, %p!", rdma->verbs,
3079 verbs);
3080 goto err_rdma_dest_wait;
3081 }
3082
3083 qemu_rdma_dump_id("dest_init", verbs);
3084
3085 ret = qemu_rdma_alloc_pd_cq(rdma);
3086 if (ret) {
3087 error_report("rdma migration: error allocating pd and cq!");
3088 goto err_rdma_dest_wait;
3089 }
3090
3091 ret = qemu_rdma_alloc_qp(rdma);
3092 if (ret) {
3093 error_report("rdma migration: error allocating qp!");
3094 goto err_rdma_dest_wait;
3095 }
3096
3097 ret = qemu_rdma_init_ram_blocks(rdma);
3098 if (ret) {
3099 error_report("rdma migration: error initializing ram blocks!");
3100 goto err_rdma_dest_wait;
3101 }
3102
3103 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
3104 ret = qemu_rdma_reg_control(rdma, idx);
3105 if (ret) {
3106 error_report("rdma: error registering %d control", idx);
3107 goto err_rdma_dest_wait;
3108 }
3109 }
3110
3111 qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
3112
3113 ret = rdma_accept(rdma->cm_id, &conn_param);
3114 if (ret) {
3115 error_report("rdma_accept returns %d", ret);
3116 goto err_rdma_dest_wait;
3117 }
3118
3119 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3120 if (ret) {
3121 error_report("rdma_accept get_cm_event failed %d", ret);
3122 goto err_rdma_dest_wait;
3123 }
3124
3125 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3126 error_report("rdma_accept not event established");
3127 rdma_ack_cm_event(cm_event);
3128 goto err_rdma_dest_wait;
3129 }
3130
3131 rdma_ack_cm_event(cm_event);
3132 rdma->connected = true;
3133
3134 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
3135 if (ret) {
3136 error_report("rdma migration: error posting second control recv");
3137 goto err_rdma_dest_wait;
3138 }
3139
3140 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3141
3142 return 0;
3143
3144err_rdma_dest_wait:
3145 rdma->error_state = ret;
3146 qemu_rdma_cleanup(rdma);
3147 return ret;
3148}
3149
3150static int dest_ram_sort_func(const void *a, const void *b)
3151{
3152 unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3153 unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3154
3155 return (a_index < b_index) ? -1 : (a_index != b_index);
3156}
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
3168{
3169 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3170 .type = RDMA_CONTROL_REGISTER_RESULT,
3171 .repeat = 0,
3172 };
3173 RDMAControlHeader unreg_resp = { .len = 0,
3174 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3175 .repeat = 0,
3176 };
3177 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3178 .repeat = 1 };
3179 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3180 RDMAContext *rdma = rioc->rdma;
3181 RDMALocalBlocks *local = &rdma->local_ram_blocks;
3182 RDMAControlHeader head;
3183 RDMARegister *reg, *registers;
3184 RDMACompress *comp;
3185 RDMARegisterResult *reg_result;
3186 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3187 RDMALocalBlock *block;
3188 void *host_addr;
3189 int ret = 0;
3190 int idx = 0;
3191 int count = 0;
3192 int i = 0;
3193
3194 CHECK_ERROR_STATE();
3195
3196 do {
3197 trace_qemu_rdma_registration_handle_wait();
3198
3199 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
3200
3201 if (ret < 0) {
3202 break;
3203 }
3204
3205 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3206 error_report("rdma: Too many requests in this message (%d)."
3207 "Bailing.", head.repeat);
3208 ret = -EIO;
3209 break;
3210 }
3211
3212 switch (head.type) {
3213 case RDMA_CONTROL_COMPRESS:
3214 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3215 network_to_compress(comp);
3216
3217 trace_qemu_rdma_registration_handle_compress(comp->length,
3218 comp->block_idx,
3219 comp->offset);
3220 if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3221 error_report("rdma: 'compress' bad block index %u (vs %d)",
3222 (unsigned int)comp->block_idx,
3223 rdma->local_ram_blocks.nb_blocks);
3224 ret = -EIO;
3225 goto out;
3226 }
3227 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3228
3229 host_addr = block->local_host_addr +
3230 (comp->offset - block->offset);
3231
3232 ram_handle_compressed(host_addr, comp->value, comp->length);
3233 break;
3234
3235 case RDMA_CONTROL_REGISTER_FINISHED:
3236 trace_qemu_rdma_registration_handle_finished();
3237 goto out;
3238
3239 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3240 trace_qemu_rdma_registration_handle_ram_blocks();
3241
3242
3243
3244
3245
3246 qsort(rdma->local_ram_blocks.block,
3247 rdma->local_ram_blocks.nb_blocks,
3248 sizeof(RDMALocalBlock), dest_ram_sort_func);
3249 if (rdma->pin_all) {
3250 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3251 if (ret) {
3252 error_report("rdma migration: error dest "
3253 "registering ram blocks");
3254 goto out;
3255 }
3256 }
3257
3258
3259
3260
3261
3262
3263
3264 for (i = 0; i < local->nb_blocks; i++) {
3265 rdma->dest_blocks[i].remote_host_addr =
3266 (uintptr_t)(local->block[i].local_host_addr);
3267
3268 if (rdma->pin_all) {
3269 rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3270 }
3271
3272 rdma->dest_blocks[i].offset = local->block[i].offset;
3273 rdma->dest_blocks[i].length = local->block[i].length;
3274
3275 dest_block_to_network(&rdma->dest_blocks[i]);
3276 trace_qemu_rdma_registration_handle_ram_blocks_loop(
3277 local->block[i].block_name,
3278 local->block[i].offset,
3279 local->block[i].length,
3280 local->block[i].local_host_addr,
3281 local->block[i].src_index);
3282 }
3283
3284 blocks.len = rdma->local_ram_blocks.nb_blocks
3285 * sizeof(RDMADestBlock);
3286
3287
3288 ret = qemu_rdma_post_send_control(rdma,
3289 (uint8_t *) rdma->dest_blocks, &blocks);
3290
3291 if (ret < 0) {
3292 error_report("rdma migration: error sending remote info");
3293 goto out;
3294 }
3295
3296 break;
3297 case RDMA_CONTROL_REGISTER_REQUEST:
3298 trace_qemu_rdma_registration_handle_register(head.repeat);
3299
3300 reg_resp.repeat = head.repeat;
3301 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3302
3303 for (count = 0; count < head.repeat; count++) {
3304 uint64_t chunk;
3305 uint8_t *chunk_start, *chunk_end;
3306
3307 reg = ®isters[count];
3308 network_to_register(reg);
3309
3310 reg_result = &results[count];
3311
3312 trace_qemu_rdma_registration_handle_register_loop(count,
3313 reg->current_index, reg->key.current_addr, reg->chunks);
3314
3315 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3316 error_report("rdma: 'register' bad block index %u (vs %d)",
3317 (unsigned int)reg->current_index,
3318 rdma->local_ram_blocks.nb_blocks);
3319 ret = -ENOENT;
3320 goto out;
3321 }
3322 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3323 if (block->is_ram_block) {
3324 if (block->offset > reg->key.current_addr) {
3325 error_report("rdma: bad register address for block %s"
3326 " offset: %" PRIx64 " current_addr: %" PRIx64,
3327 block->block_name, block->offset,
3328 reg->key.current_addr);
3329 ret = -ERANGE;
3330 goto out;
3331 }
3332 host_addr = (block->local_host_addr +
3333 (reg->key.current_addr - block->offset));
3334 chunk = ram_chunk_index(block->local_host_addr,
3335 (uint8_t *) host_addr);
3336 } else {
3337 chunk = reg->key.chunk;
3338 host_addr = block->local_host_addr +
3339 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3340
3341 if (host_addr < (void *)block->local_host_addr) {
3342 error_report("rdma: bad chunk for block %s"
3343 " chunk: %" PRIx64,
3344 block->block_name, reg->key.chunk);
3345 ret = -ERANGE;
3346 goto out;
3347 }
3348 }
3349 chunk_start = ram_chunk_start(block, chunk);
3350 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3351 if (qemu_rdma_register_and_get_keys(rdma, block,
3352 (uintptr_t)host_addr, NULL, ®_result->rkey,
3353 chunk, chunk_start, chunk_end)) {
3354 error_report("cannot get rkey");
3355 ret = -EINVAL;
3356 goto out;
3357 }
3358
3359 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3360
3361 trace_qemu_rdma_registration_handle_register_rkey(
3362 reg_result->rkey);
3363
3364 result_to_network(reg_result);
3365 }
3366
3367 ret = qemu_rdma_post_send_control(rdma,
3368 (uint8_t *) results, ®_resp);
3369
3370 if (ret < 0) {
3371 error_report("Failed to send control buffer");
3372 goto out;
3373 }
3374 break;
3375 case RDMA_CONTROL_UNREGISTER_REQUEST:
3376 trace_qemu_rdma_registration_handle_unregister(head.repeat);
3377 unreg_resp.repeat = head.repeat;
3378 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3379
3380 for (count = 0; count < head.repeat; count++) {
3381 reg = ®isters[count];
3382 network_to_register(reg);
3383
3384 trace_qemu_rdma_registration_handle_unregister_loop(count,
3385 reg->current_index, reg->key.chunk);
3386
3387 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3388
3389 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3390 block->pmr[reg->key.chunk] = NULL;
3391
3392 if (ret != 0) {
3393 perror("rdma unregistration chunk failed");
3394 ret = -ret;
3395 goto out;
3396 }
3397
3398 rdma->total_registrations--;
3399
3400 trace_qemu_rdma_registration_handle_unregister_success(
3401 reg->key.chunk);
3402 }
3403
3404 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3405
3406 if (ret < 0) {
3407 error_report("Failed to send control buffer");
3408 goto out;
3409 }
3410 break;
3411 case RDMA_CONTROL_REGISTER_RESULT:
3412 error_report("Invalid RESULT message at dest.");
3413 ret = -EIO;
3414 goto out;
3415 default:
3416 error_report("Unknown control message %s", control_desc(head.type));
3417 ret = -EIO;
3418 goto out;
3419 }
3420 } while (1);
3421out:
3422 if (ret < 0) {
3423 rdma->error_state = ret;
3424 }
3425 return ret;
3426}
3427
3428
3429
3430
3431
3432
3433
3434
3435static int
3436rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
3437{
3438 RDMAContext *rdma = rioc->rdma;
3439 int curr;
3440 int found = -1;
3441
3442
3443 for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3444 if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3445 found = curr;
3446 break;
3447 }
3448 }
3449
3450 if (found == -1) {
3451 error_report("RAMBlock '%s' not found on destination", name);
3452 return -ENOENT;
3453 }
3454
3455 rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3456 trace_rdma_block_notification_handle(name, rdma->next_src_index);
3457 rdma->next_src_index++;
3458
3459 return 0;
3460}
3461
3462static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
3463{
3464 switch (flags) {
3465 case RAM_CONTROL_BLOCK_REG:
3466 return rdma_block_notification_handle(opaque, data);
3467
3468 case RAM_CONTROL_HOOK:
3469 return qemu_rdma_registration_handle(f, opaque);
3470
3471 default:
3472
3473 abort();
3474 }
3475}
3476
3477static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3478 uint64_t flags, void *data)
3479{
3480 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3481 RDMAContext *rdma = rioc->rdma;
3482
3483 CHECK_ERROR_STATE();
3484
3485 trace_qemu_rdma_registration_start(flags);
3486 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3487 qemu_fflush(f);
3488
3489 return 0;
3490}
3491
3492
3493
3494
3495
3496static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3497 uint64_t flags, void *data)
3498{
3499 Error *local_err = NULL, **errp = &local_err;
3500 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3501 RDMAContext *rdma = rioc->rdma;
3502 RDMAControlHeader head = { .len = 0, .repeat = 1 };
3503 int ret = 0;
3504
3505 CHECK_ERROR_STATE();
3506
3507 qemu_fflush(f);
3508 ret = qemu_rdma_drain_cq(f, rdma);
3509
3510 if (ret < 0) {
3511 goto err;
3512 }
3513
3514 if (flags == RAM_CONTROL_SETUP) {
3515 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3516 RDMALocalBlocks *local = &rdma->local_ram_blocks;
3517 int reg_result_idx, i, nb_dest_blocks;
3518
3519 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3520 trace_qemu_rdma_registration_stop_ram();
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3531 ®_result_idx, rdma->pin_all ?
3532 qemu_rdma_reg_whole_ram_blocks : NULL);
3533 if (ret < 0) {
3534 ERROR(errp, "receiving remote info!");
3535 return ret;
3536 }
3537
3538 nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552 if (local->nb_blocks != nb_dest_blocks) {
3553 ERROR(errp, "ram blocks mismatch (Number of blocks %d vs %d) "
3554 "Your QEMU command line parameters are probably "
3555 "not identical on both the source and destination.",
3556 local->nb_blocks, nb_dest_blocks);
3557 rdma->error_state = -EINVAL;
3558 return -EINVAL;
3559 }
3560
3561 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3562 memcpy(rdma->dest_blocks,
3563 rdma->wr_data[reg_result_idx].control_curr, resp.len);
3564 for (i = 0; i < nb_dest_blocks; i++) {
3565 network_to_dest_block(&rdma->dest_blocks[i]);
3566
3567
3568 if (rdma->dest_blocks[i].length != local->block[i].length) {
3569 ERROR(errp, "Block %s/%d has a different length %" PRIu64
3570 "vs %" PRIu64, local->block[i].block_name, i,
3571 local->block[i].length,
3572 rdma->dest_blocks[i].length);
3573 rdma->error_state = -EINVAL;
3574 return -EINVAL;
3575 }
3576 local->block[i].remote_host_addr =
3577 rdma->dest_blocks[i].remote_host_addr;
3578 local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3579 }
3580 }
3581
3582 trace_qemu_rdma_registration_stop(flags);
3583
3584 head.type = RDMA_CONTROL_REGISTER_FINISHED;
3585 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3586
3587 if (ret < 0) {
3588 goto err;
3589 }
3590
3591 return 0;
3592err:
3593 rdma->error_state = ret;
3594 return ret;
3595}
3596
3597static const QEMUFileHooks rdma_read_hooks = {
3598 .hook_ram_load = rdma_load_hook,
3599};
3600
3601static const QEMUFileHooks rdma_write_hooks = {
3602 .before_ram_iterate = qemu_rdma_registration_start,
3603 .after_ram_iterate = qemu_rdma_registration_stop,
3604 .save_page = qemu_rdma_save_page,
3605};
3606
3607
3608static void qio_channel_rdma_finalize(Object *obj)
3609{
3610 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
3611 if (rioc->rdma) {
3612 qemu_rdma_cleanup(rioc->rdma);
3613 g_free(rioc->rdma);
3614 rioc->rdma = NULL;
3615 }
3616}
3617
3618static void qio_channel_rdma_class_init(ObjectClass *klass,
3619 void *class_data G_GNUC_UNUSED)
3620{
3621 QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
3622
3623 ioc_klass->io_writev = qio_channel_rdma_writev;
3624 ioc_klass->io_readv = qio_channel_rdma_readv;
3625 ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
3626 ioc_klass->io_close = qio_channel_rdma_close;
3627 ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
3628}
3629
3630static const TypeInfo qio_channel_rdma_info = {
3631 .parent = TYPE_QIO_CHANNEL,
3632 .name = TYPE_QIO_CHANNEL_RDMA,
3633 .instance_size = sizeof(QIOChannelRDMA),
3634 .instance_finalize = qio_channel_rdma_finalize,
3635 .class_init = qio_channel_rdma_class_init,
3636};
3637
3638static void qio_channel_rdma_register_types(void)
3639{
3640 type_register_static(&qio_channel_rdma_info);
3641}
3642
3643type_init(qio_channel_rdma_register_types);
3644
3645static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
3646{
3647 QIOChannelRDMA *rioc;
3648
3649 if (qemu_file_mode_is_not_valid(mode)) {
3650 return NULL;
3651 }
3652
3653 rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
3654 rioc->rdma = rdma;
3655
3656 if (mode[0] == 'w') {
3657 rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc));
3658 qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
3659 } else {
3660 rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc));
3661 qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
3662 }
3663
3664 return rioc->file;
3665}
3666
3667static void rdma_accept_incoming_migration(void *opaque)
3668{
3669 RDMAContext *rdma = opaque;
3670 int ret;
3671 QEMUFile *f;
3672 Error *local_err = NULL, **errp = &local_err;
3673
3674 trace_qemu_rdma_accept_incoming_migration();
3675 ret = qemu_rdma_accept(rdma);
3676
3677 if (ret) {
3678 ERROR(errp, "RDMA Migration initialization failed!");
3679 return;
3680 }
3681
3682 trace_qemu_rdma_accept_incoming_migration_accepted();
3683
3684 f = qemu_fopen_rdma(rdma, "rb");
3685 if (f == NULL) {
3686 ERROR(errp, "could not qemu_fopen_rdma!");
3687 qemu_rdma_cleanup(rdma);
3688 return;
3689 }
3690
3691 rdma->migration_started_on_destination = 1;
3692 migration_fd_process_incoming(f);
3693}
3694
3695void rdma_start_incoming_migration(const char *host_port, Error **errp)
3696{
3697 int ret;
3698 RDMAContext *rdma;
3699 Error *local_err = NULL;
3700
3701 trace_rdma_start_incoming_migration();
3702 rdma = qemu_rdma_data_init(host_port, &local_err);
3703
3704 if (rdma == NULL) {
3705 goto err;
3706 }
3707
3708 ret = qemu_rdma_dest_init(rdma, &local_err);
3709
3710 if (ret) {
3711 goto err;
3712 }
3713
3714 trace_rdma_start_incoming_migration_after_dest_init();
3715
3716 ret = rdma_listen(rdma->listen_id, 5);
3717
3718 if (ret) {
3719 ERROR(errp, "listening on socket!");
3720 goto err;
3721 }
3722
3723 trace_rdma_start_incoming_migration_after_rdma_listen();
3724
3725 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3726 NULL, (void *)(intptr_t)rdma);
3727 return;
3728err:
3729 error_propagate(errp, local_err);
3730 g_free(rdma);
3731}
3732
3733void rdma_start_outgoing_migration(void *opaque,
3734 const char *host_port, Error **errp)
3735{
3736 MigrationState *s = opaque;
3737 RDMAContext *rdma = qemu_rdma_data_init(host_port, errp);
3738 int ret = 0;
3739
3740 if (rdma == NULL) {
3741 goto err;
3742 }
3743
3744 ret = qemu_rdma_source_init(rdma,
3745 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
3746
3747 if (ret) {
3748 goto err;
3749 }
3750
3751 trace_rdma_start_outgoing_migration_after_rdma_source_init();
3752 ret = qemu_rdma_connect(rdma, errp);
3753
3754 if (ret) {
3755 goto err;
3756 }
3757
3758 trace_rdma_start_outgoing_migration_after_rdma_connect();
3759
3760 s->to_dst_file = qemu_fopen_rdma(rdma, "wb");
3761 migrate_fd_connect(s);
3762 return;
3763err:
3764 g_free(rdma);
3765}
3766