1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include "qemu/osdep.h"
18#include "qapi/error.h"
19#include "qemu/cutils.h"
20#include "rdma.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "ram.h"
24#include "qemu-file-channel.h"
25#include "qemu/error-report.h"
26#include "qemu/main-loop.h"
27#include "qemu/module.h"
28#include "qemu/rcu.h"
29#include "qemu/sockets.h"
30#include "qemu/bitmap.h"
31#include "qemu/coroutine.h"
32#include "exec/memory.h"
33#include <sys/socket.h>
34#include <netdb.h>
35#include <arpa/inet.h>
36#include <rdma/rdma_cma.h>
37#include "trace.h"
38#include "qom/object.h"
39#include <poll.h>
40
41
42
43
44#define ERROR(errp, fmt, ...) \
45 do { \
46 fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
47 if (errp && (*(errp) == NULL)) { \
48 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
49 } \
50 } while (0)
51
52#define RDMA_RESOLVE_TIMEOUT_MS 10000
53
54
55#define RDMA_MERGE_MAX (2 * 1024 * 1024)
56#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
57
58#define RDMA_REG_CHUNK_SHIFT 20
59
60
61
62
63
64
65
66#define RDMA_SEND_INCREMENT 32768
67
68
69
70
71#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
72#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
73
74#define RDMA_CONTROL_VERSION_CURRENT 1
75
76
77
78#define RDMA_CAPABILITY_PIN_ALL 0x01
79
80
81
82
83
84static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
85
86#define CHECK_ERROR_STATE() \
87 do { \
88 if (rdma->error_state) { \
89 if (!rdma->error_reported) { \
90 error_report("RDMA is in an error state waiting migration" \
91 " to abort!"); \
92 rdma->error_reported = 1; \
93 } \
94 return rdma->error_state; \
95 } \
96 } while (0)
97
98
99
100
101
102
103
104
105
106
107
108
109
110#define RDMA_WRID_TYPE_SHIFT 0UL
111#define RDMA_WRID_BLOCK_SHIFT 16UL
112#define RDMA_WRID_CHUNK_SHIFT 30UL
113
114#define RDMA_WRID_TYPE_MASK \
115 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
116
117#define RDMA_WRID_BLOCK_MASK \
118 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
119
120#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
121
122
123
124
125
126
127enum {
128 RDMA_WRID_NONE = 0,
129 RDMA_WRID_RDMA_WRITE = 1,
130 RDMA_WRID_SEND_CONTROL = 2000,
131 RDMA_WRID_RECV_CONTROL = 4000,
132};
133
134static const char *wrid_desc[] = {
135 [RDMA_WRID_NONE] = "NONE",
136 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
137 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
138 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
139};
140
141
142
143
144
145
146
147
148enum {
149 RDMA_WRID_READY = 0,
150 RDMA_WRID_DATA,
151 RDMA_WRID_CONTROL,
152 RDMA_WRID_MAX,
153};
154
155
156
157
158enum {
159 RDMA_CONTROL_NONE = 0,
160 RDMA_CONTROL_ERROR,
161 RDMA_CONTROL_READY,
162 RDMA_CONTROL_QEMU_FILE,
163 RDMA_CONTROL_RAM_BLOCKS_REQUEST,
164 RDMA_CONTROL_RAM_BLOCKS_RESULT,
165 RDMA_CONTROL_COMPRESS,
166 RDMA_CONTROL_REGISTER_REQUEST,
167 RDMA_CONTROL_REGISTER_RESULT,
168 RDMA_CONTROL_REGISTER_FINISHED,
169 RDMA_CONTROL_UNREGISTER_REQUEST,
170 RDMA_CONTROL_UNREGISTER_FINISHED,
171};
172
173
174
175
176
177
178typedef struct {
179 uint8_t control[RDMA_CONTROL_MAX_BUFFER];
180 struct ibv_mr *control_mr;
181 size_t control_len;
182 uint8_t *control_curr;
183} RDMAWorkRequestData;
184
185
186
187
188typedef struct {
189 uint32_t version;
190 uint32_t flags;
191} RDMACapabilities;
192
193static void caps_to_network(RDMACapabilities *cap)
194{
195 cap->version = htonl(cap->version);
196 cap->flags = htonl(cap->flags);
197}
198
199static void network_to_caps(RDMACapabilities *cap)
200{
201 cap->version = ntohl(cap->version);
202 cap->flags = ntohl(cap->flags);
203}
204
205
206
207
208
209
210
211
212typedef struct RDMALocalBlock {
213 char *block_name;
214 uint8_t *local_host_addr;
215 uint64_t remote_host_addr;
216 uint64_t offset;
217 uint64_t length;
218 struct ibv_mr **pmr;
219 struct ibv_mr *mr;
220 uint32_t *remote_keys;
221 uint32_t remote_rkey;
222 int index;
223 unsigned int src_index;
224 bool is_ram_block;
225 int nb_chunks;
226 unsigned long *transit_bitmap;
227 unsigned long *unregister_bitmap;
228} RDMALocalBlock;
229
230
231
232
233
234
235
236
237typedef struct QEMU_PACKED RDMADestBlock {
238 uint64_t remote_host_addr;
239 uint64_t offset;
240 uint64_t length;
241 uint32_t remote_rkey;
242 uint32_t padding;
243} RDMADestBlock;
244
245static const char *control_desc(unsigned int rdma_control)
246{
247 static const char *strs[] = {
248 [RDMA_CONTROL_NONE] = "NONE",
249 [RDMA_CONTROL_ERROR] = "ERROR",
250 [RDMA_CONTROL_READY] = "READY",
251 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
252 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
253 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
254 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
255 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
256 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
257 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
258 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
259 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
260 };
261
262 if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
263 return "??BAD CONTROL VALUE??";
264 }
265
266 return strs[rdma_control];
267}
268
269static uint64_t htonll(uint64_t v)
270{
271 union { uint32_t lv[2]; uint64_t llv; } u;
272 u.lv[0] = htonl(v >> 32);
273 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
274 return u.llv;
275}
276
277static uint64_t ntohll(uint64_t v)
278{
279 union { uint32_t lv[2]; uint64_t llv; } u;
280 u.llv = v;
281 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
282}
283
284static void dest_block_to_network(RDMADestBlock *db)
285{
286 db->remote_host_addr = htonll(db->remote_host_addr);
287 db->offset = htonll(db->offset);
288 db->length = htonll(db->length);
289 db->remote_rkey = htonl(db->remote_rkey);
290}
291
292static void network_to_dest_block(RDMADestBlock *db)
293{
294 db->remote_host_addr = ntohll(db->remote_host_addr);
295 db->offset = ntohll(db->offset);
296 db->length = ntohll(db->length);
297 db->remote_rkey = ntohl(db->remote_rkey);
298}
299
300
301
302
303
304
305typedef struct RDMALocalBlocks {
306 int nb_blocks;
307 bool init;
308 RDMALocalBlock *block;
309} RDMALocalBlocks;
310
311
312
313
314
315
316
317typedef struct RDMAContext {
318 char *host;
319 int port;
320 char *host_port;
321
322 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
323
324
325
326
327
328
329
330
331 int control_ready_expected;
332
333
334 int nb_sent;
335
336
337
338 uint64_t current_addr;
339 uint64_t current_length;
340
341 int current_index;
342
343 int current_chunk;
344
345 bool pin_all;
346
347
348
349
350
351
352
353
354 struct rdma_cm_id *cm_id;
355 struct rdma_cm_id *listen_id;
356 bool connected;
357
358 struct ibv_context *verbs;
359 struct rdma_event_channel *channel;
360 struct ibv_qp *qp;
361 struct ibv_comp_channel *recv_comp_channel;
362 struct ibv_comp_channel *send_comp_channel;
363 struct ibv_pd *pd;
364 struct ibv_cq *recv_cq;
365 struct ibv_cq *send_cq;
366
367
368
369
370
371
372 int error_state;
373 int error_reported;
374 int received_error;
375
376
377
378
379 RDMALocalBlocks local_ram_blocks;
380 RDMADestBlock *dest_blocks;
381
382
383 unsigned int next_src_index;
384
385
386
387
388
389
390 int migration_started_on_destination;
391
392 int total_registrations;
393 int total_writes;
394
395 int unregister_current, unregister_next;
396 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
397
398 GHashTable *blockmap;
399
400
401 struct RDMAContext *return_path;
402 bool is_return_path;
403} RDMAContext;
404
405#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
406OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
407
408
409
410struct QIOChannelRDMA {
411 QIOChannel parent;
412 RDMAContext *rdmain;
413 RDMAContext *rdmaout;
414 QEMUFile *file;
415 bool blocking;
416};
417
418
419
420
421
422typedef struct QEMU_PACKED {
423 uint32_t len;
424 uint32_t type;
425 uint32_t repeat;
426 uint32_t padding;
427} RDMAControlHeader;
428
429static void control_to_network(RDMAControlHeader *control)
430{
431 control->type = htonl(control->type);
432 control->len = htonl(control->len);
433 control->repeat = htonl(control->repeat);
434}
435
436static void network_to_control(RDMAControlHeader *control)
437{
438 control->type = ntohl(control->type);
439 control->len = ntohl(control->len);
440 control->repeat = ntohl(control->repeat);
441}
442
443
444
445
446
447
448
449typedef struct QEMU_PACKED {
450 union QEMU_PACKED {
451 uint64_t current_addr;
452 uint64_t chunk;
453 } key;
454 uint32_t current_index;
455 uint32_t padding;
456 uint64_t chunks;
457} RDMARegister;
458
459static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
460{
461 RDMALocalBlock *local_block;
462 local_block = &rdma->local_ram_blocks.block[reg->current_index];
463
464 if (local_block->is_ram_block) {
465
466
467
468
469 reg->key.current_addr -= local_block->offset;
470 reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
471 }
472 reg->key.current_addr = htonll(reg->key.current_addr);
473 reg->current_index = htonl(reg->current_index);
474 reg->chunks = htonll(reg->chunks);
475}
476
477static void network_to_register(RDMARegister *reg)
478{
479 reg->key.current_addr = ntohll(reg->key.current_addr);
480 reg->current_index = ntohl(reg->current_index);
481 reg->chunks = ntohll(reg->chunks);
482}
483
484typedef struct QEMU_PACKED {
485 uint32_t value;
486 uint32_t block_idx;
487 uint64_t offset;
488 uint64_t length;
489} RDMACompress;
490
491static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
492{
493 comp->value = htonl(comp->value);
494
495
496
497
498 comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
499 comp->offset += rdma->dest_blocks[comp->block_idx].offset;
500 comp->block_idx = htonl(comp->block_idx);
501 comp->offset = htonll(comp->offset);
502 comp->length = htonll(comp->length);
503}
504
505static void network_to_compress(RDMACompress *comp)
506{
507 comp->value = ntohl(comp->value);
508 comp->block_idx = ntohl(comp->block_idx);
509 comp->offset = ntohll(comp->offset);
510 comp->length = ntohll(comp->length);
511}
512
513
514
515
516
517
518typedef struct QEMU_PACKED {
519 uint32_t rkey;
520 uint32_t padding;
521 uint64_t host_addr;
522} RDMARegisterResult;
523
524static void result_to_network(RDMARegisterResult *result)
525{
526 result->rkey = htonl(result->rkey);
527 result->host_addr = htonll(result->host_addr);
528};
529
530static void network_to_result(RDMARegisterResult *result)
531{
532 result->rkey = ntohl(result->rkey);
533 result->host_addr = ntohll(result->host_addr);
534};
535
536const char *print_wrid(int wrid);
537static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
538 uint8_t *data, RDMAControlHeader *resp,
539 int *resp_idx,
540 int (*callback)(RDMAContext *rdma));
541
542static inline uint64_t ram_chunk_index(const uint8_t *start,
543 const uint8_t *host)
544{
545 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
546}
547
548static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
549 uint64_t i)
550{
551 return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
552 (i << RDMA_REG_CHUNK_SHIFT));
553}
554
555static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
556 uint64_t i)
557{
558 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
559 (1UL << RDMA_REG_CHUNK_SHIFT);
560
561 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
562 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
563 }
564
565 return result;
566}
567
568static int rdma_add_block(RDMAContext *rdma, const char *block_name,
569 void *host_addr,
570 ram_addr_t block_offset, uint64_t length)
571{
572 RDMALocalBlocks *local = &rdma->local_ram_blocks;
573 RDMALocalBlock *block;
574 RDMALocalBlock *old = local->block;
575
576 local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
577
578 if (local->nb_blocks) {
579 int x;
580
581 if (rdma->blockmap) {
582 for (x = 0; x < local->nb_blocks; x++) {
583 g_hash_table_remove(rdma->blockmap,
584 (void *)(uintptr_t)old[x].offset);
585 g_hash_table_insert(rdma->blockmap,
586 (void *)(uintptr_t)old[x].offset,
587 &local->block[x]);
588 }
589 }
590 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
591 g_free(old);
592 }
593
594 block = &local->block[local->nb_blocks];
595
596 block->block_name = g_strdup(block_name);
597 block->local_host_addr = host_addr;
598 block->offset = block_offset;
599 block->length = length;
600 block->index = local->nb_blocks;
601 block->src_index = ~0U;
602 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
603 block->transit_bitmap = bitmap_new(block->nb_chunks);
604 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
605 block->unregister_bitmap = bitmap_new(block->nb_chunks);
606 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
607 block->remote_keys = g_new0(uint32_t, block->nb_chunks);
608
609 block->is_ram_block = local->init ? false : true;
610
611 if (rdma->blockmap) {
612 g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
613 }
614
615 trace_rdma_add_block(block_name, local->nb_blocks,
616 (uintptr_t) block->local_host_addr,
617 block->offset, block->length,
618 (uintptr_t) (block->local_host_addr + block->length),
619 BITS_TO_LONGS(block->nb_chunks) *
620 sizeof(unsigned long) * 8,
621 block->nb_chunks);
622
623 local->nb_blocks++;
624
625 return 0;
626}
627
628
629
630
631
632
633static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
634{
635 const char *block_name = qemu_ram_get_idstr(rb);
636 void *host_addr = qemu_ram_get_host_addr(rb);
637 ram_addr_t block_offset = qemu_ram_get_offset(rb);
638 ram_addr_t length = qemu_ram_get_used_length(rb);
639 return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
640}
641
642
643
644
645
646
647static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
648{
649 RDMALocalBlocks *local = &rdma->local_ram_blocks;
650 int ret;
651
652 assert(rdma->blockmap == NULL);
653 memset(local, 0, sizeof *local);
654 ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
655 if (ret) {
656 return ret;
657 }
658 trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
659 rdma->dest_blocks = g_new0(RDMADestBlock,
660 rdma->local_ram_blocks.nb_blocks);
661 local->init = true;
662 return 0;
663}
664
665
666
667
668
669static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
670{
671 RDMALocalBlocks *local = &rdma->local_ram_blocks;
672 RDMALocalBlock *old = local->block;
673 int x;
674
675 if (rdma->blockmap) {
676 g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
677 }
678 if (block->pmr) {
679 int j;
680
681 for (j = 0; j < block->nb_chunks; j++) {
682 if (!block->pmr[j]) {
683 continue;
684 }
685 ibv_dereg_mr(block->pmr[j]);
686 rdma->total_registrations--;
687 }
688 g_free(block->pmr);
689 block->pmr = NULL;
690 }
691
692 if (block->mr) {
693 ibv_dereg_mr(block->mr);
694 rdma->total_registrations--;
695 block->mr = NULL;
696 }
697
698 g_free(block->transit_bitmap);
699 block->transit_bitmap = NULL;
700
701 g_free(block->unregister_bitmap);
702 block->unregister_bitmap = NULL;
703
704 g_free(block->remote_keys);
705 block->remote_keys = NULL;
706
707 g_free(block->block_name);
708 block->block_name = NULL;
709
710 if (rdma->blockmap) {
711 for (x = 0; x < local->nb_blocks; x++) {
712 g_hash_table_remove(rdma->blockmap,
713 (void *)(uintptr_t)old[x].offset);
714 }
715 }
716
717 if (local->nb_blocks > 1) {
718
719 local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
720
721 if (block->index) {
722 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
723 }
724
725 if (block->index < (local->nb_blocks - 1)) {
726 memcpy(local->block + block->index, old + (block->index + 1),
727 sizeof(RDMALocalBlock) *
728 (local->nb_blocks - (block->index + 1)));
729 for (x = block->index; x < local->nb_blocks - 1; x++) {
730 local->block[x].index--;
731 }
732 }
733 } else {
734 assert(block == local->block);
735 local->block = NULL;
736 }
737
738 trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
739 block->offset, block->length,
740 (uintptr_t)(block->local_host_addr + block->length),
741 BITS_TO_LONGS(block->nb_chunks) *
742 sizeof(unsigned long) * 8, block->nb_chunks);
743
744 g_free(old);
745
746 local->nb_blocks--;
747
748 if (local->nb_blocks && rdma->blockmap) {
749 for (x = 0; x < local->nb_blocks; x++) {
750 g_hash_table_insert(rdma->blockmap,
751 (void *)(uintptr_t)local->block[x].offset,
752 &local->block[x]);
753 }
754 }
755
756 return 0;
757}
758
759
760
761
762
763static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
764{
765 struct ibv_port_attr port;
766
767 if (ibv_query_port(verbs, 1, &port)) {
768 error_report("Failed to query port information");
769 return;
770 }
771
772 printf("%s RDMA Device opened: kernel name %s "
773 "uverbs device name %s, "
774 "infiniband_verbs class device path %s, "
775 "infiniband class device path %s, "
776 "transport: (%d) %s\n",
777 who,
778 verbs->device->name,
779 verbs->device->dev_name,
780 verbs->device->dev_path,
781 verbs->device->ibdev_path,
782 port.link_layer,
783 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
784 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
785 ? "Ethernet" : "Unknown"));
786}
787
788
789
790
791
792
793static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
794{
795 char sgid[33];
796 char dgid[33];
797 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
798 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
799 trace_qemu_rdma_dump_gid(who, sgid, dgid);
800}
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
846{
847
848#ifdef CONFIG_LINUX
849 struct ibv_port_attr port_attr;
850
851
852
853
854
855
856
857
858
859
860 if (!verbs) {
861 int num_devices, x;
862 struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
863 bool roce_found = false;
864 bool ib_found = false;
865
866 for (x = 0; x < num_devices; x++) {
867 verbs = ibv_open_device(dev_list[x]);
868 if (!verbs) {
869 if (errno == EPERM) {
870 continue;
871 } else {
872 return -EINVAL;
873 }
874 }
875
876 if (ibv_query_port(verbs, 1, &port_attr)) {
877 ibv_close_device(verbs);
878 ERROR(errp, "Could not query initial IB port");
879 return -EINVAL;
880 }
881
882 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
883 ib_found = true;
884 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
885 roce_found = true;
886 }
887
888 ibv_close_device(verbs);
889
890 }
891
892 if (roce_found) {
893 if (ib_found) {
894 fprintf(stderr, "WARN: migrations may fail:"
895 " IPv6 over RoCE / iWARP in linux"
896 " is broken. But since you appear to have a"
897 " mixed RoCE / IB environment, be sure to only"
898 " migrate over the IB fabric until the kernel "
899 " fixes the bug.\n");
900 } else {
901 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
902 " and your management software has specified '[::]'"
903 ", but IPv6 over RoCE / iWARP is not supported in Linux.");
904 return -ENONET;
905 }
906 }
907
908 return 0;
909 }
910
911
912
913
914
915
916
917
918 if (ibv_query_port(verbs, 1, &port_attr)) {
919 ERROR(errp, "Could not query initial IB port");
920 return -EINVAL;
921 }
922
923 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
924 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
925 "(but patches on linux-rdma in progress)");
926 return -ENONET;
927 }
928
929#endif
930
931 return 0;
932}
933
934
935
936
937
938
939static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
940{
941 int ret;
942 struct rdma_addrinfo *res;
943 char port_str[16];
944 struct rdma_cm_event *cm_event;
945 char ip[40] = "unknown";
946 struct rdma_addrinfo *e;
947
948 if (rdma->host == NULL || !strcmp(rdma->host, "")) {
949 ERROR(errp, "RDMA hostname has not been set");
950 return -EINVAL;
951 }
952
953
954 rdma->channel = rdma_create_event_channel();
955 if (!rdma->channel) {
956 ERROR(errp, "could not create CM channel");
957 return -EINVAL;
958 }
959
960
961 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
962 if (ret) {
963 ERROR(errp, "could not create channel id");
964 goto err_resolve_create_id;
965 }
966
967 snprintf(port_str, 16, "%d", rdma->port);
968 port_str[15] = '\0';
969
970 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
971 if (ret < 0) {
972 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
973 goto err_resolve_get_addr;
974 }
975
976 for (e = res; e != NULL; e = e->ai_next) {
977 inet_ntop(e->ai_family,
978 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
979 trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
980
981 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
982 RDMA_RESOLVE_TIMEOUT_MS);
983 if (!ret) {
984 if (e->ai_family == AF_INET6) {
985 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
986 if (ret) {
987 continue;
988 }
989 }
990 goto route;
991 }
992 }
993
994 rdma_freeaddrinfo(res);
995 ERROR(errp, "could not resolve address %s", rdma->host);
996 goto err_resolve_get_addr;
997
998route:
999 rdma_freeaddrinfo(res);
1000 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
1001
1002 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1003 if (ret) {
1004 ERROR(errp, "could not perform event_addr_resolved");
1005 goto err_resolve_get_addr;
1006 }
1007
1008 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
1009 ERROR(errp, "result not equal to event_addr_resolved %s",
1010 rdma_event_str(cm_event->event));
1011 error_report("rdma_resolve_addr");
1012 rdma_ack_cm_event(cm_event);
1013 ret = -EINVAL;
1014 goto err_resolve_get_addr;
1015 }
1016 rdma_ack_cm_event(cm_event);
1017
1018
1019 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
1020 if (ret) {
1021 ERROR(errp, "could not resolve rdma route");
1022 goto err_resolve_get_addr;
1023 }
1024
1025 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1026 if (ret) {
1027 ERROR(errp, "could not perform event_route_resolved");
1028 goto err_resolve_get_addr;
1029 }
1030 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1031 ERROR(errp, "result not equal to event_route_resolved: %s",
1032 rdma_event_str(cm_event->event));
1033 rdma_ack_cm_event(cm_event);
1034 ret = -EINVAL;
1035 goto err_resolve_get_addr;
1036 }
1037 rdma_ack_cm_event(cm_event);
1038 rdma->verbs = rdma->cm_id->verbs;
1039 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1040 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1041 return 0;
1042
1043err_resolve_get_addr:
1044 rdma_destroy_id(rdma->cm_id);
1045 rdma->cm_id = NULL;
1046err_resolve_create_id:
1047 rdma_destroy_event_channel(rdma->channel);
1048 rdma->channel = NULL;
1049 return ret;
1050}
1051
1052
1053
1054
1055static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1056{
1057
1058 rdma->pd = ibv_alloc_pd(rdma->verbs);
1059 if (!rdma->pd) {
1060 error_report("failed to allocate protection domain");
1061 return -1;
1062 }
1063
1064
1065 rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
1066 if (!rdma->recv_comp_channel) {
1067 error_report("failed to allocate receive completion channel");
1068 goto err_alloc_pd_cq;
1069 }
1070
1071
1072
1073
1074 rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1075 NULL, rdma->recv_comp_channel, 0);
1076 if (!rdma->recv_cq) {
1077 error_report("failed to allocate receive completion queue");
1078 goto err_alloc_pd_cq;
1079 }
1080
1081
1082 rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
1083 if (!rdma->send_comp_channel) {
1084 error_report("failed to allocate send completion channel");
1085 goto err_alloc_pd_cq;
1086 }
1087
1088 rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1089 NULL, rdma->send_comp_channel, 0);
1090 if (!rdma->send_cq) {
1091 error_report("failed to allocate send completion queue");
1092 goto err_alloc_pd_cq;
1093 }
1094
1095 return 0;
1096
1097err_alloc_pd_cq:
1098 if (rdma->pd) {
1099 ibv_dealloc_pd(rdma->pd);
1100 }
1101 if (rdma->recv_comp_channel) {
1102 ibv_destroy_comp_channel(rdma->recv_comp_channel);
1103 }
1104 if (rdma->send_comp_channel) {
1105 ibv_destroy_comp_channel(rdma->send_comp_channel);
1106 }
1107 if (rdma->recv_cq) {
1108 ibv_destroy_cq(rdma->recv_cq);
1109 rdma->recv_cq = NULL;
1110 }
1111 rdma->pd = NULL;
1112 rdma->recv_comp_channel = NULL;
1113 rdma->send_comp_channel = NULL;
1114 return -1;
1115
1116}
1117
1118
1119
1120
1121static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1122{
1123 struct ibv_qp_init_attr attr = { 0 };
1124 int ret;
1125
1126 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1127 attr.cap.max_recv_wr = 3;
1128 attr.cap.max_send_sge = 1;
1129 attr.cap.max_recv_sge = 1;
1130 attr.send_cq = rdma->send_cq;
1131 attr.recv_cq = rdma->recv_cq;
1132 attr.qp_type = IBV_QPT_RC;
1133
1134 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1135 if (ret) {
1136 return -1;
1137 }
1138
1139 rdma->qp = rdma->cm_id->qp;
1140 return 0;
1141}
1142
1143
1144static bool rdma_support_odp(struct ibv_context *dev)
1145{
1146 struct ibv_device_attr_ex attr = {0};
1147 int ret = ibv_query_device_ex(dev, NULL, &attr);
1148 if (ret) {
1149 return false;
1150 }
1151
1152 if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
1153 return true;
1154 }
1155
1156 return false;
1157}
1158
1159
1160
1161
1162
1163
1164static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
1165 uint32_t len, uint32_t lkey,
1166 const char *name, bool wr)
1167{
1168#ifdef HAVE_IBV_ADVISE_MR
1169 int ret;
1170 int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
1171 IBV_ADVISE_MR_ADVICE_PREFETCH;
1172 struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
1173
1174 ret = ibv_advise_mr(pd, advice,
1175 IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
1176
1177 if (ret) {
1178 trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
1179 } else {
1180 trace_qemu_rdma_advise_mr(name, len, addr, "successed");
1181 }
1182#endif
1183}
1184
1185static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1186{
1187 int i;
1188 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1189
1190 for (i = 0; i < local->nb_blocks; i++) {
1191 int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
1192
1193 local->block[i].mr =
1194 ibv_reg_mr(rdma->pd,
1195 local->block[i].local_host_addr,
1196 local->block[i].length, access
1197 );
1198
1199 if (!local->block[i].mr &&
1200 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1201 access |= IBV_ACCESS_ON_DEMAND;
1202
1203 local->block[i].mr =
1204 ibv_reg_mr(rdma->pd,
1205 local->block[i].local_host_addr,
1206 local->block[i].length, access);
1207 trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
1208
1209 if (local->block[i].mr) {
1210 qemu_rdma_advise_prefetch_mr(rdma->pd,
1211 (uintptr_t)local->block[i].local_host_addr,
1212 local->block[i].length,
1213 local->block[i].mr->lkey,
1214 local->block[i].block_name,
1215 true);
1216 }
1217 }
1218
1219 if (!local->block[i].mr) {
1220 perror("Failed to register local dest ram block!");
1221 break;
1222 }
1223 rdma->total_registrations++;
1224 }
1225
1226 if (i >= local->nb_blocks) {
1227 return 0;
1228 }
1229
1230 for (i--; i >= 0; i--) {
1231 ibv_dereg_mr(local->block[i].mr);
1232 local->block[i].mr = NULL;
1233 rdma->total_registrations--;
1234 }
1235
1236 return -1;
1237
1238}
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1250 uintptr_t block_offset,
1251 uint64_t offset,
1252 uint64_t length,
1253 uint64_t *block_index,
1254 uint64_t *chunk_index)
1255{
1256 uint64_t current_addr = block_offset + offset;
1257 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1258 (void *) block_offset);
1259 assert(block);
1260 assert(current_addr >= block->offset);
1261 assert((current_addr + length) <= (block->offset + block->length));
1262
1263 *block_index = block->index;
1264 *chunk_index = ram_chunk_index(block->local_host_addr,
1265 block->local_host_addr + (current_addr - block->offset));
1266
1267 return 0;
1268}
1269
1270
1271
1272
1273
1274
1275
1276
1277static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1278 RDMALocalBlock *block, uintptr_t host_addr,
1279 uint32_t *lkey, uint32_t *rkey, int chunk,
1280 uint8_t *chunk_start, uint8_t *chunk_end)
1281{
1282 if (block->mr) {
1283 if (lkey) {
1284 *lkey = block->mr->lkey;
1285 }
1286 if (rkey) {
1287 *rkey = block->mr->rkey;
1288 }
1289 return 0;
1290 }
1291
1292
1293 if (!block->pmr) {
1294 block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1295 }
1296
1297
1298
1299
1300
1301
1302 if (!block->pmr[chunk]) {
1303 uint64_t len = chunk_end - chunk_start;
1304 int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
1305 0;
1306
1307 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1308
1309 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1310 if (!block->pmr[chunk] &&
1311 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1312 access |= IBV_ACCESS_ON_DEMAND;
1313
1314 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1315 trace_qemu_rdma_register_odp_mr(block->block_name);
1316
1317 if (block->pmr[chunk]) {
1318 qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
1319 len, block->pmr[chunk]->lkey,
1320 block->block_name, rkey);
1321
1322 }
1323 }
1324 }
1325 if (!block->pmr[chunk]) {
1326 perror("Failed to register chunk!");
1327 fprintf(stderr, "Chunk details: block: %d chunk index %d"
1328 " start %" PRIuPTR " end %" PRIuPTR
1329 " host %" PRIuPTR
1330 " local %" PRIuPTR " registrations: %d\n",
1331 block->index, chunk, (uintptr_t)chunk_start,
1332 (uintptr_t)chunk_end, host_addr,
1333 (uintptr_t)block->local_host_addr,
1334 rdma->total_registrations);
1335 return -1;
1336 }
1337 rdma->total_registrations++;
1338
1339 if (lkey) {
1340 *lkey = block->pmr[chunk]->lkey;
1341 }
1342 if (rkey) {
1343 *rkey = block->pmr[chunk]->rkey;
1344 }
1345 return 0;
1346}
1347
1348
1349
1350
1351
1352static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1353{
1354 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1355 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1356 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1357 if (rdma->wr_data[idx].control_mr) {
1358 rdma->total_registrations++;
1359 return 0;
1360 }
1361 error_report("qemu_rdma_reg_control failed");
1362 return -1;
1363}
1364
1365const char *print_wrid(int wrid)
1366{
1367 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1368 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1369 }
1370 return wrid_desc[wrid];
1371}
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1409{
1410 while (rdma->unregistrations[rdma->unregister_current]) {
1411 int ret;
1412 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1413 uint64_t chunk =
1414 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1415 uint64_t index =
1416 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1417 RDMALocalBlock *block =
1418 &(rdma->local_ram_blocks.block[index]);
1419 RDMARegister reg = { .current_index = index };
1420 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1421 };
1422 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1423 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1424 .repeat = 1,
1425 };
1426
1427 trace_qemu_rdma_unregister_waiting_proc(chunk,
1428 rdma->unregister_current);
1429
1430 rdma->unregistrations[rdma->unregister_current] = 0;
1431 rdma->unregister_current++;
1432
1433 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1434 rdma->unregister_current = 0;
1435 }
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445 clear_bit(chunk, block->unregister_bitmap);
1446
1447 if (test_bit(chunk, block->transit_bitmap)) {
1448 trace_qemu_rdma_unregister_waiting_inflight(chunk);
1449 continue;
1450 }
1451
1452 trace_qemu_rdma_unregister_waiting_send(chunk);
1453
1454 ret = ibv_dereg_mr(block->pmr[chunk]);
1455 block->pmr[chunk] = NULL;
1456 block->remote_keys[chunk] = 0;
1457
1458 if (ret != 0) {
1459 perror("unregistration chunk failed");
1460 return -ret;
1461 }
1462 rdma->total_registrations--;
1463
1464 reg.key.chunk = chunk;
1465 register_to_network(rdma, ®);
1466 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1467 &resp, NULL, NULL);
1468 if (ret < 0) {
1469 return ret;
1470 }
1471
1472 trace_qemu_rdma_unregister_waiting_complete(chunk);
1473 }
1474
1475 return 0;
1476}
1477
1478static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1479 uint64_t chunk)
1480{
1481 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1482
1483 result |= (index << RDMA_WRID_BLOCK_SHIFT);
1484 result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1485
1486 return result;
1487}
1488
1489
1490
1491
1492
1493static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1494 uint64_t chunk, uint64_t wr_id)
1495{
1496 if (rdma->unregistrations[rdma->unregister_next] != 0) {
1497 error_report("rdma migration: queue is full");
1498 } else {
1499 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1500
1501 if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1502 trace_qemu_rdma_signal_unregister_append(chunk,
1503 rdma->unregister_next);
1504
1505 rdma->unregistrations[rdma->unregister_next++] =
1506 qemu_rdma_make_wrid(wr_id, index, chunk);
1507
1508 if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1509 rdma->unregister_next = 0;
1510 }
1511 } else {
1512 trace_qemu_rdma_signal_unregister_already(chunk);
1513 }
1514 }
1515}
1516
1517
1518
1519
1520
1521
1522static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
1523 uint64_t *wr_id_out, uint32_t *byte_len)
1524{
1525 int ret;
1526 struct ibv_wc wc;
1527 uint64_t wr_id;
1528
1529 ret = ibv_poll_cq(cq, 1, &wc);
1530
1531 if (!ret) {
1532 *wr_id_out = RDMA_WRID_NONE;
1533 return 0;
1534 }
1535
1536 if (ret < 0) {
1537 error_report("ibv_poll_cq return %d", ret);
1538 return ret;
1539 }
1540
1541 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1542
1543 if (wc.status != IBV_WC_SUCCESS) {
1544 fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1545 wc.status, ibv_wc_status_str(wc.status));
1546 fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1547
1548 return -1;
1549 }
1550
1551 if (rdma->control_ready_expected &&
1552 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1553 trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
1554 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1555 rdma->control_ready_expected = 0;
1556 }
1557
1558 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1559 uint64_t chunk =
1560 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1561 uint64_t index =
1562 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1563 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1564
1565 trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
1566 index, chunk, block->local_host_addr,
1567 (void *)(uintptr_t)block->remote_host_addr);
1568
1569 clear_bit(chunk, block->transit_bitmap);
1570
1571 if (rdma->nb_sent > 0) {
1572 rdma->nb_sent--;
1573 }
1574
1575 if (!rdma->pin_all) {
1576
1577
1578
1579
1580
1581
1582#ifdef RDMA_UNREGISTRATION_EXAMPLE
1583 qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1584#endif
1585 }
1586 } else {
1587 trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
1588 }
1589
1590 *wr_id_out = wc.wr_id;
1591 if (byte_len) {
1592 *byte_len = wc.byte_len;
1593 }
1594
1595 return 0;
1596}
1597
1598
1599
1600
1601static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
1602 struct ibv_comp_channel *comp_channel)
1603{
1604 struct rdma_cm_event *cm_event;
1605 int ret = -1;
1606
1607
1608
1609
1610
1611 if (rdma->migration_started_on_destination &&
1612 migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1613 yield_until_fd_readable(comp_channel->fd);
1614 } else {
1615
1616
1617
1618
1619
1620
1621
1622 while (!rdma->error_state && !rdma->received_error) {
1623 GPollFD pfds[2];
1624 pfds[0].fd = comp_channel->fd;
1625 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1626 pfds[0].revents = 0;
1627
1628 pfds[1].fd = rdma->channel->fd;
1629 pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1630 pfds[1].revents = 0;
1631
1632
1633 switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1634 case 2:
1635 case 1:
1636 if (pfds[0].revents) {
1637 return 0;
1638 }
1639
1640 if (pfds[1].revents) {
1641 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1642 if (ret) {
1643 error_report("failed to get cm event while wait "
1644 "completion channel");
1645 return -EPIPE;
1646 }
1647
1648 error_report("receive cm event while wait comp channel,"
1649 "cm event is %d", cm_event->event);
1650 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1651 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
1652 rdma_ack_cm_event(cm_event);
1653 return -EPIPE;
1654 }
1655 rdma_ack_cm_event(cm_event);
1656 }
1657 break;
1658
1659 case 0:
1660 break;
1661
1662 default:
1663
1664
1665 error_report("%s: poll failed", __func__);
1666 return -EPIPE;
1667 }
1668
1669 if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
1670
1671 return -EPIPE;
1672 }
1673 }
1674 }
1675
1676 if (rdma->received_error) {
1677 return -EPIPE;
1678 }
1679 return rdma->error_state;
1680}
1681
1682static struct ibv_comp_channel *to_channel(RDMAContext *rdma, int wrid)
1683{
1684 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
1685 rdma->recv_comp_channel;
1686}
1687
1688static struct ibv_cq *to_cq(RDMAContext *rdma, int wrid)
1689{
1690 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
1691}
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1707 uint32_t *byte_len)
1708{
1709 int num_cq_events = 0, ret = 0;
1710 struct ibv_cq *cq;
1711 void *cq_ctx;
1712 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1713 struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
1714 struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
1715
1716 if (ibv_req_notify_cq(poll_cq, 0)) {
1717 return -1;
1718 }
1719
1720 while (wr_id != wrid_requested) {
1721 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1722 if (ret < 0) {
1723 return ret;
1724 }
1725
1726 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1727
1728 if (wr_id == RDMA_WRID_NONE) {
1729 break;
1730 }
1731 if (wr_id != wrid_requested) {
1732 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1733 wrid_requested, print_wrid(wr_id), wr_id);
1734 }
1735 }
1736
1737 if (wr_id == wrid_requested) {
1738 return 0;
1739 }
1740
1741 while (1) {
1742 ret = qemu_rdma_wait_comp_channel(rdma, ch);
1743 if (ret) {
1744 goto err_block_for_wrid;
1745 }
1746
1747 ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
1748 if (ret) {
1749 perror("ibv_get_cq_event");
1750 goto err_block_for_wrid;
1751 }
1752
1753 num_cq_events++;
1754
1755 ret = -ibv_req_notify_cq(cq, 0);
1756 if (ret) {
1757 goto err_block_for_wrid;
1758 }
1759
1760 while (wr_id != wrid_requested) {
1761 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1762 if (ret < 0) {
1763 goto err_block_for_wrid;
1764 }
1765
1766 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1767
1768 if (wr_id == RDMA_WRID_NONE) {
1769 break;
1770 }
1771 if (wr_id != wrid_requested) {
1772 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1773 wrid_requested, print_wrid(wr_id), wr_id);
1774 }
1775 }
1776
1777 if (wr_id == wrid_requested) {
1778 goto success_block_for_wrid;
1779 }
1780 }
1781
1782success_block_for_wrid:
1783 if (num_cq_events) {
1784 ibv_ack_cq_events(cq, num_cq_events);
1785 }
1786 return 0;
1787
1788err_block_for_wrid:
1789 if (num_cq_events) {
1790 ibv_ack_cq_events(cq, num_cq_events);
1791 }
1792
1793 rdma->error_state = ret;
1794 return ret;
1795}
1796
1797
1798
1799
1800
1801static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1802 RDMAControlHeader *head)
1803{
1804 int ret = 0;
1805 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1806 struct ibv_send_wr *bad_wr;
1807 struct ibv_sge sge = {
1808 .addr = (uintptr_t)(wr->control),
1809 .length = head->len + sizeof(RDMAControlHeader),
1810 .lkey = wr->control_mr->lkey,
1811 };
1812 struct ibv_send_wr send_wr = {
1813 .wr_id = RDMA_WRID_SEND_CONTROL,
1814 .opcode = IBV_WR_SEND,
1815 .send_flags = IBV_SEND_SIGNALED,
1816 .sg_list = &sge,
1817 .num_sge = 1,
1818 };
1819
1820 trace_qemu_rdma_post_send_control(control_desc(head->type));
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1831 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1832 control_to_network((void *) wr->control);
1833
1834 if (buf) {
1835 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1836 }
1837
1838
1839 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1840
1841 if (ret > 0) {
1842 error_report("Failed to use post IB SEND for control");
1843 return -ret;
1844 }
1845
1846 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1847 if (ret < 0) {
1848 error_report("rdma migration: send polling control error");
1849 }
1850
1851 return ret;
1852}
1853
1854
1855
1856
1857
1858static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1859{
1860 struct ibv_recv_wr *bad_wr;
1861 struct ibv_sge sge = {
1862 .addr = (uintptr_t)(rdma->wr_data[idx].control),
1863 .length = RDMA_CONTROL_MAX_BUFFER,
1864 .lkey = rdma->wr_data[idx].control_mr->lkey,
1865 };
1866
1867 struct ibv_recv_wr recv_wr = {
1868 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1869 .sg_list = &sge,
1870 .num_sge = 1,
1871 };
1872
1873
1874 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1875 return -1;
1876 }
1877
1878 return 0;
1879}
1880
1881
1882
1883
1884static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1885 RDMAControlHeader *head, int expecting, int idx)
1886{
1887 uint32_t byte_len;
1888 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1889 &byte_len);
1890
1891 if (ret < 0) {
1892 error_report("rdma migration: recv polling control error!");
1893 return ret;
1894 }
1895
1896 network_to_control((void *) rdma->wr_data[idx].control);
1897 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1898
1899 trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1900
1901 if (expecting == RDMA_CONTROL_NONE) {
1902 trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1903 head->type);
1904 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1905 error_report("Was expecting a %s (%d) control message"
1906 ", but got: %s (%d), length: %d",
1907 control_desc(expecting), expecting,
1908 control_desc(head->type), head->type, head->len);
1909 if (head->type == RDMA_CONTROL_ERROR) {
1910 rdma->received_error = true;
1911 }
1912 return -EIO;
1913 }
1914 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1915 error_report("too long length: %d", head->len);
1916 return -EINVAL;
1917 }
1918 if (sizeof(*head) + head->len != byte_len) {
1919 error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1920 return -EINVAL;
1921 }
1922
1923 return 0;
1924}
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1935 RDMAControlHeader *head)
1936{
1937 rdma->wr_data[idx].control_len = head->len;
1938 rdma->wr_data[idx].control_curr =
1939 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1940}
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1956 uint8_t *data, RDMAControlHeader *resp,
1957 int *resp_idx,
1958 int (*callback)(RDMAContext *rdma))
1959{
1960 int ret = 0;
1961
1962
1963
1964
1965
1966 if (rdma->control_ready_expected) {
1967 RDMAControlHeader resp;
1968 ret = qemu_rdma_exchange_get_response(rdma,
1969 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1970 if (ret < 0) {
1971 return ret;
1972 }
1973 }
1974
1975
1976
1977
1978 if (resp) {
1979 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1980 if (ret) {
1981 error_report("rdma migration: error posting"
1982 " extra control recv for anticipated result!");
1983 return ret;
1984 }
1985 }
1986
1987
1988
1989
1990 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1991 if (ret) {
1992 error_report("rdma migration: error posting first control recv!");
1993 return ret;
1994 }
1995
1996
1997
1998
1999 ret = qemu_rdma_post_send_control(rdma, data, head);
2000
2001 if (ret < 0) {
2002 error_report("Failed to send control buffer!");
2003 return ret;
2004 }
2005
2006
2007
2008
2009 if (resp) {
2010 if (callback) {
2011 trace_qemu_rdma_exchange_send_issue_callback();
2012 ret = callback(rdma);
2013 if (ret < 0) {
2014 return ret;
2015 }
2016 }
2017
2018 trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
2019 ret = qemu_rdma_exchange_get_response(rdma, resp,
2020 resp->type, RDMA_WRID_DATA);
2021
2022 if (ret < 0) {
2023 return ret;
2024 }
2025
2026 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
2027 if (resp_idx) {
2028 *resp_idx = RDMA_WRID_DATA;
2029 }
2030 trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
2031 }
2032
2033 rdma->control_ready_expected = 1;
2034
2035 return 0;
2036}
2037
2038
2039
2040
2041
2042static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
2043 int expecting)
2044{
2045 RDMAControlHeader ready = {
2046 .len = 0,
2047 .type = RDMA_CONTROL_READY,
2048 .repeat = 1,
2049 };
2050 int ret;
2051
2052
2053
2054
2055 ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
2056
2057 if (ret < 0) {
2058 error_report("Failed to send control buffer!");
2059 return ret;
2060 }
2061
2062
2063
2064
2065 ret = qemu_rdma_exchange_get_response(rdma, head,
2066 expecting, RDMA_WRID_READY);
2067
2068 if (ret < 0) {
2069 return ret;
2070 }
2071
2072 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
2073
2074
2075
2076
2077 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2078 if (ret) {
2079 error_report("rdma migration: error posting second control recv!");
2080 return ret;
2081 }
2082
2083 return 0;
2084}
2085
2086
2087
2088
2089
2090
2091
2092static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
2093 int current_index, uint64_t current_addr,
2094 uint64_t length)
2095{
2096 struct ibv_sge sge;
2097 struct ibv_send_wr send_wr = { 0 };
2098 struct ibv_send_wr *bad_wr;
2099 int reg_result_idx, ret, count = 0;
2100 uint64_t chunk, chunks;
2101 uint8_t *chunk_start, *chunk_end;
2102 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
2103 RDMARegister reg;
2104 RDMARegisterResult *reg_result;
2105 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
2106 RDMAControlHeader head = { .len = sizeof(RDMARegister),
2107 .type = RDMA_CONTROL_REGISTER_REQUEST,
2108 .repeat = 1,
2109 };
2110
2111retry:
2112 sge.addr = (uintptr_t)(block->local_host_addr +
2113 (current_addr - block->offset));
2114 sge.length = length;
2115
2116 chunk = ram_chunk_index(block->local_host_addr,
2117 (uint8_t *)(uintptr_t)sge.addr);
2118 chunk_start = ram_chunk_start(block, chunk);
2119
2120 if (block->is_ram_block) {
2121 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2122
2123 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2124 chunks--;
2125 }
2126 } else {
2127 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2128
2129 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2130 chunks--;
2131 }
2132 }
2133
2134 trace_qemu_rdma_write_one_top(chunks + 1,
2135 (chunks + 1) *
2136 (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2137
2138 chunk_end = ram_chunk_end(block, chunk + chunks);
2139
2140 if (!rdma->pin_all) {
2141#ifdef RDMA_UNREGISTRATION_EXAMPLE
2142 qemu_rdma_unregister_waiting(rdma);
2143#endif
2144 }
2145
2146 while (test_bit(chunk, block->transit_bitmap)) {
2147 (void)count;
2148 trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2149 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2150
2151 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2152
2153 if (ret < 0) {
2154 error_report("Failed to Wait for previous write to complete "
2155 "block %d chunk %" PRIu64
2156 " current %" PRIu64 " len %" PRIu64 " %d",
2157 current_index, chunk, sge.addr, length, rdma->nb_sent);
2158 return ret;
2159 }
2160 }
2161
2162 if (!rdma->pin_all || !block->is_ram_block) {
2163 if (!block->remote_keys[chunk]) {
2164
2165
2166
2167
2168
2169
2170 if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2171 RDMACompress comp = {
2172 .offset = current_addr,
2173 .value = 0,
2174 .block_idx = current_index,
2175 .length = length,
2176 };
2177
2178 head.len = sizeof(comp);
2179 head.type = RDMA_CONTROL_COMPRESS;
2180
2181 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2182 current_index, current_addr);
2183
2184 compress_to_network(rdma, &comp);
2185 ret = qemu_rdma_exchange_send(rdma, &head,
2186 (uint8_t *) &comp, NULL, NULL, NULL);
2187
2188 if (ret < 0) {
2189 return -EIO;
2190 }
2191
2192 acct_update_position(f, sge.length, true);
2193
2194 return 1;
2195 }
2196
2197
2198
2199
2200 reg.current_index = current_index;
2201 if (block->is_ram_block) {
2202 reg.key.current_addr = current_addr;
2203 } else {
2204 reg.key.chunk = chunk;
2205 }
2206 reg.chunks = chunks;
2207
2208 trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2209 current_addr);
2210
2211 register_to_network(rdma, ®);
2212 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
2213 &resp, ®_result_idx, NULL);
2214 if (ret < 0) {
2215 return ret;
2216 }
2217
2218
2219 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2220 &sge.lkey, NULL, chunk,
2221 chunk_start, chunk_end)) {
2222 error_report("cannot get lkey");
2223 return -EINVAL;
2224 }
2225
2226 reg_result = (RDMARegisterResult *)
2227 rdma->wr_data[reg_result_idx].control_curr;
2228
2229 network_to_result(reg_result);
2230
2231 trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2232 reg_result->rkey, chunk);
2233
2234 block->remote_keys[chunk] = reg_result->rkey;
2235 block->remote_host_addr = reg_result->host_addr;
2236 } else {
2237
2238 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2239 &sge.lkey, NULL, chunk,
2240 chunk_start, chunk_end)) {
2241 error_report("cannot get lkey!");
2242 return -EINVAL;
2243 }
2244 }
2245
2246 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2247 } else {
2248 send_wr.wr.rdma.rkey = block->remote_rkey;
2249
2250 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2251 &sge.lkey, NULL, chunk,
2252 chunk_start, chunk_end)) {
2253 error_report("cannot get lkey!");
2254 return -EINVAL;
2255 }
2256 }
2257
2258
2259
2260
2261
2262
2263
2264 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2265 current_index, chunk);
2266
2267 send_wr.opcode = IBV_WR_RDMA_WRITE;
2268 send_wr.send_flags = IBV_SEND_SIGNALED;
2269 send_wr.sg_list = &sge;
2270 send_wr.num_sge = 1;
2271 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2272 (current_addr - block->offset);
2273
2274 trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2275 sge.length);
2276
2277
2278
2279
2280
2281 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2282
2283 if (ret == ENOMEM) {
2284 trace_qemu_rdma_write_one_queue_full();
2285 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2286 if (ret < 0) {
2287 error_report("rdma migration: failed to make "
2288 "room in full send queue! %d", ret);
2289 return ret;
2290 }
2291
2292 goto retry;
2293
2294 } else if (ret > 0) {
2295 perror("rdma migration: post rdma write failed");
2296 return -ret;
2297 }
2298
2299 set_bit(chunk, block->transit_bitmap);
2300 acct_update_position(f, sge.length, false);
2301 rdma->total_writes++;
2302
2303 return 0;
2304}
2305
2306
2307
2308
2309
2310
2311
2312static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2313{
2314 int ret;
2315
2316 if (!rdma->current_length) {
2317 return 0;
2318 }
2319
2320 ret = qemu_rdma_write_one(f, rdma,
2321 rdma->current_index, rdma->current_addr, rdma->current_length);
2322
2323 if (ret < 0) {
2324 return ret;
2325 }
2326
2327 if (ret == 0) {
2328 rdma->nb_sent++;
2329 trace_qemu_rdma_write_flush(rdma->nb_sent);
2330 }
2331
2332 rdma->current_length = 0;
2333 rdma->current_addr = 0;
2334
2335 return 0;
2336}
2337
2338static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2339 uint64_t offset, uint64_t len)
2340{
2341 RDMALocalBlock *block;
2342 uint8_t *host_addr;
2343 uint8_t *chunk_end;
2344
2345 if (rdma->current_index < 0) {
2346 return 0;
2347 }
2348
2349 if (rdma->current_chunk < 0) {
2350 return 0;
2351 }
2352
2353 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2354 host_addr = block->local_host_addr + (offset - block->offset);
2355 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2356
2357 if (rdma->current_length == 0) {
2358 return 0;
2359 }
2360
2361
2362
2363
2364 if (offset != (rdma->current_addr + rdma->current_length)) {
2365 return 0;
2366 }
2367
2368 if (offset < block->offset) {
2369 return 0;
2370 }
2371
2372 if ((offset + len) > (block->offset + block->length)) {
2373 return 0;
2374 }
2375
2376 if ((host_addr + len) > chunk_end) {
2377 return 0;
2378 }
2379
2380 return 1;
2381}
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2394 uint64_t block_offset, uint64_t offset,
2395 uint64_t len)
2396{
2397 uint64_t current_addr = block_offset + offset;
2398 uint64_t index = rdma->current_index;
2399 uint64_t chunk = rdma->current_chunk;
2400 int ret;
2401
2402
2403 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2404 ret = qemu_rdma_write_flush(f, rdma);
2405 if (ret) {
2406 return ret;
2407 }
2408 rdma->current_length = 0;
2409 rdma->current_addr = current_addr;
2410
2411 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2412 offset, len, &index, &chunk);
2413 if (ret) {
2414 error_report("ram block search failed");
2415 return ret;
2416 }
2417 rdma->current_index = index;
2418 rdma->current_chunk = chunk;
2419 }
2420
2421
2422 rdma->current_length += len;
2423
2424
2425 if (rdma->current_length >= RDMA_MERGE_MAX) {
2426 return qemu_rdma_write_flush(f, rdma);
2427 }
2428
2429 return 0;
2430}
2431
2432static void qemu_rdma_cleanup(RDMAContext *rdma)
2433{
2434 int idx;
2435
2436 if (rdma->cm_id && rdma->connected) {
2437 if ((rdma->error_state ||
2438 migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
2439 !rdma->received_error) {
2440 RDMAControlHeader head = { .len = 0,
2441 .type = RDMA_CONTROL_ERROR,
2442 .repeat = 1,
2443 };
2444 error_report("Early error. Sending error.");
2445 qemu_rdma_post_send_control(rdma, NULL, &head);
2446 }
2447
2448 rdma_disconnect(rdma->cm_id);
2449 trace_qemu_rdma_cleanup_disconnect();
2450 rdma->connected = false;
2451 }
2452
2453 if (rdma->channel) {
2454 qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2455 }
2456 g_free(rdma->dest_blocks);
2457 rdma->dest_blocks = NULL;
2458
2459 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2460 if (rdma->wr_data[idx].control_mr) {
2461 rdma->total_registrations--;
2462 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2463 }
2464 rdma->wr_data[idx].control_mr = NULL;
2465 }
2466
2467 if (rdma->local_ram_blocks.block) {
2468 while (rdma->local_ram_blocks.nb_blocks) {
2469 rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2470 }
2471 }
2472
2473 if (rdma->qp) {
2474 rdma_destroy_qp(rdma->cm_id);
2475 rdma->qp = NULL;
2476 }
2477 if (rdma->recv_cq) {
2478 ibv_destroy_cq(rdma->recv_cq);
2479 rdma->recv_cq = NULL;
2480 }
2481 if (rdma->send_cq) {
2482 ibv_destroy_cq(rdma->send_cq);
2483 rdma->send_cq = NULL;
2484 }
2485 if (rdma->recv_comp_channel) {
2486 ibv_destroy_comp_channel(rdma->recv_comp_channel);
2487 rdma->recv_comp_channel = NULL;
2488 }
2489 if (rdma->send_comp_channel) {
2490 ibv_destroy_comp_channel(rdma->send_comp_channel);
2491 rdma->send_comp_channel = NULL;
2492 }
2493 if (rdma->pd) {
2494 ibv_dealloc_pd(rdma->pd);
2495 rdma->pd = NULL;
2496 }
2497 if (rdma->cm_id) {
2498 rdma_destroy_id(rdma->cm_id);
2499 rdma->cm_id = NULL;
2500 }
2501
2502
2503 if (rdma->listen_id) {
2504 if (!rdma->is_return_path) {
2505 rdma_destroy_id(rdma->listen_id);
2506 }
2507 rdma->listen_id = NULL;
2508
2509 if (rdma->channel) {
2510 if (!rdma->is_return_path) {
2511 rdma_destroy_event_channel(rdma->channel);
2512 }
2513 rdma->channel = NULL;
2514 }
2515 }
2516
2517 if (rdma->channel) {
2518 rdma_destroy_event_channel(rdma->channel);
2519 rdma->channel = NULL;
2520 }
2521 g_free(rdma->host);
2522 g_free(rdma->host_port);
2523 rdma->host = NULL;
2524 rdma->host_port = NULL;
2525}
2526
2527
2528static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2529{
2530 int ret, idx;
2531 Error *local_err = NULL, **temp = &local_err;
2532
2533
2534
2535
2536
2537 rdma->pin_all = pin_all;
2538
2539 ret = qemu_rdma_resolve_host(rdma, temp);
2540 if (ret) {
2541 goto err_rdma_source_init;
2542 }
2543
2544 ret = qemu_rdma_alloc_pd_cq(rdma);
2545 if (ret) {
2546 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2547 " limits may be too low. Please check $ ulimit -a # and "
2548 "search for 'ulimit -l' in the output");
2549 goto err_rdma_source_init;
2550 }
2551
2552 ret = qemu_rdma_alloc_qp(rdma);
2553 if (ret) {
2554 ERROR(temp, "rdma migration: error allocating qp!");
2555 goto err_rdma_source_init;
2556 }
2557
2558 ret = qemu_rdma_init_ram_blocks(rdma);
2559 if (ret) {
2560 ERROR(temp, "rdma migration: error initializing ram blocks!");
2561 goto err_rdma_source_init;
2562 }
2563
2564
2565 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2566 for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2567 g_hash_table_insert(rdma->blockmap,
2568 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2569 &rdma->local_ram_blocks.block[idx]);
2570 }
2571
2572 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2573 ret = qemu_rdma_reg_control(rdma, idx);
2574 if (ret) {
2575 ERROR(temp, "rdma migration: error registering %d control!",
2576 idx);
2577 goto err_rdma_source_init;
2578 }
2579 }
2580
2581 return 0;
2582
2583err_rdma_source_init:
2584 error_propagate(errp, local_err);
2585 qemu_rdma_cleanup(rdma);
2586 return -1;
2587}
2588
2589static int qemu_get_cm_event_timeout(RDMAContext *rdma,
2590 struct rdma_cm_event **cm_event,
2591 long msec, Error **errp)
2592{
2593 int ret;
2594 struct pollfd poll_fd = {
2595 .fd = rdma->channel->fd,
2596 .events = POLLIN,
2597 .revents = 0
2598 };
2599
2600 do {
2601 ret = poll(&poll_fd, 1, msec);
2602 } while (ret < 0 && errno == EINTR);
2603
2604 if (ret == 0) {
2605 ERROR(errp, "poll cm event timeout");
2606 return -1;
2607 } else if (ret < 0) {
2608 ERROR(errp, "failed to poll cm event, errno=%i", errno);
2609 return -1;
2610 } else if (poll_fd.revents & POLLIN) {
2611 return rdma_get_cm_event(rdma->channel, cm_event);
2612 } else {
2613 ERROR(errp, "no POLLIN event, revent=%x", poll_fd.revents);
2614 return -1;
2615 }
2616}
2617
2618static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path)
2619{
2620 RDMACapabilities cap = {
2621 .version = RDMA_CONTROL_VERSION_CURRENT,
2622 .flags = 0,
2623 };
2624 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2625 .retry_count = 5,
2626 .private_data = &cap,
2627 .private_data_len = sizeof(cap),
2628 };
2629 struct rdma_cm_event *cm_event;
2630 int ret;
2631
2632
2633
2634
2635
2636 if (rdma->pin_all) {
2637 trace_qemu_rdma_connect_pin_all_requested();
2638 cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2639 }
2640
2641 caps_to_network(&cap);
2642
2643 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2644 if (ret) {
2645 ERROR(errp, "posting second control recv");
2646 goto err_rdma_source_connect;
2647 }
2648
2649 ret = rdma_connect(rdma->cm_id, &conn_param);
2650 if (ret) {
2651 perror("rdma_connect");
2652 ERROR(errp, "connecting to destination!");
2653 goto err_rdma_source_connect;
2654 }
2655
2656 if (return_path) {
2657 ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
2658 } else {
2659 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2660 }
2661 if (ret) {
2662 perror("rdma_get_cm_event after rdma_connect");
2663 ERROR(errp, "connecting to destination!");
2664 goto err_rdma_source_connect;
2665 }
2666
2667 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2668 error_report("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2669 ERROR(errp, "connecting to destination!");
2670 rdma_ack_cm_event(cm_event);
2671 goto err_rdma_source_connect;
2672 }
2673 rdma->connected = true;
2674
2675 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2676 network_to_caps(&cap);
2677
2678
2679
2680
2681
2682 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2683 ERROR(errp, "Server cannot support pinning all memory. "
2684 "Will register memory dynamically.");
2685 rdma->pin_all = false;
2686 }
2687
2688 trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2689
2690 rdma_ack_cm_event(cm_event);
2691
2692 rdma->control_ready_expected = 1;
2693 rdma->nb_sent = 0;
2694 return 0;
2695
2696err_rdma_source_connect:
2697 qemu_rdma_cleanup(rdma);
2698 return -1;
2699}
2700
2701static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2702{
2703 int ret, idx;
2704 struct rdma_cm_id *listen_id;
2705 char ip[40] = "unknown";
2706 struct rdma_addrinfo *res, *e;
2707 char port_str[16];
2708
2709 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2710 rdma->wr_data[idx].control_len = 0;
2711 rdma->wr_data[idx].control_curr = NULL;
2712 }
2713
2714 if (!rdma->host || !rdma->host[0]) {
2715 ERROR(errp, "RDMA host is not set!");
2716 rdma->error_state = -EINVAL;
2717 return -1;
2718 }
2719
2720 rdma->channel = rdma_create_event_channel();
2721 if (!rdma->channel) {
2722 ERROR(errp, "could not create rdma event channel");
2723 rdma->error_state = -EINVAL;
2724 return -1;
2725 }
2726
2727
2728 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2729 if (ret) {
2730 ERROR(errp, "could not create cm_id!");
2731 goto err_dest_init_create_listen_id;
2732 }
2733
2734 snprintf(port_str, 16, "%d", rdma->port);
2735 port_str[15] = '\0';
2736
2737 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2738 if (ret < 0) {
2739 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2740 goto err_dest_init_bind_addr;
2741 }
2742
2743 for (e = res; e != NULL; e = e->ai_next) {
2744 inet_ntop(e->ai_family,
2745 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2746 trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2747 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2748 if (ret) {
2749 continue;
2750 }
2751 if (e->ai_family == AF_INET6) {
2752 ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
2753 if (ret) {
2754 continue;
2755 }
2756 }
2757 break;
2758 }
2759
2760 rdma_freeaddrinfo(res);
2761 if (!e) {
2762 ERROR(errp, "Error: could not rdma_bind_addr!");
2763 goto err_dest_init_bind_addr;
2764 }
2765
2766 rdma->listen_id = listen_id;
2767 qemu_rdma_dump_gid("dest_init", listen_id);
2768 return 0;
2769
2770err_dest_init_bind_addr:
2771 rdma_destroy_id(listen_id);
2772err_dest_init_create_listen_id:
2773 rdma_destroy_event_channel(rdma->channel);
2774 rdma->channel = NULL;
2775 rdma->error_state = ret;
2776 return ret;
2777
2778}
2779
2780static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
2781 RDMAContext *rdma)
2782{
2783 int idx;
2784
2785 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2786 rdma_return_path->wr_data[idx].control_len = 0;
2787 rdma_return_path->wr_data[idx].control_curr = NULL;
2788 }
2789
2790
2791 rdma_return_path->channel = rdma->channel;
2792 rdma_return_path->listen_id = rdma->listen_id;
2793
2794 rdma->return_path = rdma_return_path;
2795 rdma_return_path->return_path = rdma;
2796 rdma_return_path->is_return_path = true;
2797}
2798
2799static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2800{
2801 RDMAContext *rdma = NULL;
2802 InetSocketAddress *addr;
2803
2804 if (host_port) {
2805 rdma = g_new0(RDMAContext, 1);
2806 rdma->current_index = -1;
2807 rdma->current_chunk = -1;
2808
2809 addr = g_new(InetSocketAddress, 1);
2810 if (!inet_parse(addr, host_port, NULL)) {
2811 rdma->port = atoi(addr->port);
2812 rdma->host = g_strdup(addr->host);
2813 rdma->host_port = g_strdup(host_port);
2814 } else {
2815 ERROR(errp, "bad RDMA migration address '%s'", host_port);
2816 g_free(rdma);
2817 rdma = NULL;
2818 }
2819
2820 qapi_free_InetSocketAddress(addr);
2821 }
2822
2823 return rdma;
2824}
2825
2826
2827
2828
2829
2830
2831static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
2832 const struct iovec *iov,
2833 size_t niov,
2834 int *fds,
2835 size_t nfds,
2836 Error **errp)
2837{
2838 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2839 QEMUFile *f = rioc->file;
2840 RDMAContext *rdma;
2841 int ret;
2842 ssize_t done = 0;
2843 size_t i;
2844 size_t len = 0;
2845
2846 RCU_READ_LOCK_GUARD();
2847 rdma = qatomic_rcu_read(&rioc->rdmaout);
2848
2849 if (!rdma) {
2850 return -EIO;
2851 }
2852
2853 CHECK_ERROR_STATE();
2854
2855
2856
2857
2858
2859 ret = qemu_rdma_write_flush(f, rdma);
2860 if (ret < 0) {
2861 rdma->error_state = ret;
2862 return ret;
2863 }
2864
2865 for (i = 0; i < niov; i++) {
2866 size_t remaining = iov[i].iov_len;
2867 uint8_t * data = (void *)iov[i].iov_base;
2868 while (remaining) {
2869 RDMAControlHeader head;
2870
2871 len = MIN(remaining, RDMA_SEND_INCREMENT);
2872 remaining -= len;
2873
2874 head.len = len;
2875 head.type = RDMA_CONTROL_QEMU_FILE;
2876
2877 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2878
2879 if (ret < 0) {
2880 rdma->error_state = ret;
2881 return ret;
2882 }
2883
2884 data += len;
2885 done += len;
2886 }
2887 }
2888
2889 return done;
2890}
2891
2892static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2893 size_t size, int idx)
2894{
2895 size_t len = 0;
2896
2897 if (rdma->wr_data[idx].control_len) {
2898 trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2899
2900 len = MIN(size, rdma->wr_data[idx].control_len);
2901 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2902 rdma->wr_data[idx].control_curr += len;
2903 rdma->wr_data[idx].control_len -= len;
2904 }
2905
2906 return len;
2907}
2908
2909
2910
2911
2912
2913
2914static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
2915 const struct iovec *iov,
2916 size_t niov,
2917 int **fds,
2918 size_t *nfds,
2919 Error **errp)
2920{
2921 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2922 RDMAContext *rdma;
2923 RDMAControlHeader head;
2924 int ret = 0;
2925 ssize_t i;
2926 size_t done = 0;
2927
2928 RCU_READ_LOCK_GUARD();
2929 rdma = qatomic_rcu_read(&rioc->rdmain);
2930
2931 if (!rdma) {
2932 return -EIO;
2933 }
2934
2935 CHECK_ERROR_STATE();
2936
2937 for (i = 0; i < niov; i++) {
2938 size_t want = iov[i].iov_len;
2939 uint8_t *data = (void *)iov[i].iov_base;
2940
2941
2942
2943
2944
2945
2946 ret = qemu_rdma_fill(rdma, data, want, 0);
2947 done += ret;
2948 want -= ret;
2949
2950 if (want == 0) {
2951 continue;
2952 }
2953
2954
2955
2956 if (done > 0) {
2957 break;
2958 }
2959
2960
2961
2962
2963
2964 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2965
2966 if (ret < 0) {
2967 rdma->error_state = ret;
2968 return ret;
2969 }
2970
2971
2972
2973
2974 ret = qemu_rdma_fill(rdma, data, want, 0);
2975 done += ret;
2976 want -= ret;
2977
2978
2979 if (want) {
2980 if (done == 0) {
2981 return QIO_CHANNEL_ERR_BLOCK;
2982 } else {
2983 break;
2984 }
2985 }
2986 }
2987 return done;
2988}
2989
2990
2991
2992
2993static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2994{
2995 int ret;
2996
2997 if (qemu_rdma_write_flush(f, rdma) < 0) {
2998 return -EIO;
2999 }
3000
3001 while (rdma->nb_sent) {
3002 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
3003 if (ret < 0) {
3004 error_report("rdma migration: complete polling error!");
3005 return -EIO;
3006 }
3007 }
3008
3009 qemu_rdma_unregister_waiting(rdma);
3010
3011 return 0;
3012}
3013
3014
3015static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
3016 bool blocking,
3017 Error **errp)
3018{
3019 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3020
3021 rioc->blocking = blocking;
3022 return 0;
3023}
3024
3025
3026typedef struct QIOChannelRDMASource QIOChannelRDMASource;
3027struct QIOChannelRDMASource {
3028 GSource parent;
3029 QIOChannelRDMA *rioc;
3030 GIOCondition condition;
3031};
3032
3033static gboolean
3034qio_channel_rdma_source_prepare(GSource *source,
3035 gint *timeout)
3036{
3037 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3038 RDMAContext *rdma;
3039 GIOCondition cond = 0;
3040 *timeout = -1;
3041
3042 RCU_READ_LOCK_GUARD();
3043 if (rsource->condition == G_IO_IN) {
3044 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3045 } else {
3046 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3047 }
3048
3049 if (!rdma) {
3050 error_report("RDMAContext is NULL when prepare Gsource");
3051 return FALSE;
3052 }
3053
3054 if (rdma->wr_data[0].control_len) {
3055 cond |= G_IO_IN;
3056 }
3057 cond |= G_IO_OUT;
3058
3059 return cond & rsource->condition;
3060}
3061
3062static gboolean
3063qio_channel_rdma_source_check(GSource *source)
3064{
3065 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3066 RDMAContext *rdma;
3067 GIOCondition cond = 0;
3068
3069 RCU_READ_LOCK_GUARD();
3070 if (rsource->condition == G_IO_IN) {
3071 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3072 } else {
3073 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3074 }
3075
3076 if (!rdma) {
3077 error_report("RDMAContext is NULL when check Gsource");
3078 return FALSE;
3079 }
3080
3081 if (rdma->wr_data[0].control_len) {
3082 cond |= G_IO_IN;
3083 }
3084 cond |= G_IO_OUT;
3085
3086 return cond & rsource->condition;
3087}
3088
3089static gboolean
3090qio_channel_rdma_source_dispatch(GSource *source,
3091 GSourceFunc callback,
3092 gpointer user_data)
3093{
3094 QIOChannelFunc func = (QIOChannelFunc)callback;
3095 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3096 RDMAContext *rdma;
3097 GIOCondition cond = 0;
3098
3099 RCU_READ_LOCK_GUARD();
3100 if (rsource->condition == G_IO_IN) {
3101 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3102 } else {
3103 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3104 }
3105
3106 if (!rdma) {
3107 error_report("RDMAContext is NULL when dispatch Gsource");
3108 return FALSE;
3109 }
3110
3111 if (rdma->wr_data[0].control_len) {
3112 cond |= G_IO_IN;
3113 }
3114 cond |= G_IO_OUT;
3115
3116 return (*func)(QIO_CHANNEL(rsource->rioc),
3117 (cond & rsource->condition),
3118 user_data);
3119}
3120
3121static void
3122qio_channel_rdma_source_finalize(GSource *source)
3123{
3124 QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
3125
3126 object_unref(OBJECT(ssource->rioc));
3127}
3128
3129GSourceFuncs qio_channel_rdma_source_funcs = {
3130 qio_channel_rdma_source_prepare,
3131 qio_channel_rdma_source_check,
3132 qio_channel_rdma_source_dispatch,
3133 qio_channel_rdma_source_finalize
3134};
3135
3136static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
3137 GIOCondition condition)
3138{
3139 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3140 QIOChannelRDMASource *ssource;
3141 GSource *source;
3142
3143 source = g_source_new(&qio_channel_rdma_source_funcs,
3144 sizeof(QIOChannelRDMASource));
3145 ssource = (QIOChannelRDMASource *)source;
3146
3147 ssource->rioc = rioc;
3148 object_ref(OBJECT(rioc));
3149
3150 ssource->condition = condition;
3151
3152 return source;
3153}
3154
3155static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
3156 AioContext *ctx,
3157 IOHandler *io_read,
3158 IOHandler *io_write,
3159 void *opaque)
3160{
3161 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3162 if (io_read) {
3163 aio_set_fd_handler(ctx, rioc->rdmain->recv_comp_channel->fd,
3164 false, io_read, io_write, NULL, opaque);
3165 aio_set_fd_handler(ctx, rioc->rdmain->send_comp_channel->fd,
3166 false, io_read, io_write, NULL, opaque);
3167 } else {
3168 aio_set_fd_handler(ctx, rioc->rdmaout->recv_comp_channel->fd,
3169 false, io_read, io_write, NULL, opaque);
3170 aio_set_fd_handler(ctx, rioc->rdmaout->send_comp_channel->fd,
3171 false, io_read, io_write, NULL, opaque);
3172 }
3173}
3174
3175struct rdma_close_rcu {
3176 struct rcu_head rcu;
3177 RDMAContext *rdmain;
3178 RDMAContext *rdmaout;
3179};
3180
3181
3182static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3183{
3184 if (rcu->rdmain) {
3185 qemu_rdma_cleanup(rcu->rdmain);
3186 }
3187
3188 if (rcu->rdmaout) {
3189 qemu_rdma_cleanup(rcu->rdmaout);
3190 }
3191
3192 g_free(rcu->rdmain);
3193 g_free(rcu->rdmaout);
3194 g_free(rcu);
3195}
3196
3197static int qio_channel_rdma_close(QIOChannel *ioc,
3198 Error **errp)
3199{
3200 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3201 RDMAContext *rdmain, *rdmaout;
3202 struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3203
3204 trace_qemu_rdma_close();
3205
3206 rdmain = rioc->rdmain;
3207 if (rdmain) {
3208 qatomic_rcu_set(&rioc->rdmain, NULL);
3209 }
3210
3211 rdmaout = rioc->rdmaout;
3212 if (rdmaout) {
3213 qatomic_rcu_set(&rioc->rdmaout, NULL);
3214 }
3215
3216 rcu->rdmain = rdmain;
3217 rcu->rdmaout = rdmaout;
3218 call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
3219
3220 return 0;
3221}
3222
3223static int
3224qio_channel_rdma_shutdown(QIOChannel *ioc,
3225 QIOChannelShutdown how,
3226 Error **errp)
3227{
3228 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3229 RDMAContext *rdmain, *rdmaout;
3230
3231 RCU_READ_LOCK_GUARD();
3232
3233 rdmain = qatomic_rcu_read(&rioc->rdmain);
3234 rdmaout = qatomic_rcu_read(&rioc->rdmain);
3235
3236 switch (how) {
3237 case QIO_CHANNEL_SHUTDOWN_READ:
3238 if (rdmain) {
3239 rdmain->error_state = -1;
3240 }
3241 break;
3242 case QIO_CHANNEL_SHUTDOWN_WRITE:
3243 if (rdmaout) {
3244 rdmaout->error_state = -1;
3245 }
3246 break;
3247 case QIO_CHANNEL_SHUTDOWN_BOTH:
3248 default:
3249 if (rdmain) {
3250 rdmain->error_state = -1;
3251 }
3252 if (rdmaout) {
3253 rdmaout->error_state = -1;
3254 }
3255 break;
3256 }
3257
3258 return 0;
3259}
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
3296 ram_addr_t block_offset, ram_addr_t offset,
3297 size_t size, uint64_t *bytes_sent)
3298{
3299 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3300 RDMAContext *rdma;
3301 int ret;
3302
3303 RCU_READ_LOCK_GUARD();
3304 rdma = qatomic_rcu_read(&rioc->rdmaout);
3305
3306 if (!rdma) {
3307 return -EIO;
3308 }
3309
3310 CHECK_ERROR_STATE();
3311
3312 if (migration_in_postcopy()) {
3313 return RAM_SAVE_CONTROL_NOT_SUPP;
3314 }
3315
3316 qemu_fflush(f);
3317
3318 if (size > 0) {
3319
3320
3321
3322
3323
3324 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
3325 if (ret < 0) {
3326 error_report("rdma migration: write error! %d", ret);
3327 goto err;
3328 }
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338 if (bytes_sent) {
3339 *bytes_sent = 1;
3340 }
3341 } else {
3342 uint64_t index, chunk;
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355 ret = qemu_rdma_search_ram_block(rdma, block_offset,
3356 offset, size, &index, &chunk);
3357
3358 if (ret) {
3359 error_report("ram block search failed");
3360 goto err;
3361 }
3362
3363 qemu_rdma_signal_unregister(rdma, index, chunk, 0);
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373 }
3374
3375
3376
3377
3378
3379
3380
3381
3382 while (1) {
3383 uint64_t wr_id, wr_id_in;
3384 int ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
3385 if (ret < 0) {
3386 error_report("rdma migration: polling error! %d", ret);
3387 goto err;
3388 }
3389
3390 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3391
3392 if (wr_id == RDMA_WRID_NONE) {
3393 break;
3394 }
3395 }
3396
3397 while (1) {
3398 uint64_t wr_id, wr_id_in;
3399 int ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
3400 if (ret < 0) {
3401 error_report("rdma migration: polling error! %d", ret);
3402 goto err;
3403 }
3404
3405 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3406
3407 if (wr_id == RDMA_WRID_NONE) {
3408 break;
3409 }
3410 }
3411
3412 return RAM_SAVE_CONTROL_DELAYED;
3413err:
3414 rdma->error_state = ret;
3415 return ret;
3416}
3417
3418static void rdma_accept_incoming_migration(void *opaque);
3419
3420static void rdma_cm_poll_handler(void *opaque)
3421{
3422 RDMAContext *rdma = opaque;
3423 int ret;
3424 struct rdma_cm_event *cm_event;
3425 MigrationIncomingState *mis = migration_incoming_get_current();
3426
3427 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3428 if (ret) {
3429 error_report("get_cm_event failed %d", errno);
3430 return;
3431 }
3432
3433 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
3434 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3435 if (!rdma->error_state &&
3436 migration_incoming_get_current()->state !=
3437 MIGRATION_STATUS_COMPLETED) {
3438 error_report("receive cm event, cm event is %d", cm_event->event);
3439 rdma->error_state = -EPIPE;
3440 if (rdma->return_path) {
3441 rdma->return_path->error_state = -EPIPE;
3442 }
3443 }
3444 rdma_ack_cm_event(cm_event);
3445
3446 if (mis->migration_incoming_co) {
3447 qemu_coroutine_enter(mis->migration_incoming_co);
3448 }
3449 return;
3450 }
3451 rdma_ack_cm_event(cm_event);
3452}
3453
3454static int qemu_rdma_accept(RDMAContext *rdma)
3455{
3456 RDMACapabilities cap;
3457 struct rdma_conn_param conn_param = {
3458 .responder_resources = 2,
3459 .private_data = &cap,
3460 .private_data_len = sizeof(cap),
3461 };
3462 RDMAContext *rdma_return_path = NULL;
3463 struct rdma_cm_event *cm_event;
3464 struct ibv_context *verbs;
3465 int ret = -EINVAL;
3466 int idx;
3467
3468 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3469 if (ret) {
3470 goto err_rdma_dest_wait;
3471 }
3472
3473 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3474 rdma_ack_cm_event(cm_event);
3475 goto err_rdma_dest_wait;
3476 }
3477
3478
3479
3480
3481
3482 if (migrate_postcopy() && !rdma->is_return_path) {
3483 rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
3484 if (rdma_return_path == NULL) {
3485 rdma_ack_cm_event(cm_event);
3486 goto err_rdma_dest_wait;
3487 }
3488
3489 qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
3490 }
3491
3492 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3493
3494 network_to_caps(&cap);
3495
3496 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3497 error_report("Unknown source RDMA version: %d, bailing...",
3498 cap.version);
3499 rdma_ack_cm_event(cm_event);
3500 goto err_rdma_dest_wait;
3501 }
3502
3503
3504
3505
3506 cap.flags &= known_capabilities;
3507
3508
3509
3510
3511
3512 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3513 rdma->pin_all = true;
3514 }
3515
3516 rdma->cm_id = cm_event->id;
3517 verbs = cm_event->id->verbs;
3518
3519 rdma_ack_cm_event(cm_event);
3520
3521 trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3522
3523 caps_to_network(&cap);
3524
3525 trace_qemu_rdma_accept_pin_verbsc(verbs);
3526
3527 if (!rdma->verbs) {
3528 rdma->verbs = verbs;
3529 } else if (rdma->verbs != verbs) {
3530 error_report("ibv context not matching %p, %p!", rdma->verbs,
3531 verbs);
3532 goto err_rdma_dest_wait;
3533 }
3534
3535 qemu_rdma_dump_id("dest_init", verbs);
3536
3537 ret = qemu_rdma_alloc_pd_cq(rdma);
3538 if (ret) {
3539 error_report("rdma migration: error allocating pd and cq!");
3540 goto err_rdma_dest_wait;
3541 }
3542
3543 ret = qemu_rdma_alloc_qp(rdma);
3544 if (ret) {
3545 error_report("rdma migration: error allocating qp!");
3546 goto err_rdma_dest_wait;
3547 }
3548
3549 ret = qemu_rdma_init_ram_blocks(rdma);
3550 if (ret) {
3551 error_report("rdma migration: error initializing ram blocks!");
3552 goto err_rdma_dest_wait;
3553 }
3554
3555 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
3556 ret = qemu_rdma_reg_control(rdma, idx);
3557 if (ret) {
3558 error_report("rdma: error registering %d control", idx);
3559 goto err_rdma_dest_wait;
3560 }
3561 }
3562
3563
3564 if (migrate_postcopy() && !rdma->is_return_path) {
3565 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3566 NULL,
3567 (void *)(intptr_t)rdma->return_path);
3568 } else {
3569 qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
3570 NULL, rdma);
3571 }
3572
3573 ret = rdma_accept(rdma->cm_id, &conn_param);
3574 if (ret) {
3575 error_report("rdma_accept returns %d", ret);
3576 goto err_rdma_dest_wait;
3577 }
3578
3579 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3580 if (ret) {
3581 error_report("rdma_accept get_cm_event failed %d", ret);
3582 goto err_rdma_dest_wait;
3583 }
3584
3585 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3586 error_report("rdma_accept not event established");
3587 rdma_ack_cm_event(cm_event);
3588 goto err_rdma_dest_wait;
3589 }
3590
3591 rdma_ack_cm_event(cm_event);
3592 rdma->connected = true;
3593
3594 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
3595 if (ret) {
3596 error_report("rdma migration: error posting second control recv");
3597 goto err_rdma_dest_wait;
3598 }
3599
3600 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3601
3602 return 0;
3603
3604err_rdma_dest_wait:
3605 rdma->error_state = ret;
3606 qemu_rdma_cleanup(rdma);
3607 g_free(rdma_return_path);
3608 return ret;
3609}
3610
3611static int dest_ram_sort_func(const void *a, const void *b)
3612{
3613 unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3614 unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3615
3616 return (a_index < b_index) ? -1 : (a_index != b_index);
3617}
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
3629{
3630 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3631 .type = RDMA_CONTROL_REGISTER_RESULT,
3632 .repeat = 0,
3633 };
3634 RDMAControlHeader unreg_resp = { .len = 0,
3635 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3636 .repeat = 0,
3637 };
3638 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3639 .repeat = 1 };
3640 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3641 RDMAContext *rdma;
3642 RDMALocalBlocks *local;
3643 RDMAControlHeader head;
3644 RDMARegister *reg, *registers;
3645 RDMACompress *comp;
3646 RDMARegisterResult *reg_result;
3647 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3648 RDMALocalBlock *block;
3649 void *host_addr;
3650 int ret = 0;
3651 int idx = 0;
3652 int count = 0;
3653 int i = 0;
3654
3655 RCU_READ_LOCK_GUARD();
3656 rdma = qatomic_rcu_read(&rioc->rdmain);
3657
3658 if (!rdma) {
3659 return -EIO;
3660 }
3661
3662 CHECK_ERROR_STATE();
3663
3664 local = &rdma->local_ram_blocks;
3665 do {
3666 trace_qemu_rdma_registration_handle_wait();
3667
3668 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
3669
3670 if (ret < 0) {
3671 break;
3672 }
3673
3674 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3675 error_report("rdma: Too many requests in this message (%d)."
3676 "Bailing.", head.repeat);
3677 ret = -EIO;
3678 break;
3679 }
3680
3681 switch (head.type) {
3682 case RDMA_CONTROL_COMPRESS:
3683 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3684 network_to_compress(comp);
3685
3686 trace_qemu_rdma_registration_handle_compress(comp->length,
3687 comp->block_idx,
3688 comp->offset);
3689 if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3690 error_report("rdma: 'compress' bad block index %u (vs %d)",
3691 (unsigned int)comp->block_idx,
3692 rdma->local_ram_blocks.nb_blocks);
3693 ret = -EIO;
3694 goto out;
3695 }
3696 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3697
3698 host_addr = block->local_host_addr +
3699 (comp->offset - block->offset);
3700
3701 ram_handle_compressed(host_addr, comp->value, comp->length);
3702 break;
3703
3704 case RDMA_CONTROL_REGISTER_FINISHED:
3705 trace_qemu_rdma_registration_handle_finished();
3706 goto out;
3707
3708 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3709 trace_qemu_rdma_registration_handle_ram_blocks();
3710
3711
3712
3713
3714
3715 qsort(rdma->local_ram_blocks.block,
3716 rdma->local_ram_blocks.nb_blocks,
3717 sizeof(RDMALocalBlock), dest_ram_sort_func);
3718 for (i = 0; i < local->nb_blocks; i++) {
3719 local->block[i].index = i;
3720 }
3721
3722 if (rdma->pin_all) {
3723 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3724 if (ret) {
3725 error_report("rdma migration: error dest "
3726 "registering ram blocks");
3727 goto out;
3728 }
3729 }
3730
3731
3732
3733
3734
3735
3736
3737 for (i = 0; i < local->nb_blocks; i++) {
3738 rdma->dest_blocks[i].remote_host_addr =
3739 (uintptr_t)(local->block[i].local_host_addr);
3740
3741 if (rdma->pin_all) {
3742 rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3743 }
3744
3745 rdma->dest_blocks[i].offset = local->block[i].offset;
3746 rdma->dest_blocks[i].length = local->block[i].length;
3747
3748 dest_block_to_network(&rdma->dest_blocks[i]);
3749 trace_qemu_rdma_registration_handle_ram_blocks_loop(
3750 local->block[i].block_name,
3751 local->block[i].offset,
3752 local->block[i].length,
3753 local->block[i].local_host_addr,
3754 local->block[i].src_index);
3755 }
3756
3757 blocks.len = rdma->local_ram_blocks.nb_blocks
3758 * sizeof(RDMADestBlock);
3759
3760
3761 ret = qemu_rdma_post_send_control(rdma,
3762 (uint8_t *) rdma->dest_blocks, &blocks);
3763
3764 if (ret < 0) {
3765 error_report("rdma migration: error sending remote info");
3766 goto out;
3767 }
3768
3769 break;
3770 case RDMA_CONTROL_REGISTER_REQUEST:
3771 trace_qemu_rdma_registration_handle_register(head.repeat);
3772
3773 reg_resp.repeat = head.repeat;
3774 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3775
3776 for (count = 0; count < head.repeat; count++) {
3777 uint64_t chunk;
3778 uint8_t *chunk_start, *chunk_end;
3779
3780 reg = ®isters[count];
3781 network_to_register(reg);
3782
3783 reg_result = &results[count];
3784
3785 trace_qemu_rdma_registration_handle_register_loop(count,
3786 reg->current_index, reg->key.current_addr, reg->chunks);
3787
3788 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3789 error_report("rdma: 'register' bad block index %u (vs %d)",
3790 (unsigned int)reg->current_index,
3791 rdma->local_ram_blocks.nb_blocks);
3792 ret = -ENOENT;
3793 goto out;
3794 }
3795 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3796 if (block->is_ram_block) {
3797 if (block->offset > reg->key.current_addr) {
3798 error_report("rdma: bad register address for block %s"
3799 " offset: %" PRIx64 " current_addr: %" PRIx64,
3800 block->block_name, block->offset,
3801 reg->key.current_addr);
3802 ret = -ERANGE;
3803 goto out;
3804 }
3805 host_addr = (block->local_host_addr +
3806 (reg->key.current_addr - block->offset));
3807 chunk = ram_chunk_index(block->local_host_addr,
3808 (uint8_t *) host_addr);
3809 } else {
3810 chunk = reg->key.chunk;
3811 host_addr = block->local_host_addr +
3812 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3813
3814 if (host_addr < (void *)block->local_host_addr) {
3815 error_report("rdma: bad chunk for block %s"
3816 " chunk: %" PRIx64,
3817 block->block_name, reg->key.chunk);
3818 ret = -ERANGE;
3819 goto out;
3820 }
3821 }
3822 chunk_start = ram_chunk_start(block, chunk);
3823 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3824
3825 uint32_t tmp_rkey = 0;
3826 if (qemu_rdma_register_and_get_keys(rdma, block,
3827 (uintptr_t)host_addr, NULL, &tmp_rkey,
3828 chunk, chunk_start, chunk_end)) {
3829 error_report("cannot get rkey");
3830 ret = -EINVAL;
3831 goto out;
3832 }
3833 reg_result->rkey = tmp_rkey;
3834
3835 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3836
3837 trace_qemu_rdma_registration_handle_register_rkey(
3838 reg_result->rkey);
3839
3840 result_to_network(reg_result);
3841 }
3842
3843 ret = qemu_rdma_post_send_control(rdma,
3844 (uint8_t *) results, ®_resp);
3845
3846 if (ret < 0) {
3847 error_report("Failed to send control buffer");
3848 goto out;
3849 }
3850 break;
3851 case RDMA_CONTROL_UNREGISTER_REQUEST:
3852 trace_qemu_rdma_registration_handle_unregister(head.repeat);
3853 unreg_resp.repeat = head.repeat;
3854 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3855
3856 for (count = 0; count < head.repeat; count++) {
3857 reg = ®isters[count];
3858 network_to_register(reg);
3859
3860 trace_qemu_rdma_registration_handle_unregister_loop(count,
3861 reg->current_index, reg->key.chunk);
3862
3863 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3864
3865 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3866 block->pmr[reg->key.chunk] = NULL;
3867
3868 if (ret != 0) {
3869 perror("rdma unregistration chunk failed");
3870 ret = -ret;
3871 goto out;
3872 }
3873
3874 rdma->total_registrations--;
3875
3876 trace_qemu_rdma_registration_handle_unregister_success(
3877 reg->key.chunk);
3878 }
3879
3880 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3881
3882 if (ret < 0) {
3883 error_report("Failed to send control buffer");
3884 goto out;
3885 }
3886 break;
3887 case RDMA_CONTROL_REGISTER_RESULT:
3888 error_report("Invalid RESULT message at dest.");
3889 ret = -EIO;
3890 goto out;
3891 default:
3892 error_report("Unknown control message %s", control_desc(head.type));
3893 ret = -EIO;
3894 goto out;
3895 }
3896 } while (1);
3897out:
3898 if (ret < 0) {
3899 rdma->error_state = ret;
3900 }
3901 return ret;
3902}
3903
3904
3905
3906
3907
3908
3909
3910
3911static int
3912rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
3913{
3914 RDMAContext *rdma;
3915 int curr;
3916 int found = -1;
3917
3918 RCU_READ_LOCK_GUARD();
3919 rdma = qatomic_rcu_read(&rioc->rdmain);
3920
3921 if (!rdma) {
3922 return -EIO;
3923 }
3924
3925
3926 for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3927 if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3928 found = curr;
3929 break;
3930 }
3931 }
3932
3933 if (found == -1) {
3934 error_report("RAMBlock '%s' not found on destination", name);
3935 return -ENOENT;
3936 }
3937
3938 rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3939 trace_rdma_block_notification_handle(name, rdma->next_src_index);
3940 rdma->next_src_index++;
3941
3942 return 0;
3943}
3944
3945static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
3946{
3947 switch (flags) {
3948 case RAM_CONTROL_BLOCK_REG:
3949 return rdma_block_notification_handle(opaque, data);
3950
3951 case RAM_CONTROL_HOOK:
3952 return qemu_rdma_registration_handle(f, opaque);
3953
3954 default:
3955
3956 abort();
3957 }
3958}
3959
3960static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3961 uint64_t flags, void *data)
3962{
3963 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3964 RDMAContext *rdma;
3965
3966 RCU_READ_LOCK_GUARD();
3967 rdma = qatomic_rcu_read(&rioc->rdmaout);
3968 if (!rdma) {
3969 return -EIO;
3970 }
3971
3972 CHECK_ERROR_STATE();
3973
3974 if (migration_in_postcopy()) {
3975 return 0;
3976 }
3977
3978 trace_qemu_rdma_registration_start(flags);
3979 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3980 qemu_fflush(f);
3981
3982 return 0;
3983}
3984
3985
3986
3987
3988
3989static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3990 uint64_t flags, void *data)
3991{
3992 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3993 RDMAContext *rdma;
3994 RDMAControlHeader head = { .len = 0, .repeat = 1 };
3995 int ret = 0;
3996
3997 RCU_READ_LOCK_GUARD();
3998 rdma = qatomic_rcu_read(&rioc->rdmaout);
3999 if (!rdma) {
4000 return -EIO;
4001 }
4002
4003 CHECK_ERROR_STATE();
4004
4005 if (migration_in_postcopy()) {
4006 return 0;
4007 }
4008
4009 qemu_fflush(f);
4010 ret = qemu_rdma_drain_cq(f, rdma);
4011
4012 if (ret < 0) {
4013 goto err;
4014 }
4015
4016 if (flags == RAM_CONTROL_SETUP) {
4017 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
4018 RDMALocalBlocks *local = &rdma->local_ram_blocks;
4019 int reg_result_idx, i, nb_dest_blocks;
4020
4021 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
4022 trace_qemu_rdma_registration_stop_ram();
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
4033 ®_result_idx, rdma->pin_all ?
4034 qemu_rdma_reg_whole_ram_blocks : NULL);
4035 if (ret < 0) {
4036 fprintf(stderr, "receiving remote info!");
4037 return ret;
4038 }
4039
4040 nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054 if (local->nb_blocks != nb_dest_blocks) {
4055 fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
4056 "Your QEMU command line parameters are probably "
4057 "not identical on both the source and destination.",
4058 local->nb_blocks, nb_dest_blocks);
4059 rdma->error_state = -EINVAL;
4060 return -EINVAL;
4061 }
4062
4063 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
4064 memcpy(rdma->dest_blocks,
4065 rdma->wr_data[reg_result_idx].control_curr, resp.len);
4066 for (i = 0; i < nb_dest_blocks; i++) {
4067 network_to_dest_block(&rdma->dest_blocks[i]);
4068
4069
4070 if (rdma->dest_blocks[i].length != local->block[i].length) {
4071 fprintf(stderr, "Block %s/%d has a different length %" PRIu64
4072 "vs %" PRIu64, local->block[i].block_name, i,
4073 local->block[i].length,
4074 rdma->dest_blocks[i].length);
4075 rdma->error_state = -EINVAL;
4076 return -EINVAL;
4077 }
4078 local->block[i].remote_host_addr =
4079 rdma->dest_blocks[i].remote_host_addr;
4080 local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
4081 }
4082 }
4083
4084 trace_qemu_rdma_registration_stop(flags);
4085
4086 head.type = RDMA_CONTROL_REGISTER_FINISHED;
4087 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
4088
4089 if (ret < 0) {
4090 goto err;
4091 }
4092
4093 return 0;
4094err:
4095 rdma->error_state = ret;
4096 return ret;
4097}
4098
4099static const QEMUFileHooks rdma_read_hooks = {
4100 .hook_ram_load = rdma_load_hook,
4101};
4102
4103static const QEMUFileHooks rdma_write_hooks = {
4104 .before_ram_iterate = qemu_rdma_registration_start,
4105 .after_ram_iterate = qemu_rdma_registration_stop,
4106 .save_page = qemu_rdma_save_page,
4107};
4108
4109
4110static void qio_channel_rdma_finalize(Object *obj)
4111{
4112 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
4113 if (rioc->rdmain) {
4114 qemu_rdma_cleanup(rioc->rdmain);
4115 g_free(rioc->rdmain);
4116 rioc->rdmain = NULL;
4117 }
4118 if (rioc->rdmaout) {
4119 qemu_rdma_cleanup(rioc->rdmaout);
4120 g_free(rioc->rdmaout);
4121 rioc->rdmaout = NULL;
4122 }
4123}
4124
4125static void qio_channel_rdma_class_init(ObjectClass *klass,
4126 void *class_data G_GNUC_UNUSED)
4127{
4128 QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
4129
4130 ioc_klass->io_writev = qio_channel_rdma_writev;
4131 ioc_klass->io_readv = qio_channel_rdma_readv;
4132 ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
4133 ioc_klass->io_close = qio_channel_rdma_close;
4134 ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
4135 ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
4136 ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
4137}
4138
4139static const TypeInfo qio_channel_rdma_info = {
4140 .parent = TYPE_QIO_CHANNEL,
4141 .name = TYPE_QIO_CHANNEL_RDMA,
4142 .instance_size = sizeof(QIOChannelRDMA),
4143 .instance_finalize = qio_channel_rdma_finalize,
4144 .class_init = qio_channel_rdma_class_init,
4145};
4146
4147static void qio_channel_rdma_register_types(void)
4148{
4149 type_register_static(&qio_channel_rdma_info);
4150}
4151
4152type_init(qio_channel_rdma_register_types);
4153
4154static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
4155{
4156 QIOChannelRDMA *rioc;
4157
4158 if (qemu_file_mode_is_not_valid(mode)) {
4159 return NULL;
4160 }
4161
4162 rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4163
4164 if (mode[0] == 'w') {
4165 rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc));
4166 rioc->rdmaout = rdma;
4167 rioc->rdmain = rdma->return_path;
4168 qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
4169 } else {
4170 rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc));
4171 rioc->rdmain = rdma;
4172 rioc->rdmaout = rdma->return_path;
4173 qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
4174 }
4175
4176 return rioc->file;
4177}
4178
4179static void rdma_accept_incoming_migration(void *opaque)
4180{
4181 RDMAContext *rdma = opaque;
4182 int ret;
4183 QEMUFile *f;
4184 Error *local_err = NULL;
4185
4186 trace_qemu_rdma_accept_incoming_migration();
4187 ret = qemu_rdma_accept(rdma);
4188
4189 if (ret) {
4190 fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
4191 return;
4192 }
4193
4194 trace_qemu_rdma_accept_incoming_migration_accepted();
4195
4196 if (rdma->is_return_path) {
4197 return;
4198 }
4199
4200 f = qemu_fopen_rdma(rdma, "rb");
4201 if (f == NULL) {
4202 fprintf(stderr, "RDMA ERROR: could not qemu_fopen_rdma\n");
4203 qemu_rdma_cleanup(rdma);
4204 return;
4205 }
4206
4207 rdma->migration_started_on_destination = 1;
4208 migration_fd_process_incoming(f, &local_err);
4209 if (local_err) {
4210 error_reportf_err(local_err, "RDMA ERROR:");
4211 }
4212}
4213
4214void rdma_start_incoming_migration(const char *host_port, Error **errp)
4215{
4216 int ret;
4217 RDMAContext *rdma, *rdma_return_path = NULL;
4218 Error *local_err = NULL;
4219
4220 trace_rdma_start_incoming_migration();
4221
4222
4223 if (ram_block_discard_is_required()) {
4224 error_setg(errp, "RDMA: cannot disable RAM discard");
4225 return;
4226 }
4227
4228 rdma = qemu_rdma_data_init(host_port, &local_err);
4229 if (rdma == NULL) {
4230 goto err;
4231 }
4232
4233 ret = qemu_rdma_dest_init(rdma, &local_err);
4234
4235 if (ret) {
4236 goto err;
4237 }
4238
4239 trace_rdma_start_incoming_migration_after_dest_init();
4240
4241 ret = rdma_listen(rdma->listen_id, 5);
4242
4243 if (ret) {
4244 ERROR(errp, "listening on socket!");
4245 goto cleanup_rdma;
4246 }
4247
4248 trace_rdma_start_incoming_migration_after_rdma_listen();
4249
4250 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
4251 NULL, (void *)(intptr_t)rdma);
4252 return;
4253
4254cleanup_rdma:
4255 qemu_rdma_cleanup(rdma);
4256err:
4257 error_propagate(errp, local_err);
4258 if (rdma) {
4259 g_free(rdma->host);
4260 g_free(rdma->host_port);
4261 }
4262 g_free(rdma);
4263 g_free(rdma_return_path);
4264}
4265
4266void rdma_start_outgoing_migration(void *opaque,
4267 const char *host_port, Error **errp)
4268{
4269 MigrationState *s = opaque;
4270 RDMAContext *rdma_return_path = NULL;
4271 RDMAContext *rdma;
4272 int ret = 0;
4273
4274
4275 if (ram_block_discard_is_required()) {
4276 error_setg(errp, "RDMA: cannot disable RAM discard");
4277 return;
4278 }
4279
4280 rdma = qemu_rdma_data_init(host_port, errp);
4281 if (rdma == NULL) {
4282 goto err;
4283 }
4284
4285 ret = qemu_rdma_source_init(rdma,
4286 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4287
4288 if (ret) {
4289 goto err;
4290 }
4291
4292 trace_rdma_start_outgoing_migration_after_rdma_source_init();
4293 ret = qemu_rdma_connect(rdma, errp, false);
4294
4295 if (ret) {
4296 goto err;
4297 }
4298
4299
4300 if (migrate_postcopy()) {
4301 rdma_return_path = qemu_rdma_data_init(host_port, errp);
4302
4303 if (rdma_return_path == NULL) {
4304 goto return_path_err;
4305 }
4306
4307 ret = qemu_rdma_source_init(rdma_return_path,
4308 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4309
4310 if (ret) {
4311 goto return_path_err;
4312 }
4313
4314 ret = qemu_rdma_connect(rdma_return_path, errp, true);
4315
4316 if (ret) {
4317 goto return_path_err;
4318 }
4319
4320 rdma->return_path = rdma_return_path;
4321 rdma_return_path->return_path = rdma;
4322 rdma_return_path->is_return_path = true;
4323 }
4324
4325 trace_rdma_start_outgoing_migration_after_rdma_connect();
4326
4327 s->to_dst_file = qemu_fopen_rdma(rdma, "wb");
4328 migrate_fd_connect(s, NULL);
4329 return;
4330return_path_err:
4331 qemu_rdma_cleanup(rdma);
4332err:
4333 g_free(rdma);
4334 g_free(rdma_return_path);
4335}
4336